From e5f4e68eed85fa8495d78cd966eecc2b27bb9e53 Mon Sep 17 00:00:00 2001 From: Doug Smythies Date: Mon, 3 Apr 2023 14:11:38 -0700 Subject: [PATCH 001/989] tools/power turbostat: Fix added raw MSR output When using --Summary mode, added MSRs in raw mode always print zeros. Print the actual register contents. Example, with patch: note the added column: --add msr0x64f,u32,package,raw,REASON Where: 0x64F is MSR_CORE_PERF_LIMIT_REASONS Busy% Bzy_MHz PkgTmp PkgWatt CorWatt REASON 0.00 4800 35 1.42 0.76 0x00000000 0.00 4801 34 1.42 0.76 0x00000000 80.08 4531 66 108.17 107.52 0x08000000 98.69 4530 66 133.21 132.54 0x08000000 99.28 4505 66 128.26 127.60 0x0c000400 99.65 4486 68 124.91 124.25 0x0c000400 99.63 4483 68 124.90 124.25 0x0c000400 79.34 4481 41 99.80 99.13 0x0c000000 0.00 4801 41 1.40 0.73 0x0c000000 Where, for the test processor (i5-10600K): PKG Limit #1: 125.000 Watts, 8.000000 sec MSR bit 26 = log; bit 10 = status PKG Limit #2: 136.000 Watts, 0.002441 sec MSR bit 27 = log; bit 11 = status Example, without patch: Busy% Bzy_MHz PkgTmp PkgWatt CorWatt REASON 0.01 4800 35 1.43 0.77 0x00000000 0.00 4801 35 1.39 0.73 0x00000000 83.49 4531 66 112.71 112.06 0x00000000 98.69 4530 68 133.35 132.69 0x00000000 99.31 4500 67 127.96 127.30 0x00000000 99.63 4483 69 124.91 124.25 0x00000000 99.61 4481 69 124.90 124.25 0x00000000 99.61 4481 71 124.92 124.25 0x00000000 59.35 4479 42 75.03 74.37 0x00000000 0.00 4800 42 1.39 0.73 0x00000000 0.00 4801 42 1.42 0.76 0x00000000 c000000 [lenb: simplified patch to apply only to package scope] Signed-off-by: Doug Smythies Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 7a334377f92b9..fca7913f6c84d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2444,9 +2444,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.rapl_dram_perf_status += p->rapl_dram_perf_status; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW) - continue; - average.packages.counter[i] += p->counter[i]; + if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0)) + average.packages.counter[i] = p->counter[i]; + else + average.packages.counter[i] += p->counter[i]; } return 0; } -- GitLab From 3ac1d14d0583a2de75d49a5234d767e2590384dd Mon Sep 17 00:00:00 2001 From: Wyes Karny Date: Tue, 3 Oct 2023 05:07:51 +0000 Subject: [PATCH 002/989] tools/power turbostat: Increase the limit for fd opened When running turbostat, a system with 512 cpus reaches the limit for maximum number of file descriptors that can be opened. To solve this problem, the limit is raised to 2^15, which is a large enough number. Below data is collected from AMD server systems while running turbostat: |-----------+-------------------------------| | # of cpus | # of opened fds for turbostat | |-----------+-------------------------------| | 128 | 260 | |-----------+-------------------------------| | 192 | 388 | |-----------+-------------------------------| | 512 | 1028 | |-----------+-------------------------------| So, the new max limit would be sufficient up to 2^14 cpus (but this also depends on how many counters are enabled). Reviewed-by: Doug Smythies Tested-by: Doug Smythies Signed-off-by: Wyes Karny Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index fca7913f6c84d..2550a0e35914f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -53,6 +53,8 @@ #define NAME_BYTES 20 #define PATH_BYTES 128 +#define MAX_NOFILE 0x8000 + enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; @@ -6705,6 +6707,22 @@ void cmdline(int argc, char **argv) } } +void set_rlimit(void) +{ + struct rlimit limit; + + if (getrlimit(RLIMIT_NOFILE, &limit) < 0) + err(1, "Failed to get rlimit"); + + if (limit.rlim_max < MAX_NOFILE) + limit.rlim_max = MAX_NOFILE; + if (limit.rlim_cur < MAX_NOFILE) + limit.rlim_cur = MAX_NOFILE; + + if (setrlimit(RLIMIT_NOFILE, &limit) < 0) + err(1, "Failed to set rlimit"); +} + int main(int argc, char **argv) { int fd, ret; @@ -6730,6 +6748,9 @@ skip_cgroup_setting: probe_sysfs(); + if (!getuid()) + set_rlimit(); + turbostat_init(); msr_sum_record(); -- GitLab From 0b13410b52c4636aacb6964a4253a797c0fa0d16 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Sat, 7 Oct 2023 13:46:22 +0800 Subject: [PATCH 003/989] tools/power turbostat: Fix Bzy_MHz documentation typo The code calculates Bzy_MHz by multiplying TSC_delta * APERF_delta/MPERF_delta The man page erroneously showed that TSC_delta was divided. Signed-off-by: Peng Liu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 8f08c3fd498d5..1ba6340d3b3da 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -370,7 +370,7 @@ below the processor's base frequency. Busy% = MPERF_delta/TSC_delta -Bzy_MHz = TSC_delta/APERF_delta/MPERF_delta/measurement_interval +Bzy_MHz = TSC_delta*APERF_delta/MPERF_delta/measurement_interval Note that these calculations depend on TSC_delta, so they are not reliable during intervals when TSC_MHz is not running at the base frequency. -- GitLab From 227ed18f456a68bbb69807294a9089208663a6d3 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Sun, 22 Oct 2023 13:52:21 +0800 Subject: [PATCH 004/989] tools/power turbostat: Do not print negative LPI residency turbostat prints the abnormal SYS%LPI across suspend-to-idle: SYS%LPI = 114479815993277.50 This is reproduced by: Run a freeze cycle, e.g. "sleepgraph -m freeze -rtcwake 15". Then do a reboot. After boot up, launch the suspend-idle-idle and check the SYS%LPI field. The slp_so residence counter is in LPIT table, and BIOS does not clears this register across reset. The PMC expects the OS to calculate the LPI residency based on the delta. However, there is an firmware issue that the LPIT gets cleared to 0 during the second suspend to idle after the reboot, which brings negative delta value. [lenb: updated to print "neg" upon this BIOS failure] Reported-by: Todd Brandt Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 30 +++++++++++++++++++-------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 2550a0e35914f..c23703dd54aa1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -991,8 +991,8 @@ struct pkg_data { unsigned long long pc8; unsigned long long pc9; unsigned long long pc10; - unsigned long long cpu_lpi; - unsigned long long sys_lpi; + long long cpu_lpi; + long long sys_lpi; unsigned long long pkg_wtd_core_c0; unsigned long long pkg_any_core_c0; unsigned long long pkg_any_gfxe_c0; @@ -1978,12 +1978,22 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_Pkgpc10)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); - if (DO_BIC(BIC_CPU_LPI)) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->cpu_lpi / 1000000.0 / interval_float); - if (DO_BIC(BIC_SYS_LPI)) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float); + if (DO_BIC(BIC_CPU_LPI)) { + if (p->cpu_lpi >= 0) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + 100.0 * p->cpu_lpi / 1000000.0 / interval_float); + else + outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); + } + if (DO_BIC(BIC_SYS_LPI)) { + if (p->sys_lpi >= 0) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + 100.0 * p->sys_lpi / 1000000.0 / interval_float); + else + outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); + } if (DO_BIC(BIC_PkgWatt)) outp += @@ -3832,7 +3842,8 @@ void re_initialize(void) { free_all_buffers(); setup_all_buffers(false); - fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); + fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, + topo.allowed_cpus); } void set_max_cpu_num(void) @@ -6145,6 +6156,7 @@ void topology_update(void) topo.allowed_packages = 0; for_all_cpus(update_topo, ODD_COUNTERS); } + void setup_all_buffers(bool startup) { topology_probe(startup); -- GitLab From bb6181fa6bc942aac3f7f2fa8e3831952a2ef118 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 20 Dec 2023 13:11:05 -0500 Subject: [PATCH 005/989] tools/power turbostat: Expand probe_intel_uncore_frequency() Print current frequency along with the current (and initial) limits Probe and print uncore config also for machines using the new cluster API Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 84 ++++++++++++++++++++------- 1 file changed, 63 insertions(+), 21 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c23703dd54aa1..bbd2e0edadfae 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4581,20 +4581,15 @@ static void dump_sysfs_file(char *path) static void probe_intel_uncore_frequency(void) { int i, j; - char path[128]; + char path[256]; if (!genuine_intel) return; - if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00", R_OK)) - return; - - /* Cluster level sysfs not supported yet. */ - if (!access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK)) - return; + if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) + goto probe_cluster; - if (!access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) - BIC_PRESENT(BIC_UNCORE_MHZ); + BIC_PRESENT(BIC_UNCORE_MHZ); if (quiet) return; @@ -4602,26 +4597,73 @@ static void probe_intel_uncore_frequency(void) for (i = 0; i < topo.num_packages; ++i) { for (j = 0; j < topo.num_die; ++j) { int k, l; + char path_base[128]; + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i, + j); - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/min_freq_khz", - i, j); + sprintf(path, "%s/min_freq_khz", path_base); k = read_sysfs_int(path); - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/max_freq_khz", - i, j); + sprintf(path, "%s/max_freq_khz", path_base); l = read_sysfs_int(path); - fprintf(outf, "Uncore Frequency pkg%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000); + fprintf(outf, "Uncore Frequency package%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000); - sprintf(path, - "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_min_freq_khz", - i, j); + sprintf(path, "%s/initial_min_freq_khz", path_base); k = read_sysfs_int(path); - sprintf(path, - "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_max_freq_khz", - i, j); + sprintf(path, "%s/initial_max_freq_khz", path_base); l = read_sysfs_int(path); - fprintf(outf, "(%d - %d MHz)\n", k / 1000, l / 1000); + fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + k = read_sysfs_int(path); + fprintf(outf, " %d MHz\n", k / 1000); } } + return; + +probe_cluster: + if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK)) + return; + + if (quiet) + return; + + for (i = 0;; ++i) { + int k, l; + char path_base[128]; + int package_id, domain_id, cluster_id; + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i); + + if (access(path_base, R_OK)) + break; + + sprintf(path, "%s/package_id", path_base); + package_id = read_sysfs_int(path); + + sprintf(path, "%s/domain_id", path_base); + domain_id = read_sysfs_int(path); + + sprintf(path, "%s/fabric_cluster_id", path_base); + cluster_id = read_sysfs_int(path); + + sprintf(path, "%s/min_freq_khz", path_base); + k = read_sysfs_int(path); + sprintf(path, "%s/max_freq_khz", path_base); + l = read_sysfs_int(path); + fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id, + cluster_id, k / 1000, l / 1000); + + sprintf(path, "%s/initial_min_freq_khz", path_base); + k = read_sysfs_int(path); + sprintf(path, "%s/initial_max_freq_khz", path_base); + l = read_sysfs_int(path); + fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + k = read_sysfs_int(path); + fprintf(outf, " %d MHz\n", k / 1000); + } } static void probe_graphics(void) -- GitLab From fb5ceca046efc84f69fcf9779a013f8a0e63bbff Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 12 Jan 2024 13:48:14 +0100 Subject: [PATCH 006/989] tools/power turbostat: Print ucode revision only if valid If the MSR read were to fail, turbostat would print "microcode 0x0" Signed-off-by: Patryk Wlazlyn Reviewed-by: Len Brown Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bbd2e0edadfae..a4a40a6e1b957 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5679,6 +5679,7 @@ void process_cpuid() unsigned int eax, ebx, ecx, edx; unsigned int fms, family, model, stepping, ecx_flags, edx_flags; unsigned long long ucode_patch = 0; + bool ucode_patch_valid = false; eax = ebx = ecx = edx = 0; @@ -5708,6 +5709,8 @@ void process_cpuid() if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) warnx("get_msr(UCODE)"); + else + ucode_patch_valid = true; /* * check max extended function levels of CPUID. @@ -5718,9 +5721,12 @@ void process_cpuid() __cpuid(0x80000000, max_extended_level, ebx, ecx, edx); if (!quiet) { - fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n", - family, model, stepping, family, model, stepping, - (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)", + family, model, stepping, family, model, stepping); + if (ucode_patch_valid) + fprintf(outf, " microcode 0x%x", (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + fputc('\n', outf); + fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level); fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n", ecx_flags & (1 << 0) ? "SSE3" : "-", -- GitLab From ccb2280ec2f9e805d70f57a3a1c5deff0d532cb3 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 25 Oct 2023 01:59:13 -0400 Subject: [PATCH 007/989] x86/kvm: Use separate percpu variable to track the enabling of asyncpf Refer to commit fd10cde9294f ("KVM paravirt: Add async PF initialization to PV guest") and commit 344d9588a9df ("KVM: Add PV MSR to enable asynchronous page faults delivery"). It turns out that at the time when asyncpf was introduced, the purpose was defining the shared PV data 'struct kvm_vcpu_pv_apf_data' with the size of 64 bytes. However, it made a mistake and defined the size to 68 bytes, which failed to make fit in a cache line and made the code inconsistent with the documentation. Below justification quoted from Sean[*] KVM (the host side) has *never* read kvm_vcpu_pv_apf_data.enabled, and the documentation clearly states that enabling is based solely on the bit in the synthetic MSR. So rather than update the documentation, fix the goof by removing the enabled filed and use the separate percpu variable instread. KVM-as-a-host obviously doesn't enforce anything or consume the size, and changing the header will only affect guests that are rebuilt against the new header, so there's no chance of ABI breakage between KVM and its guests. The only possible breakage is if some other hypervisor is emulating KVM's async #PF (LOL) and relies on the guest to set kvm_vcpu_pv_apf_data.enabled. But (a) I highly doubt such a hypervisor exists, (b) that would arguably be a violation of KVM's "spec", and (c) the worst case scenario is that the guest would simply lose async #PF functionality. [*] https://lore.kernel.org/all/ZS7ERnnRqs8Fl0ZF@google.com/T/#u Suggested-by: Sean Christopherson Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20231025055914.1201792-2-xiaoyao.li@intel.com [sean: use true/false instead of 1/0 for booleans] Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/x86/msr.rst | 1 - arch/x86/include/uapi/asm/kvm_para.h | 1 - arch/x86/kernel/kvm.c | 11 ++++++----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/Documentation/virt/kvm/x86/msr.rst b/Documentation/virt/kvm/x86/msr.rst index 9315fc385fb0b..f6d70f99a1a73 100644 --- a/Documentation/virt/kvm/x86/msr.rst +++ b/Documentation/virt/kvm/x86/msr.rst @@ -204,7 +204,6 @@ data: __u32 token; __u8 pad[56]; - __u32 enabled; }; Bits 5-4 of the MSR are reserved and should be zero. Bit 0 is set to 1 diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 6e64b27b2c1ee..605899594ebb8 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -142,7 +142,6 @@ struct kvm_vcpu_pv_apf_data { __u32 token; __u8 pad[56]; - __u32 enabled; }; #define KVM_PV_EOI_BIT 0 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index dfe9945b9bece..e34f985c9a6fd 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -65,6 +65,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); +static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled); static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; @@ -244,7 +245,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void) { u32 flags = 0; - if (__this_cpu_read(apf_reason.enabled)) { + if (__this_cpu_read(async_pf_enabled)) { flags = __this_cpu_read(apf_reason.flags); __this_cpu_write(apf_reason.flags, 0); } @@ -295,7 +296,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) inc_irq_stat(irq_hv_callback_count); - if (__this_cpu_read(apf_reason.enabled)) { + if (__this_cpu_read(async_pf_enabled)) { token = __this_cpu_read(apf_reason.token); kvm_async_pf_task_wake(token); __this_cpu_write(apf_reason.token, 0); @@ -362,7 +363,7 @@ static void kvm_guest_cpu_init(void) wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); - __this_cpu_write(apf_reason.enabled, 1); + __this_cpu_write(async_pf_enabled, true); pr_debug("setup async PF for cpu %d\n", smp_processor_id()); } @@ -383,11 +384,11 @@ static void kvm_guest_cpu_init(void) static void kvm_pv_disable_apf(void) { - if (!__this_cpu_read(apf_reason.enabled)) + if (!__this_cpu_read(async_pf_enabled)) return; wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); - __this_cpu_write(apf_reason.enabled, 0); + __this_cpu_write(async_pf_enabled, false); pr_debug("disable async PF for cpu %d\n", smp_processor_id()); } -- GitLab From df01f0a1165c35e95b5f52c7ba25c19020352ff9 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 25 Oct 2023 01:59:14 -0400 Subject: [PATCH 008/989] KVM: x86: Improve documentation of MSR_KVM_ASYNC_PF_EN Fix some incorrect statement of MSR_KVM_ASYNC_PF_EN documentation and state clearly the token in 'struct kvm_vcpu_pv_apf_data' of 'page ready' event is matchted with the token in CR2 in 'page not present' event. Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20231025055914.1201792-3-xiaoyao.li@intel.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/x86/msr.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/virt/kvm/x86/msr.rst b/Documentation/virt/kvm/x86/msr.rst index f6d70f99a1a73..3aecf2a70e7b4 100644 --- a/Documentation/virt/kvm/x86/msr.rst +++ b/Documentation/virt/kvm/x86/msr.rst @@ -193,8 +193,8 @@ data: Asynchronous page fault (APF) control MSR. Bits 63-6 hold 64-byte aligned physical address of a 64 byte memory area - which must be in guest RAM and must be zeroed. This memory is expected - to hold a copy of the following structure:: + which must be in guest RAM. This memory is expected to hold the + following structure:: struct kvm_vcpu_pv_apf_data { /* Used for 'page not present' events delivered via #PF */ @@ -231,14 +231,14 @@ data: as regular page fault, guest must reset 'flags' to '0' before it does something that can generate normal page fault. - Bytes 5-7 of 64 byte memory location ('token') will be written to by the + Bytes 4-7 of 64 byte memory location ('token') will be written to by the hypervisor at the time of APF 'page ready' event injection. The content - of these bytes is a token which was previously delivered as 'page not - present' event. The event indicates the page in now available. Guest is - supposed to write '0' to 'token' when it is done handling 'page ready' - event and to write 1' to MSR_KVM_ASYNC_PF_ACK after clearing the location; - writing to the MSR forces KVM to re-scan its queue and deliver the next - pending notification. + of these bytes is a token which was previously delivered in CR2 as + 'page not present' event. The event indicates the page is now available. + Guest is supposed to write '0' to 'token' when it is done handling + 'page ready' event and to write '1' to MSR_KVM_ASYNC_PF_ACK after + clearing the location; writing to the MSR forces KVM to re-scan its + queue and deliver the next pending notification. Note, MSR_KVM_ASYNC_PF_INT MSR specifying the interrupt vector for 'page ready' APF delivery needs to be written to before enabling APF mechanism -- GitLab From cc4ce37bed85989daf8f38bf19ea591f3b36fb0c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 31 Jan 2024 15:56:06 -0800 Subject: [PATCH 009/989] KVM: SVM: Set sev->asid in sev_asid_new() instead of overloading the return Explicitly set sev->asid in sev_asid_new() when a new ASID is successfully allocated, and return '0' to indicate success instead of overloading the return value to multiplex the ASID with error codes. There is exactly one caller of sev_asid_new(), and sev_asid_free() already consumes sev->asid, i.e. returning the ASID isn't necessary for flexibility, nor does it provide symmetry between related APIs. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240131235609.4161407-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f760106c31f8a..7c000088bca61 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -179,7 +179,8 @@ again: mutex_unlock(&sev_bitmap_lock); - return asid; + sev->asid = asid; + return 0; e_uncharge: sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); @@ -246,7 +247,7 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - int asid, ret; + int ret; if (kvm->created_vcpus) return -EINVAL; @@ -257,10 +258,9 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) sev->active = true; sev->es_active = argp->id == KVM_SEV_ES_INIT; - asid = sev_asid_new(sev); - if (asid < 0) + ret = sev_asid_new(sev); + if (ret) goto e_no_asid; - sev->asid = asid; ret = sev_platform_init(&argp->error); if (ret) -- GitLab From 466eec4a22a76c462781bf6d45cb02cbedf21a61 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 31 Jan 2024 15:56:07 -0800 Subject: [PATCH 010/989] KVM: SVM: Use unsigned integers when dealing with ASIDs Convert all local ASID variables and parameters throughout the SEV code from signed integers to unsigned integers. As ASIDs are fundamentally unsigned values, and the global min/max variables are appropriately unsigned integers, too. Functionally, this is a glorified nop as KVM guarantees min_sev_asid is non-zero, and no CPU supports -1u as the _only_ asid, i.e. the signed vs. unsigned goof won't cause problems in practice. Opportunistically use sev_get_asid() in sev_flush_encrypted_page() instead of open coding an equivalent. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240131235609.4161407-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 18 ++++++++++-------- arch/x86/kvm/trace.h | 10 +++++----- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7c000088bca61..eeef43c795d8d 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -84,9 +84,10 @@ struct enc_region { }; /* Called with the sev_bitmap_lock held, or on shutdown */ -static int sev_flush_asids(int min_asid, int max_asid) +static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) { - int ret, asid, error = 0; + int ret, error = 0; + unsigned int asid; /* Check if there are any ASIDs to reclaim before performing a flush */ asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); @@ -116,7 +117,7 @@ static inline bool is_mirroring_enc_context(struct kvm *kvm) } /* Must be called with the sev_bitmap_lock held */ -static bool __sev_recycle_asids(int min_asid, int max_asid) +static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid) { if (sev_flush_asids(min_asid, max_asid)) return false; @@ -143,8 +144,9 @@ static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) static int sev_asid_new(struct kvm_sev_info *sev) { - int asid, min_asid, max_asid, ret; + unsigned int asid, min_asid, max_asid; bool retry = true; + int ret; WARN_ON(sev->misc_cg); sev->misc_cg = get_current_misc_cg(); @@ -188,7 +190,7 @@ e_uncharge: return ret; } -static int sev_get_asid(struct kvm *kvm) +static unsigned int sev_get_asid(struct kvm *kvm) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -284,8 +286,8 @@ e_no_asid: static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) { + unsigned int asid = sev_get_asid(kvm); struct sev_data_activate activate; - int asid = sev_get_asid(kvm); int ret; /* activate ASID on the given handle */ @@ -2312,7 +2314,7 @@ int sev_cpu_init(struct svm_cpu_data *sd) */ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) { - int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid; + unsigned int asid = sev_get_asid(vcpu->kvm); /* * Note! The address must be a kernel address, as regular page walk @@ -2630,7 +2632,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) void pre_sev_run(struct vcpu_svm *svm, int cpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - int asid = sev_get_asid(svm->vcpu.kvm); + unsigned int asid = sev_get_asid(svm->vcpu.kvm); /* Assign the asid allocated with this SEV guest */ svm->asid = asid; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 83843379813ee..b82e6ed4f0241 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -732,13 +732,13 @@ TRACE_EVENT(kvm_nested_intr_vmexit, * Tracepoint for nested #vmexit because of interrupt pending */ TRACE_EVENT(kvm_invlpga, - TP_PROTO(__u64 rip, int asid, u64 address), + TP_PROTO(__u64 rip, unsigned int asid, u64 address), TP_ARGS(rip, asid, address), TP_STRUCT__entry( - __field( __u64, rip ) - __field( int, asid ) - __field( __u64, address ) + __field( __u64, rip ) + __field( unsigned int, asid ) + __field( __u64, address ) ), TP_fast_assign( @@ -747,7 +747,7 @@ TRACE_EVENT(kvm_invlpga, __entry->address = address; ), - TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx", + TP_printk("rip: 0x%016llx asid: %u address: 0x%016llx", __entry->rip, __entry->asid, __entry->address) ); -- GitLab From 0aa6b90ef9d75b4bd7b6d106d85f2a3437697f91 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Wed, 31 Jan 2024 15:56:08 -0800 Subject: [PATCH 011/989] KVM: SVM: Add support for allowing zero SEV ASIDs Some BIOSes allow the end user to set the minimum SEV ASID value (CPUID 0x8000001F_EDX) to be greater than the maximum number of encrypted guests, or maximum SEV ASID value (CPUID 0x8000001F_ECX) in order to dedicate all the SEV ASIDs to SEV-ES or SEV-SNP. The SEV support, as coded, does not handle the case where the minimum SEV ASID value can be greater than the maximum SEV ASID value. As a result, the following confusing message is issued: [ 30.715724] kvm_amd: SEV enabled (ASIDs 1007 - 1006) Fix the support to properly handle this case. Fixes: 916391a2d1dc ("KVM: SVM: Add support for SEV-ES capability in KVM") Suggested-by: Sean Christopherson Signed-off-by: Ashish Kalra Cc: stable@vger.kernel.org Acked-by: Tom Lendacky Link: https://lore.kernel.org/r/20240104190520.62510-1-Ashish.Kalra@amd.com Link: https://lore.kernel.org/r/20240131235609.4161407-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index eeef43c795d8d..5f8312edee36b 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -144,10 +144,21 @@ static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) static int sev_asid_new(struct kvm_sev_info *sev) { - unsigned int asid, min_asid, max_asid; + /* + * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. + * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. + * Note: min ASID can end up larger than the max if basic SEV support is + * effectively disabled by disallowing use of ASIDs for SEV guests. + */ + unsigned int min_asid = sev->es_active ? 1 : min_sev_asid; + unsigned int max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; + unsigned int asid; bool retry = true; int ret; + if (min_asid > max_asid) + return -ENOTTY; + WARN_ON(sev->misc_cg); sev->misc_cg = get_current_misc_cg(); ret = sev_misc_cg_try_charge(sev); @@ -159,12 +170,6 @@ static int sev_asid_new(struct kvm_sev_info *sev) mutex_lock(&sev_bitmap_lock); - /* - * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. - * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. - */ - min_asid = sev->es_active ? 1 : min_sev_asid; - max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; again: asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); if (asid > max_asid) { @@ -2234,8 +2239,10 @@ void __init sev_hardware_setup(void) goto out; } - sev_asid_count = max_sev_asid - min_sev_asid + 1; - WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); + if (min_sev_asid <= max_sev_asid) { + sev_asid_count = max_sev_asid - min_sev_asid + 1; + WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); + } sev_supported = true; /* SEV-ES support requested? */ @@ -2266,7 +2273,9 @@ void __init sev_hardware_setup(void) out: if (boot_cpu_has(X86_FEATURE_SEV)) pr_info("SEV %s (ASIDs %u - %u)\n", - sev_supported ? "enabled" : "disabled", + sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : + "unusable" : + "disabled", min_sev_asid, max_sev_asid); if (boot_cpu_has(X86_FEATURE_SEV_ES)) pr_info("SEV-ES %s (ASIDs %u - %u)\n", -- GitLab From fdd58834d132046149699b88a27a0db26829f4fb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 31 Jan 2024 15:56:09 -0800 Subject: [PATCH 012/989] KVM: SVM: Return -EINVAL instead of -EBUSY on attempt to re-init SEV/SEV-ES Return -EINVAL instead of -EBUSY if userspace attempts KVM_SEV{,ES}_INIT on a VM that already has SEV active. Returning -EBUSY is nonsencial as it's impossible to deactivate SEV without destroying the VM, i.e. the VM isn't "busy" in any sane sense of the word, and the odds of any userspace wanting exactly -EBUSY on a userspace bug are minuscule. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240131235609.4161407-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5f8312edee36b..f06f9e51ad9db 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -259,9 +259,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) if (kvm->created_vcpus) return -EINVAL; - ret = -EBUSY; if (unlikely(sev->active)) - return ret; + return -EINVAL; sev->active = true; sev->es_active = argp->id == KVM_SEV_ES_INIT; -- GitLab From 7013482ff5945aee7390c55ababcb390de1a4aad Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 11 Feb 2024 20:33:41 -0800 Subject: [PATCH 013/989] 9p/trans_fd: remove Excess kernel-doc comment Remove the "@req" kernel-doc description since there is not 'req' member in the struct p9_conn. Fixes one kernel-doc warning: trans_fd.c:133: warning: Excess struct member 'req' description in 'p9_conn' Signed-off-by: Randy Dunlap Cc: Eric Van Hensbergen Cc: Latchesar Ionkov Cc: Dominique Martinet Cc: v9fs@lists.linux.dev Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: netdev@vger.kernel.org Reviewed-by: Simon Horman Message-ID: <20240212043341.4631-1-rdunlap@infradead.org> Signed-off-by: Dominique Martinet --- net/9p/trans_fd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 1a3948b8c493e..196060dc6138a 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -95,7 +95,6 @@ struct p9_poll_wait { * @unsent_req_list: accounting for requests that haven't been sent * @rreq: read request * @wreq: write request - * @req: current request being processed (if any) * @tmp_buf: temporary buffer to read in header * @rc: temporary fcall for reading current frame * @wpos: write position for current frame -- GitLab From be3193e58ec210b2a72fb1134c2a0695088a911d Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Tue, 9 Jan 2024 12:39:03 +0900 Subject: [PATCH 014/989] 9p: Fix read/write debug statements to report server reply Previous conversion to iov missed these debug statements which would now always print the requested size instead of the actual server reply. Write also added a loop in a much older commit but we didn't report these, while reads do report each iteration -- it's more coherent to keep reporting all requests to server so move that at the same time. Fixes: 7f02464739da ("9p: convert to advancing variant of iov_iter_get_pages_alloc()") Signed-off-by: Dominique Martinet Message-ID: <20240109-9p-rw-trace-v1-1-327178114257@codewreck.org> --- net/9p/client.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index e265a0ca6bddd..f7e90b4769bba 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1583,7 +1583,7 @@ p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, received = rsize; } - p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); + p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", received); if (non_zc) { int n = copy_to_iter(dataptr, received, to); @@ -1609,9 +1609,6 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) int total = 0; *err = 0; - p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %zd\n", - fid->fid, offset, iov_iter_count(from)); - while (iov_iter_count(from)) { int count = iov_iter_count(from); int rsize = fid->iounit; @@ -1623,6 +1620,9 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) if (count < rsize) rsize = count; + p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %d (/%d)\n", + fid->fid, offset, rsize, count); + /* Don't bother zerocopy for small IO (< 1024) */ if (clnt->trans_mod->zc_request && rsize > 1024) { req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0, @@ -1650,7 +1650,7 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) written = rsize; } - p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); + p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", written); p9_req_put(clnt, req); iov_iter_revert(from, count - written - iov_iter_count(from)); -- GitLab From 2a0505cdd8c8b12670f4b5a6eb5c996c0861c2d5 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:46:17 +0000 Subject: [PATCH 015/989] 9p: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed as of v6.8-rc1, so it became a dead flag since the commit 16a1d968358a ("mm/slab: remove mm/slab.c and slab_def.h"). And the series[1] went on to mark it obsolete to avoid confusion for users. Here we can just remove all its users, which has no functional change. Link: https://lore.kernel.org/all/20240223-slab-cleanup-flags-v2-1-02f1753e8303@suse.cz/ [1] Signed-off-by: Chengming Zhou Message-ID: <20240224134617.829016-1-chengming.zhou@linux.dev> Signed-off-by: Dominique Martinet --- fs/9p/v9fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 61dbe52bb3a32..281a1ed03a041 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -637,7 +637,7 @@ static int v9fs_init_inode_cache(void) v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", sizeof(struct v9fs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), v9fs_inode_init_once); if (!v9fs_inode_cache) return -ENOMEM; -- GitLab From 92e82cf632e85474e1e19a6f1cae813e87b07965 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 28 Feb 2024 11:18:35 +0100 Subject: [PATCH 016/989] KVM: x86: Introduce __kvm_get_hypervisor_cpuid() helper Similar to kvm_find_kvm_cpuid_features()/__kvm_find_kvm_cpuid_features(), introduce a helper to search for the specific hypervisor signature in any struct kvm_cpuid_entry2 array, not only in vcpu->arch.cpuid_entries. No functional change intended. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240228101837.93642-2-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index adba49afb5fe6..0e1e3e5df6dd4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -189,15 +189,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 return 0; } -static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, - const char *sig) +static struct kvm_hypervisor_cpuid __kvm_get_hypervisor_cpuid(struct kvm_cpuid_entry2 *entries, + int nent, const char *sig) { struct kvm_hypervisor_cpuid cpuid = {}; struct kvm_cpuid_entry2 *entry; u32 base; for_each_possible_hypervisor_cpuid_base(base) { - entry = kvm_find_cpuid_entry(vcpu, base); + entry = cpuid_entry2_find(entries, nent, base, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (entry) { u32 signature[3]; @@ -217,6 +217,13 @@ static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcp return cpuid; } +static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, + const char *sig) +{ + return __kvm_get_hypervisor_cpuid(vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent, sig); +} + static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, int nent) { -- GitLab From 4736d85f0d18ad0469439a0ebc7ccb0cd94bd754 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 28 Feb 2024 11:18:36 +0100 Subject: [PATCH 017/989] KVM: x86: Use actual kvm_cpuid.base for clearing KVM_FEATURE_PV_UNHALT Commit ee3a5f9e3d9b ("KVM: x86: Do runtime CPUID update before updating vcpu->arch.cpuid_entries") moved tweaking of the supplied CPUID data earlier in kvm_set_cpuid() but __kvm_update_cpuid_runtime() actually uses 'vcpu->arch.kvm_cpuid' (though __kvm_find_kvm_cpuid_features()) which gets set later in kvm_set_cpuid(). In some cases, e.g. when kvm_set_cpuid() is called for the first time and 'vcpu->arch.kvm_cpuid' is clear, __kvm_find_kvm_cpuid_features() fails to find KVM PV feature entry and the logic which clears KVM_FEATURE_PV_UNHALT after enabling KVM_X86_DISABLE_EXITS_HLT does not work. The logic, introduced by the commit ee3a5f9e3d9b ("KVM: x86: Do runtime CPUID update before updating vcpu->arch.cpuid_entries") must stay: the supplied CPUID data is tweaked by KVM first (__kvm_update_cpuid_runtime()) and checked later (kvm_check_cpuid()) and the actual data (vcpu->arch.cpuid_*, vcpu->arch.kvm_cpuid, vcpu->arch.xen.cpuid,..) is only updated on success. Switch to searching for KVM_SIGNATURE in the supplied CPUID data to discover KVM PV feature entry instead of using stale 'vcpu->arch.kvm_cpuid'. While on it, drop pointless "&& (best->eax & (1 << KVM_FEATURE_PV_UNHALT)" check when clearing KVM_FEATURE_PV_UNHALT bit. Fixes: ee3a5f9e3d9b ("KVM: x86: Do runtime CPUID update before updating vcpu->arch.cpuid_entries") Reported-and-tested-by: Li RongQing Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240228101837.93642-3-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0e1e3e5df6dd4..bfc0bfcb2bc60 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -224,22 +224,22 @@ static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcp vcpu->arch.cpuid_nent, sig); } -static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, - struct kvm_cpuid_entry2 *entries, int nent) +static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_cpuid_entry2 *entries, + int nent, u32 kvm_cpuid_base) { - u32 base = vcpu->arch.kvm_cpuid.base; - - if (!base) - return NULL; - - return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, + return cpuid_entry2_find(entries, nent, kvm_cpuid_base | KVM_CPUID_FEATURES, KVM_CPUID_INDEX_NOT_SIGNIFICANT); } static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) { - return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent); + u32 base = vcpu->arch.kvm_cpuid.base; + + if (!base) + return NULL; + + return __kvm_find_kvm_cpuid_features(vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent, base); } void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) @@ -273,6 +273,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e int nent) { struct kvm_cpuid_entry2 *best; + struct kvm_hypervisor_cpuid kvm_cpuid; best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { @@ -299,10 +300,12 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent); - if (kvm_hlt_in_guest(vcpu->kvm) && best && - (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) - best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + kvm_cpuid = __kvm_get_hypervisor_cpuid(entries, nent, KVM_SIGNATURE); + if (kvm_cpuid.base) { + best = __kvm_find_kvm_cpuid_features(entries, nent, kvm_cpuid.base); + if (kvm_hlt_in_guest(vcpu->kvm) && best) + best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + } if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); -- GitLab From c2585047c8e185b070ad5c7bd887ef59cee3941f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 28 Feb 2024 11:18:37 +0100 Subject: [PATCH 018/989] KVM: selftests: Check that PV_UNHALT is cleared when HLT exiting is disabled KVM_FEATURE_PV_UNHALT is expected to get cleared from KVM PV feature CPUID data when KVM_X86_DISABLE_EXITS_HLT is enabled. Add the corresponding test to kvm_pv_test. Note, the newly added code doesn't actually test KVM_FEATURE_PV_UNHALT and KVM_X86_DISABLE_EXITS_HLT features. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240228101837.93642-4-vkuznets@redhat.com [sean: add and use vcpu_cpuid_has()] Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86_64/processor.h | 9 +++++ .../selftests/kvm/x86_64/kvm_pv_test.c | 39 +++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index a84863503fcb4..8b5c804562f99 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -995,6 +995,15 @@ static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu) vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid); } +static inline bool vcpu_cpuid_has(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_feature feature) +{ + struct kvm_cpuid_entry2 *entry; + + entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index); + return *((&entry->eax) + feature.reg) & BIT(feature.bit); +} + void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr); void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function); diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 9e2879af7c201..40cc59f4e6501 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -133,6 +133,43 @@ static void enter_guest(struct kvm_vcpu *vcpu) } } +static void test_pv_unhalt(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct kvm_cpuid_entry2 *ent; + u32 kvm_sig_old; + + pr_info("testing KVM_FEATURE_PV_UNHALT\n"); + + TEST_REQUIRE(KVM_CAP_X86_DISABLE_EXITS); + + /* KVM_PV_UNHALT test */ + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_KVM_PV_UNHALT); + + TEST_ASSERT(vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT), + "Enabling X86_FEATURE_KVM_PV_UNHALT had no effect"); + + /* Make sure KVM clears vcpu->arch.kvm_cpuid */ + ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE); + kvm_sig_old = ent->ebx; + ent->ebx = 0xdeadbeef; + vcpu_set_cpuid(vcpu); + + vm_enable_cap(vm, KVM_CAP_X86_DISABLE_EXITS, KVM_X86_DISABLE_EXITS_HLT); + ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE); + ent->ebx = kvm_sig_old; + vcpu_set_cpuid(vcpu); + + TEST_ASSERT(!vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT), + "KVM_FEATURE_PV_UNHALT is set with KVM_CAP_X86_DISABLE_EXITS"); + + /* FIXME: actually test KVM_FEATURE_PV_UNHALT feature */ + + kvm_vm_free(vm); +} + int main(void) { struct kvm_vcpu *vcpu; @@ -151,4 +188,6 @@ int main(void) enter_guest(vcpu); kvm_vm_free(vm); + + test_pv_unhalt(); } -- GitLab From f193957b0fbbba397c8bddedf158b3bf7e4850fc Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 7 Mar 2024 11:02:27 +0000 Subject: [PATCH 019/989] ASoC: wm_adsp: Fix missing mutex_lock in wm_adsp_write_ctl() wm_adsp_write_ctl() must hold the pwr_lock mutex when calling cs_dsp_get_ctl(). This was previously partially fixed by commit 781118bc2fc1 ("ASoC: wm_adsp: Fix missing locking in wm_adsp_[read|write]_ctl()") but this only put locking around the call to cs_dsp_coeff_write_ctrl(), missing the call to cs_dsp_get_ctl(). Signed-off-by: Richard Fitzgerald Fixes: 781118bc2fc1 ("ASoC: wm_adsp: Fix missing locking in wm_adsp_[read|write]_ctl()") Link: https://msgid.link/r/20240307110227.41421-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/wm_adsp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c index 36ea0dcdc7ab0..9cb9068c0462a 100644 --- a/sound/soc/codecs/wm_adsp.c +++ b/sound/soc/codecs/wm_adsp.c @@ -683,11 +683,12 @@ static void wm_adsp_control_remove(struct cs_dsp_coeff_ctl *cs_ctl) int wm_adsp_write_ctl(struct wm_adsp *dsp, const char *name, int type, unsigned int alg, void *buf, size_t len) { - struct cs_dsp_coeff_ctl *cs_ctl = cs_dsp_get_ctl(&dsp->cs_dsp, name, type, alg); + struct cs_dsp_coeff_ctl *cs_ctl; struct wm_coeff_ctl *ctl; int ret; mutex_lock(&dsp->cs_dsp.pwr_lock); + cs_ctl = cs_dsp_get_ctl(&dsp->cs_dsp, name, type, alg); ret = cs_dsp_coeff_write_ctrl(cs_ctl, 0, buf, len); mutex_unlock(&dsp->cs_dsp.pwr_lock); -- GitLab From 95f37eb52e18879a1b16e51b972d992b39e50a81 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:35 +0200 Subject: [PATCH 020/989] ARM: OMAP2+: fix bogus MMC GPIO labels on Nokia N8x0 The GPIO bank width is 32 on OMAP2, so all labels are incorrect. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-2-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/board-n8x0.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/arm/mach-omap2/board-n8x0.c b/arch/arm/mach-omap2/board-n8x0.c index 31755a378c736..3e48f34016c19 100644 --- a/arch/arm/mach-omap2/board-n8x0.c +++ b/arch/arm/mach-omap2/board-n8x0.c @@ -144,8 +144,7 @@ static struct gpiod_lookup_table nokia8xx_mmc_gpio_table = { .dev_id = "mmci-omap.0", .table = { /* Slot switch, GPIO 96 */ - GPIO_LOOKUP("gpio-80-111", 16, - "switch", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("gpio-96-127", 0, "switch", GPIO_ACTIVE_HIGH), { } }, }; @@ -154,11 +153,9 @@ static struct gpiod_lookup_table nokia810_mmc_gpio_table = { .dev_id = "mmci-omap.0", .table = { /* Slot index 1, VSD power, GPIO 23 */ - GPIO_LOOKUP_IDX("gpio-16-31", 7, - "vsd", 1, GPIO_ACTIVE_HIGH), + GPIO_LOOKUP_IDX("gpio-0-31", 23, "vsd", 1, GPIO_ACTIVE_HIGH), /* Slot index 1, VIO power, GPIO 9 */ - GPIO_LOOKUP_IDX("gpio-0-15", 9, - "vio", 1, GPIO_ACTIVE_HIGH), + GPIO_LOOKUP_IDX("gpio-0-31", 9, "vio", 1, GPIO_ACTIVE_HIGH), { } }, }; -- GitLab From 480d44d0820dd5ae043dc97c0b46dabbe53cb1cf Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:36 +0200 Subject: [PATCH 021/989] ARM: OMAP2+: fix N810 MMC gpiod table Trying to append a second table for the same dev_id doesn't seem to work. The second table is just silently ignored. As a result eMMC GPIOs are not present. Fix by using separate tables for N800 and N810. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-3-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/board-n8x0.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm/mach-omap2/board-n8x0.c b/arch/arm/mach-omap2/board-n8x0.c index 3e48f34016c19..c933a91751e4f 100644 --- a/arch/arm/mach-omap2/board-n8x0.c +++ b/arch/arm/mach-omap2/board-n8x0.c @@ -140,7 +140,7 @@ static int slot1_cover_open; static int slot2_cover_open; static struct device *mmc_device; -static struct gpiod_lookup_table nokia8xx_mmc_gpio_table = { +static struct gpiod_lookup_table nokia800_mmc_gpio_table = { .dev_id = "mmci-omap.0", .table = { /* Slot switch, GPIO 96 */ @@ -152,6 +152,8 @@ static struct gpiod_lookup_table nokia8xx_mmc_gpio_table = { static struct gpiod_lookup_table nokia810_mmc_gpio_table = { .dev_id = "mmci-omap.0", .table = { + /* Slot switch, GPIO 96 */ + GPIO_LOOKUP("gpio-96-127", 0, "switch", GPIO_ACTIVE_HIGH), /* Slot index 1, VSD power, GPIO 23 */ GPIO_LOOKUP_IDX("gpio-0-31", 23, "vsd", 1, GPIO_ACTIVE_HIGH), /* Slot index 1, VIO power, GPIO 9 */ @@ -412,8 +414,6 @@ static struct omap_mmc_platform_data *mmc_data[OMAP24XX_NR_MMC]; static void __init n8x0_mmc_init(void) { - gpiod_add_lookup_table(&nokia8xx_mmc_gpio_table); - if (board_is_n810()) { mmc1_data.slots[0].name = "external"; @@ -426,6 +426,8 @@ static void __init n8x0_mmc_init(void) mmc1_data.slots[1].name = "internal"; mmc1_data.slots[1].ban_openended = 1; gpiod_add_lookup_table(&nokia810_mmc_gpio_table); + } else { + gpiod_add_lookup_table(&nokia800_mmc_gpio_table); } mmc1_data.nr_slots = 2; -- GitLab From d4debbcbffa45c3de5df0040af2eea74a9e794a3 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:37 +0200 Subject: [PATCH 022/989] mmc: omap: fix broken slot switch lookup The lookup is done before host->dev is initialized. It will always just fail silently, and the MMC behaviour is totally unpredictable as the switch is left in an undefined state. Fix that. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-4-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- drivers/mmc/host/omap.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index 9fb8995b43a1c..aa40e1a9dc29e 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1384,13 +1384,6 @@ static int mmc_omap_probe(struct platform_device *pdev) if (IS_ERR(host->virt_base)) return PTR_ERR(host->virt_base); - host->slot_switch = gpiod_get_optional(host->dev, "switch", - GPIOD_OUT_LOW); - if (IS_ERR(host->slot_switch)) - return dev_err_probe(host->dev, PTR_ERR(host->slot_switch), - "error looking up slot switch GPIO\n"); - - INIT_WORK(&host->slot_release_work, mmc_omap_slot_release_work); INIT_WORK(&host->send_stop_work, mmc_omap_send_stop_work); @@ -1409,6 +1402,12 @@ static int mmc_omap_probe(struct platform_device *pdev) host->dev = &pdev->dev; platform_set_drvdata(pdev, host); + host->slot_switch = gpiod_get_optional(host->dev, "switch", + GPIOD_OUT_LOW); + if (IS_ERR(host->slot_switch)) + return dev_err_probe(host->dev, PTR_ERR(host->slot_switch), + "error looking up slot switch GPIO\n"); + host->id = pdev->id; host->irq = irq; host->phys_base = res->start; -- GitLab From f6862c7f156d04f81c38467e1c304b7e9517e810 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:38 +0200 Subject: [PATCH 023/989] mmc: omap: fix deferred probe After a deferred probe, GPIO descriptor lookup will fail with EBUSY. Fix by using managed descriptors. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-5-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- drivers/mmc/host/omap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index aa40e1a9dc29e..50408771ae01c 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1259,18 +1259,18 @@ static int mmc_omap_new_slot(struct mmc_omap_host *host, int id) slot->pdata = &host->pdata->slots[id]; /* Check for some optional GPIO controls */ - slot->vsd = gpiod_get_index_optional(host->dev, "vsd", - id, GPIOD_OUT_LOW); + slot->vsd = devm_gpiod_get_index_optional(host->dev, "vsd", + id, GPIOD_OUT_LOW); if (IS_ERR(slot->vsd)) return dev_err_probe(host->dev, PTR_ERR(slot->vsd), "error looking up VSD GPIO\n"); - slot->vio = gpiod_get_index_optional(host->dev, "vio", - id, GPIOD_OUT_LOW); + slot->vio = devm_gpiod_get_index_optional(host->dev, "vio", + id, GPIOD_OUT_LOW); if (IS_ERR(slot->vio)) return dev_err_probe(host->dev, PTR_ERR(slot->vio), "error looking up VIO GPIO\n"); - slot->cover = gpiod_get_index_optional(host->dev, "cover", - id, GPIOD_IN); + slot->cover = devm_gpiod_get_index_optional(host->dev, "cover", + id, GPIOD_IN); if (IS_ERR(slot->cover)) return dev_err_probe(host->dev, PTR_ERR(slot->cover), "error looking up cover switch GPIO\n"); @@ -1402,8 +1402,8 @@ static int mmc_omap_probe(struct platform_device *pdev) host->dev = &pdev->dev; platform_set_drvdata(pdev, host); - host->slot_switch = gpiod_get_optional(host->dev, "switch", - GPIOD_OUT_LOW); + host->slot_switch = devm_gpiod_get_optional(host->dev, "switch", + GPIOD_OUT_LOW); if (IS_ERR(host->slot_switch)) return dev_err_probe(host->dev, PTR_ERR(host->slot_switch), "error looking up slot switch GPIO\n"); -- GitLab From 894ad61b85d6ba8efd4274aa8719d9ff1c89ea54 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:39 +0200 Subject: [PATCH 024/989] mmc: omap: restore original power up/down steps Commit e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") moved Nokia N810 MMC power up/down from the board file into the MMC driver. The change removed some delays, and ordering without a valid reason. Restore power up/down to match the original code. This matters only on N810 where the 2nd GPIO is in use. Other boards will see an additional delay but that should be a lesser concern than omitting delays altogether. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-6-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- drivers/mmc/host/omap.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index 50408771ae01c..13fa8588e38c1 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1119,10 +1119,25 @@ static void mmc_omap_set_power(struct mmc_omap_slot *slot, int power_on, host = slot->host; - if (slot->vsd) - gpiod_set_value(slot->vsd, power_on); - if (slot->vio) - gpiod_set_value(slot->vio, power_on); + if (power_on) { + if (slot->vsd) { + gpiod_set_value(slot->vsd, power_on); + msleep(1); + } + if (slot->vio) { + gpiod_set_value(slot->vio, power_on); + msleep(1); + } + } else { + if (slot->vio) { + gpiod_set_value(slot->vio, power_on); + msleep(50); + } + if (slot->vsd) { + gpiod_set_value(slot->vsd, power_on); + msleep(50); + } + } if (slot->pdata->set_power != NULL) slot->pdata->set_power(mmc_dev(slot->mmc), slot->id, power_on, -- GitLab From 4421405e3634a3189b541cf1e34598e44260720d Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:16:56 +0200 Subject: [PATCH 025/989] ARM: OMAP2+: fix USB regression on Nokia N8x0 GPIO chip labels are wrong for OMAP2, so the USB does not work. Fix. Fixes: 8e0285ab95a9 ("ARM/musb: omap2: Remove global GPIO numbers from TUSB6010") Signed-off-by: Aaro Koskinen Reviewed-by: Linus Walleij Message-ID: <20240223181656.1099845-1-aaro.koskinen@iki.fi> Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/board-n8x0.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm/mach-omap2/board-n8x0.c b/arch/arm/mach-omap2/board-n8x0.c index c933a91751e4f..ff2a4a4d82204 100644 --- a/arch/arm/mach-omap2/board-n8x0.c +++ b/arch/arm/mach-omap2/board-n8x0.c @@ -79,10 +79,8 @@ static struct musb_hdrc_platform_data tusb_data = { static struct gpiod_lookup_table tusb_gpio_table = { .dev_id = "musb-tusb", .table = { - GPIO_LOOKUP("gpio-0-15", 0, "enable", - GPIO_ACTIVE_HIGH), - GPIO_LOOKUP("gpio-48-63", 10, "int", - GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("gpio-0-31", 0, "enable", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("gpio-32-63", 26, "int", GPIO_ACTIVE_HIGH), { } }, }; -- GitLab From fcf3f7e2fc8a53a6140beee46ec782a4c88e4744 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 8 Mar 2024 17:37:26 +0800 Subject: [PATCH 026/989] raid1: fix use-after-free for original bio in raid1_write_request() r1_bio->bios[] is used to record new bios that will be issued to underlying disks, however, in raid1_write_request(), r1_bio->bios[] will set to the original bio temporarily. Meanwhile, if blocked rdev is set, free_r1bio() will be called causing that all r1_bio->bios[] to be freed: raid1_write_request() r1_bio = alloc_r1bio(mddev, bio); -> r1_bio->bios[] is NULL for (i = 0; i < disks; i++) -> for each rdev in conf // first rdev is normal r1_bio->bios[0] = bio; -> set to original bio // second rdev is blocked if (test_bit(Blocked, &rdev->flags)) break if (blocked_rdev) free_r1bio() put_all_bios() bio_put(r1_bio->bios[0]) -> original bio is freed Test scripts: mdadm -CR /dev/md0 -l1 -n4 /dev/sd[abcd] --assume-clean fio -filename=/dev/md0 -ioengine=libaio -rw=write -bs=4k -numjobs=1 \ -iodepth=128 -name=test -direct=1 echo blocked > /sys/block/md0/md/rd2/state Test result: BUG bio-264 (Not tainted): Object already free ----------------------------------------------------------------------------- Allocated in mempool_alloc_slab+0x24/0x50 age=1 cpu=1 pid=869 kmem_cache_alloc+0x324/0x480 mempool_alloc_slab+0x24/0x50 mempool_alloc+0x6e/0x220 bio_alloc_bioset+0x1af/0x4d0 blkdev_direct_IO+0x164/0x8a0 blkdev_write_iter+0x309/0x440 aio_write+0x139/0x2f0 io_submit_one+0x5ca/0xb70 __do_sys_io_submit+0x86/0x270 __x64_sys_io_submit+0x22/0x30 do_syscall_64+0xb1/0x210 entry_SYSCALL_64_after_hwframe+0x6c/0x74 Freed in mempool_free_slab+0x1f/0x30 age=1 cpu=1 pid=869 kmem_cache_free+0x28c/0x550 mempool_free_slab+0x1f/0x30 mempool_free+0x40/0x100 bio_free+0x59/0x80 bio_put+0xf0/0x220 free_r1bio+0x74/0xb0 raid1_make_request+0xadf/0x1150 md_handle_request+0xc7/0x3b0 md_submit_bio+0x76/0x130 __submit_bio+0xd8/0x1d0 submit_bio_noacct_nocheck+0x1eb/0x5c0 submit_bio_noacct+0x169/0xd40 submit_bio+0xee/0x1d0 blkdev_direct_IO+0x322/0x8a0 blkdev_write_iter+0x309/0x440 aio_write+0x139/0x2f0 Since that bios for underlying disks are not allocated yet, fix this problem by using mempool_free() directly to free the r1_bio. Fixes: 992db13a4aee ("md/raid1: free the r1bio before waiting for blocked rdev") Cc: stable@vger.kernel.org # v6.6+ Reported-by: Coly Li Signed-off-by: Yu Kuai Tested-by: Coly Li Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240308093726.1047420-1-yukuai1@huaweicloud.com --- drivers/md/raid1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index be8ac24f50b6a..7b8a71ca66dde 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1558,7 +1558,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); - free_r1bio(r1_bio); + mempool_free(r1_bio, &conf->r1bio_pool); allow_barrier(conf, bio->bi_iter.bi_sector); if (bio->bi_opf & REQ_NOWAIT) { -- GitLab From 59097a2a5ecadb0f025232c665fd11c8ae1e1f58 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Sat, 2 Mar 2024 03:22:49 +0100 Subject: [PATCH 027/989] interconnect: qcom: x1e80100: Remove inexistent ACV_PERF BCM Booting the kernel on X1E results in a message like: [ 2.561524] qnoc-x1e80100 interconnect-0: ACV_PERF could not find RPMh address And indeed, taking a look at cmd-db, no such BCM exists. Remove it. Fixes: 9f196772841e ("interconnect: qcom: Add X1E80100 interconnect provider driver") Signed-off-by: Konrad Dybcio Reviewed-by: Mike Tipton Link: https://lore.kernel.org/r/20240302-topic-faux_bcm_x1e-v1-1-c40fab7c4bc5@linaro.org Signed-off-by: Georgi Djakov --- drivers/interconnect/qcom/x1e80100.c | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/drivers/interconnect/qcom/x1e80100.c b/drivers/interconnect/qcom/x1e80100.c index cbaf4f9c41be6..06f0a6d6cbbc0 100644 --- a/drivers/interconnect/qcom/x1e80100.c +++ b/drivers/interconnect/qcom/x1e80100.c @@ -116,15 +116,6 @@ static struct qcom_icc_node xm_sdc2 = { .links = { X1E80100_SLAVE_A2NOC_SNOC }, }; -static struct qcom_icc_node ddr_perf_mode_master = { - .name = "ddr_perf_mode_master", - .id = X1E80100_MASTER_DDR_PERF_MODE, - .channels = 1, - .buswidth = 4, - .num_links = 1, - .links = { X1E80100_SLAVE_DDR_PERF_MODE }, -}; - static struct qcom_icc_node qup0_core_master = { .name = "qup0_core_master", .id = X1E80100_MASTER_QUP_CORE_0, @@ -832,14 +823,6 @@ static struct qcom_icc_node qns_a2noc_snoc = { .links = { X1E80100_MASTER_A2NOC_SNOC }, }; -static struct qcom_icc_node ddr_perf_mode_slave = { - .name = "ddr_perf_mode_slave", - .id = X1E80100_SLAVE_DDR_PERF_MODE, - .channels = 1, - .buswidth = 4, - .num_links = 0, -}; - static struct qcom_icc_node qup0_core_slave = { .name = "qup0_core_slave", .id = X1E80100_SLAVE_QUP_CORE_0, @@ -1591,12 +1574,6 @@ static struct qcom_icc_bcm bcm_acv = { .nodes = { &ebi }, }; -static struct qcom_icc_bcm bcm_acv_perf = { - .name = "ACV_PERF", - .num_nodes = 1, - .nodes = { &ddr_perf_mode_slave }, -}; - static struct qcom_icc_bcm bcm_ce0 = { .name = "CE0", .num_nodes = 1, @@ -1863,18 +1840,15 @@ static const struct qcom_icc_desc x1e80100_aggre2_noc = { }; static struct qcom_icc_bcm * const clk_virt_bcms[] = { - &bcm_acv_perf, &bcm_qup0, &bcm_qup1, &bcm_qup2, }; static struct qcom_icc_node * const clk_virt_nodes[] = { - [MASTER_DDR_PERF_MODE] = &ddr_perf_mode_master, [MASTER_QUP_CORE_0] = &qup0_core_master, [MASTER_QUP_CORE_1] = &qup1_core_master, [MASTER_QUP_CORE_2] = &qup2_core_master, - [SLAVE_DDR_PERF_MODE] = &ddr_perf_mode_slave, [SLAVE_QUP_CORE_0] = &qup0_core_slave, [SLAVE_QUP_CORE_1] = &qup1_core_slave, [SLAVE_QUP_CORE_2] = &qup2_core_slave, -- GitLab From de1bf25b6d771abdb52d43546cf57ad775fb68a1 Mon Sep 17 00:00:00 2001 From: Mike Tipton Date: Tue, 5 Mar 2024 14:56:52 -0800 Subject: [PATCH 028/989] interconnect: Don't access req_list while it's being manipulated The icc_lock mutex was split into separate icc_lock and icc_bw_lock mutexes in [1] to avoid lockdep splats. However, this didn't adequately protect access to icc_node::req_list. The icc_set_bw() function will eventually iterate over req_list while only holding icc_bw_lock, but req_list can be modified while only holding icc_lock. This causes races between icc_set_bw(), of_icc_get(), and icc_put(). Example A: CPU0 CPU1 ---- ---- icc_set_bw(path_a) mutex_lock(&icc_bw_lock); icc_put(path_b) mutex_lock(&icc_lock); aggregate_requests() hlist_for_each_entry(r, ... hlist_del(... Example B: CPU0 CPU1 ---- ---- icc_set_bw(path_a) mutex_lock(&icc_bw_lock); path_b = of_icc_get() of_icc_get_by_index() mutex_lock(&icc_lock); path_find() path_init() aggregate_requests() hlist_for_each_entry(r, ... hlist_add_head(... Fix this by ensuring icc_bw_lock is always held before manipulating icc_node::req_list. The additional places icc_bw_lock is held don't perform any memory allocations, so we should still be safe from the original lockdep splats that motivated the separate locks. [1] commit af42269c3523 ("interconnect: Fix locking for runpm vs reclaim") Signed-off-by: Mike Tipton Fixes: af42269c3523 ("interconnect: Fix locking for runpm vs reclaim") Reviewed-by: Rob Clark Link: https://lore.kernel.org/r/20240305225652.22872-1-quic_mdtipton@quicinc.com Signed-off-by: Georgi Djakov --- drivers/interconnect/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c index 50bac2d79d9b5..68edb07d4443e 100644 --- a/drivers/interconnect/core.c +++ b/drivers/interconnect/core.c @@ -176,6 +176,8 @@ static struct icc_path *path_init(struct device *dev, struct icc_node *dst, path->num_nodes = num_nodes; + mutex_lock(&icc_bw_lock); + for (i = num_nodes - 1; i >= 0; i--) { node->provider->users++; hlist_add_head(&path->reqs[i].req_node, &node->req_list); @@ -186,6 +188,8 @@ static struct icc_path *path_init(struct device *dev, struct icc_node *dst, node = node->reverse; } + mutex_unlock(&icc_bw_lock); + return path; } @@ -792,12 +796,16 @@ void icc_put(struct icc_path *path) pr_err("%s: error (%d)\n", __func__, ret); mutex_lock(&icc_lock); + mutex_lock(&icc_bw_lock); + for (i = 0; i < path->num_nodes; i++) { node = path->reqs[i].node; hlist_del(&path->reqs[i].req_node); if (!WARN_ON(!node->provider->users)) node->provider->users--; } + + mutex_unlock(&icc_bw_lock); mutex_unlock(&icc_lock); kfree_const(path->name); -- GitLab From f3c80061c0d35c60709088ccb019305796d3f6ff Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 9 Feb 2024 13:37:33 -0500 Subject: [PATCH 029/989] KVM: SEV: fix compat ABI for KVM_MEMORY_ENCRYPT_OP The data structs for KVM_MEMORY_ENCRYPT_OP have different sizes for 32- and 64-bit userspace, but they do not make any attempt to convert from one ABI to the other when 32-bit userspace is running on 64-bit kernels. This configuration never worked, and SEV is only for 64-bit kernels so we're not breaking ABI on 32-bit kernels. Fix this by adding the appropriate padding; no functional change intended for 64-bit userspace. Reviewed-by: Michael Roth Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index ad29984d5e398..ef11aa4cab425 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -694,6 +694,7 @@ enum sev_cmd_id { struct kvm_sev_cmd { __u32 id; + __u32 pad0; __u64 data; __u32 error; __u32 sev_fd; @@ -704,28 +705,35 @@ struct kvm_sev_launch_start { __u32 policy; __u64 dh_uaddr; __u32 dh_len; + __u32 pad0; __u64 session_uaddr; __u32 session_len; + __u32 pad1; }; struct kvm_sev_launch_update_data { __u64 uaddr; __u32 len; + __u32 pad0; }; struct kvm_sev_launch_secret { __u64 hdr_uaddr; __u32 hdr_len; + __u32 pad0; __u64 guest_uaddr; __u32 guest_len; + __u32 pad1; __u64 trans_uaddr; __u32 trans_len; + __u32 pad2; }; struct kvm_sev_launch_measure { __u64 uaddr; __u32 len; + __u32 pad0; }; struct kvm_sev_guest_status { @@ -738,33 +746,43 @@ struct kvm_sev_dbg { __u64 src_uaddr; __u64 dst_uaddr; __u32 len; + __u32 pad0; }; struct kvm_sev_attestation_report { __u8 mnonce[16]; __u64 uaddr; __u32 len; + __u32 pad0; }; struct kvm_sev_send_start { __u32 policy; + __u32 pad0; __u64 pdh_cert_uaddr; __u32 pdh_cert_len; + __u32 pad1; __u64 plat_certs_uaddr; __u32 plat_certs_len; + __u32 pad2; __u64 amd_certs_uaddr; __u32 amd_certs_len; + __u32 pad3; __u64 session_uaddr; __u32 session_len; + __u32 pad4; }; struct kvm_sev_send_update_data { __u64 hdr_uaddr; __u32 hdr_len; + __u32 pad0; __u64 guest_uaddr; __u32 guest_len; + __u32 pad1; __u64 trans_uaddr; __u32 trans_len; + __u32 pad2; }; struct kvm_sev_receive_start { @@ -772,17 +790,22 @@ struct kvm_sev_receive_start { __u32 policy; __u64 pdh_uaddr; __u32 pdh_len; + __u32 pad0; __u64 session_uaddr; __u32 session_len; + __u32 pad1; }; struct kvm_sev_receive_update_data { __u64 hdr_uaddr; __u32 hdr_len; + __u32 pad0; __u64 guest_uaddr; __u32 guest_len; + __u32 pad1; __u64 trans_uaddr; __u32 trans_len; + __u32 pad2; }; #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) -- GitLab From 19cebbab995b2c99e927d6c5f3f24ded0740efd5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 9 Feb 2024 13:37:35 -0500 Subject: [PATCH 030/989] Documentation: kvm/sev: separate description of firmware The description of firmware is included part under the "SEV Key Management" header, part under the KVM_SEV_INIT ioctl. Put these two bits together and and rename "SEV Key Management" to what it actually is, namely a description of the KVM_MEMORY_ENCRYPT_OP API. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- .../virt/kvm/x86/amd-memory-encryption.rst | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index 995780088eb23..4f2eb441c718f 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -46,14 +46,8 @@ SEV hardware uses ASIDs to associate a memory encryption key with a VM. Hence, the ASID for the SEV-enabled guests must be from 1 to a maximum value defined in the CPUID 0x8000001f[ecx] field. -SEV Key Management -================== - -The SEV guest key management is handled by a separate processor called the AMD -Secure Processor (AMD-SP). Firmware running inside the AMD-SP provides a secure -key management interface to perform common hypervisor activities such as -encrypting bootstrap code, snapshot, migrating and debugging the guest. For more -information, see the SEV Key Management spec [api-spec]_ +The KVM_MEMORY_ENCRYPT_OP ioctl +=============================== The main ioctl to access SEV is KVM_MEMORY_ENCRYPT_OP. If the argument to KVM_MEMORY_ENCRYPT_OP is NULL, the ioctl returns 0 if SEV is enabled @@ -87,10 +81,6 @@ guests, such as launching, running, snapshotting, migrating and decommissioning. The KVM_SEV_INIT command is used by the hypervisor to initialize the SEV platform context. In a typical workflow, this command should be the first command issued. -The firmware can be initialized either by using its own non-volatile storage or -the OS can manage the NV storage for the firmware using the module parameter -``init_ex_path``. If the file specified by ``init_ex_path`` does not exist or -is invalid, the OS will create or override the file with output from PSP. Returns: 0 on success, -negative on error @@ -434,6 +424,21 @@ issued by the hypervisor to make the guest ready for execution. Returns: 0 on success, -negative on error +Firmware Management +=================== + +The SEV guest key management is handled by a separate processor called the AMD +Secure Processor (AMD-SP). Firmware running inside the AMD-SP provides a secure +key management interface to perform common hypervisor activities such as +encrypting bootstrap code, snapshot, migrating and debugging the guest. For more +information, see the SEV Key Management spec [api-spec]_ + +The AMD-SP firmware can be initialized either by using its own non-volatile +storage or the OS can manage the NV storage for the firmware using +parameter ``init_ex_path`` of the ``ccp`` module. If the file specified +by ``init_ex_path`` does not exist or is invalid, the OS will create or +override the file with PSP non-volatile storage. + References ========== -- GitLab From c20722c412f1cc1879e994fe169278bd0f322ad9 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 18 Mar 2024 12:11:41 -0400 Subject: [PATCH 031/989] Documentation: kvm/sev: clarify usage of KVM_MEMORY_ENCRYPT_OP Explain that it operates on the VM file descriptor, and also clarify how detection of SEV operates on old kernels predating commit 2da1ed62d55c ("KVM: SVM: document KVM_MEM_ENCRYPT_OP, let userspace detect if SEV is available"). Signed-off-by: Paolo Bonzini --- .../virt/kvm/x86/amd-memory-encryption.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index 4f2eb441c718f..84335d119ff13 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -49,12 +49,13 @@ defined in the CPUID 0x8000001f[ecx] field. The KVM_MEMORY_ENCRYPT_OP ioctl =============================== -The main ioctl to access SEV is KVM_MEMORY_ENCRYPT_OP. If the argument -to KVM_MEMORY_ENCRYPT_OP is NULL, the ioctl returns 0 if SEV is enabled -and ``ENOTTY`` if it is disabled (on some older versions of Linux, -the ioctl runs normally even with a NULL argument, and therefore will -likely return ``EFAULT``). If non-NULL, the argument to KVM_MEMORY_ENCRYPT_OP -must be a struct kvm_sev_cmd:: +The main ioctl to access SEV is KVM_MEMORY_ENCRYPT_OP, which operates on +the VM file descriptor. If the argument to KVM_MEMORY_ENCRYPT_OP is NULL, +the ioctl returns 0 if SEV is enabled and ``ENOTTY`` if it is disabled +(on some older versions of Linux, the ioctl tries to run normally even +with a NULL argument, and therefore will likely return ``EFAULT`` instead +of zero if SEV is enabled). If non-NULL, the argument to +KVM_MEMORY_ENCRYPT_OP must be a struct kvm_sev_cmd:: struct kvm_sev_cmd { __u32 id; -- GitLab From e249884e106b81c34f8050d23935ffc12623843f Mon Sep 17 00:00:00 2001 From: Erni Sri Satya Vennela Date: Thu, 21 Mar 2024 01:22:05 -0700 Subject: [PATCH 032/989] x86/hyperv: Cosmetic changes for hv_apic.c Fix issues reported by checkpatch.pl script for hv_apic.c file - Alignment should match open parenthesis - Remove unnecessary parenthesis No functional changes intended. Signed-off-by: Erni Sri Satya Vennela Reviewed-by: Saurabh Sengar Link: https://lore.kernel.org/r/1711009325-21894-1-git-send-email-ernis@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1711009325-21894-1-git-send-email-ernis@linux.microsoft.com> --- arch/x86/hyperv/hv_apic.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 5fc45543e9550..0569f579338b5 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -105,7 +105,7 @@ static bool cpu_is_self(int cpu) * IPI implementation on Hyper-V. */ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, - bool exclude_self) + bool exclude_self) { struct hv_send_ipi_ex *ipi_arg; unsigned long flags; @@ -132,8 +132,8 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) { ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; - nr_bank = cpumask_to_vpset_skip(&(ipi_arg->vp_set), mask, - exclude_self ? cpu_is_self : NULL); + nr_bank = cpumask_to_vpset_skip(&ipi_arg->vp_set, mask, + exclude_self ? cpu_is_self : NULL); /* * 'nr_bank <= 0' means some CPUs in cpumask can't be @@ -147,7 +147,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, } status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, - ipi_arg, NULL); + ipi_arg, NULL); ipi_mask_ex_done: local_irq_restore(flags); @@ -155,7 +155,7 @@ ipi_mask_ex_done: } static bool __send_ipi_mask(const struct cpumask *mask, int vector, - bool exclude_self) + bool exclude_self) { int cur_cpu, vcpu, this_cpu = smp_processor_id(); struct hv_send_ipi ipi_arg; @@ -181,7 +181,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, return false; } - if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR) return false; /* @@ -218,7 +218,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, } status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector, - ipi_arg.cpu_mask); + ipi_arg.cpu_mask); return hv_result_success(status); do_ex_hypercall: @@ -241,7 +241,7 @@ static bool __send_ipi_one(int cpu, int vector) return false; } - if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR) return false; if (vp >= 64) -- GitLab From 1f1dc442c57ec61c08d21d47e4c5b4f16446fe00 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 22 Mar 2024 14:10:26 -0700 Subject: [PATCH 033/989] mshyperv: Introduce hv_numa_node_to_pxm_info() Factor out logic for converting numa node to hv_proximity_domain_info into a helper function. Change hv_proximity_domain_info to a struct to improve readability. While at it, rename hv_add_logical_processor_* structs to the correct hv_input_/hv_output_ prefix, and remove the flags field which is not present in the ABI. Signed-off-by: Nuno Das Neves Reviewed-by: Wei Liu Link: https://lore.kernel.org/r/1711141826-9458-1-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1711141826-9458-1-git-send-email-nunodasneves@linux.microsoft.com> --- arch/x86/hyperv/hv_proc.c | 22 ++++------------------ include/asm-generic/hyperv-tlfs.h | 19 +++++++------------ include/asm-generic/mshyperv.h | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 30 deletions(-) diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c index 68a0843d4750f..3fa1f2ee7b0d0 100644 --- a/arch/x86/hyperv/hv_proc.c +++ b/arch/x86/hyperv/hv_proc.c @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -116,12 +115,11 @@ free_buf: int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) { - struct hv_add_logical_processor_in *input; - struct hv_add_logical_processor_out *output; + struct hv_input_add_logical_processor *input; + struct hv_output_add_logical_processor *output; u64 status; unsigned long flags; int ret = HV_STATUS_SUCCESS; - int pxm = node_to_pxm(node); /* * When adding a logical processor, the hypervisor may return @@ -137,11 +135,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) input->lp_index = lp_index; input->apic_id = apic_id; - input->flags = 0; - input->proximity_domain_info.domain_id = pxm; - input->proximity_domain_info.flags.reserved = 0; - input->proximity_domain_info.flags.proximity_info_valid = 1; - input->proximity_domain_info.flags.proximity_preferred = 1; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR, input, output); local_irq_restore(flags); @@ -166,7 +160,6 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) u64 status; unsigned long irq_flags; int ret = HV_STATUS_SUCCESS; - int pxm = node_to_pxm(node); /* Root VPs don't seem to need pages deposited */ if (partition_id != hv_current_partition_id) { @@ -185,14 +178,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) input->vp_index = vp_index; input->flags = flags; input->subnode_type = HvSubnodeAny; - if (node != NUMA_NO_NODE) { - input->proximity_domain_info.domain_id = pxm; - input->proximity_domain_info.flags.reserved = 0; - input->proximity_domain_info.flags.proximity_info_valid = 1; - input->proximity_domain_info.flags.proximity_preferred = 1; - } else { - input->proximity_domain_info.as_uint64 = 0; - } + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL); local_irq_restore(irq_flags); diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index fdac4a1714ec0..69f3f68dc249a 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -512,13 +512,9 @@ struct hv_proximity_domain_flags { u32 proximity_info_valid : 1; } __packed; -/* Not a union in windows but useful for zeroing */ -union hv_proximity_domain_info { - struct { - u32 domain_id; - struct hv_proximity_domain_flags flags; - }; - u64 as_uint64; +struct hv_proximity_domain_info { + u32 domain_id; + struct hv_proximity_domain_flags flags; } __packed; struct hv_lp_startup_status { @@ -532,14 +528,13 @@ struct hv_lp_startup_status { } __packed; /* HvAddLogicalProcessor hypercall */ -struct hv_add_logical_processor_in { +struct hv_input_add_logical_processor { u32 lp_index; u32 apic_id; - union hv_proximity_domain_info proximity_domain_info; - u64 flags; + struct hv_proximity_domain_info proximity_domain_info; } __packed; -struct hv_add_logical_processor_out { +struct hv_output_add_logical_processor { struct hv_lp_startup_status startup_status; } __packed; @@ -560,7 +555,7 @@ struct hv_create_vp { u8 padding[3]; u8 subnode_type; u64 subnode_id; - union hv_proximity_domain_info proximity_domain_info; + struct hv_proximity_domain_info proximity_domain_info; u64 flags; } __packed; diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 430f0ae0dde2d..cfb0b51a09982 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +68,19 @@ extern u64 hv_do_fast_hypercall8(u16 control, u64 input8); bool hv_isolation_type_snp(void); bool hv_isolation_type_tdx(void); +static inline struct hv_proximity_domain_info hv_numa_node_to_pxm_info(int node) +{ + struct hv_proximity_domain_info pxm_info = {}; + + if (node != NUMA_NO_NODE) { + pxm_info.domain_id = node_to_pxm(node); + pxm_info.flags.proximity_info_valid = 1; + pxm_info.flags.proximity_preferred = 1; + } + + return pxm_info; +} + /* Helper functions that provide a consistent pattern for checking Hyper-V hypercall status. */ static inline int hv_result(u64 status) { -- GitLab From 5448d9282af57c2c89a3033f1d56b31689d09b73 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 7 Mar 2024 08:19:51 +0000 Subject: [PATCH 034/989] KVM: selftests: Fix spelling mistake "trigged" -> "triggered" There are spelling mistakes in __GUEST_ASSERT messages. Fix them. Signed-off-by: Colin Ian King Acked-by: Oliver Upton Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20240307081951.1954830-1-colin.i.king@gmail.com --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 2 +- tools/testing/selftests/kvm/riscv/arch_timer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index ddba2c2fb5deb..16ac74d07d680 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -136,7 +136,7 @@ static void guest_run_stage(struct test_vcpu_shared_data *shared_data, irq_iter = READ_ONCE(shared_data->nr_iter); __GUEST_ASSERT(config_iter + 1 == irq_iter, "config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n" - " Guest timer interrupt was not trigged within the specified\n" + " Guest timer interrupt was not triggered within the specified\n" " interval, try to increase the error margin by [-e] option.\n", config_iter + 1, irq_iter); } diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c index e22848f747c01..0f9cabd99fd45 100644 --- a/tools/testing/selftests/kvm/riscv/arch_timer.c +++ b/tools/testing/selftests/kvm/riscv/arch_timer.c @@ -60,7 +60,7 @@ static void guest_run(struct test_vcpu_shared_data *shared_data) irq_iter = READ_ONCE(shared_data->nr_iter); __GUEST_ASSERT(config_iter + 1 == irq_iter, "config_iter + 1 = 0x%x, irq_iter = 0x%x.\n" - " Guest timer interrupt was not trigged within the specified\n" + " Guest timer interrupt was not triggered within the specified\n" " interval, try to increase the error margin by [-e] option.\n", config_iter + 1, irq_iter); } -- GitLab From 7fd99b7ab57057f219bb6cb2a1ff0b88a2b84420 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 15 Mar 2024 09:29:14 +0000 Subject: [PATCH 035/989] RISC-V: KVM: Remove second semicolon There is a statement with two semicolons. Remove the second one, it is redundant. Signed-off-by: Colin Ian King Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20240315092914.2431214-1-colin.i.king@gmail.com --- arch/riscv/kvm/vcpu_onereg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index f4a6124d25c93..994adc26db4b1 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -986,7 +986,7 @@ static int copy_isa_ext_reg_indices(const struct kvm_vcpu *vcpu, static inline unsigned long num_isa_ext_regs(const struct kvm_vcpu *vcpu) { - return copy_isa_ext_reg_indices(vcpu, NULL);; + return copy_isa_ext_reg_indices(vcpu, NULL); } static int copy_sbi_ext_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) -- GitLab From d8dd9f113e16bef3b29c9dcceb584a6f144f55e4 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 21 Mar 2024 14:20:40 +0530 Subject: [PATCH 036/989] RISC-V: KVM: Fix APLIC setipnum_le/be write emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The writes to setipnum_le/be register for APLIC in MSI-mode have special consideration for level-triggered interrupts as-per the section "4.9.2 Special consideration for level-sensitive interrupt sources" of the RISC-V AIA specification. Particularly, the below text from the RISC-V AIA specification defines the behaviour of writes to setipnum_le/be register for level-triggered interrupts: "A second option is for the interrupt service routine to write the APLIC’s source identity number for the interrupt to the domain’s setipnum register just before exiting. This will cause the interrupt’s pending bit to be set to one again if the source is still asserting an interrupt, but not if the source is not asserting an interrupt." Fix setipnum_le/be write emulation for in-kernel APLIC by implementing the above behaviour in aplic_write_pending() function. Cc: stable@vger.kernel.org Fixes: 74967aa208e2 ("RISC-V: KVM: Add in-kernel emulation of AIA APLIC") Signed-off-by: Anup Patel Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20240321085041.1955293-2-apatel@ventanamicro.com --- arch/riscv/kvm/aia_aplic.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kvm/aia_aplic.c b/arch/riscv/kvm/aia_aplic.c index 39e72aa016a4c..5e842b92dc461 100644 --- a/arch/riscv/kvm/aia_aplic.c +++ b/arch/riscv/kvm/aia_aplic.c @@ -137,11 +137,21 @@ static void aplic_write_pending(struct aplic *aplic, u32 irq, bool pending) raw_spin_lock_irqsave(&irqd->lock, flags); sm = irqd->sourcecfg & APLIC_SOURCECFG_SM_MASK; - if (!pending && - ((sm == APLIC_SOURCECFG_SM_LEVEL_HIGH) || - (sm == APLIC_SOURCECFG_SM_LEVEL_LOW))) + if (sm == APLIC_SOURCECFG_SM_INACTIVE) goto skip_write_pending; + if (sm == APLIC_SOURCECFG_SM_LEVEL_HIGH || + sm == APLIC_SOURCECFG_SM_LEVEL_LOW) { + if (!pending) + goto skip_write_pending; + if ((irqd->state & APLIC_IRQ_STATE_INPUT) && + sm == APLIC_SOURCECFG_SM_LEVEL_LOW) + goto skip_write_pending; + if (!(irqd->state & APLIC_IRQ_STATE_INPUT) && + sm == APLIC_SOURCECFG_SM_LEVEL_HIGH) + goto skip_write_pending; + } + if (pending) irqd->state |= APLIC_IRQ_STATE_PENDING; else -- GitLab From 1a4bd2b128fb5ca62e4d1c5ca298d3d06b9c1e8e Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Mon, 11 Mar 2024 12:07:00 +0100 Subject: [PATCH 037/989] firmware: arm_ffa: Fix the partition ID check in ffa_notification_info_get() FFA_NOTIFICATION_INFO_GET retrieves information about pending notifications. Notifications can be either global or per VCPU. Global notifications are reported with the partition ID only in the list of endpoints with pending notifications. ffa_notification_info_get() incorrectly expect no ID at all for global notifications. Fix this by checking for ID = 1 instead of ID = 0. Fixes: 3522be48d82b ("firmware: arm_ffa: Implement the NOTIFICATION_INFO_GET interface") Signed-off-by: Jens Wiklander Reviewed-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20240311110700.2367142-1-jens.wiklander@linaro.org Signed-off-by: Sudeep Holla --- drivers/firmware/arm_ffa/driver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c index f2556a8e94015..9bc2e10381afd 100644 --- a/drivers/firmware/arm_ffa/driver.c +++ b/drivers/firmware/arm_ffa/driver.c @@ -790,7 +790,7 @@ static void ffa_notification_info_get(void) part_id = packed_id_list[ids_processed++]; - if (!ids_count[list]) { /* Global Notification */ + if (ids_count[list] == 1) { /* Global Notification */ __do_sched_recv_cb(part_id, 0, false); continue; } -- GitLab From 17f243adf1653bdbaeec767e3e74c9ad089f470b Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Mon, 11 Mar 2024 10:04:12 +0100 Subject: [PATCH 038/989] firmware: arm_scmi: Fix wrong fastchannel initialization Fastchannels are initialized with an incorrect index(POWERCAP_PAI_GET) in: commit 2441caa84aac ("firmware: arm_scmi: Populate fast channel rate_limit") Fix this and provide a correct index(POWERCAP_FC_PAI) Fixes: 2441caa84aac ("firmware: arm_scmi: Populate fast channel rate_limit") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202403100744.7Op3PI8L-lkp@intel.com/ Signed-off-by: Pierre Gondois Link: https://lore.kernel.org/r/20240311090413.1710725-1-pierre.gondois@arm.com Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/powercap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_scmi/powercap.c b/drivers/firmware/arm_scmi/powercap.c index ea9201e7044cb..1fa79bba492e8 100644 --- a/drivers/firmware/arm_scmi/powercap.c +++ b/drivers/firmware/arm_scmi/powercap.c @@ -736,7 +736,7 @@ static void scmi_powercap_domain_init_fc(const struct scmi_protocol_handle *ph, ph->hops->fastchannel_init(ph, POWERCAP_DESCRIBE_FASTCHANNEL, POWERCAP_PAI_GET, 4, domain, &fc[POWERCAP_FC_PAI].get_addr, NULL, - &fc[POWERCAP_PAI_GET].rate_limit); + &fc[POWERCAP_FC_PAI].rate_limit); *p_fc = fc; } -- GitLab From b70c7996d4ffb2e02895132e8a79a37cee66504f Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Fri, 15 Mar 2024 14:03:24 +0000 Subject: [PATCH 039/989] firmware: arm_scmi: Make raw debugfs entries non-seekable SCMI raw debugfs entries are used to inject and snoop messages out of the SCMI core and, as such, the underlying virtual files have no reason to support seeking. Modify the related file_operations descriptors to be non-seekable. Fixes: 3c3d818a9317 ("firmware: arm_scmi: Add core raw transmission support") Signed-off-by: Cristian Marussi Link: https://lore.kernel.org/r/20240315140324.231830-1-cristian.marussi@arm.com Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/raw_mode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/arm_scmi/raw_mode.c b/drivers/firmware/arm_scmi/raw_mode.c index 3505735185033..130d13e9cd6be 100644 --- a/drivers/firmware/arm_scmi/raw_mode.c +++ b/drivers/firmware/arm_scmi/raw_mode.c @@ -921,7 +921,7 @@ static int scmi_dbg_raw_mode_open(struct inode *inode, struct file *filp) rd->raw = raw; filp->private_data = rd; - return 0; + return nonseekable_open(inode, filp); } static int scmi_dbg_raw_mode_release(struct inode *inode, struct file *filp) @@ -950,6 +950,7 @@ static const struct file_operations scmi_dbg_raw_mode_reset_fops = { .open = scmi_dbg_raw_mode_open, .release = scmi_dbg_raw_mode_release, .write = scmi_dbg_raw_mode_reset_write, + .llseek = no_llseek, .owner = THIS_MODULE, }; @@ -959,6 +960,7 @@ static const struct file_operations scmi_dbg_raw_mode_message_fops = { .read = scmi_dbg_raw_mode_message_read, .write = scmi_dbg_raw_mode_message_write, .poll = scmi_dbg_raw_mode_message_poll, + .llseek = no_llseek, .owner = THIS_MODULE, }; @@ -975,6 +977,7 @@ static const struct file_operations scmi_dbg_raw_mode_message_async_fops = { .read = scmi_dbg_raw_mode_message_read, .write = scmi_dbg_raw_mode_message_async_write, .poll = scmi_dbg_raw_mode_message_poll, + .llseek = no_llseek, .owner = THIS_MODULE, }; @@ -998,6 +1001,7 @@ static const struct file_operations scmi_dbg_raw_mode_notification_fops = { .release = scmi_dbg_raw_mode_release, .read = scmi_test_dbg_raw_mode_notif_read, .poll = scmi_test_dbg_raw_mode_notif_poll, + .llseek = no_llseek, .owner = THIS_MODULE, }; @@ -1021,6 +1025,7 @@ static const struct file_operations scmi_dbg_raw_mode_errors_fops = { .release = scmi_dbg_raw_mode_release, .read = scmi_test_dbg_raw_mode_errors_read, .poll = scmi_test_dbg_raw_mode_errors_poll, + .llseek = no_llseek, .owner = THIS_MODULE, }; -- GitLab From 8aebf68dfd4b482f7fb1f82864fdda08020f62b4 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 30 Jan 2024 15:59:17 -0600 Subject: [PATCH 040/989] MAINTAINERS: Add TPM DT bindings to TPM maintainers Bindings for a given device class generally go to the respective subsystem maintainers. Add the TPM bindings to the TPM maintainers entry. Reviewed-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20240130215917.2473250-1-robh@kernel.org Signed-off-by: Rob Herring --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index aa3b947fb0801..08ea59b78df07 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22384,6 +22384,7 @@ S: Maintained W: https://kernsec.org/wiki/index.php/Linux_Kernel_Integrity Q: https://patchwork.kernel.org/project/linux-integrity/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git +F: Documentation/devicetree/bindings/tpm/ F: drivers/char/tpm/ TPS546D24 DRIVER -- GitLab From 3d9b8e6db9bda4463bbf2a97411f0716215254da Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 25 Mar 2024 10:11:39 +0100 Subject: [PATCH 041/989] docs: dt-bindings: add missing address/size-cells to example Complete the example of recommended order of properties by adding missing address/size-cells. They are not necessary to illustrate the style, but lack of them us bit really correct DTS code which might confuse readers. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Tudor Ambarus Link: https://lore.kernel.org/r/20240325091139.18602-1-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/dts-coding-style.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/dts-coding-style.rst b/Documentation/devicetree/bindings/dts-coding-style.rst index a9bdd2b59dcab..8a68331075a09 100644 --- a/Documentation/devicetree/bindings/dts-coding-style.rst +++ b/Documentation/devicetree/bindings/dts-coding-style.rst @@ -144,6 +144,8 @@ Example:: #dma-cells = <1>; clocks = <&clock_controller 0>, <&clock_controller 1>; clock-names = "bus", "host"; + #address-cells = <1>; + #size-cells = <1>; vendor,custom-property = <2>; status = "disabled"; -- GitLab From fb9f8125ed9d9b8e11f309a7dbfbe7b40de48fba Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:07:58 +0200 Subject: [PATCH 042/989] ASoC: SOF: Add dsp_max_burst_size_in_ms member to snd_sof_pcm_stream The dsp_max_burst_size_in_ms can be used to save the length of the maximum burst size in ms the host DMA will use. Platform code can place constraint using this to avoid user space requesting too small ALSA buffer which will result xruns. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-2-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/sof-audio.h | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/sof/sof-audio.h b/sound/soc/sof/sof-audio.h index 9ea2ac5adac79..04e5cb2c70a7c 100644 --- a/sound/soc/sof/sof-audio.h +++ b/sound/soc/sof/sof-audio.h @@ -322,6 +322,7 @@ struct snd_sof_pcm_stream { struct work_struct period_elapsed_work; struct snd_soc_dapm_widget_list *list; /* list of connected DAPM widgets */ bool d0i3_compatible; /* DSP can be in D0I3 when this pcm is opened */ + unsigned int dsp_max_burst_size_in_ms; /* The maximum size of the host DMA burst in ms */ /* * flag to indicate that the DSP pipelines should be kept * active or not while suspending the stream -- GitLab From 842bb8b62cc6f3546d61eb63115b32ebc6dd4a87 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:07:59 +0200 Subject: [PATCH 043/989] ASoC: SOF: ipc4-topology: Save the DMA maximum burst size for PCMs When setting up the pcm widget, save the DSP buffer size (in ms) for platform code to place a constraint on playback. On playback the DMA will fill the buffer on start and if the period size is smaller it will immediately overrun. On capture the DMA will move data in 1ms bursts. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-3-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-topology.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index da4a83afb87a8..bb4cf6dd1e189 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -412,8 +412,9 @@ static int sof_ipc4_widget_setup_pcm(struct snd_sof_widget *swidget) struct sof_ipc4_available_audio_format *available_fmt; struct snd_soc_component *scomp = swidget->scomp; struct sof_ipc4_copier *ipc4_copier; + struct snd_sof_pcm *spcm; int node_type = 0; - int ret; + int ret, dir; ipc4_copier = kzalloc(sizeof(*ipc4_copier), GFP_KERNEL); if (!ipc4_copier) @@ -447,6 +448,25 @@ static int sof_ipc4_widget_setup_pcm(struct snd_sof_widget *swidget) } dev_dbg(scomp->dev, "host copier '%s' node_type %u\n", swidget->widget->name, node_type); + spcm = snd_sof_find_spcm_comp(scomp, swidget->comp_id, &dir); + if (!spcm) + goto skip_gtw_cfg; + + if (dir == SNDRV_PCM_STREAM_PLAYBACK) { + struct snd_sof_pcm_stream *sps = &spcm->stream[dir]; + + sof_update_ipc_object(scomp, &sps->dsp_max_burst_size_in_ms, + SOF_COPIER_DEEP_BUFFER_TOKENS, + swidget->tuples, + swidget->num_tuples, sizeof(u32), 1); + /* Set default DMA buffer size if it is not specified in topology */ + if (!sps->dsp_max_burst_size_in_ms) + sps->dsp_max_burst_size_in_ms = SOF_IPC4_MIN_DMA_BUFFER_SIZE; + } else { + /* Capture data is copied from DSP to host in 1ms bursts */ + spcm->stream[dir].dsp_max_burst_size_in_ms = 1; + } + skip_gtw_cfg: ipc4_copier->gtw_attr = kzalloc(sizeof(*ipc4_copier->gtw_attr), GFP_KERNEL); if (!ipc4_copier->gtw_attr) { -- GitLab From fe76d2e75a6da97edd2b4ec5cfb9efd541be087a Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:00 +0200 Subject: [PATCH 044/989] ASoC: SOF: Intel: hda-pcm: Use dsp_max_burst_size_in_ms to place constraint If the PCM have the dsp_max_burst_size_in_ms set then place a constraint to limit the minimum buffer time to avoid xruns caused by DMA bursts spinning on the ALSA buffer. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-4-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-pcm.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sound/soc/sof/intel/hda-pcm.c b/sound/soc/sof/intel/hda-pcm.c index 18f07364d2198..69fefcd1abc5b 100644 --- a/sound/soc/sof/intel/hda-pcm.c +++ b/sound/soc/sof/intel/hda-pcm.c @@ -259,6 +259,27 @@ int hda_dsp_pcm_open(struct snd_sof_dev *sdev, snd_pcm_hw_constraint_mask64(substream->runtime, SNDRV_PCM_HW_PARAM_FORMAT, SNDRV_PCM_FMTBIT_S16 | SNDRV_PCM_FMTBIT_S32); + /* + * The dsp_max_burst_size_in_ms is the length of the maximum burst size + * of the host DMA in the ALSA buffer. + * + * On playback start the DMA will transfer dsp_max_burst_size_in_ms + * amount of data in one initial burst to fill up the host DMA buffer. + * Consequent DMA burst sizes are shorter and their length can vary. + * To make sure that userspace allocate large enough ALSA buffer we need + * to place a constraint on the buffer time. + * + * On capture the DMA will transfer 1ms chunks. + * + * Exact dsp_max_burst_size_in_ms constraint is racy, so set the + * constraint to a minimum of 2x dsp_max_burst_size_in_ms. + */ + if (spcm->stream[direction].dsp_max_burst_size_in_ms) + snd_pcm_hw_constraint_minmax(substream->runtime, + SNDRV_PCM_HW_PARAM_BUFFER_TIME, + spcm->stream[direction].dsp_max_burst_size_in_ms * USEC_PER_MSEC * 2, + UINT_MAX); + /* binding pcm substream to hda stream */ substream->runtime->private_data = &dsp_stream->hstream; return 0; -- GitLab From 67b182bea08a8d1092b91b57aefdfe420fce1634 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:01 +0200 Subject: [PATCH 045/989] ASoC: SOF: Intel: hda: Implement get_stream_position (Linear Link Position) When the Linear Link Position is not available in firmware SRAM window we use the host accessible position registers to read it. The address of the PPLCLLPL/U registers depend on the number of streams (playback+capture). At probe time the pplc_addr is calculated for each stream and we can use it to read the LLP without the need of address re-calculation. Set the get_stream_position callback in sof_hda_common_ops for all platforms: The callback is used for IPC4 delay calculations only but the register is a generic HDA register, not tied to any specific IPC version. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Rander Wang Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-5-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-common-ops.c | 2 ++ sound/soc/sof/intel/hda-stream.c | 32 ++++++++++++++++++++++++++++ sound/soc/sof/intel/hda.h | 3 +++ 3 files changed, 37 insertions(+) diff --git a/sound/soc/sof/intel/hda-common-ops.c b/sound/soc/sof/intel/hda-common-ops.c index 2b385cddc385c..80a69599a8c30 100644 --- a/sound/soc/sof/intel/hda-common-ops.c +++ b/sound/soc/sof/intel/hda-common-ops.c @@ -57,6 +57,8 @@ struct snd_sof_dsp_ops sof_hda_common_ops = { .pcm_pointer = hda_dsp_pcm_pointer, .pcm_ack = hda_dsp_pcm_ack, + .get_stream_position = hda_dsp_get_stream_llp, + /* firmware loading */ .load_firmware = snd_sof_load_firmware_raw, diff --git a/sound/soc/sof/intel/hda-stream.c b/sound/soc/sof/intel/hda-stream.c index b387b1a69d7ea..48ea187f72302 100644 --- a/sound/soc/sof/intel/hda-stream.c +++ b/sound/soc/sof/intel/hda-stream.c @@ -1063,3 +1063,35 @@ snd_pcm_uframes_t hda_dsp_stream_get_position(struct hdac_stream *hstream, return pos; } + +/** + * hda_dsp_get_stream_llp - Retrieve the LLP (Linear Link Position) of the stream + * @sdev: SOF device + * @component: ASoC component + * @substream: PCM substream + * + * Returns the raw Linear Link Position value + */ +u64 hda_dsp_get_stream_llp(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream) +{ + struct hdac_stream *hstream = substream->runtime->private_data; + struct hdac_ext_stream *hext_stream = stream_to_hdac_ext_stream(hstream); + u32 llp_l, llp_u; + + /* + * The pplc_addr have been calculated during probe in + * hda_dsp_stream_init(): + * pplc_addr = sdev->bar[HDA_DSP_PP_BAR] + + * SOF_HDA_PPLC_BASE + + * SOF_HDA_PPLC_MULTI * total_stream + + * SOF_HDA_PPLC_INTERVAL * stream_index + * + * Use this pre-calculated address to avoid repeated re-calculation. + */ + llp_l = readl(hext_stream->pplc_addr + AZX_REG_PPLCLLPL); + llp_u = readl(hext_stream->pplc_addr + AZX_REG_PPLCLLPU); + + return ((u64)llp_u << 32) | llp_l; +} diff --git a/sound/soc/sof/intel/hda.h b/sound/soc/sof/intel/hda.h index b36eb7c789133..9d26cad785fe0 100644 --- a/sound/soc/sof/intel/hda.h +++ b/sound/soc/sof/intel/hda.h @@ -662,6 +662,9 @@ bool hda_dsp_check_stream_irq(struct snd_sof_dev *sdev); snd_pcm_uframes_t hda_dsp_stream_get_position(struct hdac_stream *hstream, int direction, bool can_sleep); +u64 hda_dsp_get_stream_llp(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream); struct hdac_ext_stream * hda_dsp_stream_get(struct snd_sof_dev *sdev, int direction, u32 flags); -- GitLab From 4374f698d7d9f849b66f3fa8f7a64f0bc1a53d7f Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:02 +0200 Subject: [PATCH 046/989] ASoC: SOF: Intel: mtl/lnl: Use the generic get_stream_position callback Drop the MTL mtl_dsp_get_stream_hda_link_position() function and related defines since it can only work on platforms which have 19 streams because of the use of 0x948 as base offset for the LLP registers. The generic hda_dsp_get_stream_hda_link_position() takes the number of streams into consideration when reading the LLP registers for the stream and can handle different HDA configurations. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Rander Wang Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-6-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/lnl.c | 2 -- sound/soc/sof/intel/mtl.c | 14 -------------- sound/soc/sof/intel/mtl.h | 10 ---------- 3 files changed, 26 deletions(-) diff --git a/sound/soc/sof/intel/lnl.c b/sound/soc/sof/intel/lnl.c index 7ae017a00184e..d1c73d407e68e 100644 --- a/sound/soc/sof/intel/lnl.c +++ b/sound/soc/sof/intel/lnl.c @@ -134,8 +134,6 @@ int sof_lnl_ops_init(struct snd_sof_dev *sdev) sof_lnl_ops.runtime_resume = lnl_hda_dsp_runtime_resume; } - sof_lnl_ops.get_stream_position = mtl_dsp_get_stream_hda_link_position; - /* dsp core get/put */ sof_lnl_ops.core_get = mtl_dsp_core_get; sof_lnl_ops.core_put = mtl_dsp_core_put; diff --git a/sound/soc/sof/intel/mtl.c b/sound/soc/sof/intel/mtl.c index df05dc77b8d5e..060c34988e90d 100644 --- a/sound/soc/sof/intel/mtl.c +++ b/sound/soc/sof/intel/mtl.c @@ -626,18 +626,6 @@ static int mtl_dsp_disable_interrupts(struct snd_sof_dev *sdev) return mtl_enable_interrupts(sdev, false); } -u64 mtl_dsp_get_stream_hda_link_position(struct snd_sof_dev *sdev, - struct snd_soc_component *component, - struct snd_pcm_substream *substream) -{ - struct hdac_stream *hstream = substream->runtime->private_data; - u32 llp_l, llp_u; - - llp_l = snd_sof_dsp_read(sdev, HDA_DSP_HDA_BAR, MTL_PPLCLLPL(hstream->index)); - llp_u = snd_sof_dsp_read(sdev, HDA_DSP_HDA_BAR, MTL_PPLCLLPU(hstream->index)); - return ((u64)llp_u << 32) | llp_l; -} - int mtl_dsp_core_get(struct snd_sof_dev *sdev, int core) { const struct sof_ipc_pm_ops *pm_ops = sdev->ipc->ops->pm; @@ -707,8 +695,6 @@ int sof_mtl_ops_init(struct snd_sof_dev *sdev) sof_mtl_ops.core_get = mtl_dsp_core_get; sof_mtl_ops.core_put = mtl_dsp_core_put; - sof_mtl_ops.get_stream_position = mtl_dsp_get_stream_hda_link_position; - sdev->private = kzalloc(sizeof(struct sof_ipc4_fw_data), GFP_KERNEL); if (!sdev->private) return -ENOMEM; diff --git a/sound/soc/sof/intel/mtl.h b/sound/soc/sof/intel/mtl.h index cc5a1f46fd095..ea8c1b83f7127 100644 --- a/sound/soc/sof/intel/mtl.h +++ b/sound/soc/sof/intel/mtl.h @@ -6,12 +6,6 @@ * Copyright(c) 2020-2022 Intel Corporation. All rights reserved. */ -/* HDA Registers */ -#define MTL_PPLCLLPL_BASE 0x948 -#define MTL_PPLCLLPU_STRIDE 0x10 -#define MTL_PPLCLLPL(x) (MTL_PPLCLLPL_BASE + (x) * MTL_PPLCLLPU_STRIDE) -#define MTL_PPLCLLPU(x) (MTL_PPLCLLPL_BASE + 0x4 + (x) * MTL_PPLCLLPU_STRIDE) - /* DSP Registers */ #define MTL_HFDSSCS 0x1000 #define MTL_HFDSSCS_SPA_MASK BIT(16) @@ -103,9 +97,5 @@ int mtl_dsp_ipc_get_window_offset(struct snd_sof_dev *sdev, u32 id); void mtl_ipc_dump(struct snd_sof_dev *sdev); -u64 mtl_dsp_get_stream_hda_link_position(struct snd_sof_dev *sdev, - struct snd_soc_component *component, - struct snd_pcm_substream *substream); - int mtl_dsp_core_get(struct snd_sof_dev *sdev, int core); int mtl_dsp_core_put(struct snd_sof_dev *sdev, int core); -- GitLab From ce2faa9a180c1984225689b6b1cb26045f8b7470 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:03 +0200 Subject: [PATCH 047/989] ASoC: SOF: Introduce a new callback pair to be used for PCM delay reporting For delay calculation we need two information: Number of bytes transferred between the DSP and host memory (ALSA buffer) Number of frames transferred between the DSP and external device (link/codec/DMIC/etc). The reason for the different units (bytes vs frames) on host and dai side is that the format on the dai side is decided by the firmware and might not be the same as on the host side, thus the expectation is that the counter reflects the number of frames. The kernel know the host side format and in there we have access to the DMA position which is in bytes. In a simplified way, the DSP caused delay is the difference between the two counters. The existing get_stream_position callback is defined to retrieve the frame counter on the DAI side but it's name is too generic to be intuitive and makes it hard to define a callback for the host side. This patch introduces a new set of callbacks to replace the get_stream_position and define the host side equivalent: get_dai_frame_counter get_host_byte_counter Subsequent patches will remove the old callback. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-7-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ops.h | 24 ++++++++++++++++++++++++ sound/soc/sof/sof-priv.h | 21 +++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/sound/soc/sof/ops.h b/sound/soc/sof/ops.h index 6cf21e829e072..d83cd771015cb 100644 --- a/sound/soc/sof/ops.h +++ b/sound/soc/sof/ops.h @@ -533,6 +533,30 @@ static inline u64 snd_sof_pcm_get_stream_position(struct snd_sof_dev *sdev, return 0; } +static inline u64 +snd_sof_pcm_get_dai_frame_counter(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream) +{ + if (sof_ops(sdev) && sof_ops(sdev)->get_dai_frame_counter) + return sof_ops(sdev)->get_dai_frame_counter(sdev, component, + substream); + + return 0; +} + +static inline u64 +snd_sof_pcm_get_host_byte_counter(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream) +{ + if (sof_ops(sdev) && sof_ops(sdev)->get_host_byte_counter) + return sof_ops(sdev)->get_host_byte_counter(sdev, component, + substream); + + return 0; +} + /* machine driver */ static inline int snd_sof_machine_register(struct snd_sof_dev *sdev, void *pdata) diff --git a/sound/soc/sof/sof-priv.h b/sound/soc/sof/sof-priv.h index d453a4ce3b219..91043f177dfa3 100644 --- a/sound/soc/sof/sof-priv.h +++ b/sound/soc/sof/sof-priv.h @@ -270,6 +270,27 @@ struct snd_sof_dsp_ops { struct snd_soc_component *component, struct snd_pcm_substream *substream); /* optional */ + /* + * optional callback to retrieve the number of frames left/arrived from/to + * the DSP on the DAI side (link/codec/DMIC/etc). + * + * The callback is used when the firmware does not provide this information + * via the shared SRAM window and it can be retrieved by host. + */ + u64 (*get_dai_frame_counter)(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream); /* optional */ + + /* + * Optional callback to retrieve the number of bytes left/arrived from/to + * the DSP on the host side (bytes between host ALSA buffer and DSP). + * + * The callback is needed for ALSA delay reporting. + */ + u64 (*get_host_byte_counter)(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream); /* optional */ + /* host read DSP stream data */ int (*ipc_msg_data)(struct snd_sof_dev *sdev, struct snd_sof_pcm_stream *sps, -- GitLab From fd6f6a0632bc891673490bf4a92304172251825c Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:04 +0200 Subject: [PATCH 048/989] ASoC: SOF: Intel: Set the dai/host get frame/byte counter callbacks Add implementation for reading the LDP (Linear DMA Position) to be used as get_host_byte_counter(). The LDP is counting the number of bytes moved between the DSP and host memory. Set the get_dai_frame_counter to hda_dsp_get_stream_llp, which is counting the frames on the link side of the DSP. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-8-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-common-ops.c | 2 ++ sound/soc/sof/intel/hda-stream.c | 31 ++++++++++++++++++++++++++++ sound/soc/sof/intel/hda.h | 3 +++ 3 files changed, 36 insertions(+) diff --git a/sound/soc/sof/intel/hda-common-ops.c b/sound/soc/sof/intel/hda-common-ops.c index 80a69599a8c30..4d7ea18604eee 100644 --- a/sound/soc/sof/intel/hda-common-ops.c +++ b/sound/soc/sof/intel/hda-common-ops.c @@ -58,6 +58,8 @@ struct snd_sof_dsp_ops sof_hda_common_ops = { .pcm_ack = hda_dsp_pcm_ack, .get_stream_position = hda_dsp_get_stream_llp, + .get_dai_frame_counter = hda_dsp_get_stream_llp, + .get_host_byte_counter = hda_dsp_get_stream_ldp, /* firmware loading */ .load_firmware = snd_sof_load_firmware_raw, diff --git a/sound/soc/sof/intel/hda-stream.c b/sound/soc/sof/intel/hda-stream.c index 48ea187f72302..8504a4f27b608 100644 --- a/sound/soc/sof/intel/hda-stream.c +++ b/sound/soc/sof/intel/hda-stream.c @@ -1095,3 +1095,34 @@ u64 hda_dsp_get_stream_llp(struct snd_sof_dev *sdev, return ((u64)llp_u << 32) | llp_l; } + +/** + * hda_dsp_get_stream_ldp - Retrieve the LDP (Linear DMA Position) of the stream + * @sdev: SOF device + * @component: ASoC component + * @substream: PCM substream + * + * Returns the raw Linear Link Position value + */ +u64 hda_dsp_get_stream_ldp(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream) +{ + struct hdac_stream *hstream = substream->runtime->private_data; + struct hdac_ext_stream *hext_stream = stream_to_hdac_ext_stream(hstream); + u32 ldp_l, ldp_u; + + /* + * The pphc_addr have been calculated during probe in + * hda_dsp_stream_init(): + * pphc_addr = sdev->bar[HDA_DSP_PP_BAR] + + * SOF_HDA_PPHC_BASE + + * SOF_HDA_PPHC_INTERVAL * stream_index + * + * Use this pre-calculated address to avoid repeated re-calculation. + */ + ldp_l = readl(hext_stream->pphc_addr + AZX_REG_PPHCLDPL); + ldp_u = readl(hext_stream->pphc_addr + AZX_REG_PPHCLDPU); + + return ((u64)ldp_u << 32) | ldp_l; +} diff --git a/sound/soc/sof/intel/hda.h b/sound/soc/sof/intel/hda.h index 9d26cad785fe0..81a1d4606d3cd 100644 --- a/sound/soc/sof/intel/hda.h +++ b/sound/soc/sof/intel/hda.h @@ -665,6 +665,9 @@ snd_pcm_uframes_t hda_dsp_stream_get_position(struct hdac_stream *hstream, u64 hda_dsp_get_stream_llp(struct snd_sof_dev *sdev, struct snd_soc_component *component, struct snd_pcm_substream *substream); +u64 hda_dsp_get_stream_ldp(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream); struct hdac_ext_stream * hda_dsp_stream_get(struct snd_sof_dev *sdev, int direction, u32 flags); -- GitLab From 37679a1bd372c8308a3faccf3438c9df642565b3 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:05 +0200 Subject: [PATCH 049/989] ASoC: SOF: ipc4-pcm: Use the snd_sof_pcm_get_dai_frame_counter() for pcm_delay Switch to the new callback to retrieve the DAI (link) frame counter. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-9-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index 0f332c8cdbe6a..d0795f77cc153 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -897,11 +897,12 @@ static snd_pcm_sframes_t sof_ipc4_pcm_delay(struct snd_soc_component *component, } /* - * HDaudio links don't support the LLP counter reported by firmware - * the link position is read directly from hardware registers. + * If the LLP counter is not reported by firmware in the SRAM window + * then read the dai (link) position via host accessible means if + * available. */ if (!time_info->llp_offset) { - tmp_ptr = snd_sof_pcm_get_stream_position(sdev, component, substream); + tmp_ptr = snd_sof_pcm_get_dai_frame_counter(sdev, component, substream); if (!tmp_ptr) return 0; } else { -- GitLab From 4ab6c38c664442c1fc9911eb3c5c6953d3dbcca5 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:06 +0200 Subject: [PATCH 050/989] ASoC: SOF: Intel: hda-common-ops: Do not set the get_stream_position callback The get_stream_position has been replaced by get_dai_frame_counter, it should not be set to allow it to be dropped from core code. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-10-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-common-ops.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/soc/sof/intel/hda-common-ops.c b/sound/soc/sof/intel/hda-common-ops.c index 4d7ea18604eee..d71bb66b99911 100644 --- a/sound/soc/sof/intel/hda-common-ops.c +++ b/sound/soc/sof/intel/hda-common-ops.c @@ -57,7 +57,6 @@ struct snd_sof_dsp_ops sof_hda_common_ops = { .pcm_pointer = hda_dsp_pcm_pointer, .pcm_ack = hda_dsp_pcm_ack, - .get_stream_position = hda_dsp_get_stream_llp, .get_dai_frame_counter = hda_dsp_get_stream_llp, .get_host_byte_counter = hda_dsp_get_stream_ldp, -- GitLab From 07007b8ac42cffc23043d00e56b0f67a75dc4b22 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:07 +0200 Subject: [PATCH 051/989] ASoC: SOF: Remove the get_stream_position callback The get_stream_position has been replaced by get_dai_frame_counter and all related code can be dropped form the core. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-11-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ops.h | 10 ---------- sound/soc/sof/sof-priv.h | 9 --------- 2 files changed, 19 deletions(-) diff --git a/sound/soc/sof/ops.h b/sound/soc/sof/ops.h index d83cd771015cb..3cd748e134609 100644 --- a/sound/soc/sof/ops.h +++ b/sound/soc/sof/ops.h @@ -523,16 +523,6 @@ static inline int snd_sof_pcm_platform_ack(struct snd_sof_dev *sdev, return 0; } -static inline u64 snd_sof_pcm_get_stream_position(struct snd_sof_dev *sdev, - struct snd_soc_component *component, - struct snd_pcm_substream *substream) -{ - if (sof_ops(sdev) && sof_ops(sdev)->get_stream_position) - return sof_ops(sdev)->get_stream_position(sdev, component, substream); - - return 0; -} - static inline u64 snd_sof_pcm_get_dai_frame_counter(struct snd_sof_dev *sdev, struct snd_soc_component *component, diff --git a/sound/soc/sof/sof-priv.h b/sound/soc/sof/sof-priv.h index 91043f177dfa3..d3c436f826046 100644 --- a/sound/soc/sof/sof-priv.h +++ b/sound/soc/sof/sof-priv.h @@ -261,15 +261,6 @@ struct snd_sof_dsp_ops { /* pcm ack */ int (*pcm_ack)(struct snd_sof_dev *sdev, struct snd_pcm_substream *substream); /* optional */ - /* - * optional callback to retrieve the link DMA position for the substream - * when the position is not reported in the shared SRAM windows but - * instead from a host-accessible hardware counter. - */ - u64 (*get_stream_position)(struct snd_sof_dev *sdev, - struct snd_soc_component *component, - struct snd_pcm_substream *substream); /* optional */ - /* * optional callback to retrieve the number of frames left/arrived from/to * the DSP on the DAI side (link/codec/DMIC/etc). -- GitLab From 31d2874d083ba6cc2a4f4b26dab73c3be1c92658 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:08 +0200 Subject: [PATCH 052/989] ASoC: SOF: ipc4-pcm: Move struct sof_ipc4_timestamp_info definition locally The sof_ipc4_timestamp_info is only used by ipc4-pcm.c internally, it should not be in a generic header implying that it might be used elsewhere. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-12-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 14 ++++++++++++++ sound/soc/sof/ipc4-priv.h | 14 -------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index d0795f77cc153..2d7295221884f 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -15,6 +15,20 @@ #include "ipc4-topology.h" #include "ipc4-fw-reg.h" +/** + * struct sof_ipc4_timestamp_info - IPC4 timestamp info + * @host_copier: the host copier of the pcm stream + * @dai_copier: the dai copier of the pcm stream + * @stream_start_offset: reported by fw in memory window + * @llp_offset: llp offset in memory window + */ +struct sof_ipc4_timestamp_info { + struct sof_ipc4_copier *host_copier; + struct sof_ipc4_copier *dai_copier; + u64 stream_start_offset; + u32 llp_offset; +}; + static int sof_ipc4_set_multi_pipeline_state(struct snd_sof_dev *sdev, u32 state, struct ipc4_pipeline_set_state_data *trigger_list) { diff --git a/sound/soc/sof/ipc4-priv.h b/sound/soc/sof/ipc4-priv.h index f3b908b093f95..afed618a15f06 100644 --- a/sound/soc/sof/ipc4-priv.h +++ b/sound/soc/sof/ipc4-priv.h @@ -92,20 +92,6 @@ struct sof_ipc4_fw_data { struct mutex pipeline_state_mutex; /* protect pipeline triggers, ref counts and states */ }; -/** - * struct sof_ipc4_timestamp_info - IPC4 timestamp info - * @host_copier: the host copier of the pcm stream - * @dai_copier: the dai copier of the pcm stream - * @stream_start_offset: reported by fw in memory window - * @llp_offset: llp offset in memory window - */ -struct sof_ipc4_timestamp_info { - struct sof_ipc4_copier *host_copier; - struct sof_ipc4_copier *dai_copier; - u64 stream_start_offset; - u32 llp_offset; -}; - extern const struct sof_ipc_fw_loader_ops ipc4_loader_ops; extern const struct sof_ipc_tplg_ops ipc4_tplg_ops; extern const struct sof_ipc_tplg_control_ops tplg_ipc4_control_ops; -- GitLab From 55ca6ca227bfc5a8d0a0c2c5d6e239777226a604 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:09 +0200 Subject: [PATCH 053/989] ASoC: SOF: ipc4-pcm: Combine the SOF_IPC4_PIPE_PAUSED cases in pcm_trigger The SNDRV_PCM_TRIGGER_PAUSE_PUSH does not need to be a separate case, it can be handled along with STOP and SUSPEND Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-13-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index 2d7295221884f..4e41b16d32054 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -478,14 +478,12 @@ static int sof_ipc4_pcm_trigger(struct snd_soc_component *component, /* determine the pipeline state */ switch (cmd) { - case SNDRV_PCM_TRIGGER_PAUSE_PUSH: - state = SOF_IPC4_PIPE_PAUSED; - break; case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: case SNDRV_PCM_TRIGGER_RESUME: case SNDRV_PCM_TRIGGER_START: state = SOF_IPC4_PIPE_RUNNING; break; + case SNDRV_PCM_TRIGGER_PAUSE_PUSH: case SNDRV_PCM_TRIGGER_SUSPEND: case SNDRV_PCM_TRIGGER_STOP: state = SOF_IPC4_PIPE_PAUSED; -- GitLab From 3ce3bc36d91510389955b47e36ea4c4e94fcbdd3 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:10 +0200 Subject: [PATCH 054/989] ASoC: SOF: ipc4-pcm: Invalidate the stream_start_offset in PAUSED state When the final state is SOF_IPC4_PIPE_PAUSED, it is possible that the stream will be restarted (resume or start) in which case we need to update the offset from the firmware. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-14-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index 4e41b16d32054..905dbc4852b1e 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -437,8 +437,19 @@ static int sof_ipc4_trigger_pipelines(struct snd_soc_component *component, } /* return if this is the final state */ - if (state == SOF_IPC4_PIPE_PAUSED) + if (state == SOF_IPC4_PIPE_PAUSED) { + struct sof_ipc4_timestamp_info *time_info; + + /* + * Invalidate the stream_start_offset to make sure that it is + * going to be updated if the stream resumes + */ + time_info = spcm->stream[substream->stream].private; + if (time_info) + time_info->stream_start_offset = SOF_IPC4_INVALID_STREAM_POSITION; + goto free; + } skip_pause_transition: /* else set the RUNNING/RESET state in the DSP */ ret = sof_ipc4_set_multi_pipeline_state(sdev, state, trigger_list); -- GitLab From 77165bd955d55114028b06787a530b8f9220e4b0 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:11 +0200 Subject: [PATCH 055/989] ASoC: SOF: sof-pcm: Add pointer callback to sof_ipc_pcm_ops The IPC specific pointer callback can be used when additional or custom handling is needed during the pointer calculation, like executing a delay calculation at the same time to minimize drift between the reported pointer and the calculated delay. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-15-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/pcm.c | 8 ++++++++ sound/soc/sof/sof-audio.h | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/pcm.c b/sound/soc/sof/pcm.c index 33d576b176478..f03cee94bce62 100644 --- a/sound/soc/sof/pcm.c +++ b/sound/soc/sof/pcm.c @@ -388,13 +388,21 @@ static snd_pcm_uframes_t sof_pcm_pointer(struct snd_soc_component *component, { struct snd_soc_pcm_runtime *rtd = snd_soc_substream_to_rtd(substream); struct snd_sof_dev *sdev = snd_soc_component_get_drvdata(component); + const struct sof_ipc_pcm_ops *pcm_ops = sof_ipc_get_ops(sdev, pcm); struct snd_sof_pcm *spcm; snd_pcm_uframes_t host, dai; + int ret = -EOPNOTSUPP; /* nothing to do for BE */ if (rtd->dai_link->no_pcm) return 0; + if (pcm_ops && pcm_ops->pointer) + ret = pcm_ops->pointer(component, substream, &host); + + if (ret != -EOPNOTSUPP) + return ret ? ret : host; + /* use dsp ops pointer callback directly if set */ if (sof_ops(sdev)->pcm_pointer) return sof_ops(sdev)->pcm_pointer(sdev, substream); diff --git a/sound/soc/sof/sof-audio.h b/sound/soc/sof/sof-audio.h index 04e5cb2c70a7c..86bbb531e142c 100644 --- a/sound/soc/sof/sof-audio.h +++ b/sound/soc/sof/sof-audio.h @@ -103,7 +103,10 @@ struct snd_sof_dai_config_data { * additional memory in the SOF PCM stream structure * @pcm_free: Function pointer for PCM free that can be used for freeing any * additional memory in the SOF PCM stream structure - * @delay: Function pointer for pcm delay calculation + * @pointer: Function pointer for pcm pointer + * Note: the @pointer callback may return -EOPNOTSUPP which should be + * handled in a same way as if the callback is not provided + * @delay: Function pointer for pcm delay reporting * @reset_hw_params_during_stop: Flag indicating whether the hw_params should be reset during the * STOP pcm trigger * @ipc_first_on_start: Send IPC before invoking platform trigger during @@ -124,6 +127,9 @@ struct sof_ipc_pcm_ops { int (*dai_link_fixup)(struct snd_soc_pcm_runtime *rtd, struct snd_pcm_hw_params *params); int (*pcm_setup)(struct snd_sof_dev *sdev, struct snd_sof_pcm *spcm); void (*pcm_free)(struct snd_sof_dev *sdev, struct snd_sof_pcm *spcm); + int (*pointer)(struct snd_soc_component *component, + struct snd_pcm_substream *substream, + snd_pcm_uframes_t *pointer); snd_pcm_sframes_t (*delay)(struct snd_soc_component *component, struct snd_pcm_substream *substream); bool reset_hw_params_during_stop; -- GitLab From 0ea06680dfcb4464ac6c05968433d060efb44345 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:12 +0200 Subject: [PATCH 056/989] ASoC: SOF: ipc4-pcm: Correct the delay calculation This patch improves the delay calculation by relying on the LLP (Linear Link Position) on the DAI side and the LDP (Linear Data Pointer) on the host side. The LDP provides the same DMA position as LPIB, but with a linear count instead of a position in the ALSA ring buffer. The LDP values are provided in bytes and must be converted to frames. The difference in units means that the host counter will wrap earlier than the LLP. We need to wrap the LLP at the same boundary as the host counter. The ASoC framework relies on separate pointer and delay callback. Measurement errors can be reduced by processing all the counter values in the pointer callback. The delay value is stored, and will be reported to higher levels in the delay callback. For playback, the firmware provides a stream_start offset to handle mixing/pause usages, where the DAI might have started earlier than the PCM device. The delay calculation must be special-cased when the link counter has not reached the start offset value, i.e. no valid audio has left the DSP. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-16-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 159 +++++++++++++++++++++++++++++++-------- 1 file changed, 127 insertions(+), 32 deletions(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index 905dbc4852b1e..e915f9f87a6c3 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -19,14 +19,22 @@ * struct sof_ipc4_timestamp_info - IPC4 timestamp info * @host_copier: the host copier of the pcm stream * @dai_copier: the dai copier of the pcm stream - * @stream_start_offset: reported by fw in memory window + * @stream_start_offset: reported by fw in memory window (converted to frames) + * @stream_end_offset: reported by fw in memory window (converted to frames) * @llp_offset: llp offset in memory window + * @boundary: wrap boundary should be used for the LLP frame counter + * @delay: Calculated and stored in pointer callback. The stored value is + * returned in the delay callback. */ struct sof_ipc4_timestamp_info { struct sof_ipc4_copier *host_copier; struct sof_ipc4_copier *dai_copier; u64 stream_start_offset; + u64 stream_end_offset; u32 llp_offset; + + u64 boundary; + snd_pcm_sframes_t delay; }; static int sof_ipc4_set_multi_pipeline_state(struct snd_sof_dev *sdev, u32 state, @@ -726,6 +734,10 @@ static int sof_ipc4_pcm_setup(struct snd_sof_dev *sdev, struct snd_sof_pcm *spcm if (abi_version < SOF_IPC4_FW_REGS_ABI_VER) support_info = false; + /* For delay reporting the get_host_byte_counter callback is needed */ + if (!sof_ops(sdev) || !sof_ops(sdev)->get_host_byte_counter) + support_info = false; + for_each_pcm_streams(stream) { pipeline_list = &spcm->stream[stream].pipeline_list; @@ -858,7 +870,6 @@ static int sof_ipc4_get_stream_start_offset(struct snd_sof_dev *sdev, struct sof_ipc4_copier *host_copier = time_info->host_copier; struct sof_ipc4_copier *dai_copier = time_info->dai_copier; struct sof_ipc4_pipeline_registers ppl_reg; - u64 stream_start_position; u32 dai_sample_size; u32 ch, node_index; u32 offset; @@ -875,38 +886,51 @@ static int sof_ipc4_get_stream_start_offset(struct snd_sof_dev *sdev, if (ppl_reg.stream_start_offset == SOF_IPC4_INVALID_STREAM_POSITION) return -EINVAL; - stream_start_position = ppl_reg.stream_start_offset; ch = dai_copier->data.out_format.fmt_cfg; ch = SOF_IPC4_AUDIO_FORMAT_CFG_CHANNELS_COUNT(ch); dai_sample_size = (dai_copier->data.out_format.bit_depth >> 3) * ch; - /* convert offset to sample count */ - do_div(stream_start_position, dai_sample_size); - time_info->stream_start_offset = stream_start_position; + + /* convert offsets to frame count */ + time_info->stream_start_offset = ppl_reg.stream_start_offset; + do_div(time_info->stream_start_offset, dai_sample_size); + time_info->stream_end_offset = ppl_reg.stream_end_offset; + do_div(time_info->stream_end_offset, dai_sample_size); + + /* + * Calculate the wrap boundary need to be used for delay calculation + * The host counter is in bytes, it will wrap earlier than the frames + * based link counter. + */ + time_info->boundary = div64_u64(~((u64)0), + frames_to_bytes(substream->runtime, 1)); + /* Initialize the delay value to 0 (no delay) */ + time_info->delay = 0; return 0; } -static snd_pcm_sframes_t sof_ipc4_pcm_delay(struct snd_soc_component *component, - struct snd_pcm_substream *substream) +static int sof_ipc4_pcm_pointer(struct snd_soc_component *component, + struct snd_pcm_substream *substream, + snd_pcm_uframes_t *pointer) { struct snd_sof_dev *sdev = snd_soc_component_get_drvdata(component); struct snd_soc_pcm_runtime *rtd = snd_soc_substream_to_rtd(substream); struct sof_ipc4_timestamp_info *time_info; struct sof_ipc4_llp_reading_slot llp; - snd_pcm_uframes_t head_ptr, tail_ptr; + snd_pcm_uframes_t head_cnt, tail_cnt; struct snd_sof_pcm_stream *stream; + u64 dai_cnt, host_cnt, host_ptr; struct snd_sof_pcm *spcm; - u64 tmp_ptr; int ret; spcm = snd_sof_find_spcm_dai(component, rtd); if (!spcm) - return 0; + return -EOPNOTSUPP; stream = &spcm->stream[substream->stream]; time_info = stream->private; if (!time_info) - return 0; + return -EOPNOTSUPP; /* * stream_start_offset is updated to memory window by FW based on @@ -916,46 +940,116 @@ static snd_pcm_sframes_t sof_ipc4_pcm_delay(struct snd_soc_component *component, if (time_info->stream_start_offset == SOF_IPC4_INVALID_STREAM_POSITION) { ret = sof_ipc4_get_stream_start_offset(sdev, substream, stream, time_info); if (ret < 0) - return 0; + return -EOPNOTSUPP; } + /* For delay calculation we need the host counter */ + host_cnt = snd_sof_pcm_get_host_byte_counter(sdev, component, substream); + host_ptr = host_cnt; + + /* convert the host_cnt to frames */ + host_cnt = div64_u64(host_cnt, frames_to_bytes(substream->runtime, 1)); + /* * If the LLP counter is not reported by firmware in the SRAM window - * then read the dai (link) position via host accessible means if + * then read the dai (link) counter via host accessible means if * available. */ if (!time_info->llp_offset) { - tmp_ptr = snd_sof_pcm_get_dai_frame_counter(sdev, component, substream); - if (!tmp_ptr) - return 0; + dai_cnt = snd_sof_pcm_get_dai_frame_counter(sdev, component, substream); + if (!dai_cnt) + return -EOPNOTSUPP; } else { sof_mailbox_read(sdev, time_info->llp_offset, &llp, sizeof(llp)); - tmp_ptr = ((u64)llp.reading.llp_u << 32) | llp.reading.llp_l; + dai_cnt = ((u64)llp.reading.llp_u << 32) | llp.reading.llp_l; } + dai_cnt += time_info->stream_end_offset; - /* In two cases dai dma position is not accurate + /* In two cases dai dma counter is not accurate * (1) dai pipeline is started before host pipeline - * (2) multiple streams mixed into one. Each stream has the same dai dma position + * (2) multiple streams mixed into one. Each stream has the same dai dma + * counter + * + * Firmware calculates correct stream_start_offset for all cases + * including above two. + * Driver subtracts stream_start_offset from dai dma counter to get + * accurate one + */ + + /* + * On stream start the dai counter might not yet have reached the + * stream_start_offset value which means that no frames have left the + * DSP yet from the audio stream (on playback, capture streams have + * offset of 0 as we start capturing right away). + * In this case we need to adjust the distance between the counters by + * increasing the host counter by (offset - dai_counter). + * Otherwise the dai_counter needs to be adjusted to reflect the number + * of valid frames passed on the DAI side. * - * Firmware calculates correct stream_start_offset for all cases including above two. - * Driver subtracts stream_start_offset from dai dma position to get accurate one + * The delay is the difference between the counters on the two + * sides of the DSP. */ - tmp_ptr -= time_info->stream_start_offset; + if (dai_cnt < time_info->stream_start_offset) { + host_cnt += time_info->stream_start_offset - dai_cnt; + dai_cnt = 0; + } else { + dai_cnt -= time_info->stream_start_offset; + } + + /* Wrap the dai counter at the boundary where the host counter wraps */ + div64_u64_rem(dai_cnt, time_info->boundary, &dai_cnt); - /* Calculate the delay taking into account that both pointer can wrap */ - div64_u64_rem(tmp_ptr, substream->runtime->boundary, &tmp_ptr); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { - head_ptr = substream->runtime->status->hw_ptr; - tail_ptr = tmp_ptr; + head_cnt = host_cnt; + tail_cnt = dai_cnt; } else { - head_ptr = tmp_ptr; - tail_ptr = substream->runtime->status->hw_ptr; + head_cnt = dai_cnt; + tail_cnt = host_cnt; + } + + if (head_cnt < tail_cnt) { + time_info->delay = time_info->boundary - tail_cnt + head_cnt; + goto out; } - if (head_ptr < tail_ptr) - return substream->runtime->boundary - tail_ptr + head_ptr; + time_info->delay = head_cnt - tail_cnt; + +out: + /* + * Convert the host byte counter to PCM pointer which wraps in buffer + * and it is in frames + */ + div64_u64_rem(host_ptr, snd_pcm_lib_buffer_bytes(substream), &host_ptr); + *pointer = bytes_to_frames(substream->runtime, host_ptr); + + return 0; +} + +static snd_pcm_sframes_t sof_ipc4_pcm_delay(struct snd_soc_component *component, + struct snd_pcm_substream *substream) +{ + struct snd_soc_pcm_runtime *rtd = snd_soc_substream_to_rtd(substream); + struct sof_ipc4_timestamp_info *time_info; + struct snd_sof_pcm_stream *stream; + struct snd_sof_pcm *spcm; + + spcm = snd_sof_find_spcm_dai(component, rtd); + if (!spcm) + return 0; + + stream = &spcm->stream[substream->stream]; + time_info = stream->private; + /* + * Report the stored delay value calculated in the pointer callback. + * In the unlikely event that the calculation was skipped/aborted, the + * default 0 delay returned. + */ + if (time_info) + return time_info->delay; + + /* No delay information available, report 0 as delay */ + return 0; - return head_ptr - tail_ptr; } const struct sof_ipc_pcm_ops ipc4_pcm_ops = { @@ -965,6 +1059,7 @@ const struct sof_ipc_pcm_ops ipc4_pcm_ops = { .dai_link_fixup = sof_ipc4_pcm_dai_link_fixup, .pcm_setup = sof_ipc4_pcm_setup, .pcm_free = sof_ipc4_pcm_free, + .pointer = sof_ipc4_pcm_pointer, .delay = sof_ipc4_pcm_delay, .ipc_first_on_start = true, .platform_stop_during_hw_free = true, -- GitLab From f9eeb6bb13fb5d7af1ea5b74a10b1f8ead962540 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:13 +0200 Subject: [PATCH 057/989] ALSA: hda: Add pplcllpl/u members to hdac_ext_stream The pplcllpl/u can be used to save the Link Connection Linear Link Position register value to be used for compensation of the LLP register value in case the counter is not reset (after pause/resume or stop/start without closing the stream). The LLP can be used along with PPHCLDP to calculate delay caused by the DSP processing for HDA links. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-17-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- include/sound/hdaudio_ext.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/sound/hdaudio_ext.h b/include/sound/hdaudio_ext.h index a8bebac1e4b28..957295364a5e3 100644 --- a/include/sound/hdaudio_ext.h +++ b/include/sound/hdaudio_ext.h @@ -56,6 +56,9 @@ struct hdac_ext_stream { u32 pphcldpl; u32 pphcldpu; + u32 pplcllpl; + u32 pplcllpu; + bool decoupled:1; bool link_locked:1; bool link_prepared; -- GitLab From 1abc2642588e06f6180b3fbb21968cf5d0ba9e5f Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:14 +0200 Subject: [PATCH 058/989] ASoC: SOF: Intel: hda: Compensate LLP in case it is not reset During pause/reset or stop/start the LLP counter is not reset, which will result broken delay reporting. Read the LLP value on STOP/PAUSE trigger and use it in LLP reading to normalize the LLP from the register. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-18-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-dai-ops.c | 11 +++++++++++ sound/soc/sof/intel/hda-pcm.c | 8 ++++++++ sound/soc/sof/intel/hda-stream.c | 9 ++++++++- 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/intel/hda-dai-ops.c b/sound/soc/sof/intel/hda-dai-ops.c index c50ca9e72d373..b073720b4cf43 100644 --- a/sound/soc/sof/intel/hda-dai-ops.c +++ b/sound/soc/sof/intel/hda-dai-ops.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -362,6 +363,16 @@ static int hda_trigger(struct snd_sof_dev *sdev, struct snd_soc_dai *cpu_dai, case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_PAUSE_PUSH: snd_hdac_ext_stream_clear(hext_stream); + + /* + * Save the LLP registers in case the stream is + * restarting due PAUSE_RELEASE, or START without a pcm + * close/open since in this case the LLP register is not reset + * to 0 and the delay calculation will return with invalid + * results. + */ + hext_stream->pplcllpl = readl(hext_stream->pplc_addr + AZX_REG_PPLCLLPL); + hext_stream->pplcllpu = readl(hext_stream->pplc_addr + AZX_REG_PPLCLLPU); break; default: dev_err(sdev->dev, "unknown trigger command %d\n", cmd); diff --git a/sound/soc/sof/intel/hda-pcm.c b/sound/soc/sof/intel/hda-pcm.c index 69fefcd1abc5b..d7b446f3f973e 100644 --- a/sound/soc/sof/intel/hda-pcm.c +++ b/sound/soc/sof/intel/hda-pcm.c @@ -282,6 +282,14 @@ int hda_dsp_pcm_open(struct snd_sof_dev *sdev, /* binding pcm substream to hda stream */ substream->runtime->private_data = &dsp_stream->hstream; + + /* + * Reset the llp cache values (they are used for LLP compensation in + * case the counter is not reset) + */ + dsp_stream->pplcllpl = 0; + dsp_stream->pplcllpu = 0; + return 0; } diff --git a/sound/soc/sof/intel/hda-stream.c b/sound/soc/sof/intel/hda-stream.c index 8504a4f27b608..0c189d3b19c1a 100644 --- a/sound/soc/sof/intel/hda-stream.c +++ b/sound/soc/sof/intel/hda-stream.c @@ -1064,6 +1064,8 @@ snd_pcm_uframes_t hda_dsp_stream_get_position(struct hdac_stream *hstream, return pos; } +#define merge_u64(u32_u, u32_l) (((u64)(u32_u) << 32) | (u32_l)) + /** * hda_dsp_get_stream_llp - Retrieve the LLP (Linear Link Position) of the stream * @sdev: SOF device @@ -1093,7 +1095,12 @@ u64 hda_dsp_get_stream_llp(struct snd_sof_dev *sdev, llp_l = readl(hext_stream->pplc_addr + AZX_REG_PPLCLLPL); llp_u = readl(hext_stream->pplc_addr + AZX_REG_PPLCLLPU); - return ((u64)llp_u << 32) | llp_l; + /* Compensate the LLP counter with the saved offset */ + if (hext_stream->pplcllpl || hext_stream->pplcllpu) + return merge_u64(llp_u, llp_l) - + merge_u64(hext_stream->pplcllpu, hext_stream->pplcllpl); + + return merge_u64(llp_u, llp_l); } /** -- GitLab From c61115b37ff964d63191dbf4a058f481daabdf57 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Fri, 22 Mar 2024 13:25:04 +0200 Subject: [PATCH 059/989] ASoC: SOF: Intel: hda-dsp: Skip IMR boot on ACE platforms in case of S3 suspend SoCs with ACE architecture are tailored to use s2idle instead deep (S3) suspend state and the IMR content is lost when the system is forced to enter even to S3. When waking up from S3 state the IMR boot will fail as the content is lost. Set the skip_imr_boot flag to make sure that we don't try IMR in this case. Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Link: https://msgid.link/r/20240322112504.4192-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-dsp.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/sound/soc/sof/intel/hda-dsp.c b/sound/soc/sof/intel/hda-dsp.c index 31ffa1a8f2ac0..ef5c915db8ffb 100644 --- a/sound/soc/sof/intel/hda-dsp.c +++ b/sound/soc/sof/intel/hda-dsp.c @@ -681,17 +681,27 @@ static int hda_suspend(struct snd_sof_dev *sdev, bool runtime_suspend) struct sof_intel_hda_dev *hda = sdev->pdata->hw_pdata; const struct sof_intel_dsp_desc *chip = hda->desc; struct hdac_bus *bus = sof_to_bus(sdev); + bool imr_lost = false; int ret, j; /* - * The memory used for IMR boot loses its content in deeper than S3 state - * We must not try IMR boot on next power up (as it will fail). - * + * The memory used for IMR boot loses its content in deeper than S3 + * state on CAVS platforms. + * On ACE platforms due to the system architecture the IMR content is + * lost at S3 state already, they are tailored for s2idle use. + * We must not try IMR boot on next power up in these cases as it will + * fail. + */ + if (sdev->system_suspend_target > SOF_SUSPEND_S3 || + (chip->hw_ip_version >= SOF_INTEL_ACE_1_0 && + sdev->system_suspend_target == SOF_SUSPEND_S3)) + imr_lost = true; + + /* * In case of firmware crash or boot failure set the skip_imr_boot to true * as well in order to try to re-load the firmware to do a 'cold' boot. */ - if (sdev->system_suspend_target > SOF_SUSPEND_S3 || - sdev->fw_state == SOF_FW_CRASHED || + if (imr_lost || sdev->fw_state == SOF_FW_CRASHED || sdev->fw_state == SOF_FW_BOOT_FAILED) hda->skip_imr_boot = true; -- GitLab From e2d7ad717a6b0880843dbc60855a5b97ad0395f8 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Mon, 25 Mar 2024 14:44:50 +0000 Subject: [PATCH 060/989] ASoC: cs-amp-lib: Check for no firmware controls when writing calibration When a wmfw file has not been loaded the firmware control descriptions necessary to write a stored calibration are not present. In this case print a more descriptive error message. The message is logged at info level because it is not fatal, and does not necessarily imply that anything is broken. Signed-off-by: Simon Trimmer Signed-off-by: Richard Fitzgerald Link: https://msgid.link/r/20240325144450.293630-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs-amp-lib.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c index 01ef4db5407da..287ac01a38735 100644 --- a/sound/soc/codecs/cs-amp-lib.c +++ b/sound/soc/codecs/cs-amp-lib.c @@ -56,6 +56,11 @@ static int _cs_amp_write_cal_coeffs(struct cs_dsp *dsp, dev_dbg(dsp->dev, "Calibration: Ambient=%#x, Status=%#x, CalR=%d\n", data->calAmbient, data->calStatus, data->calR); + if (list_empty(&dsp->ctl_list)) { + dev_info(dsp->dev, "Calibration disabled due to missing firmware controls\n"); + return -ENOENT; + } + ret = cs_amp_write_cal_coeff(dsp, controls, controls->ambient, data->calAmbient); if (ret) return ret; -- GitLab From 708181c50b7763c689ecaba5db8075c2d03719c4 Mon Sep 17 00:00:00 2001 From: Rander Wang Date: Fri, 22 Mar 2024 13:27:03 +0200 Subject: [PATCH 061/989] ASoC: SOF: mtrace: rework mtrace timestamp setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original timestamp is built base on windows epoch time which is not fit for Linux system and difficult to be used for kernel debugging. This patch adopts syslog timestamp so that we can simply use dmesg to check the timestamp between fw and kernel. Signed-off-by: Rander Wang Reviewed-by: Péter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Kai Vehmanen Signed-off-by: Peter Ujfalusi Link: https://msgid.link/r/20240322112703.4549-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-mtrace.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sound/soc/sof/ipc4-mtrace.c b/sound/soc/sof/ipc4-mtrace.c index 9f1e33ee88261..0e04bea9432dd 100644 --- a/sound/soc/sof/ipc4-mtrace.c +++ b/sound/soc/sof/ipc4-mtrace.c @@ -4,6 +4,7 @@ #include #include +#include #include #include "sof-priv.h" #include "ipc4-priv.h" @@ -412,7 +413,6 @@ static int ipc4_mtrace_enable(struct snd_sof_dev *sdev) const struct sof_ipc_ops *iops = sdev->ipc->ops; struct sof_ipc4_msg msg; u64 system_time; - ktime_t kt; int ret; if (priv->mtrace_state != SOF_MTRACE_DISABLED) @@ -424,9 +424,12 @@ static int ipc4_mtrace_enable(struct snd_sof_dev *sdev) msg.primary |= SOF_IPC4_MOD_INSTANCE(SOF_IPC4_MOD_INIT_BASEFW_INSTANCE_ID); msg.extension = SOF_IPC4_MOD_EXT_MSG_PARAM_ID(SOF_IPC4_FW_PARAM_SYSTEM_TIME); - /* The system time is in usec, UTC, epoch is 1601-01-01 00:00:00 */ - kt = ktime_add_us(ktime_get_real(), FW_EPOCH_DELTA * USEC_PER_SEC); - system_time = ktime_to_us(kt); + /* + * local_clock() is used to align with dmesg, so both kernel and firmware logs have + * the same base and a minor delta due to the IPC. system time is in us format but + * local_clock() returns the time in ns, so convert to ns. + */ + system_time = div64_u64(local_clock(), NSEC_PER_USEC); msg.data_size = sizeof(system_time); msg.data_ptr = &system_time; ret = iops->set_get_data(sdev, &msg, msg.data_size, true); -- GitLab From a469158eaf8f4b10263b417856d923dfa38ae96d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Apitzsch?= Date: Mon, 25 Mar 2024 19:05:09 +0100 Subject: [PATCH 062/989] regulator: tps65132: Add of_match table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add of_match table for "ti,tps65132" compatible string. This fixes automatic driver loading when using device-tree, and if built as a module like major linux distributions do. Signed-off-by: André Apitzsch Link: https://msgid.link/r/20240325-of_tps65132-v1-1-86a5f7ef4ede@apitzsch.eu Signed-off-by: Mark Brown --- drivers/regulator/tps65132-regulator.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/regulator/tps65132-regulator.c b/drivers/regulator/tps65132-regulator.c index a06f5f2d79329..9c2f0dd42613d 100644 --- a/drivers/regulator/tps65132-regulator.c +++ b/drivers/regulator/tps65132-regulator.c @@ -267,10 +267,17 @@ static const struct i2c_device_id tps65132_id[] = { }; MODULE_DEVICE_TABLE(i2c, tps65132_id); +static const struct of_device_id __maybe_unused tps65132_of_match[] = { + { .compatible = "ti,tps65132" }, + {}, +}; +MODULE_DEVICE_TABLE(of, tps65132_of_match); + static struct i2c_driver tps65132_i2c_driver = { .driver = { .name = "tps65132", .probe_type = PROBE_PREFER_ASYNCHRONOUS, + .of_match_table = of_match_ptr(tps65132_of_match), }, .probe = tps65132_probe, .id_table = tps65132_id, -- GitLab From 8e936e98718f005c986be0bfa1ee6b355acf96be Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 21 Mar 2024 14:20:41 +0530 Subject: [PATCH 063/989] RISC-V: KVM: Fix APLIC in_clrip[x] read emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reads to APLIC in_clrip[x] registers returns rectified input values of the interrupt sources. A rectified input value of an interrupt source is defined by the section "4.5.2 Source configurations (sourcecfg[1]–sourcecfg[1023])" of the RISC-V AIA specification as: rectified input value = (incoming wire value) XOR (source is inverted) Update the riscv_aplic_input() implementation to match the above. Cc: stable@vger.kernel.org Fixes: 74967aa208e2 ("RISC-V: KVM: Add in-kernel emulation of AIA APLIC") Signed-off-by: Anup Patel Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20240321085041.1955293-3-apatel@ventanamicro.com --- arch/riscv/kvm/aia_aplic.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kvm/aia_aplic.c b/arch/riscv/kvm/aia_aplic.c index 5e842b92dc461..b467ba5ed9100 100644 --- a/arch/riscv/kvm/aia_aplic.c +++ b/arch/riscv/kvm/aia_aplic.c @@ -197,16 +197,31 @@ static void aplic_write_enabled(struct aplic *aplic, u32 irq, bool enabled) static bool aplic_read_input(struct aplic *aplic, u32 irq) { - bool ret; - unsigned long flags; + u32 sourcecfg, sm, raw_input, irq_inverted; struct aplic_irq *irqd; + unsigned long flags; + bool ret = false; if (!irq || aplic->nr_irqs <= irq) return false; irqd = &aplic->irqs[irq]; raw_spin_lock_irqsave(&irqd->lock, flags); - ret = (irqd->state & APLIC_IRQ_STATE_INPUT) ? true : false; + + sourcecfg = irqd->sourcecfg; + if (sourcecfg & APLIC_SOURCECFG_D) + goto skip; + + sm = sourcecfg & APLIC_SOURCECFG_SM_MASK; + if (sm == APLIC_SOURCECFG_SM_INACTIVE) + goto skip; + + raw_input = (irqd->state & APLIC_IRQ_STATE_INPUT) ? 1 : 0; + irq_inverted = (sm == APLIC_SOURCECFG_SM_LEVEL_LOW || + sm == APLIC_SOURCECFG_SM_EDGE_FALL) ? 1 : 0; + ret = !!(raw_input ^ irq_inverted); + +skip: raw_spin_unlock_irqrestore(&irqd->lock, flags); return ret; -- GitLab From e89c928bedd77d181edc2df01cb6672184775140 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 5 Mar 2024 18:48:39 +0000 Subject: [PATCH 064/989] KVM: arm64: Fix host-programmed guest events in nVHE Programming PMU events in the host that count during guest execution is a feature supported by perf, e.g. perf stat -e cpu_cycles:G ./lkvm run While this works for VHE, the guest/host event bitmaps are not carried through to the hypervisor in the nVHE configuration. Make kvm_pmu_update_vcpu_events() conditional on whether or not _hardware_ supports PMUv3 rather than if the vCPU as vPMU enabled. Cc: stable@vger.kernel.org Fixes: 84d751a019a9 ("KVM: arm64: Pass pmu events to hyp via vcpu") Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240305184840.636212-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- include/kvm/arm_pmu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index eb4c369a79eb3..35d4ca4f6122c 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -86,7 +86,7 @@ void kvm_vcpu_pmu_resync_el0(void); */ #define kvm_pmu_update_vcpu_events(vcpu) \ do { \ - if (!has_vhe() && kvm_vcpu_has_pmu(vcpu)) \ + if (!has_vhe() && kvm_arm_support_pmu_v3()) \ vcpu->arch.pmu.events = *kvm_get_pmu_events(); \ } while (0) -- GitLab From f5fe0adeed6019df495497a64cb57d563ead2296 Mon Sep 17 00:00:00 2001 From: Wujie Duan Date: Mon, 18 Mar 2024 17:47:35 +0800 Subject: [PATCH 065/989] KVM: arm64: Fix out-of-IPA space translation fault handling Commit 11e5ea5242e3 ("KVM: arm64: Use helpers to classify exception types reported via ESR") tried to abstract the translation fault check when handling an out-of IPA space condition, but incorrectly replaced it with a permission fault check. Restore the previous translation fault check. Fixes: 11e5ea5242e3 ("KVM: arm64: Use helpers to classify exception types reported via ESR") Acked-by: Ard Biesheuvel Reviewed-by: Marc Zyngier Signed-off-by: Wujie Duan Cc: stable@vger.kernel.org Link: https://lore.kernel.org/kvmarm/864jd3269g.wl-maz@kernel.org/ Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 18680771cdb0e..dc04bc7678659 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1637,7 +1637,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); is_iabt = kvm_vcpu_trap_is_iabt(vcpu); - if (esr_fsc_is_permission_fault(esr)) { + if (esr_fsc_is_translation_fault(esr)) { /* Beyond sanitised PARange (which is the IPA limit) */ if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { kvm_inject_size_fault(vcpu); -- GitLab From 29b0075ed61cda250449f556fbe007a5c469440c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 10:51:15 -0700 Subject: [PATCH 066/989] KVM: selftests: Fix __GUEST_ASSERT() format warnings in ARM's arch timer test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use %x instead of %lx when printing uint32_t variables to fix format warnings in ARM's arch timer test. aarch64/arch_timer.c: In function ‘guest_run_stage’: aarch64/arch_timer.c:138:33: warning: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 6 has type ‘uint32_t’ {aka ‘unsigned int’} [-Wformat=] 138 | "config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n" | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ...... 141 | config_iter + 1, irq_iter); | ~~~~~~~~~~~~~~~ | | | uint32_t {aka unsigned int} Fixes: d1dafd065a23 ("KVM: arm64: selftests: Enable tuning of error margin in arch_timer test") Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20240314175116.2366301-1-seanjc@google.com Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index ddba2c2fb5deb..93100b3f1312a 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -135,7 +135,7 @@ static void guest_run_stage(struct test_vcpu_shared_data *shared_data, irq_iter = READ_ONCE(shared_data->nr_iter); __GUEST_ASSERT(config_iter + 1 == irq_iter, - "config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n" + "config_iter + 1 = 0x%x, irq_iter = 0x%x.\n" " Guest timer interrupt was not trigged within the specified\n" " interval, try to increase the error margin by [-e] option.\n", config_iter + 1, irq_iter); -- GitLab From 502892bbd2021fbe20f0d702b6b0bae281a742fa Mon Sep 17 00:00:00 2001 From: Irui Wang Date: Wed, 6 Mar 2024 10:12:23 +0800 Subject: [PATCH 067/989] media: mediatek: vcodec: Handle VP9 superframe bitstream with 8 sub-frames The VP9 bitstream uses superframes, which each contain 8 sub-frames, enable accessing the last superframe by increasing the range of the index vaidation as the maximum number of superframes is 8 and not 7, so that the last sub-frame can be decoded normally with the stateful VP9 decoder. Signed-off-by: Irui Wang Signed-off-by: Sebastian Fricke Signed-off-by: Hans Verkuil --- .../mediatek/vcodec/decoder/vdec/vdec_vp9_if.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_if.c index 55355fa700908..039082f600c81 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_if.c @@ -16,6 +16,7 @@ #include "../vdec_drv_base.h" #include "../vdec_vpu_if.h" +#define VP9_MAX_SUPER_FRAMES_NUM 8 #define VP9_SUPER_FRAME_BS_SZ 64 #define MAX_VP9_DPB_SIZE 9 @@ -133,11 +134,11 @@ struct vp9_sf_ref_fb { */ struct vdec_vp9_vsi { unsigned char sf_bs_buf[VP9_SUPER_FRAME_BS_SZ]; - struct vp9_sf_ref_fb sf_ref_fb[VP9_MAX_FRM_BUF_NUM-1]; + struct vp9_sf_ref_fb sf_ref_fb[VP9_MAX_SUPER_FRAMES_NUM]; int sf_next_ref_fb_idx; unsigned int sf_frm_cnt; - unsigned int sf_frm_offset[VP9_MAX_FRM_BUF_NUM-1]; - unsigned int sf_frm_sz[VP9_MAX_FRM_BUF_NUM-1]; + unsigned int sf_frm_offset[VP9_MAX_SUPER_FRAMES_NUM]; + unsigned int sf_frm_sz[VP9_MAX_SUPER_FRAMES_NUM]; unsigned int sf_frm_idx; unsigned int sf_init; struct vdec_fb fb; @@ -526,7 +527,7 @@ static void vp9_swap_frm_bufs(struct vdec_vp9_inst *inst) /* if this super frame and it is not last sub-frame, get next fb for * sub-frame decode */ - if (vsi->sf_frm_cnt > 0 && vsi->sf_frm_idx != vsi->sf_frm_cnt - 1) + if (vsi->sf_frm_cnt > 0 && vsi->sf_frm_idx != vsi->sf_frm_cnt) vsi->sf_next_ref_fb_idx = vp9_get_sf_ref_fb(inst); } @@ -735,7 +736,7 @@ static void get_free_fb(struct vdec_vp9_inst *inst, struct vdec_fb **out_fb) static int validate_vsi_array_indexes(struct vdec_vp9_inst *inst, struct vdec_vp9_vsi *vsi) { - if (vsi->sf_frm_idx >= VP9_MAX_FRM_BUF_NUM - 1) { + if (vsi->sf_frm_idx > VP9_MAX_SUPER_FRAMES_NUM) { mtk_vdec_err(inst->ctx, "Invalid vsi->sf_frm_idx=%u.", vsi->sf_frm_idx); return -EIO; } -- GitLab From 97c75ee5de060d271d80109b0c47cb6008439e5b Mon Sep 17 00:00:00 2001 From: Nicolas Dufresne Date: Mon, 26 Feb 2024 16:19:52 -0500 Subject: [PATCH 068/989] media: mediatek: vcodec: Fix oops when HEVC init fails The stateless HEVC decoder saves the instance pointer in the context regardless if the initialization worked or not. This caused a use after free, when the pointer is freed in case of a failure in the deinit function. Only store the instance pointer when the initialization was successful, to solve this issue. Hardware name: Acer Tomato (rev3 - 4) board (DT) pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : vcodec_vpu_send_msg+0x4c/0x190 [mtk_vcodec_dec] lr : vcodec_send_ap_ipi+0x78/0x170 [mtk_vcodec_dec] sp : ffff80008750bc20 x29: ffff80008750bc20 x28: ffff1299f6d70000 x27: 0000000000000000 x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000 x23: ffff80008750bc98 x22: 000000000000a003 x21: ffffd45c4cfae000 x20: 0000000000000010 x19: ffff1299fd668310 x18: 000000000000001a x17: 000000040044ffff x16: ffffd45cb15dc648 x15: 0000000000000000 x14: ffff1299c08da1c0 x13: ffffd45cb1f87a10 x12: ffffd45cb2f5fe80 x11: 0000000000000001 x10: 0000000000001b30 x9 : ffffd45c4d12b488 x8 : 1fffe25339380d81 x7 : 0000000000000001 x6 : ffff1299c9c06c00 x5 : 0000000000000132 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 0000000000000010 x1 : ffff80008750bc98 x0 : 0000000000000000 Call trace: vcodec_vpu_send_msg+0x4c/0x190 [mtk_vcodec_dec] vcodec_send_ap_ipi+0x78/0x170 [mtk_vcodec_dec] vpu_dec_deinit+0x1c/0x30 [mtk_vcodec_dec] vdec_hevc_slice_deinit+0x30/0x98 [mtk_vcodec_dec] vdec_if_deinit+0x38/0x68 [mtk_vcodec_dec] mtk_vcodec_dec_release+0x20/0x40 [mtk_vcodec_dec] fops_vcodec_release+0x64/0x118 [mtk_vcodec_dec] v4l2_release+0x7c/0x100 __fput+0x80/0x2d8 __fput_sync+0x58/0x70 __arm64_sys_close+0x40/0x90 invoke_syscall+0x50/0x128 el0_svc_common.constprop.0+0x48/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x38/0xd8 el0t_64_sync_handler+0xc0/0xc8 el0t_64_sync+0x1a8/0x1b0 Code: d503201f f9401660 b900127f b900227f (f9400400) Signed-off-by: Nicolas Dufresne Fixes: 2674486aac7d ("media: mediatek: vcodec: support stateless hevc decoder") Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Sebastian Fricke Signed-off-by: Hans Verkuil --- .../mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c index 06ed47df693bf..21836dd6ef85a 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c @@ -869,7 +869,6 @@ static int vdec_hevc_slice_init(struct mtk_vcodec_dec_ctx *ctx) inst->vpu.codec_type = ctx->current_codec; inst->vpu.capture_type = ctx->capture_fourcc; - ctx->drv_handle = inst; err = vpu_dec_init(&inst->vpu); if (err) { mtk_vdec_err(ctx, "vdec_hevc init err=%d", err); @@ -898,6 +897,7 @@ static int vdec_hevc_slice_init(struct mtk_vcodec_dec_ctx *ctx) mtk_vdec_debug(ctx, "lat hevc instance >> %p, codec_type = 0x%x", inst, inst->vpu.codec_type); + ctx->drv_handle = inst; return 0; error_free_inst: kfree(inst); -- GitLab From 6467cda18c9f9b5f2f9a0aa1e2861c653e41f382 Mon Sep 17 00:00:00 2001 From: Yunfei Dong Date: Thu, 14 Mar 2024 20:30:08 +0800 Subject: [PATCH 069/989] media: mediatek: vcodec: adding lock to protect decoder context list Add a lock for the ctx_list, to avoid accessing a NULL pointer within the 'vpu_dec_ipi_handler' function when the ctx_list has been deleted due to an unexpected behavior on the SCP IP block. Hardware name: Google juniper sku16 board (DT) pstate: 20400005 (nzCv daif +PAN -UAO -TCO BTYPE=--) pc : vpu_dec_ipi_handler+0x58/0x1f8 [mtk_vcodec_dec] lr : scp_ipi_handler+0xd0/0x194 [mtk_scp] sp : ffffffc0131dbbd0 x29: ffffffc0131dbbd0 x28: 0000000000000000 x27: ffffff9bb277f348 x26: ffffff9bb242ad00 x25: ffffffd2d440d3b8 x24: ffffffd2a13ff1d4 x23: ffffff9bb7fe85a0 x22: ffffffc0133fbdb0 x21: 0000000000000010 x20: ffffff9b050ea328 x19: ffffffc0131dbc08 x18: 0000000000001000 x17: 0000000000000000 x16: ffffffd2d461c6e0 x15: 0000000000000242 x14: 000000000000018f x13: 000000000000004d x12: 0000000000000000 x11: 0000000000000001 x10: fffffffffffffff0 x9 : ffffff9bb6e793a8 x8 : 0000000000000000 x7 : 0000000000000000 x6 : 000000000000003f x5 : 0000000000000040 x4 : fffffffffffffff0 x3 : 0000000000000020 x2 : ffffff9bb6e79080 x1 : 0000000000000010 x0 : ffffffc0131dbc08 Call trace: vpu_dec_ipi_handler+0x58/0x1f8 [mtk_vcodec_dec (HASH:6c3f 2)] scp_ipi_handler+0xd0/0x194 [mtk_scp (HASH:7046 3)] mt8183_scp_irq_handler+0x44/0x88 [mtk_scp (HASH:7046 3)] scp_irq_handler+0x48/0x90 [mtk_scp (HASH:7046 3)] irq_thread_fn+0x38/0x94 irq_thread+0x100/0x1c0 kthread+0x140/0x1fc ret_from_fork+0x10/0x30 Code: 54000088 f94ca50a eb14015f 54000060 (f9400108) ---[ end trace ace43ce36cbd5c93 ]--- Kernel panic - not syncing: Oops: Fatal exception SMP: stopping secondary CPUs Kernel Offset: 0x12c4000000 from 0xffffffc010000000 PHYS_OFFSET: 0xffffffe580000000 CPU features: 0x08240002,2188200c Memory Limit: none Fixes: 655b86e52eac ("media: mediatek: vcodec: Fix possible invalid memory access for decoder") Signed-off-by: Yunfei Dong Reviewed-by: Nicolas Dufresne Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Sebastian Fricke Signed-off-by: Hans Verkuil --- .../platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c | 4 ++-- .../platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c | 5 +++++ .../platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h | 2 ++ drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c | 2 ++ 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c index 4c34344dc7dcb..62cafe25fed94 100644 --- a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c +++ b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c @@ -50,12 +50,12 @@ static void mtk_vcodec_vpu_reset_dec_handler(void *priv) dev_err(&dev->plat_dev->dev, "Watchdog timeout!!"); - mutex_lock(&dev->dev_mutex); + mutex_lock(&dev->dev_ctx_lock); list_for_each_entry(ctx, &dev->ctx_list, list) { ctx->state = MTK_STATE_ABORT; mtk_v4l2_vdec_dbg(0, ctx, "[%d] Change to state MTK_STATE_ABORT", ctx->id); } - mutex_unlock(&dev->dev_mutex); + mutex_unlock(&dev->dev_ctx_lock); } static void mtk_vcodec_vpu_reset_enc_handler(void *priv) diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c index f47c98faf068b..2073781ccadb1 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c @@ -268,7 +268,9 @@ static int fops_vcodec_open(struct file *file) ctx->dev->vdec_pdata->init_vdec_params(ctx); + mutex_lock(&dev->dev_ctx_lock); list_add(&ctx->list, &dev->ctx_list); + mutex_unlock(&dev->dev_ctx_lock); mtk_vcodec_dbgfs_create(ctx); mutex_unlock(&dev->dev_mutex); @@ -311,7 +313,9 @@ static int fops_vcodec_release(struct file *file) v4l2_ctrl_handler_free(&ctx->ctrl_hdl); mtk_vcodec_dbgfs_remove(dev, ctx->id); + mutex_lock(&dev->dev_ctx_lock); list_del_init(&ctx->list); + mutex_unlock(&dev->dev_ctx_lock); kfree(ctx); mutex_unlock(&dev->dev_mutex); return 0; @@ -404,6 +408,7 @@ static int mtk_vcodec_probe(struct platform_device *pdev) for (i = 0; i < MTK_VDEC_HW_MAX; i++) mutex_init(&dev->dec_mutex[i]); mutex_init(&dev->dev_mutex); + mutex_init(&dev->dev_ctx_lock); spin_lock_init(&dev->irqlock); snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), "%s", diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h index 849b89dd205c2..85b2c0d3d8bcd 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h +++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h @@ -241,6 +241,7 @@ struct mtk_vcodec_dec_ctx { * * @dec_mutex: decoder hardware lock * @dev_mutex: video_device lock + * @dev_ctx_lock: the lock of context list * @decode_workqueue: decode work queue * * @irqlock: protect data access by irq handler and work thread @@ -282,6 +283,7 @@ struct mtk_vcodec_dec_dev { /* decoder hardware mutex lock */ struct mutex dec_mutex[MTK_VDEC_HW_MAX]; struct mutex dev_mutex; + struct mutex dev_ctx_lock; struct workqueue_struct *decode_workqueue; spinlock_t irqlock; diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c index 82e57ae983d55..da6be556727bb 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c @@ -77,12 +77,14 @@ static bool vpu_dec_check_ap_inst(struct mtk_vcodec_dec_dev *dec_dev, struct vde struct mtk_vcodec_dec_ctx *ctx; int ret = false; + mutex_lock(&dec_dev->dev_ctx_lock); list_for_each_entry(ctx, &dec_dev->ctx_list, list) { if (!IS_ERR_OR_NULL(ctx) && ctx->vpu_inst == vpu) { ret = true; break; } } + mutex_unlock(&dec_dev->dev_ctx_lock); return ret; } -- GitLab From afaaf3a0f647a24a7bf6a2145d8ade37baaf75ad Mon Sep 17 00:00:00 2001 From: Yunfei Dong Date: Thu, 14 Mar 2024 20:30:09 +0800 Subject: [PATCH 070/989] media: mediatek: vcodec: adding lock to protect encoder context list Add a lock for the ctx_list, to avoid accessing a NULL pointer within the 'vpu_enc_ipi_handler' function when the ctx_list has been deleted due to an unexpected behavior on the SCP IP block. Fixes: 1972e32431ed ("media: mediatek: vcodec: Fix possible invalid memory access for encoder") Signed-off-by: Yunfei Dong Reviewed-by: Nicolas Dufresne Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Sebastian Fricke Signed-off-by: Hans Verkuil --- .../platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c | 4 ++-- .../platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c | 5 +++++ .../platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h | 2 ++ drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c | 2 ++ 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c index 62cafe25fed94..d7027d600208f 100644 --- a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c +++ b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c @@ -65,12 +65,12 @@ static void mtk_vcodec_vpu_reset_enc_handler(void *priv) dev_err(&dev->plat_dev->dev, "Watchdog timeout!!"); - mutex_lock(&dev->dev_mutex); + mutex_lock(&dev->dev_ctx_lock); list_for_each_entry(ctx, &dev->ctx_list, list) { ctx->state = MTK_STATE_ABORT; mtk_v4l2_vdec_dbg(0, ctx, "[%d] Change to state MTK_STATE_ABORT", ctx->id); } - mutex_unlock(&dev->dev_mutex); + mutex_unlock(&dev->dev_ctx_lock); } static const struct mtk_vcodec_fw_ops mtk_vcodec_vpu_msg = { diff --git a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c index 6319f24bc714b..3cb8a16222220 100644 --- a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c +++ b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c @@ -177,7 +177,9 @@ static int fops_vcodec_open(struct file *file) mtk_v4l2_venc_dbg(2, ctx, "Create instance [%d]@%p m2m_ctx=%p ", ctx->id, ctx, ctx->m2m_ctx); + mutex_lock(&dev->dev_ctx_lock); list_add(&ctx->list, &dev->ctx_list); + mutex_unlock(&dev->dev_ctx_lock); mutex_unlock(&dev->dev_mutex); mtk_v4l2_venc_dbg(0, ctx, "%s encoder [%d]", dev_name(&dev->plat_dev->dev), @@ -212,7 +214,9 @@ static int fops_vcodec_release(struct file *file) v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->ctrl_hdl); + mutex_lock(&dev->dev_ctx_lock); list_del_init(&ctx->list); + mutex_unlock(&dev->dev_ctx_lock); kfree(ctx); mutex_unlock(&dev->dev_mutex); return 0; @@ -294,6 +298,7 @@ static int mtk_vcodec_probe(struct platform_device *pdev) mutex_init(&dev->enc_mutex); mutex_init(&dev->dev_mutex); + mutex_init(&dev->dev_ctx_lock); spin_lock_init(&dev->irqlock); snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), "%s", diff --git a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h index a042f607ed8d1..0bd85d0fb379a 100644 --- a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h +++ b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h @@ -178,6 +178,7 @@ struct mtk_vcodec_enc_ctx { * * @enc_mutex: encoder hardware lock. * @dev_mutex: video_device lock + * @dev_ctx_lock: the lock of context list * @encode_workqueue: encode work queue * * @enc_irq: h264 encoder irq resource @@ -205,6 +206,7 @@ struct mtk_vcodec_enc_dev { /* encoder hardware mutex lock */ struct mutex enc_mutex; struct mutex dev_mutex; + struct mutex dev_ctx_lock; struct workqueue_struct *encode_workqueue; int enc_irq; diff --git a/drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c b/drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c index 84ad1cc6ad171..51bb7ee141b9e 100644 --- a/drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c +++ b/drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c @@ -47,12 +47,14 @@ static bool vpu_enc_check_ap_inst(struct mtk_vcodec_enc_dev *enc_dev, struct ven struct mtk_vcodec_enc_ctx *ctx; int ret = false; + mutex_lock(&enc_dev->dev_ctx_lock); list_for_each_entry(ctx, &enc_dev->ctx_list, list) { if (!IS_ERR_OR_NULL(ctx) && ctx->vpu_inst == vpu) { ret = true; break; } } + mutex_unlock(&enc_dev->dev_ctx_lock); return ret; } -- GitLab From d353c3c34af08cfd4eaafc8c55f664eacec274ee Mon Sep 17 00:00:00 2001 From: Yunfei Dong Date: Wed, 6 Mar 2024 20:19:02 +0800 Subject: [PATCH 071/989] media: mediatek: vcodec: support 36 bits physical address The physical address on the MT8188 platform is larger than 32 bits, change the type from unsigned int to dma_addr_t to be able to access the high bits of the address. Signed-off-by: Yunfei Dong Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Sebastian Fricke Signed-off-by: Hans Verkuil --- .../media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c | 2 +- .../mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c index 19407f9bc773c..987b3d71b662a 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c @@ -449,7 +449,7 @@ static int vdec_vp8_decode(void *h_vdec, struct mtk_vcodec_mem *bs, inst->frm_cnt, y_fb_dma, c_fb_dma, fb); inst->cur_fb = fb; - dec->bs_dma = (unsigned long)bs->dma_addr; + dec->bs_dma = (uint64_t)bs->dma_addr; dec->bs_sz = bs->size; dec->cur_y_fb_dma = y_fb_dma; dec->cur_c_fb_dma = c_fb_dma; diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c index cf48d09b78d7a..eea709d938209 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c @@ -1074,7 +1074,7 @@ static int vdec_vp9_slice_setup_tile_buffer(struct vdec_vp9_slice_instance *inst unsigned int mi_row; unsigned int mi_col; unsigned int offset; - unsigned int pa; + dma_addr_t pa; unsigned int size; struct vdec_vp9_slice_tiles *tiles; unsigned char *pos; @@ -1109,7 +1109,7 @@ static int vdec_vp9_slice_setup_tile_buffer(struct vdec_vp9_slice_instance *inst pos = va + offset; end = va + bs->size; /* truncated */ - pa = (unsigned int)bs->dma_addr + offset; + pa = bs->dma_addr + offset; tb = instance->tile.va; for (i = 0; i < rows; i++) { for (j = 0; j < cols; j++) { -- GitLab From 56ebbd19c2989f7450341f581e2724a149d0f08e Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 26 Mar 2024 10:54:34 +0000 Subject: [PATCH 072/989] ASoC: cs42l43: Correct extraction of data pointer in suspend/resume The current code is pulling the wrong pointer causing it to disable the wrong IRQ. Correct the code to pull the correct cs42l43 core data pointer. Fixes: 64353af49fec ("ASoC: cs42l43: Add system suspend ops to disable IRQ") Signed-off-by: Charles Keepax Link: https://msgid.link/r/20240326105434.852907-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l43.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sound/soc/codecs/cs42l43.c b/sound/soc/codecs/cs42l43.c index 860d5cda67bff..94685449f0f48 100644 --- a/sound/soc/codecs/cs42l43.c +++ b/sound/soc/codecs/cs42l43.c @@ -2364,7 +2364,8 @@ static int cs42l43_codec_runtime_resume(struct device *dev) static int cs42l43_codec_suspend(struct device *dev) { - struct cs42l43 *cs42l43 = dev_get_drvdata(dev); + struct cs42l43_codec *priv = dev_get_drvdata(dev); + struct cs42l43 *cs42l43 = priv->core; disable_irq(cs42l43->irq); @@ -2373,7 +2374,8 @@ static int cs42l43_codec_suspend(struct device *dev) static int cs42l43_codec_suspend_noirq(struct device *dev) { - struct cs42l43 *cs42l43 = dev_get_drvdata(dev); + struct cs42l43_codec *priv = dev_get_drvdata(dev); + struct cs42l43 *cs42l43 = priv->core; enable_irq(cs42l43->irq); @@ -2382,7 +2384,8 @@ static int cs42l43_codec_suspend_noirq(struct device *dev) static int cs42l43_codec_resume(struct device *dev) { - struct cs42l43 *cs42l43 = dev_get_drvdata(dev); + struct cs42l43_codec *priv = dev_get_drvdata(dev); + struct cs42l43 *cs42l43 = priv->core; enable_irq(cs42l43->irq); @@ -2391,7 +2394,8 @@ static int cs42l43_codec_resume(struct device *dev) static int cs42l43_codec_resume_noirq(struct device *dev) { - struct cs42l43 *cs42l43 = dev_get_drvdata(dev); + struct cs42l43_codec *priv = dev_get_drvdata(dev); + struct cs42l43 *cs42l43 = priv->core; disable_irq(cs42l43->irq); -- GitLab From 674bc0168e6b68070c75df22e97ab63b6eb60d89 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Fri, 1 Mar 2024 12:18:32 -0800 Subject: [PATCH 073/989] riscv: mm: Fix prototype to avoid discarding const __flush_tlb_range() does not modify the provided cpumask, so its cmask parameter can be pointer-to-const. This avoids the unsafe cast of cpu_online_mask. Fixes: 54d7431af73e ("riscv: Add support for BATCHED_UNMAP_TLB_FLUSH") Signed-off-by: Samuel Holland Reviewed-by: Andrew Jones Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240301201837.2826172-1-samuel.holland@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/tlbflush.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index 893566e004b73..07d743f87b3f6 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -99,7 +99,7 @@ static void __ipi_flush_tlb_range_asid(void *info) local_flush_tlb_range_asid(d->start, d->size, d->stride, d->asid); } -static void __flush_tlb_range(struct cpumask *cmask, unsigned long asid, +static void __flush_tlb_range(const struct cpumask *cmask, unsigned long asid, unsigned long start, unsigned long size, unsigned long stride) { @@ -200,7 +200,7 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - __flush_tlb_range((struct cpumask *)cpu_online_mask, FLUSH_TLB_NO_ASID, + __flush_tlb_range(cpu_online_mask, FLUSH_TLB_NO_ASID, start, end - start, PAGE_SIZE); } -- GitLab From ea558de7238bb12c3435c47f0631e9d17bf4a09f Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Sat, 16 Mar 2024 12:38:29 +0100 Subject: [PATCH 074/989] i40e: Enforce software interrupt during busy-poll exit As for ice bug fixed by commit b7306b42beaf ("ice: manage interrupts during poll exit") followed by commit 23be7075b318 ("ice: fix software generating extra interrupts") I'm seeing the similar issue also with i40e driver. In certain situation when busy-loop is enabled together with adaptive coalescing, the driver occasionally misses that there are outstanding descriptors to clean when exiting busy poll. Try to catch the remaining work by triggering a software interrupt when exiting busy poll. No extra interrupts will be generated when busy polling is not used. The issue was found when running sockperf ping-pong tcp test with adaptive coalescing and busy poll enabled (50 as value busy_pool and busy_read sysctl knobs) and results in huge latency spikes with more than 100000us. The fix is inspired from the ice driver and do the following: 1) During napi poll exit in case of busy-poll (napo_complete_done() returns false) this is recorded to q_vector that we were in busy loop. 2) Extends i40e_buildreg_itr() to be able to add an enforced software interrupt into built value 2) In i40e_update_enable_itr() enforces a software interrupt trigger if we are exiting busy poll to catch any pending clean-ups 3) Reuses unused 3rd ITR (interrupt throttle) index and set it to 20K interrupts per second to limit the number of these sw interrupts. Test results ============ Prior: [root@dell-per640-07 net]# sockperf ping-pong -i 10.9.9.1 --tcp -m 1000 --mps=max -t 120 sockperf: == version #3.10-no.git == sockperf[CLIENT] send on:sockperf: using recvfrom() to block on socket(s) [ 0] IP = 10.9.9.1 PORT = 11111 # TCP sockperf: Warmup stage (sending a few dummy messages)... sockperf: Starting test... sockperf: Test end (interrupted by timer) sockperf: Test ended sockperf: [Total Run] RunTime=119.999 sec; Warm up time=400 msec; SentMessages=2438563; ReceivedMessages=2438562 sockperf: ========= Printing statistics for Server No: 0 sockperf: [Valid Duration] RunTime=119.549 sec; SentMessages=2429473; ReceivedMessages=2429473 sockperf: ====> avg-latency=24.571 (std-dev=93.297, mean-ad=4.904, median-ad=1.510, siqr=1.063, cv=3.797, std-error=0.060, 99.0% ci=[24.417, 24.725]) sockperf: # dropped messages = 0; # duplicated messages = 0; # out-of-order messages = 0 sockperf: Summary: Latency is 24.571 usec sockperf: Total 2429473 observations; each percentile contains 24294.73 observations sockperf: ---> observation = 103294.331 sockperf: ---> percentile 99.999 = 45.633 sockperf: ---> percentile 99.990 = 37.013 sockperf: ---> percentile 99.900 = 35.910 sockperf: ---> percentile 99.000 = 33.390 sockperf: ---> percentile 90.000 = 28.626 sockperf: ---> percentile 75.000 = 27.741 sockperf: ---> percentile 50.000 = 26.743 sockperf: ---> percentile 25.000 = 25.614 sockperf: ---> observation = 12.220 After: [root@dell-per640-07 net]# sockperf ping-pong -i 10.9.9.1 --tcp -m 1000 --mps=max -t 120 sockperf: == version #3.10-no.git == sockperf[CLIENT] send on:sockperf: using recvfrom() to block on socket(s) [ 0] IP = 10.9.9.1 PORT = 11111 # TCP sockperf: Warmup stage (sending a few dummy messages)... sockperf: Starting test... sockperf: Test end (interrupted by timer) sockperf: Test ended sockperf: [Total Run] RunTime=119.999 sec; Warm up time=400 msec; SentMessages=2400055; ReceivedMessages=2400054 sockperf: ========= Printing statistics for Server No: 0 sockperf: [Valid Duration] RunTime=119.549 sec; SentMessages=2391186; ReceivedMessages=2391186 sockperf: ====> avg-latency=24.965 (std-dev=5.934, mean-ad=4.642, median-ad=1.485, siqr=1.067, cv=0.238, std-error=0.004, 99.0% ci=[24.955, 24.975]) sockperf: # dropped messages = 0; # duplicated messages = 0; # out-of-order messages = 0 sockperf: Summary: Latency is 24.965 usec sockperf: Total 2391186 observations; each percentile contains 23911.86 observations sockperf: ---> observation = 195.841 sockperf: ---> percentile 99.999 = 45.026 sockperf: ---> percentile 99.990 = 39.009 sockperf: ---> percentile 99.900 = 35.922 sockperf: ---> percentile 99.000 = 33.482 sockperf: ---> percentile 90.000 = 28.902 sockperf: ---> percentile 75.000 = 27.821 sockperf: ---> percentile 50.000 = 26.860 sockperf: ---> percentile 25.000 = 25.685 sockperf: ---> observation = 12.277 Fixes: 0bcd952feec7 ("ethernet/intel: consolidate NAPI and NAPI exit") Reported-by: Hugo Ferreira Reviewed-by: Michal Schmidt Signed-off-by: Ivan Vecera Reviewed-by: Jesse Brandeburg Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 6 ++ .../net/ethernet/intel/i40e/i40e_register.h | 3 + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 82 ++++++++++++++----- drivers/net/ethernet/intel/i40e/i40e_txrx.h | 1 + 5 files changed, 72 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index ba24f3fa92c37..2fbabcdb5bb5f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -955,6 +955,7 @@ struct i40e_q_vector { struct rcu_head rcu; /* to avoid race with update stats on free */ char name[I40E_INT_NAME_STR_LEN]; bool arm_wb_state; + bool in_busy_poll; int irq_num; /* IRQ assigned to this q_vector */ } ____cacheline_internodealigned_in_smp; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index f86578857e8ae..6576a00810936 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3911,6 +3911,12 @@ static void i40e_vsi_configure_msix(struct i40e_vsi *vsi) q_vector->tx.target_itr >> 1); q_vector->tx.current_itr = q_vector->tx.target_itr; + /* Set ITR for software interrupts triggered after exiting + * busy-loop polling. + */ + wr32(hw, I40E_PFINT_ITRN(I40E_SW_ITR, vector - 1), + I40E_ITR_20K); + wr32(hw, I40E_PFINT_RATEN(vector - 1), i40e_intrl_usec_to_reg(vsi->int_rate_limit)); diff --git a/drivers/net/ethernet/intel/i40e/i40e_register.h b/drivers/net/ethernet/intel/i40e/i40e_register.h index 14ab642cafdb2..432afbb642013 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_register.h +++ b/drivers/net/ethernet/intel/i40e/i40e_register.h @@ -333,8 +333,11 @@ #define I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT 3 #define I40E_PFINT_DYN_CTLN_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) #define I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT 5 +#define I40E_PFINT_DYN_CTLN_INTERVAL_MASK I40E_MASK(0xFFF, I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT) #define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT 24 #define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT) +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT 25 +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT) #define I40E_PFINT_ICR0 0x00038780 /* Reset: CORER */ #define I40E_PFINT_ICR0_INTEVENT_SHIFT 0 #define I40E_PFINT_ICR0_INTEVENT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_INTEVENT_SHIFT) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 0d7177083708f..1a12b732818ee 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2630,7 +2630,22 @@ process_next: return failure ? budget : (int)total_rx_packets; } -static inline u32 i40e_buildreg_itr(const int type, u16 itr) +/** + * i40e_buildreg_itr - build a value for writing to I40E_PFINT_DYN_CTLN register + * @itr_idx: interrupt throttling index + * @interval: interrupt throttling interval value in usecs + * @force_swint: force software interrupt + * + * The function builds a value for I40E_PFINT_DYN_CTLN register that + * is used to update interrupt throttling interval for specified ITR index + * and optionally enforces a software interrupt. If the @itr_idx is equal + * to I40E_ITR_NONE then no interval change is applied and only @force_swint + * parameter is taken into account. If the interval change and enforced + * software interrupt are not requested then the built value just enables + * appropriate vector interrupt. + **/ +static u32 i40e_buildreg_itr(enum i40e_dyn_idx itr_idx, u16 interval, + bool force_swint) { u32 val; @@ -2644,23 +2659,33 @@ static inline u32 i40e_buildreg_itr(const int type, u16 itr) * an event in the PBA anyway so we need to rely on the automask * to hold pending events for us until the interrupt is re-enabled * - * The itr value is reported in microseconds, and the register - * value is recorded in 2 microsecond units. For this reason we - * only need to shift by the interval shift - 1 instead of the - * full value. + * We have to shift the given value as it is reported in microseconds + * and the register value is recorded in 2 microsecond units. */ - itr &= I40E_ITR_MASK; + interval >>= 1; + /* 1. Enable vector interrupt + * 2. Update the interval for the specified ITR index + * (I40E_ITR_NONE in the register is used to indicate that + * no interval update is requested) + */ val = I40E_PFINT_DYN_CTLN_INTENA_MASK | - (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) | - (itr << (I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT - 1)); + FIELD_PREP(I40E_PFINT_DYN_CTLN_ITR_INDX_MASK, itr_idx) | + FIELD_PREP(I40E_PFINT_DYN_CTLN_INTERVAL_MASK, interval); + + /* 3. Enforce software interrupt trigger if requested + * (These software interrupts rate is limited by ITR2 that is + * set to 20K interrupts per second) + */ + if (force_swint) + val |= I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK | + I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK | + FIELD_PREP(I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK, + I40E_SW_ITR); return val; } -/* a small macro to shorten up some long lines */ -#define INTREG I40E_PFINT_DYN_CTLN - /* The act of updating the ITR will cause it to immediately trigger. In order * to prevent this from throwing off adaptive update statistics we defer the * update so that it can only happen so often. So after either Tx or Rx are @@ -2679,8 +2704,10 @@ static inline u32 i40e_buildreg_itr(const int type, u16 itr) static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector) { + enum i40e_dyn_idx itr_idx = I40E_ITR_NONE; struct i40e_hw *hw = &vsi->back->hw; - u32 intval; + u16 interval = 0; + u32 itr_val; /* If we don't have MSIX, then we only need to re-enable icr0 */ if (!test_bit(I40E_FLAG_MSIX_ENA, vsi->back->flags)) { @@ -2702,8 +2729,8 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, */ if (q_vector->rx.target_itr < q_vector->rx.current_itr) { /* Rx ITR needs to be reduced, this is highest priority */ - intval = i40e_buildreg_itr(I40E_RX_ITR, - q_vector->rx.target_itr); + itr_idx = I40E_RX_ITR; + interval = q_vector->rx.target_itr; q_vector->rx.current_itr = q_vector->rx.target_itr; q_vector->itr_countdown = ITR_COUNTDOWN_START; } else if ((q_vector->tx.target_itr < q_vector->tx.current_itr) || @@ -2712,25 +2739,36 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, /* Tx ITR needs to be reduced, this is second priority * Tx ITR needs to be increased more than Rx, fourth priority */ - intval = i40e_buildreg_itr(I40E_TX_ITR, - q_vector->tx.target_itr); + itr_idx = I40E_TX_ITR; + interval = q_vector->tx.target_itr; q_vector->tx.current_itr = q_vector->tx.target_itr; q_vector->itr_countdown = ITR_COUNTDOWN_START; } else if (q_vector->rx.current_itr != q_vector->rx.target_itr) { /* Rx ITR needs to be increased, third priority */ - intval = i40e_buildreg_itr(I40E_RX_ITR, - q_vector->rx.target_itr); + itr_idx = I40E_RX_ITR; + interval = q_vector->rx.target_itr; q_vector->rx.current_itr = q_vector->rx.target_itr; q_vector->itr_countdown = ITR_COUNTDOWN_START; } else { /* No ITR update, lowest priority */ - intval = i40e_buildreg_itr(I40E_ITR_NONE, 0); if (q_vector->itr_countdown) q_vector->itr_countdown--; } - if (!test_bit(__I40E_VSI_DOWN, vsi->state)) - wr32(hw, INTREG(q_vector->reg_idx), intval); + /* Do not update interrupt control register if VSI is down */ + if (test_bit(__I40E_VSI_DOWN, vsi->state)) + return; + + /* Update ITR interval if necessary and enforce software interrupt + * if we are exiting busy poll. + */ + if (q_vector->in_busy_poll) { + itr_val = i40e_buildreg_itr(itr_idx, interval, true); + q_vector->in_busy_poll = false; + } else { + itr_val = i40e_buildreg_itr(itr_idx, interval, false); + } + wr32(hw, I40E_PFINT_DYN_CTLN(q_vector->reg_idx), itr_val); } /** @@ -2845,6 +2883,8 @@ tx_only: */ if (likely(napi_complete_done(napi, work_done))) i40e_update_enable_itr(vsi, q_vector); + else + q_vector->in_busy_poll = true; return min(work_done, budget - 1); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index abf15067eb5de..2cdc7de6301c1 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -68,6 +68,7 @@ enum i40e_dyn_idx { /* these are indexes into ITRN registers */ #define I40E_RX_ITR I40E_IDX_ITR0 #define I40E_TX_ITR I40E_IDX_ITR1 +#define I40E_SW_ITR I40E_IDX_ITR2 /* Supported RSS offloads */ #define I40E_DEFAULT_RSS_HENA ( \ -- GitLab From d080a08b06b6266cc3e0e86c5acfd80db937cb6b Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Mon, 11 Mar 2024 19:19:13 -0700 Subject: [PATCH 075/989] riscv: Fix spurious errors from __get/put_kernel_nofault These macros did not initialize __kr_err, so they could fail even if the access did not fault. Cc: stable@vger.kernel.org Fixes: d464118cdc41 ("riscv: implement __get_kernel_nofault and __put_user_nofault") Signed-off-by: Samuel Holland Reviewed-by: Alexandre Ghiti Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240312022030.320789-1-samuel.holland@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index ec0cab9fbddd0..72ec1d9bd3f31 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -319,7 +319,7 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ - long __kr_err; \ + long __kr_err = 0; \ \ __get_user_nocheck(*((type *)(dst)), (type *)(src), __kr_err); \ if (unlikely(__kr_err)) \ @@ -328,7 +328,7 @@ do { \ #define __put_kernel_nofault(dst, src, type, err_label) \ do { \ - long __kr_err; \ + long __kr_err = 0; \ \ __put_user_nocheck(*((type *)(src)), (type *)(dst), __kr_err); \ if (unlikely(__kr_err)) \ -- GitLab From eb58c598ce45b7e787568fe27016260417c3d807 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Wed, 13 Mar 2024 10:44:00 +0100 Subject: [PATCH 076/989] i40e: fix i40e_count_filters() to count only active/new filters The bug usually affects untrusted VFs, because they are limited to 18 MACs, it affects them badly, not letting to create MAC all filters. Not stable to reproduce, it happens when VF user creates MAC filters when other MACVLAN operations are happened in parallel. But consequence is that VF can't receive desired traffic. Fix counter to be bumped only for new or active filters. Fixes: 621650cabee5 ("i40e: Refactoring VF MAC filters counting to make more reliable") Signed-off-by: Aleksandr Loktionov Reviewed-by: Arkadiusz Kubalewski Reviewed-by: Paul Menzel Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 6576a00810936..48b9ddb2b1b38 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1253,8 +1253,11 @@ int i40e_count_filters(struct i40e_vsi *vsi) int bkt; int cnt = 0; - hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) - ++cnt; + hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) { + if (f->state == I40E_FILTER_NEW || + f->state == I40E_FILTER_ACTIVE) + ++cnt; + } return cnt; } -- GitLab From f37c4eac99c258111d414d31b740437e1925b8e8 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Wed, 13 Mar 2024 10:56:39 +0100 Subject: [PATCH 077/989] i40e: fix vf may be used uninitialized in this function warning To fix the regression introduced by commit 52424f974bc5, which causes servers hang in very hard to reproduce conditions with resets races. Using two sources for the information is the root cause. In this function before the fix bumping v didn't mean bumping vf pointer. But the code used this variables interchangeably, so stale vf could point to different/not intended vf. Remove redundant "v" variable and iterate via single VF pointer across whole function instead to guarantee VF pointer validity. Fixes: 52424f974bc5 ("i40e: Fix VF hang when reset is triggered on another VF") Signed-off-by: Aleksandr Loktionov Reviewed-by: Arkadiusz Kubalewski Reviewed-by: Przemek Kitszel Reviewed-by: Paul Menzel Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 34 +++++++++---------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 83a34e98bdc79..4c13f78af6756 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -1624,8 +1624,8 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) { struct i40e_hw *hw = &pf->hw; struct i40e_vf *vf; - int i, v; u32 reg; + int i; /* If we don't have any VFs, then there is nothing to reset */ if (!pf->num_alloc_vfs) @@ -1636,11 +1636,10 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) return false; /* Begin reset on all VFs at once */ - for (v = 0; v < pf->num_alloc_vfs; v++) { - vf = &pf->vf[v]; + for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { /* If VF is being reset no need to trigger reset again */ if (!test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) - i40e_trigger_vf_reset(&pf->vf[v], flr); + i40e_trigger_vf_reset(vf, flr); } /* HW requires some time to make sure it can flush the FIFO for a VF @@ -1649,14 +1648,13 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) * the VFs using a simple iterator that increments once that VF has * finished resetting. */ - for (i = 0, v = 0; i < 10 && v < pf->num_alloc_vfs; i++) { + for (i = 0, vf = &pf->vf[0]; i < 10 && vf < &pf->vf[pf->num_alloc_vfs]; ++i) { usleep_range(10000, 20000); /* Check each VF in sequence, beginning with the VF to fail * the previous check. */ - while (v < pf->num_alloc_vfs) { - vf = &pf->vf[v]; + while (vf < &pf->vf[pf->num_alloc_vfs]) { if (!test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) { reg = rd32(hw, I40E_VPGEN_VFRSTAT(vf->vf_id)); if (!(reg & I40E_VPGEN_VFRSTAT_VFRD_MASK)) @@ -1666,7 +1664,7 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) /* If the current VF has finished resetting, move on * to the next VF in sequence. */ - v++; + ++vf; } } @@ -1676,39 +1674,39 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) /* Display a warning if at least one VF didn't manage to reset in * time, but continue on with the operation. */ - if (v < pf->num_alloc_vfs) + if (vf < &pf->vf[pf->num_alloc_vfs]) dev_err(&pf->pdev->dev, "VF reset check timeout on VF %d\n", - pf->vf[v].vf_id); + vf->vf_id); usleep_range(10000, 20000); /* Begin disabling all the rings associated with VFs, but do not wait * between each VF. */ - for (v = 0; v < pf->num_alloc_vfs; v++) { + for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { /* On initial reset, we don't have any queues to disable */ - if (pf->vf[v].lan_vsi_idx == 0) + if (vf->lan_vsi_idx == 0) continue; /* If VF is reset in another thread just continue */ if (test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) continue; - i40e_vsi_stop_rings_no_wait(pf->vsi[pf->vf[v].lan_vsi_idx]); + i40e_vsi_stop_rings_no_wait(pf->vsi[vf->lan_vsi_idx]); } /* Now that we've notified HW to disable all of the VF rings, wait * until they finish. */ - for (v = 0; v < pf->num_alloc_vfs; v++) { + for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { /* On initial reset, we don't have any queues to disable */ - if (pf->vf[v].lan_vsi_idx == 0) + if (vf->lan_vsi_idx == 0) continue; /* If VF is reset in another thread just continue */ if (test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) continue; - i40e_vsi_wait_queues_disabled(pf->vsi[pf->vf[v].lan_vsi_idx]); + i40e_vsi_wait_queues_disabled(pf->vsi[vf->lan_vsi_idx]); } /* Hw may need up to 50ms to finish disabling the RX queues. We @@ -1717,12 +1715,12 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) mdelay(50); /* Finish the reset on each VF */ - for (v = 0; v < pf->num_alloc_vfs; v++) { + for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { /* If VF is reset in another thread just continue */ if (test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) continue; - i40e_cleanup_reset_vf(&pf->vf[v]); + i40e_cleanup_reset_vf(vf); } i40e_flush(hw); -- GitLab From b7c59b038c656214f56432867056997c2e0fc268 Mon Sep 17 00:00:00 2001 From: Yuquan Wang Date: Mon, 18 Mar 2024 10:29:28 +0800 Subject: [PATCH 078/989] cxl/mem: Fix for the index of Clear Event Record Handle The dev_dbg info for Clear Event Records mailbox command would report the handle of the next record to clear not the current one. This was because the index 'i' had incremented before printing the current handle value. Fixes: 6ebe28f9ec72 ("cxl/mem: Read, trace, and clear events on driver load") Signed-off-by: Yuquan Wang Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Reviewed-by: Fan Ni Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 9adda4795eb78..50146161887d5 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -915,7 +915,7 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, payload->handles[i++] = gen->hdr.handle; dev_dbg(mds->cxlds.dev, "Event log '%d': Clearing %u\n", log, - le16_to_cpu(payload->handles[i])); + le16_to_cpu(payload->handles[i - 1])); if (i == max_handles) { payload->nr_recs = i; -- GitLab From 5c88a9ccd4c431d58b532e4158b6999a8350062c Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 19 Mar 2024 11:15:08 -0700 Subject: [PATCH 079/989] cxl/core/regs: Fix usage of map->reg_type in cxl_decode_regblock() before assigned In the error path, map->reg_type is being used for kernel warning before its value is setup. Found by code inspection. Exposure to user is wrong reg_type being emitted via kernel log. Use a local var for reg_type and retrieve value for usage. Fixes: 6c7f4f1e51c2 ("cxl/core/regs: Make cxl_map_{component, device}_regs() device generic") Reviewed-by: Dan Williams Reviewed-by: Davidlohr Bueso Signed-off-by: Dave Jiang --- drivers/cxl/core/regs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 372786f809555..3c42f984eeafa 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -271,6 +271,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_map_device_regs, CXL); static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, struct cxl_register_map *map) { + u8 reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); int bar = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo); u64 offset = ((u64)reg_hi << 32) | (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK); @@ -278,11 +279,11 @@ static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, if (offset > pci_resource_len(pdev, bar)) { dev_warn(&pdev->dev, "BAR%d: %pr: too small (offset: %pa, type: %d)\n", bar, - &pdev->resource[bar], &offset, map->reg_type); + &pdev->resource[bar], &offset, reg_type); return false; } - map->reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); + map->reg_type = reg_type; map->resource = pci_resource_start(pdev, bar) + offset; map->max_size = pci_resource_len(pdev, bar) - offset; return true; -- GitLab From 0462c56c290a99a7f03e817ae5b843116dfb575c Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 25 Mar 2024 16:21:25 +0100 Subject: [PATCH 080/989] driver core: Introduce device_link_wait_removal() The commit 80dd33cf72d1 ("drivers: base: Fix device link removal") introduces a workqueue to release the consumer and supplier devices used in the devlink. In the job queued, devices are release and in turn, when all the references to these devices are dropped, the release function of the device itself is called. Nothing is present to provide some synchronisation with this workqueue in order to ensure that all ongoing releasing operations are done and so, some other operations can be started safely. For instance, in the following sequence: 1) of_platform_depopulate() 2) of_overlay_remove() During the step 1, devices are released and related devlinks are removed (jobs pushed in the workqueue). During the step 2, OF nodes are destroyed but, without any synchronisation with devlink removal jobs, of_overlay_remove() can raise warnings related to missing of_node_put(): ERROR: memory leak, expected refcount 1 instead of 2 Indeed, the missing of_node_put() call is going to be done, too late, from the workqueue job execution. Introduce device_link_wait_removal() to offer a way to synchronize operations waiting for the end of devlink removals (i.e. end of workqueue jobs). Also, as a flushing operation is done on the workqueue, the workqueue used is moved from a system-wide workqueue to a local one. Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Tested-by: Luca Ceresoli Reviewed-by: Nuno Sa Reviewed-by: Saravana Kannan Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240325152140.198219-2-herve.codina@bootlin.com Signed-off-by: Rob Herring --- drivers/base/core.c | 26 +++++++++++++++++++++++--- include/linux/device.h | 1 + 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index b93f3c5716aee..5f4e03336e68e 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -44,6 +44,7 @@ static bool fw_devlink_is_permissive(void); static void __fw_devlink_link_to_consumers(struct device *dev); static bool fw_devlink_drv_reg_done; static bool fw_devlink_best_effort; +static struct workqueue_struct *device_link_wq; /** * __fwnode_link_add - Create a link between two fwnode_handles. @@ -533,12 +534,26 @@ static void devlink_dev_release(struct device *dev) /* * It may take a while to complete this work because of the SRCU * synchronization in device_link_release_fn() and if the consumer or - * supplier devices get deleted when it runs, so put it into the "long" - * workqueue. + * supplier devices get deleted when it runs, so put it into the + * dedicated workqueue. */ - queue_work(system_long_wq, &link->rm_work); + queue_work(device_link_wq, &link->rm_work); } +/** + * device_link_wait_removal - Wait for ongoing devlink removal jobs to terminate + */ +void device_link_wait_removal(void) +{ + /* + * devlink removal jobs are queued in the dedicated work queue. + * To be sure that all removal jobs are terminated, ensure that any + * scheduled work has run to completion. + */ + flush_workqueue(device_link_wq); +} +EXPORT_SYMBOL_GPL(device_link_wait_removal); + static struct class devlink_class = { .name = "devlink", .dev_groups = devlink_groups, @@ -4164,9 +4179,14 @@ int __init devices_init(void) sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj); if (!sysfs_dev_char_kobj) goto char_kobj_err; + device_link_wq = alloc_workqueue("device_link_wq", 0, 0); + if (!device_link_wq) + goto wq_err; return 0; + wq_err: + kobject_put(sysfs_dev_char_kobj); char_kobj_err: kobject_put(sysfs_dev_block_kobj); block_kobj_err: diff --git a/include/linux/device.h b/include/linux/device.h index 97c4b046c09d9..b9f5464f44ed8 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1247,6 +1247,7 @@ void device_link_del(struct device_link *link); void device_link_remove(void *consumer, struct device *supplier); void device_links_supplier_sync_state_pause(void); void device_links_supplier_sync_state_resume(void); +void device_link_wait_removal(void); /* Create alias, so I can be autoloaded. */ #define MODULE_ALIAS_CHARDEV(major,minor) \ -- GitLab From 8917e7385346bd6584890ed362985c219fe6ae84 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 25 Mar 2024 16:21:26 +0100 Subject: [PATCH 081/989] of: dynamic: Synchronize of_changeset_destroy() with the devlink removals In the following sequence: 1) of_platform_depopulate() 2) of_overlay_remove() During the step 1, devices are destroyed and devlinks are removed. During the step 2, OF nodes are destroyed but __of_changeset_entry_destroy() can raise warnings related to missing of_node_put(): ERROR: memory leak, expected refcount 1 instead of 2 ... Indeed, during the devlink removals performed at step 1, the removal itself releasing the device (and the attached of_node) is done by a job queued in a workqueue and so, it is done asynchronously with respect to function calls. When the warning is present, of_node_put() will be called but wrongly too late from the workqueue job. In order to be sure that any ongoing devlink removals are done before the of_node destruction, synchronize the of_changeset_destroy() with the devlink removals. Fixes: 80dd33cf72d1 ("drivers: base: Fix device link removal") Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Reviewed-by: Saravana Kannan Tested-by: Luca Ceresoli Reviewed-by: Nuno Sa Link: https://lore.kernel.org/r/20240325152140.198219-3-herve.codina@bootlin.com Signed-off-by: Rob Herring --- drivers/of/dynamic.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c index 3bf27052832f3..4d57a4e341054 100644 --- a/drivers/of/dynamic.c +++ b/drivers/of/dynamic.c @@ -9,6 +9,7 @@ #define pr_fmt(fmt) "OF: " fmt +#include #include #include #include @@ -667,6 +668,17 @@ void of_changeset_destroy(struct of_changeset *ocs) { struct of_changeset_entry *ce, *cen; + /* + * When a device is deleted, the device links to/from it are also queued + * for deletion. Until these device links are freed, the devices + * themselves aren't freed. If the device being deleted is due to an + * overlay change, this device might be holding a reference to a device + * node that will be freed. So, wait until all already pending device + * links are deleted before freeing a device node. This ensures we don't + * free any device node that has a non-zero reference count. + */ + device_link_wait_removal(); + list_for_each_entry_safe_reverse(ce, cen, &ocs->entries, node) __of_changeset_entry_destroy(ce); } -- GitLab From ad14f7ca9f0d9fdf73d1fd61aaf8248d46ffc849 Mon Sep 17 00:00:00 2001 From: Vladimir Isaev Date: Wed, 13 Mar 2024 10:35:46 +0300 Subject: [PATCH 082/989] riscv: hwprobe: do not produce frtace relocation Such relocation causes crash of android linker similar to one described in commit e05d57dcb8c7 ("riscv: Fixup __vdso_gettimeofday broke dynamic ftrace"). Looks like this relocation is added by CONFIG_DYNAMIC_FTRACE which is disabled in the default android kernel. Before: readelf -rW arch/riscv/kernel/vdso/vdso.so: Relocation section '.rela.dyn' at offset 0xd00 contains 1 entry: Offset Info Type 0000000000000d20 0000000000000003 R_RISCV_RELATIVE objdump: 0000000000000c86 <__vdso_riscv_hwprobe@@LINUX_4.15>: c86: 0001 nop c88: 0001 nop c8a: 0001 nop c8c: 0001 nop c8e: e211 bnez a2,c92 <__vdso_riscv_hwprobe... After: readelf -rW arch/riscv/kernel/vdso/vdso.so: There are no relocations in this file. objdump: 0000000000000c86 <__vdso_riscv_hwprobe@@LINUX_4.15>: c86: e211 bnez a2,c8a <__vdso_riscv_hwprobe... c88: c6b9 beqz a3,cd6 <__vdso_riscv_hwprobe... c8a: e739 bnez a4,cd8 <__vdso_riscv_hwprobe... c8c: ffffd797 auipc a5,0xffffd Also disable SCS since it also should not be available in vdso. Fixes: aa5af0aa90ba ("RISC-V: Add hwprobe vDSO function and data") Signed-off-by: Roman Artemev Signed-off-by: Vladimir Isaev Reviewed-by: Alexandre Ghiti Reviewed-by: Guo Ren Link: https://lore.kernel.org/r/20240313085843.17661-1-vladimir.isaev@syntacore.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vdso/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 9b517fe1b8a8e..272c431ac5b9f 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -37,6 +37,7 @@ endif # Disable -pg to prevent insert call site CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) +CFLAGS_REMOVE_hwprobe.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) # Disable profiling and instrumentation for VDSO code GCOV_PROFILE := n -- GitLab From 4b0bf9a0127029054c2fa18ba5b3f3ddc45f54ed Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 17 Nov 2023 21:58:07 +0900 Subject: [PATCH 083/989] riscv: compat_vdso: install compat_vdso.so.dbg to /lib/modules/*/vdso/ 'make vdso_install' installs debug vdso files to /lib/modules/*/vdso/. Only for the compat vdso on riscv, the installation destination differs; compat_vdso.so.dbg is installed to /lib/module/*/compat_vdso/. To follow the standard install destination and simplify the vdso_install logic, change the install destination to standard /lib/modules/*/vdso/. Signed-off-by: Masahiro Yamada Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231117125807.1058477-1-masahiroy@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 252d63942f34e..5b3115a198522 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -151,7 +151,7 @@ endif endif vdso-install-y += arch/riscv/kernel/vdso/vdso.so.dbg -vdso-install-$(CONFIG_COMPAT) += arch/riscv/kernel/compat_vdso/compat_vdso.so.dbg:../compat_vdso/compat_vdso.so +vdso-install-$(CONFIG_COMPAT) += arch/riscv/kernel/compat_vdso/compat_vdso.so.dbg ifneq ($(CONFIG_XIP_KERNEL),y) ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_ARCH_CANAAN),yy) -- GitLab From ea6873118493019474abbf57d5a800da365734df Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Tue, 12 Mar 2024 01:20:53 +0000 Subject: [PATCH 084/989] drivers/perf: riscv: Disable PERF_SAMPLE_BRANCH_* while not supported RISC-V perf driver does not yet support branch sampling. Although the specification is in the works [0], it is best to disable such events until support is available, otherwise we will get unexpected results. Due to this reason, two riscv bpf testcases get_branch_snapshot and perf_branches/perf_branches_hw fail. Link: https://github.com/riscv/riscv-control-transfer-records [0] Fixes: f5bfa23f576f ("RISC-V: Add a perf core library for pmu drivers") Signed-off-by: Pu Lehui Reviewed-by: Atish Patra Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20240312012053.1178140-1-pulehui@huaweicloud.com Signed-off-by: Palmer Dabbelt --- drivers/perf/riscv_pmu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c index c78a6fd6c57f6..b4efdddb2ad91 100644 --- a/drivers/perf/riscv_pmu.c +++ b/drivers/perf/riscv_pmu.c @@ -313,6 +313,10 @@ static int riscv_pmu_event_init(struct perf_event *event) u64 event_config = 0; uint64_t cmask; + /* driver does not support branch stack sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + hwc->flags = 0; mapped_event = rvpmu->event_map(event, &event_config); if (mapped_event < 0) { -- GitLab From 653650c468be211752aa56eae79af1ae67c5e70c Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Tue, 26 Mar 2024 15:37:13 +0000 Subject: [PATCH 085/989] riscv: Mark __se_sys_* functions __used Clang doesn't think ___se_sys_* functions used even though they are aliased to __se_sys_*, resulting in -Wunused-function warnings when building rv32. For example: mm/oom_kill.c:1195:1: warning: unused function '___se_sys_process_mrelease' [-Wunused-function] 1195 | SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/linux/syscalls.h:221:36: note: expanded from macro 'SYSCALL_DEFINE2' 221 | #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/linux/syscalls.h:231:2: note: expanded from macro 'SYSCALL_DEFINEx' 231 | __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/riscv/include/asm/syscall_wrapper.h:81:2: note: expanded from macro '__SYSCALL_DEFINEx' 81 | __SYSCALL_SE_DEFINEx(x, sys, name, __VA_ARGS__) \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/riscv/include/asm/syscall_wrapper.h:40:14: note: expanded from macro '__SYSCALL_SE_DEFINEx' 40 | static long ___se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)) | ^~~~~~~~~~~~~~~~~~~~ :30:1: note: expanded from here 30 | ___se_sys_process_mrelease | ^~~~~~~~~~~~~~~~~~~~~~~~~~ 1 warning generated. Mark the functions __used explicitly to fix the Clang warnings. Fixes: a9ad73295cc1 ("riscv: Fix syscall wrapper for >word-size arguments") Reported-by: Linux Kernel Functional Testing Tested-by: Linux Kernel Functional Testing Signed-off-by: Sami Tolvanen Reviewed-by: Alexandre Ghiti Tested-by: Conor Dooley Link: https://lore.kernel.org/r/20240326153712.1839482-2-samitolvanen@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/syscall_wrapper.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/syscall_wrapper.h b/arch/riscv/include/asm/syscall_wrapper.h index 980094c2e9761..ac80216549ffa 100644 --- a/arch/riscv/include/asm/syscall_wrapper.h +++ b/arch/riscv/include/asm/syscall_wrapper.h @@ -36,7 +36,8 @@ asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *); ulong) \ __attribute__((alias(__stringify(___se_##prefix##name)))); \ __diag_pop(); \ - static long noinline ___se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static long noinline ___se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + __used; \ static long ___se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)) #define SC_RISCV_REGS_TO_ARGS(x, ...) \ -- GitLab From ddd65e19c60140673ea9f7249af0a672f1820623 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 23 Mar 2024 17:11:19 +0100 Subject: [PATCH 086/989] block: handle BLK_OPEN_RESTRICT_WRITES correctly Last kernel release we introduce CONFIG_BLK_DEV_WRITE_MOUNTED. By default this option is set. When it is set the long-standing behavior of being able to write to mounted block devices is enabled. But in order to guard against unintended corruption by writing to the block device buffer cache CONFIG_BLK_DEV_WRITE_MOUNTED can be turned off. In that case it isn't possible to write to mounted block devices anymore. A filesystem may open its block devices with BLK_OPEN_RESTRICT_WRITES which disallows concurrent BLK_OPEN_WRITE access. When we still had the bdev handle around we could recognize BLK_OPEN_RESTRICT_WRITES because the mode was passed around. Since we managed to get rid of the bdev handle we changed that logic to recognize BLK_OPEN_RESTRICT_WRITES based on whether the file was opened writable and writes to that block device are blocked. That logic doesn't work because we do allow BLK_OPEN_RESTRICT_WRITES to be specified without BLK_OPEN_WRITE. Fix the detection logic and use an FMODE_* bit. We could've also abused O_EXCL as an indicator that BLK_OPEN_RESTRICT_WRITES has been requested. For userspace open paths O_EXCL will never be retained but for internal opens where we open files that are never installed into a file descriptor table this is fine. But it would be a gamble that this doesn't cause bugs. Note that BLK_OPEN_RESTRICT_WRITES is an internal only flag that cannot directly be raised by userspace. It is implicitly raised during mounting. Passes xftests and blktests with CONFIG_BLK_DEV_WRITE_MOUNTED set and unset. Link: https://lore.kernel.org/r/ZfyyEwu9Uq5Pgb94@casper.infradead.org Link: https://lore.kernel.org/r/20240323-zielbereich-mittragen-6fdf14876c3e@brauner Fixes: 321de651fa56 ("block: don't rely on BLK_OPEN_RESTRICT_WRITES when yielding write access") Reviewed-by: Yu Kuai Reviewed-by: Jan Kara Reported-by: Matthew Wilcox Signed-off-by: Christian Brauner --- block/bdev.c | 14 +++++++------- include/linux/fs.h | 2 ++ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 7a5f611c3d2e3..ab9cae0e05f14 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -821,13 +821,11 @@ static void bdev_yield_write_access(struct file *bdev_file) return; bdev = file_bdev(bdev_file); - /* Yield exclusive or shared write access. */ - if (bdev_file->f_mode & FMODE_WRITE) { - if (bdev_writes_blocked(bdev)) - bdev_unblock_writes(bdev); - else - bdev->bd_writers--; - } + + if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) + bdev_unblock_writes(bdev); + else if (bdev_file->f_mode & FMODE_WRITE) + bdev->bd_writers--; } /** @@ -907,6 +905,8 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; if (bdev_nowait(bdev)) bdev_file->f_mode |= FMODE_NOWAIT; + if (mode & BLK_OPEN_RESTRICT_WRITES) + bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; bdev_file->f_mapping = bdev->bd_inode->i_mapping; bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); bdev_file->private_data = holder; diff --git a/include/linux/fs.h b/include/linux/fs.h index 00fc429b0af0f..8dfd53b52744a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -121,6 +121,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_PWRITE ((__force fmode_t)0x10) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)0x20) +/* File writes are restricted (block device specific) */ +#define FMODE_WRITE_RESTRICTED ((__force fmode_t)0x40) /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)0x200) /* 64bit hashes as llseek() offset (for directories) */ -- GitLab From 3ff56e285de5a375fbfab3c3f1af81bbd23db36d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 23 Mar 2024 17:11:20 +0100 Subject: [PATCH 087/989] block: count BLK_OPEN_RESTRICT_WRITES openers The original changes in v6.8 do allow for a block device to be reopened with BLK_OPEN_RESTRICT_WRITES provided the same holder is used as per bdev_may_open(). I think this has a bug. The first opener @f1 of that block device will set bdev->bd_writers to -1. The second opener @f2 using the same holder will pass the check in bdev_may_open() that bdev->bd_writers must not be greater than zero. The first opener @f1 now closes the block device and in bdev_release() will end up calling bdev_yield_write_access() which calls bdev_writes_blocked() and sets bdev->bd_writers to 0 again. Now @f2 holds a file to that block device which was opened with exclusive write access but bdev->bd_writers has been reset to 0. So now @f3 comes along and succeeds in opening the block device with BLK_OPEN_WRITE betraying @f2's request to have exclusive write access. This isn't a practical issue yet because afaict there's no codepath inside the kernel that reopenes the same block device with BLK_OPEN_RESTRICT_WRITES but it will be if there is. Fix this by counting the number of BLK_OPEN_RESTRICT_WRITES openers. So we only allow writes again once all BLK_OPEN_RESTRICT_WRITES openers are done. Link: https://lore.kernel.org/r/20240323-abtauchen-klauen-c2953810082d@brauner Fixes: ed5cc702d311 ("block: Add config option to not allow writing to mounted devices") Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- block/bdev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index ab9cae0e05f14..a1946a902df36 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -776,17 +776,17 @@ void blkdev_put_no_open(struct block_device *bdev) static bool bdev_writes_blocked(struct block_device *bdev) { - return bdev->bd_writers == -1; + return bdev->bd_writers < 0; } static void bdev_block_writes(struct block_device *bdev) { - bdev->bd_writers = -1; + bdev->bd_writers--; } static void bdev_unblock_writes(struct block_device *bdev) { - bdev->bd_writers = 0; + bdev->bd_writers++; } static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) -- GitLab From 22650a99821dda3d05f1c334ea90330b4982de56 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 26 Mar 2024 13:47:22 +0100 Subject: [PATCH 088/989] fs,block: yield devices early Currently a device is only really released once the umount returns to userspace due to how file closing works. That ultimately could cause an old umount assumption to be violated that concurrent umount and mount don't fail. So an exclusively held device with a temporary holder should be yielded before the filesystem is gone. Add a helper that allows callers to do that. This also allows us to remove the two holder ops that Linus wasn't excited about. Link: https://lore.kernel.org/r/20240326-vfs-bdev-end_holder-v1-1-20af85202918@kernel.org Fixes: f3a608827d1f ("bdev: open block device as files") # mainline only Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner --- block/bdev.c | 64 ++++++++++++++++++++++++++++----- drivers/mtd/devices/block2mtd.c | 2 +- fs/bcachefs/super-io.c | 2 +- fs/cramfs/inode.c | 2 +- fs/ext4/super.c | 8 ++--- fs/f2fs/super.c | 2 +- fs/jfs/jfs_logmgr.c | 4 +-- fs/reiserfs/journal.c | 2 +- fs/romfs/super.c | 2 +- fs/super.c | 24 ++----------- fs/xfs/xfs_buf.c | 2 +- fs/xfs/xfs_super.c | 6 ++-- include/linux/blkdev.h | 11 +----- 13 files changed, 76 insertions(+), 55 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index a1946a902df36..b8e32d933a636 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -583,9 +583,6 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder, mutex_unlock(&bdev->bd_holder_lock); bd_clear_claiming(whole, holder); mutex_unlock(&bdev_lock); - - if (hops && hops->get_holder) - hops->get_holder(holder); } /** @@ -608,7 +605,6 @@ EXPORT_SYMBOL(bd_abort_claiming); static void bd_end_claim(struct block_device *bdev, void *holder) { struct block_device *whole = bdev_whole(bdev); - const struct blk_holder_ops *hops = bdev->bd_holder_ops; bool unblock = false; /* @@ -631,9 +627,6 @@ static void bd_end_claim(struct block_device *bdev, void *holder) whole->bd_holder = NULL; mutex_unlock(&bdev_lock); - if (hops && hops->put_holder) - hops->put_holder(holder); - /* * If this was the last claim, remove holder link and unblock evpoll if * it was a write holder. @@ -813,6 +806,11 @@ static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) bdev->bd_writers++; } +static inline bool bdev_unclaimed(const struct file *bdev_file) +{ + return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host); +} + static void bdev_yield_write_access(struct file *bdev_file) { struct block_device *bdev; @@ -820,6 +818,9 @@ static void bdev_yield_write_access(struct file *bdev_file) if (bdev_allow_write_mounted) return; + if (bdev_unclaimed(bdev_file)) + return; + bdev = file_bdev(bdev_file); if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) @@ -1012,6 +1013,20 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, } EXPORT_SYMBOL(bdev_file_open_by_path); +static inline void bd_yield_claim(struct file *bdev_file) +{ + struct block_device *bdev = file_bdev(bdev_file); + void *holder = bdev_file->private_data; + + lockdep_assert_held(&bdev->bd_disk->open_mutex); + + if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder))) + return; + + if (!bdev_unclaimed(bdev_file)) + bd_end_claim(bdev, holder); +} + void bdev_release(struct file *bdev_file) { struct block_device *bdev = file_bdev(bdev_file); @@ -1036,7 +1051,7 @@ void bdev_release(struct file *bdev_file) bdev_yield_write_access(bdev_file); if (holder) - bd_end_claim(bdev, holder); + bd_yield_claim(bdev_file); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE @@ -1056,6 +1071,39 @@ put_no_open: blkdev_put_no_open(bdev); } +/** + * bdev_fput - yield claim to the block device and put the file + * @bdev_file: open block device + * + * Yield claim on the block device and put the file. Ensure that the + * block device can be reclaimed before the file is closed which is a + * deferred operation. + */ +void bdev_fput(struct file *bdev_file) +{ + if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) + return; + + if (bdev_file->private_data) { + struct block_device *bdev = file_bdev(bdev_file); + struct gendisk *disk = bdev->bd_disk; + + mutex_lock(&disk->open_mutex); + bdev_yield_write_access(bdev_file); + bd_yield_claim(bdev_file); + /* + * Tell release we already gave up our hold on the + * device and if write restrictions are available that + * we already gave up write access to the device. + */ + bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); + mutex_unlock(&disk->open_mutex); + } + + fput(bdev_file); +} +EXPORT_SYMBOL(bdev_fput); + /** * lookup_bdev() - Look up a struct block_device by name. * @pathname: Name of the block device in the filesystem. diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index 97a00ec9a4d48..caacdc0a38194 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -209,7 +209,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev) if (dev->bdev_file) { invalidate_mapping_pages(dev->bdev_file->f_mapping, 0, -1); - fput(dev->bdev_file); + bdev_fput(dev->bdev_file); } kfree(dev); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index ad28e370b6404..cb7b4de11a49e 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -143,7 +143,7 @@ void bch2_free_super(struct bch_sb_handle *sb) { kfree(sb->bio); if (!IS_ERR_OR_NULL(sb->s_bdev_file)) - fput(sb->s_bdev_file); + bdev_fput(sb->s_bdev_file); kfree(sb->holder); kfree(sb->sb_name); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 39e75131fd5aa..9901057a15ba7 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb) sb->s_mtd = NULL; } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { sync_blockdev(sb->s_bdev); - fput(sb->s_bdev_file); + bdev_fput(sb->s_bdev_file); } kfree(sbi); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cfb8449c731f9..044135796f2b6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5668,7 +5668,7 @@ failed_mount: brelse(sbi->s_sbh); if (sbi->s_journal_bdev_file) { invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); - fput(sbi->s_journal_bdev_file); + bdev_fput(sbi->s_journal_bdev_file); } out_fail: invalidate_bdev(sb->s_bdev); @@ -5913,7 +5913,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb, out_bh: brelse(bh); out_bdev: - fput(bdev_file); + bdev_fput(bdev_file); return ERR_PTR(errno); } @@ -5952,7 +5952,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, out_journal: jbd2_journal_destroy(journal); out_bdev: - fput(bdev_file); + bdev_fput(bdev_file); return ERR_PTR(errno); } @@ -7327,7 +7327,7 @@ static void ext4_kill_sb(struct super_block *sb) kill_block_super(sb); if (bdev_file) - fput(bdev_file); + bdev_fput(bdev_file); } static struct file_system_type ext4_fs_type = { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a6867f26f1418..a4bc26dfdb1af 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1558,7 +1558,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) for (i = 0; i < sbi->s_ndevs; i++) { if (i > 0) - fput(FDEV(i).bdev_file); + bdev_fput(FDEV(i).bdev_file); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); #endif diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 73389c68e2517..9609349e92e5e 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1141,7 +1141,7 @@ journal_found: lbmLogShutdown(log); close: /* close external log device */ - fput(bdev_file); + bdev_fput(bdev_file); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1485,7 +1485,7 @@ int lmLogClose(struct super_block *sb) bdev_file = log->bdev_file; rc = lmLogShutdown(log); - fput(bdev_file); + bdev_fput(bdev_file); kfree(log); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 6474529c42530..e539ccd39e1ee 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2589,7 +2589,7 @@ static void journal_list_init(struct super_block *sb) static void release_journal_dev(struct reiserfs_journal *journal) { if (journal->j_bdev_file) { - fput(journal->j_bdev_file); + bdev_fput(journal->j_bdev_file); journal->j_bdev_file = NULL; } } diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 2be227532f399..2cbb924620747 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -594,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb) #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) { sync_blockdev(sb->s_bdev); - fput(sb->s_bdev_file); + bdev_fput(sb->s_bdev_file); } #endif } diff --git a/fs/super.c b/fs/super.c index 71d9779c42b10..69ce6c6009684 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1515,29 +1515,11 @@ static int fs_bdev_thaw(struct block_device *bdev) return error; } -static void fs_bdev_super_get(void *data) -{ - struct super_block *sb = data; - - spin_lock(&sb_lock); - sb->s_count++; - spin_unlock(&sb_lock); -} - -static void fs_bdev_super_put(void *data) -{ - struct super_block *sb = data; - - put_super(sb); -} - const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, .freeze = fs_bdev_freeze, .thaw = fs_bdev_thaw, - .get_holder = fs_bdev_super_get, - .put_holder = fs_bdev_super_put, }; EXPORT_SYMBOL_GPL(fs_holder_ops); @@ -1562,7 +1544,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { - fput(bdev_file); + bdev_fput(bdev_file); return -EACCES; } @@ -1573,7 +1555,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - fput(bdev_file); + bdev_fput(bdev_file); return -EBUSY; } spin_lock(&sb_lock); @@ -1693,7 +1675,7 @@ void kill_block_super(struct super_block *sb) generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); - fput(sb->s_bdev_file); + bdev_fput(sb->s_bdev_file); } } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1a18c381127e2..f0fa02264edaa 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2030,7 +2030,7 @@ xfs_free_buftarg( fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) - fput(btp->bt_bdev_file); + bdev_fput(btp->bt_bdev_file); kfree(btp); } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c21f10ab0f5db..bce020374c5eb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -485,7 +485,7 @@ xfs_open_devices( mp->m_logdev_targp = mp->m_ddev_targp; /* Handle won't be used, drop it */ if (logdev_file) - fput(logdev_file); + bdev_fput(logdev_file); } return 0; @@ -497,10 +497,10 @@ xfs_open_devices( xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: if (rtdev_file) - fput(rtdev_file); + bdev_fput(rtdev_file); out_close_logdev: if (logdev_file) - fput(logdev_file); + bdev_fput(logdev_file); return error; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c3e8f7cf96be9..172c918799995 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1505,16 +1505,6 @@ struct blk_holder_ops { * Thaw the file system mounted on the block device. */ int (*thaw)(struct block_device *bdev); - - /* - * If needed, get a reference to the holder. - */ - void (*get_holder)(void *holder); - - /* - * Release the holder. - */ - void (*put_holder)(void *holder); }; /* @@ -1585,6 +1575,7 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev) int bdev_freeze(struct block_device *bdev); int bdev_thaw(struct block_device *bdev); +void bdev_fput(struct file *bdev_file); struct io_comp_batch { struct request *req_list; -- GitLab From 13dddf9319808badd2c1f5d7007b4e82838a648e Mon Sep 17 00:00:00 2001 From: Victor Isaev Date: Fri, 15 Dec 2023 23:27:20 -0500 Subject: [PATCH 089/989] RISC-V: Update AT_VECTOR_SIZE_ARCH for new AT_MINSIGSTKSZ "riscv: signal: Report signal frame size to userspace via auxv" (e92f469) has added new constant AT_MINSIGSTKSZ but failed to increment the size of auxv, keeping AT_VECTOR_SIZE_ARCH at 9. This fix correctly increments AT_VECTOR_SIZE_ARCH to 10, following the approach in the commit 94b07c1 ("arm64: signal: Report signal frame size to userspace via auxv"). Link: https://lore.kernel.org/r/73883406.20231215232720@torrio.net Link: https://lore.kernel.org/all/20240102133617.3649-1-victor@torrio.net/ Reported-by: Ivan Komarov Closes: https://lore.kernel.org/linux-riscv/CY3Z02NYV1C4.11BLB9PLVW9G1@fedora/ Fixes: e92f469b0771 ("riscv: signal: Report signal frame size to userspace via auxv") Signed-off-by: Victor Isaev Signed-off-by: Palmer Dabbelt --- arch/riscv/include/uapi/asm/auxvec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/uapi/asm/auxvec.h b/arch/riscv/include/uapi/asm/auxvec.h index 10aaa83db89ef..95050ebe9ad00 100644 --- a/arch/riscv/include/uapi/asm/auxvec.h +++ b/arch/riscv/include/uapi/asm/auxvec.h @@ -34,7 +34,7 @@ #define AT_L3_CACHEGEOMETRY 47 /* entries in ARCH_DLINFO */ -#define AT_VECTOR_SIZE_ARCH 9 +#define AT_VECTOR_SIZE_ARCH 10 #define AT_MINSIGSTKSZ 51 #endif /* _UAPI_ASM_RISCV_AUXVEC_H */ -- GitLab From 7115ff4a8bfed3b9294bad2e111744e6abeadf1a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 17 Nov 2023 21:58:43 +0900 Subject: [PATCH 090/989] riscv: compat_vdso: align VDSOAS build log Add one more space after "VDSOAS" for better alignment in the build log. [Before] LDS arch/riscv/kernel/compat_vdso/compat_vdso.lds VDSOAS arch/riscv/kernel/compat_vdso/rt_sigreturn.o VDSOAS arch/riscv/kernel/compat_vdso/getcpu.o VDSOAS arch/riscv/kernel/compat_vdso/flush_icache.o VDSOAS arch/riscv/kernel/compat_vdso/note.o VDSOLD arch/riscv/kernel/compat_vdso/compat_vdso.so.dbg VDSOSYM include/generated/compat_vdso-offsets.h [After] LDS arch/riscv/kernel/compat_vdso/compat_vdso.lds VDSOAS arch/riscv/kernel/compat_vdso/rt_sigreturn.o VDSOAS arch/riscv/kernel/compat_vdso/getcpu.o VDSOAS arch/riscv/kernel/compat_vdso/flush_icache.o VDSOAS arch/riscv/kernel/compat_vdso/note.o VDSOLD arch/riscv/kernel/compat_vdso/compat_vdso.so.dbg VDSOSYM include/generated/compat_vdso-offsets.h Signed-off-by: Masahiro Yamada Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231117125843.1058553-1-masahiroy@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/compat_vdso/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/compat_vdso/Makefile b/arch/riscv/kernel/compat_vdso/Makefile index 62fa393b2eb2e..3df4cb788c1fa 100644 --- a/arch/riscv/kernel/compat_vdso/Makefile +++ b/arch/riscv/kernel/compat_vdso/Makefile @@ -74,5 +74,5 @@ quiet_cmd_compat_vdsold = VDSOLD $@ rm $@.tmp # actual build commands -quiet_cmd_compat_vdsoas = VDSOAS $@ +quiet_cmd_compat_vdsoas = VDSOAS $@ cmd_compat_vdsoas = $(COMPAT_CC) $(a_flags) $(COMPAT_CC_FLAGS) -c -o $@ $< -- GitLab From 0ffe1ae7026dd129d86318388ed62ba61f085730 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 22 Nov 2023 00:06:37 +0800 Subject: [PATCH 091/989] riscv: mm: implement pgprot_nx commit cca98e9f8b5e ("mm: enforce that vmap can't map pages executable") enforces the W^X protection by not allowing remapping existing pages as executable. Add riscv bits so that riscv can benefit the same protection. Signed-off-by: Jisheng Zhang Reviewed-by: Samuel Holland Tested-by: Samuel Holland Reviewed-by: Christoph Hellwig Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231121160637.3856-1-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 97fcde30e2477..9f8ea0e33eb10 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -593,6 +593,12 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, return ptep_test_and_clear_young(vma, address, ptep); } +#define pgprot_nx pgprot_nx +static inline pgprot_t pgprot_nx(pgprot_t _prot) +{ + return __pgprot(pgprot_val(_prot) & ~_PAGE_EXEC); +} + #define pgprot_noncached pgprot_noncached static inline pgprot_t pgprot_noncached(pgprot_t _prot) { -- GitLab From ad91c1d77fd0a489706b7b784a70e464a4a03490 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 26 Mar 2024 18:46:30 +0100 Subject: [PATCH 092/989] dt-bindings: ufs: qcom: document SC8180X UFS Document already upstreamed and used Qualcomm SC8180x UFS host controller to fix dtbs_check warnings like: sc8180x-primus.dtb: ufshc@1d84000: compatible:0: 'qcom,sc8180x-ufshc' is not one of ['qcom,msm8994-ufshc', ... ] sc8180x-primus.dtb: ufshc@1d84000: Unevaluated properties are not allowed ('compatible' was unexpected) Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240326174632.209745-1-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/ufs/qcom,ufs.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml index 10c146424baa1..1ab3d16917acb 100644 --- a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml +++ b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml @@ -28,6 +28,7 @@ properties: - qcom,msm8998-ufshc - qcom,sa8775p-ufshc - qcom,sc7280-ufshc + - qcom,sc8180x-ufshc - qcom,sc8280xp-ufshc - qcom,sdm845-ufshc - qcom,sm6115-ufshc @@ -120,6 +121,7 @@ allOf: - qcom,msm8998-ufshc - qcom,sa8775p-ufshc - qcom,sc7280-ufshc + - qcom,sc8180x-ufshc - qcom,sc8280xp-ufshc - qcom,sm8250-ufshc - qcom,sm8350-ufshc -- GitLab From 7fb5aafc0a702c4c0bb22410d1e67a732e320511 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 26 Mar 2024 18:46:31 +0100 Subject: [PATCH 093/989] dt-bindings: ufs: qcom: document SC7180 UFS Document already upstreamed and used Qualcomm SC7180 UFS host controller to fix dtbs_check warnings like: sc7180-idp.dtb: ufshc@1d84000: compatible:0: 'qcom,sc7180-ufshc' is not one of ... sc7180-idp.dtb: ufshc@1d84000: clocks: [[39, 99], [39, 7], [39, 98], [39, 107], [36, 0], [39, 106], [39, 105]] is too short sc7180-idp.dtb: ufshc@1d84000: clock-names: ['core_clk', 'bus_aggr_clk', 'iface_clk', 'core_clk_unipro', ...] is too short Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240326174632.209745-2-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- .../devicetree/bindings/ufs/qcom,ufs.yaml | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml index 1ab3d16917acb..7e6d442545ad9 100644 --- a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml +++ b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml @@ -27,6 +27,7 @@ properties: - qcom,msm8996-ufshc - qcom,msm8998-ufshc - qcom,sa8775p-ufshc + - qcom,sc7180-ufshc - qcom,sc7280-ufshc - qcom,sc8180x-ufshc - qcom,sc8280xp-ufshc @@ -43,11 +44,11 @@ properties: - const: jedec,ufs-2.0 clocks: - minItems: 8 + minItems: 7 maxItems: 11 clock-names: - minItems: 8 + minItems: 7 maxItems: 11 dma-coherent: true @@ -113,6 +114,31 @@ required: allOf: - $ref: ufs-common.yaml + - if: + properties: + compatible: + contains: + enum: + - qcom,sc7180-ufshc + then: + properties: + clocks: + minItems: 7 + maxItems: 7 + clock-names: + items: + - const: core_clk + - const: bus_aggr_clk + - const: iface_clk + - const: core_clk_unipro + - const: ref_clk + - const: tx_lane0_sync_clk + - const: rx_lane0_sync_clk + reg: + maxItems: 1 + reg-names: + maxItems: 1 + - if: properties: compatible: @@ -250,7 +276,7 @@ allOf: reg: maxItems: 1 clocks: - minItems: 8 + minItems: 7 maxItems: 8 else: properties: @@ -258,7 +284,7 @@ allOf: minItems: 1 maxItems: 2 clocks: - minItems: 8 + minItems: 7 maxItems: 11 unevaluatedProperties: false -- GitLab From b5237d0bdb3cb164b7792cc4f1ff2ecafbfac661 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 26 Mar 2024 18:46:32 +0100 Subject: [PATCH 094/989] dt-bindings: ufs: qcom: document SM6125 UFS Document already upstreamed and used Qualcomm SM6125 UFS host controller to fix dtbs_check warnings like: sm6125-xiaomi-laurel-sprout.dtb: ufs@4804000: compatible:0: 'qcom,sm6125-ufshc' is not one of ['qcom,msm8994-ufshc', ... sm6125-xiaomi-laurel-sprout.dtb: ufs@4804000: Unevaluated properties are not allowed ('compatible' was unexpected) Signed-off-by: Krzysztof Kozlowski Acked-by: Krzysztof Kozlowski Reviewed-by: Krzysztof Kozlowski Reviewed-by: Martin Botka Link: https://lore.kernel.org/r/20240326174632.209745-3-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/ufs/qcom,ufs.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml index 7e6d442545ad9..cd3680dc002f9 100644 --- a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml +++ b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml @@ -33,6 +33,7 @@ properties: - qcom,sc8280xp-ufshc - qcom,sdm845-ufshc - qcom,sm6115-ufshc + - qcom,sm6125-ufshc - qcom,sm6350-ufshc - qcom,sm8150-ufshc - qcom,sm8250-ufshc @@ -243,6 +244,7 @@ allOf: contains: enum: - qcom,sm6115-ufshc + - qcom,sm6125-ufshc then: properties: clocks: -- GitLab From 4af565de9f8c74b9f6035924ce0d40adec211246 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Wed, 27 Mar 2024 16:16:53 +0530 Subject: [PATCH 095/989] ASoC: amd: acp: fix for acp pdm configuration check ACP PDM configuration has to be verified for all combinations. Remove FLAG_AMD_LEGACY_ONLY_DMIC check. Fixes: 3a94c8ad0aae ("ASoC: amd: acp: add code for scanning acp pdm controller") Signed-off-by: Vijendar Mukunda Link: https://msgid.link/r/20240327104657.3537664-2-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-pci.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sound/soc/amd/acp/acp-pci.c b/sound/soc/amd/acp/acp-pci.c index 8c8b1dcac6281..440b91d4f261c 100644 --- a/sound/soc/amd/acp/acp-pci.c +++ b/sound/soc/amd/acp/acp-pci.c @@ -133,11 +133,9 @@ static int acp_pci_probe(struct pci_dev *pci, const struct pci_device_id *pci_id } } - if (flag == FLAG_AMD_LEGACY_ONLY_DMIC) { - ret = check_acp_pdm(pci, chip); - if (ret < 0) - goto skip_pdev_creation; - } + ret = check_acp_pdm(pci, chip); + if (ret < 0) + goto skip_pdev_creation; chip->flag = flag; memset(&pdevinfo, 0, sizeof(pdevinfo)); -- GitLab From 00bb549d7d63a21532e76e4a334d7807a54d9f31 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Wed, 27 Mar 2024 11:44:06 +0000 Subject: [PATCH 096/989] regmap: maple: Fix cache corruption in regcache_maple_drop() When keeping the upper end of a cache block entry, the entry[] array must be indexed by the offset from the base register of the block, i.e. max - mas.index. The code was indexing entry[] by only the register address, leading to an out-of-bounds access that copied some part of the kernel memory over the cache contents. This bug was not detected by the regmap KUnit test because it only tests with a block of registers starting at 0, so mas.index == 0. Signed-off-by: Richard Fitzgerald Fixes: f033c26de5a5 ("regmap: Add maple tree based register cache") Link: https://msgid.link/r/20240327114406.976986-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/base/regmap/regcache-maple.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/regcache-maple.c b/drivers/base/regmap/regcache-maple.c index 41edd6a430eb4..c1776127a5724 100644 --- a/drivers/base/regmap/regcache-maple.c +++ b/drivers/base/regmap/regcache-maple.c @@ -145,7 +145,7 @@ static int regcache_maple_drop(struct regmap *map, unsigned int min, upper_index = max + 1; upper_last = mas.last; - upper = kmemdup(&entry[max + 1], + upper = kmemdup(&entry[max - mas.index + 1], ((mas.last - max) * sizeof(unsigned long)), map->alloc_flags); -- GitLab From 3cf5abf2860bc538620fc3dfca06f403964b787b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 26 Mar 2024 14:21:30 +0530 Subject: [PATCH 097/989] MAINTAINERS: Drop Gustavo Pimentel as PCI DWC Maintainer Gustavo Pimentel seems to have left Synopsys, so his email is bouncing. And there is no indication from him expressing willingless to continue contributing to the driver. Drop him from the MAINTAINERS entry and add a CREDITS entry. Link: https://lore.kernel.org/r/20240326085130.12487-1-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam [bhelgaas: add CREDITS entry] Signed-off-by: Bjorn Helgaas --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index c55c5a0ee4ff6..0107047f807bf 100644 --- a/CREDITS +++ b/CREDITS @@ -3146,6 +3146,10 @@ S: Triftstra=DFe 55 S: 13353 Berlin S: Germany +N: Gustavo Pimental +E: gustavo.pimentel@synopsys.com +D: PCI driver for Synopsys DesignWare + N: Emanuel Pirker E: epirker@edu.uni-klu.ac.at D: AIC5800 IEEE 1394, RAW I/O on 1394 diff --git a/MAINTAINERS b/MAINTAINERS index aa3b947fb0801..f615407222690 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16966,7 +16966,6 @@ F: drivers/pci/controller/dwc/pci-exynos.c PCI DRIVER FOR SYNOPSYS DESIGNWARE M: Jingoo Han -M: Gustavo Pimentel M: Manivannan Sadhasivam L: linux-pci@vger.kernel.org S: Maintained -- GitLab From 6dbdd4de0362c37e54e8b049781402e5a409e7d0 Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Thu, 4 Jan 2024 16:16:52 +0200 Subject: [PATCH 098/989] e1000e: Workaround for sporadic MDI error on Meteor Lake systems On some Meteor Lake systems accessing the PHY via the MDIO interface may result in an MDI error. This issue happens sporadically and in most cases a second access to the PHY via the MDIO interface results in success. As a workaround, introduce a retry counter which is set to 3 on Meteor Lake systems. The driver will only return an error if 3 consecutive PHY access attempts fail. The retry mechanism is disabled in specific flows, where MDI errors are expected. Fixes: cc23f4f0b6b9 ("e1000e: Add support for Meteor Lake") Suggested-by: Nikolay Mushayev Co-developed-by: Nir Efrati Signed-off-by: Nir Efrati Signed-off-by: Vitaly Lifshits Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/hw.h | 2 + drivers/net/ethernet/intel/e1000e/ich8lan.c | 33 ++++ drivers/net/ethernet/intel/e1000e/phy.c | 182 ++++++++++++-------- drivers/net/ethernet/intel/e1000e/phy.h | 2 + 4 files changed, 150 insertions(+), 69 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/hw.h b/drivers/net/ethernet/intel/e1000e/hw.h index 1fef6bb5a5fbc..4b6e7536170ab 100644 --- a/drivers/net/ethernet/intel/e1000e/hw.h +++ b/drivers/net/ethernet/intel/e1000e/hw.h @@ -628,6 +628,7 @@ struct e1000_phy_info { u32 id; u32 reset_delay_us; /* in usec */ u32 revision; + u32 retry_count; enum e1000_media_type media_type; @@ -644,6 +645,7 @@ struct e1000_phy_info { bool polarity_correction; bool speed_downgraded; bool autoneg_wait_to_complete; + bool retry_enabled; }; struct e1000_nvm_info { diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 19e450a5bd314..d8e97669f31b0 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -222,11 +222,18 @@ out: if (hw->mac.type >= e1000_pch_lpt) { /* Only unforce SMBus if ME is not active */ if (!(er32(FWSM) & E1000_ICH_FWSM_FW_VALID)) { + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + /* Unforce SMBus mode in PHY */ e1e_rphy_locked(hw, CV_SMB_CTRL, &phy_reg); phy_reg &= ~CV_SMB_CTRL_FORCE_SMBUS; e1e_wphy_locked(hw, CV_SMB_CTRL, phy_reg); + e1000e_enable_phy_retry(hw); + /* Unforce SMBus mode in MAC */ mac_reg = er32(CTRL_EXT); mac_reg &= ~E1000_CTRL_EXT_FORCE_SMBUS; @@ -310,6 +317,11 @@ static s32 e1000_init_phy_workarounds_pchlan(struct e1000_hw *hw) goto out; } + /* There is no guarantee that the PHY is accessible at this time + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + /* The MAC-PHY interconnect may be in SMBus mode. If the PHY is * inaccessible and resetting the PHY is not blocked, toggle the * LANPHYPC Value bit to force the interconnect to PCIe mode. @@ -380,6 +392,8 @@ static s32 e1000_init_phy_workarounds_pchlan(struct e1000_hw *hw) break; } + e1000e_enable_phy_retry(hw); + hw->phy.ops.release(hw); if (!ret_val) { @@ -449,6 +463,11 @@ static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw) phy->id = e1000_phy_unknown; + if (hw->mac.type == e1000_pch_mtp) { + phy->retry_count = 2; + e1000e_enable_phy_retry(hw); + } + ret_val = e1000_init_phy_workarounds_pchlan(hw); if (ret_val) return ret_val; @@ -1146,6 +1165,11 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) if (ret_val) goto out; + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + /* Force SMBus mode in PHY */ ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); if (ret_val) @@ -1153,6 +1177,8 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) phy_reg |= CV_SMB_CTRL_FORCE_SMBUS; e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); + e1000e_enable_phy_retry(hw); + /* Force SMBus mode in MAC */ mac_reg = er32(CTRL_EXT); mac_reg |= E1000_CTRL_EXT_FORCE_SMBUS; @@ -1313,6 +1339,11 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw *hw, bool force) /* Toggle LANPHYPC Value bit */ e1000_toggle_lanphypc_pch_lpt(hw); + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + /* Unforce SMBus mode in PHY */ ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); if (ret_val) { @@ -1333,6 +1364,8 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw *hw, bool force) phy_reg &= ~CV_SMB_CTRL_FORCE_SMBUS; e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); + e1000e_enable_phy_retry(hw); + /* Unforce SMBus mode in MAC */ mac_reg = er32(CTRL_EXT); mac_reg &= ~E1000_CTRL_EXT_FORCE_SMBUS; diff --git a/drivers/net/ethernet/intel/e1000e/phy.c b/drivers/net/ethernet/intel/e1000e/phy.c index 5e329156d1bae..93544f1cc2a51 100644 --- a/drivers/net/ethernet/intel/e1000e/phy.c +++ b/drivers/net/ethernet/intel/e1000e/phy.c @@ -107,6 +107,16 @@ s32 e1000e_phy_reset_dsp(struct e1000_hw *hw) return e1e_wphy(hw, M88E1000_PHY_GEN_CONTROL, 0); } +void e1000e_disable_phy_retry(struct e1000_hw *hw) +{ + hw->phy.retry_enabled = false; +} + +void e1000e_enable_phy_retry(struct e1000_hw *hw) +{ + hw->phy.retry_enabled = true; +} + /** * e1000e_read_phy_reg_mdic - Read MDI control register * @hw: pointer to the HW structure @@ -118,55 +128,73 @@ s32 e1000e_phy_reset_dsp(struct e1000_hw *hw) **/ s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data) { + u32 i, mdic = 0, retry_counter, retry_max; struct e1000_phy_info *phy = &hw->phy; - u32 i, mdic = 0; + bool success; if (offset > MAX_PHY_REG_ADDRESS) { e_dbg("PHY Address %d is out of range\n", offset); return -E1000_ERR_PARAM; } + retry_max = phy->retry_enabled ? phy->retry_count : 0; + /* Set up Op-code, Phy Address, and register offset in the MDI * Control register. The MAC will take care of interfacing with the * PHY to retrieve the desired data. */ - mdic = ((offset << E1000_MDIC_REG_SHIFT) | - (phy->addr << E1000_MDIC_PHY_SHIFT) | - (E1000_MDIC_OP_READ)); + for (retry_counter = 0; retry_counter <= retry_max; retry_counter++) { + success = true; - ew32(MDIC, mdic); + mdic = ((offset << E1000_MDIC_REG_SHIFT) | + (phy->addr << E1000_MDIC_PHY_SHIFT) | + (E1000_MDIC_OP_READ)); - /* Poll the ready bit to see if the MDI read completed - * Increasing the time out as testing showed failures with - * the lower time out - */ - for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { - udelay(50); - mdic = er32(MDIC); - if (mdic & E1000_MDIC_READY) - break; - } - if (!(mdic & E1000_MDIC_READY)) { - e_dbg("MDI Read PHY Reg Address %d did not complete\n", offset); - return -E1000_ERR_PHY; - } - if (mdic & E1000_MDIC_ERROR) { - e_dbg("MDI Read PHY Reg Address %d Error\n", offset); - return -E1000_ERR_PHY; - } - if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { - e_dbg("MDI Read offset error - requested %d, returned %d\n", - offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); - return -E1000_ERR_PHY; + ew32(MDIC, mdic); + + /* Poll the ready bit to see if the MDI read completed + * Increasing the time out as testing showed failures with + * the lower time out + */ + for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { + usleep_range(50, 60); + mdic = er32(MDIC); + if (mdic & E1000_MDIC_READY) + break; + } + if (!(mdic & E1000_MDIC_READY)) { + e_dbg("MDI Read PHY Reg Address %d did not complete\n", + offset); + success = false; + } + if (mdic & E1000_MDIC_ERROR) { + e_dbg("MDI Read PHY Reg Address %d Error\n", offset); + success = false; + } + if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { + e_dbg("MDI Read offset error - requested %d, returned %d\n", + offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); + success = false; + } + + /* Allow some time after each MDIC transaction to avoid + * reading duplicate data in the next MDIC transaction. + */ + if (hw->mac.type == e1000_pch2lan) + usleep_range(100, 150); + + if (success) { + *data = (u16)mdic; + return 0; + } + + if (retry_counter != retry_max) { + e_dbg("Perform retry on PHY transaction...\n"); + mdelay(10); + } } - *data = (u16)mdic; - /* Allow some time after each MDIC transaction to avoid - * reading duplicate data in the next MDIC transaction. - */ - if (hw->mac.type == e1000_pch2lan) - udelay(100); - return 0; + return -E1000_ERR_PHY; } /** @@ -179,56 +207,72 @@ s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data) **/ s32 e1000e_write_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 data) { + u32 i, mdic = 0, retry_counter, retry_max; struct e1000_phy_info *phy = &hw->phy; - u32 i, mdic = 0; + bool success; if (offset > MAX_PHY_REG_ADDRESS) { e_dbg("PHY Address %d is out of range\n", offset); return -E1000_ERR_PARAM; } + retry_max = phy->retry_enabled ? phy->retry_count : 0; + /* Set up Op-code, Phy Address, and register offset in the MDI * Control register. The MAC will take care of interfacing with the * PHY to retrieve the desired data. */ - mdic = (((u32)data) | - (offset << E1000_MDIC_REG_SHIFT) | - (phy->addr << E1000_MDIC_PHY_SHIFT) | - (E1000_MDIC_OP_WRITE)); + for (retry_counter = 0; retry_counter <= retry_max; retry_counter++) { + success = true; - ew32(MDIC, mdic); + mdic = (((u32)data) | + (offset << E1000_MDIC_REG_SHIFT) | + (phy->addr << E1000_MDIC_PHY_SHIFT) | + (E1000_MDIC_OP_WRITE)); - /* Poll the ready bit to see if the MDI read completed - * Increasing the time out as testing showed failures with - * the lower time out - */ - for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { - udelay(50); - mdic = er32(MDIC); - if (mdic & E1000_MDIC_READY) - break; - } - if (!(mdic & E1000_MDIC_READY)) { - e_dbg("MDI Write PHY Reg Address %d did not complete\n", offset); - return -E1000_ERR_PHY; - } - if (mdic & E1000_MDIC_ERROR) { - e_dbg("MDI Write PHY Red Address %d Error\n", offset); - return -E1000_ERR_PHY; - } - if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { - e_dbg("MDI Write offset error - requested %d, returned %d\n", - offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); - return -E1000_ERR_PHY; - } + ew32(MDIC, mdic); - /* Allow some time after each MDIC transaction to avoid - * reading duplicate data in the next MDIC transaction. - */ - if (hw->mac.type == e1000_pch2lan) - udelay(100); + /* Poll the ready bit to see if the MDI read completed + * Increasing the time out as testing showed failures with + * the lower time out + */ + for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { + usleep_range(50, 60); + mdic = er32(MDIC); + if (mdic & E1000_MDIC_READY) + break; + } + if (!(mdic & E1000_MDIC_READY)) { + e_dbg("MDI Write PHY Reg Address %d did not complete\n", + offset); + success = false; + } + if (mdic & E1000_MDIC_ERROR) { + e_dbg("MDI Write PHY Reg Address %d Error\n", offset); + success = false; + } + if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { + e_dbg("MDI Write offset error - requested %d, returned %d\n", + offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); + success = false; + } - return 0; + /* Allow some time after each MDIC transaction to avoid + * reading duplicate data in the next MDIC transaction. + */ + if (hw->mac.type == e1000_pch2lan) + usleep_range(100, 150); + + if (success) + return 0; + + if (retry_counter != retry_max) { + e_dbg("Perform retry on PHY transaction...\n"); + mdelay(10); + } + } + + return -E1000_ERR_PHY; } /** diff --git a/drivers/net/ethernet/intel/e1000e/phy.h b/drivers/net/ethernet/intel/e1000e/phy.h index c48777d095235..049bb325b4b14 100644 --- a/drivers/net/ethernet/intel/e1000e/phy.h +++ b/drivers/net/ethernet/intel/e1000e/phy.h @@ -51,6 +51,8 @@ s32 e1000e_read_phy_reg_bm2(struct e1000_hw *hw, u32 offset, u16 *data); s32 e1000e_write_phy_reg_bm2(struct e1000_hw *hw, u32 offset, u16 data); void e1000_power_up_phy_copper(struct e1000_hw *hw); void e1000_power_down_phy_copper(struct e1000_hw *hw); +void e1000e_disable_phy_retry(struct e1000_hw *hw); +void e1000e_enable_phy_retry(struct e1000_hw *hw); s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data); s32 e1000e_write_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 data); s32 e1000_read_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 *data); -- GitLab From 861e8086029e003305750b4126ecd6617465f5c7 Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Sun, 3 Mar 2024 12:51:32 +0200 Subject: [PATCH 099/989] e1000e: move force SMBUS from enable ulp function to avoid PHY loss issue Forcing SMBUS inside the ULP enabling flow leads to sporadic PHY loss on some systems. It is suspected to be caused by initiating PHY transactions before the interface settles. Separating this configuration from the ULP enabling flow and moving it to the shutdown function allows enough time for the interface to settle and avoids adding a delay. Fixes: 6607c99e7034 ("e1000e: i219 - fix to enable both ULP and EEE in Sx state") Co-developed-by: Dima Ruinskiy Signed-off-by: Dima Ruinskiy Signed-off-by: Vitaly Lifshits Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 19 ------------------- drivers/net/ethernet/intel/e1000e/netdev.c | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index d8e97669f31b0..f9e94be36e97f 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1165,25 +1165,6 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) if (ret_val) goto out; - /* Switching PHY interface always returns MDI error - * so disable retry mechanism to avoid wasting time - */ - e1000e_disable_phy_retry(hw); - - /* Force SMBus mode in PHY */ - ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); - if (ret_val) - goto release; - phy_reg |= CV_SMB_CTRL_FORCE_SMBUS; - e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); - - e1000e_enable_phy_retry(hw); - - /* Force SMBus mode in MAC */ - mac_reg = er32(CTRL_EXT); - mac_reg |= E1000_CTRL_EXT_FORCE_SMBUS; - ew32(CTRL_EXT, mac_reg); - /* Si workaround for ULP entry flow on i127/rev6 h/w. Enable * LPLU and disable Gig speed when entering ULP */ diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index cc8c531ec3dff..3692fce201959 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6623,6 +6623,7 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime) struct e1000_hw *hw = &adapter->hw; u32 ctrl, ctrl_ext, rctl, status, wufc; int retval = 0; + u16 smb_ctrl; /* Runtime suspend should only enable wakeup for link changes */ if (runtime) @@ -6696,6 +6697,23 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime) if (retval) return retval; } + + /* Force SMBUS to allow WOL */ + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + + e1e_rphy(hw, CV_SMB_CTRL, &smb_ctrl); + smb_ctrl |= CV_SMB_CTRL_FORCE_SMBUS; + e1e_wphy(hw, CV_SMB_CTRL, smb_ctrl); + + e1000e_enable_phy_retry(hw); + + /* Force SMBus mode in MAC */ + ctrl_ext = er32(CTRL_EXT); + ctrl_ext |= E1000_CTRL_EXT_FORCE_SMBUS; + ew32(CTRL_EXT, ctrl_ext); } /* Ensure that the appropriate bits are set in LPI_CTRL -- GitLab From a1aa5390cc912934fee76ce80af5f940452fa987 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Wed, 27 Mar 2024 19:52:49 +0300 Subject: [PATCH 100/989] of: module: prevent NULL pointer dereference in vsnprintf() In of_modalias(), we can get passed the str and len parameters which would cause a kernel oops in vsnprintf() since it only allows passing a NULL ptr when the length is also 0. Also, we need to filter out the negative values of the len parameter as these will result in a really huge buffer since snprintf() takes size_t parameter while ours is ssize_t... Found by Linux Verification Center (linuxtesting.org) with the Svace static analysis tool. Signed-off-by: Sergey Shtylyov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1d211023-3923-685b-20f0-f3f90ea56e1f@omp.ru Signed-off-by: Rob Herring --- drivers/of/module.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/of/module.c b/drivers/of/module.c index 0e8aa974f0f2b..f58e624953a20 100644 --- a/drivers/of/module.c +++ b/drivers/of/module.c @@ -16,6 +16,14 @@ ssize_t of_modalias(const struct device_node *np, char *str, ssize_t len) ssize_t csize; ssize_t tsize; + /* + * Prevent a kernel oops in vsnprintf() -- it only allows passing a + * NULL ptr when the length is also 0. Also filter out the negative + * lengths... + */ + if ((len > 0 && !str) || len < 0) + return -EINVAL; + /* Name & Type */ /* %p eats all alphanum characters, so %c must be used here */ csize = snprintf(str, len, "of:N%pOFn%c%s", np, 'T', -- GitLab From daf6c4681a74034d5723e2fb761e0d7f3a1ca18f Mon Sep 17 00:00:00 2001 From: Christoffer Sandberg Date: Thu, 28 Mar 2024 11:27:57 +0100 Subject: [PATCH 101/989] ALSA: hda/realtek - Fix inactive headset mic jack This patch adds the existing fixup to certain TF platforms implementing the ALC274 codec with a headset jack. It fixes/activates the inactive microphone of the headset. Signed-off-by: Christoffer Sandberg Signed-off-by: Werner Sembach Cc: Message-ID: <20240328102757.50310-1-wse@tuxedocomputers.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a17c36a36aa53..c31e9be257a9b 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10403,6 +10403,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1d05, 0x1147, "TongFang GMxTGxx", ALC269_FIXUP_NO_SHUTUP), SND_PCI_QUIRK(0x1d05, 0x115c, "TongFang GMxTGxx", ALC269_FIXUP_NO_SHUTUP), SND_PCI_QUIRK(0x1d05, 0x121b, "TongFang GMxAGxx", ALC269_FIXUP_NO_SHUTUP), + SND_PCI_QUIRK(0x1d05, 0x1387, "TongFang GMxIXxx", ALC2XX_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1602, "RedmiBook", ALC255_FIXUP_XIAOMI_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1701, "XiaomiNotebook Pro", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1d72, 0x1901, "RedmiBook 14", ALC256_FIXUP_ASUS_HEADSET_MIC), -- GitLab From 2d0401ee38d43ab0e4cdd02dfc9d402befb2b5c8 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Thu, 28 Mar 2024 12:13:55 +0000 Subject: [PATCH 102/989] ALSA: hda: cs35l56: Add ACPI device match tables Adding the ACPI HIDs to the match table triggers the cs35l56-hda modules to be loaded on boot so that Serial Multi Instantiate can add the devices to the bus and begin the driver init sequence. Signed-off-by: Simon Trimmer Fixes: 73cfbfa9caea ("ALSA: hda/cs35l56: Add driver for Cirrus Logic CS35L56 amplifier") Message-ID: <20240328121355.18972-1-simont@opensource.cirrus.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l56_hda_i2c.c | 13 +++++++++++-- sound/pci/hda/cs35l56_hda_spi.c | 13 +++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/sound/pci/hda/cs35l56_hda_i2c.c b/sound/pci/hda/cs35l56_hda_i2c.c index 13beee807308f..40f2f97944d54 100644 --- a/sound/pci/hda/cs35l56_hda_i2c.c +++ b/sound/pci/hda/cs35l56_hda_i2c.c @@ -56,10 +56,19 @@ static const struct i2c_device_id cs35l56_hda_i2c_id[] = { {} }; +static const struct acpi_device_id cs35l56_acpi_hda_match[] = { + { "CSC3554", 0 }, + { "CSC3556", 0 }, + { "CSC3557", 0 }, + {} +}; +MODULE_DEVICE_TABLE(acpi, cs35l56_acpi_hda_match); + static struct i2c_driver cs35l56_hda_i2c_driver = { .driver = { - .name = "cs35l56-hda", - .pm = &cs35l56_hda_pm_ops, + .name = "cs35l56-hda", + .acpi_match_table = cs35l56_acpi_hda_match, + .pm = &cs35l56_hda_pm_ops, }, .id_table = cs35l56_hda_i2c_id, .probe = cs35l56_hda_i2c_probe, diff --git a/sound/pci/hda/cs35l56_hda_spi.c b/sound/pci/hda/cs35l56_hda_spi.c index a3b2fa76663d3..7f02155fe61e3 100644 --- a/sound/pci/hda/cs35l56_hda_spi.c +++ b/sound/pci/hda/cs35l56_hda_spi.c @@ -56,10 +56,19 @@ static const struct spi_device_id cs35l56_hda_spi_id[] = { {} }; +static const struct acpi_device_id cs35l56_acpi_hda_match[] = { + { "CSC3554", 0 }, + { "CSC3556", 0 }, + { "CSC3557", 0 }, + {} +}; +MODULE_DEVICE_TABLE(acpi, cs35l56_acpi_hda_match); + static struct spi_driver cs35l56_hda_spi_driver = { .driver = { - .name = "cs35l56-hda", - .pm = &cs35l56_hda_pm_ops, + .name = "cs35l56-hda", + .acpi_match_table = cs35l56_acpi_hda_match, + .pm = &cs35l56_hda_pm_ops, }, .id_table = cs35l56_hda_spi_id, .probe = cs35l56_hda_spi_probe, -- GitLab From cd25e15e57e68a6b18dc9323047fe9c68b99290b Mon Sep 17 00:00:00 2001 From: Joakim Sindholt Date: Mon, 18 Mar 2024 12:22:31 +0100 Subject: [PATCH 103/989] fs/9p: only translate RWX permissions for plain 9P2000 Garbage in plain 9P2000's perm bits is allowed through, which causes it to be able to set (among others) the suid bit. This was presumably not the intent since the unix extended bits are handled explicitly and conditionally on .u. Signed-off-by: Joakim Sindholt Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b01b1bbf24937..9612fdb563a3e 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -83,7 +83,7 @@ static int p9mode2perm(struct v9fs_session_info *v9ses, int res; int mode = stat->mode; - res = mode & S_IALLUGO; + res = mode & 0777; /* S_IRWXUGO */ if (v9fs_proto_dotu(v9ses)) { if ((mode & P9_DMSETUID) == P9_DMSETUID) res |= S_ISUID; -- GitLab From 87de39e70503e04ddb58965520b15eb9efa7eef3 Mon Sep 17 00:00:00 2001 From: Joakim Sindholt Date: Mon, 18 Mar 2024 12:22:33 +0100 Subject: [PATCH 104/989] fs/9p: translate O_TRUNC into OTRUNC This one hits both 9P2000 and .u as it appears v9fs has never translated the O_TRUNC flag. Signed-off-by: Joakim Sindholt Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 9612fdb563a3e..c5b4d3631c47e 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -178,6 +178,9 @@ int v9fs_uflags2omode(int uflags, int extended) break; } + if (uflags & O_TRUNC) + ret |= P9_OTRUNC; + if (extended) { if (uflags & O_EXCL) ret |= P9_OEXCL; -- GitLab From 4e5d208cc9bd5fbc95d536fa223b4b14c37b8ca8 Mon Sep 17 00:00:00 2001 From: Joakim Sindholt Date: Mon, 18 Mar 2024 12:22:34 +0100 Subject: [PATCH 105/989] fs/9p: fix the cache always being enabled on files with qid flags I'm not sure why this check was ever here. After updating to 6.6 I suddenly found caching had been turned on by default and neither cache=none nor the new directio would turn it off. After walking through the new code very manually I realized that it's because the caching has to be, in effect, turned off explicitly by setting P9L_DIRECT and whenever a file has a flag, in my case QTAPPEND, it doesn't get set. Setting aside QTDIR which seems to ignore the new fid->mode entirely, the rest of these either should be subject to the same cache rules as every other QTFILE or perhaps very explicitly not cached in the case of QTAUTH. Signed-off-by: Joakim Sindholt Signed-off-by: Eric Van Hensbergen --- fs/9p/fid.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/9p/fid.h b/fs/9p/fid.h index 29281b7c38870..0d6138bee2a3d 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -49,9 +49,6 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry) static inline void v9fs_fid_add_modes(struct p9_fid *fid, unsigned int s_flags, unsigned int s_cache, unsigned int f_flags) { - if (fid->qid.type != P9_QTFILE) - return; - if ((!s_cache) || ((fid->qid.version == 0) && !(s_flags & V9FS_IGNORE_QV)) || (s_flags & V9FS_DIRECT_IO) || (f_flags & O_DIRECT)) { -- GitLab From 2bd02f5a0bac4bb13e0da18652dc75ba0e4958ec Mon Sep 17 00:00:00 2001 From: Christian Hewitt Date: Fri, 22 Mar 2024 16:45:25 +0000 Subject: [PATCH 106/989] drm/panfrost: fix power transition timeout warnings Increase the timeout value to prevent system logs on Amlogic boards flooding with power transition warnings: [ 13.047638] panfrost ffe40000.gpu: shader power transition timeout [ 13.048674] panfrost ffe40000.gpu: l2 power transition timeout [ 13.937324] panfrost ffe40000.gpu: shader power transition timeout [ 13.938351] panfrost ffe40000.gpu: l2 power transition timeout ... [39829.506904] panfrost ffe40000.gpu: shader power transition timeout [39829.507938] panfrost ffe40000.gpu: l2 power transition timeout [39949.508369] panfrost ffe40000.gpu: shader power transition timeout [39949.509405] panfrost ffe40000.gpu: l2 power transition timeout The 2000 value has been found through trial and error testing with devices using G52 and G31 GPUs. Fixes: 22aa1a209018 ("drm/panfrost: Really power off GPU cores in panfrost_gpu_power_off()") Signed-off-by: Christian Hewitt Reviewed-by: Steven Price Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Steven Price Link: https://patchwork.freedesktop.org/patch/msgid/20240322164525.2617508-1-christianshewitt@gmail.com --- drivers/gpu/drm/panfrost/panfrost_gpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c b/drivers/gpu/drm/panfrost/panfrost_gpu.c index 9063ce2546422..fd8e44992184f 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gpu.c +++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c @@ -441,19 +441,19 @@ void panfrost_gpu_power_off(struct panfrost_device *pfdev) gpu_write(pfdev, SHADER_PWROFF_LO, pfdev->features.shader_present); ret = readl_relaxed_poll_timeout(pfdev->iomem + SHADER_PWRTRANS_LO, - val, !val, 1, 1000); + val, !val, 1, 2000); if (ret) dev_err(pfdev->dev, "shader power transition timeout"); gpu_write(pfdev, TILER_PWROFF_LO, pfdev->features.tiler_present); ret = readl_relaxed_poll_timeout(pfdev->iomem + TILER_PWRTRANS_LO, - val, !val, 1, 1000); + val, !val, 1, 2000); if (ret) dev_err(pfdev->dev, "tiler power transition timeout"); gpu_write(pfdev, L2_PWROFF_LO, pfdev->features.l2_present); ret = readl_poll_timeout(pfdev->iomem + L2_PWRTRANS_LO, - val, !val, 0, 1000); + val, !val, 0, 2000); if (ret) dev_err(pfdev->dev, "l2 power transition timeout"); } -- GitLab From c60ebc58f2a82d27006cfc30af406bfd2ec204cc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 15 Mar 2024 09:09:30 +0000 Subject: [PATCH 107/989] drm/nouveau/gr/gf100: Remove second semicolon There is a statement with two semicolons. Remove the second one, it is redundant. Signed-off-by: Colin Ian King Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240315090930.2429958-1-colin.i.king@gmail.com --- drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c index 986e8d547c942..060c74a80eb14 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c @@ -420,7 +420,7 @@ gf100_gr_chan_new(struct nvkm_gr *base, struct nvkm_chan *fifoch, return ret; } else { ret = nvkm_memory_map(gr->attrib_cb, 0, chan->vmm, chan->attrib_cb, - &args, sizeof(args));; + &args, sizeof(args)); if (ret) return ret; } -- GitLab From be141849ec00ef39935bf169c0f194ac70bf85ce Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 28 Mar 2024 12:43:16 +1000 Subject: [PATCH 108/989] nouveau/uvmm: fix addr/range calcs for remap operations dEQP-VK.sparse_resources.image_rebind.2d_array.r64i.128_128_8 was causing a remap operation like the below. op_remap: prev: 0000003fffed0000 00000000000f0000 00000000a5abd18a 0000000000000000 op_remap: next: op_remap: unmap: 0000003fffed0000 0000000000100000 0 op_map: map: 0000003ffffc0000 0000000000010000 000000005b1ba33c 00000000000e0000 This was resulting in an unmap operation from 0x3fffed0000+0xf0000, 0x100000 which was corrupting the pagetables and oopsing the kernel. Fixes the prev + unmap range calcs to use start/end and map back to addr/range. Signed-off-by: Dave Airlie Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI") Cc: Danilo Krummrich Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240328024317.2041851-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nouveau_uvmm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c index 0a0a11dc9ec03..ee02cd833c5e4 100644 --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c @@ -812,15 +812,15 @@ op_remap(struct drm_gpuva_op_remap *r, struct drm_gpuva_op_unmap *u = r->unmap; struct nouveau_uvma *uvma = uvma_from_va(u->va); u64 addr = uvma->va.va.addr; - u64 range = uvma->va.va.range; + u64 end = uvma->va.va.addr + uvma->va.va.range; if (r->prev) addr = r->prev->va.addr + r->prev->va.range; if (r->next) - range = r->next->va.addr - addr; + end = r->next->va.addr; - op_unmap_range(u, addr, range); + op_unmap_range(u, addr, end - addr); } static int -- GitLab From a4ec240f6b7c21cf846d10017c3ce423a0eae92c Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 22 Mar 2024 14:48:01 -0700 Subject: [PATCH 109/989] drm/prime: Unbreak virtgpu dma-buf export virtgpu "vram" GEM objects do not implement obj->get_sg_table(). But they also don't use drm_gem_map_dma_buf(). In fact they may not even have guest visible pages. But it is perfectly fine to export and share with other virtual devices. Reported-by: Dominik Behr Fixes: 207395da5a97 ("drm/prime: reject DMA-BUF attach when get_sg_table is missing") Signed-off-by: Rob Clark Reviewed-by: Simon Ser Signed-off-by: Simon Ser Link: https://patchwork.freedesktop.org/patch/msgid/20240322214801.319975-1-robdclark@gmail.com --- drivers/gpu/drm/drm_prime.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c index 7352bde299d54..03bd3c7bd0dc2 100644 --- a/drivers/gpu/drm/drm_prime.c +++ b/drivers/gpu/drm/drm_prime.c @@ -582,7 +582,12 @@ int drm_gem_map_attach(struct dma_buf *dma_buf, { struct drm_gem_object *obj = dma_buf->priv; - if (!obj->funcs->get_sg_table) + /* + * drm_gem_map_dma_buf() requires obj->get_sg_table(), but drivers + * that implement their own ->map_dma_buf() do not. + */ + if (dma_buf->ops->map_dma_buf == drm_gem_map_dma_buf && + !obj->funcs->get_sg_table) return -ENOSYS; return drm_gem_pin(obj); -- GitLab From 310a5caa4e861616a27a83c3e8bda17d65026fa8 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 25 Mar 2024 17:18:12 -0500 Subject: [PATCH 110/989] ASoC: rt5682-sdw: fix locking sequence The disable_irq_lock protects the 'disable_irq' value, we need to lock before testing it. Fixes: 02fb23d72720 ("ASoC: rt5682-sdw: fix for JD event handling in ClockStop Mode0") Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Chao Song Link: https://msgid.link/r/20240325221817.206465-2-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5682-sdw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/rt5682-sdw.c b/sound/soc/codecs/rt5682-sdw.c index e67c2e19cb1a7..1fdbef5fd6cba 100644 --- a/sound/soc/codecs/rt5682-sdw.c +++ b/sound/soc/codecs/rt5682-sdw.c @@ -763,12 +763,12 @@ static int __maybe_unused rt5682_dev_resume(struct device *dev) return 0; if (!slave->unattach_request) { + mutex_lock(&rt5682->disable_irq_lock); if (rt5682->disable_irq == true) { - mutex_lock(&rt5682->disable_irq_lock); sdw_write_no_pm(slave, SDW_SCP_INTMASK1, SDW_SCP_INT1_IMPL_DEF); rt5682->disable_irq = false; - mutex_unlock(&rt5682->disable_irq_lock); } + mutex_unlock(&rt5682->disable_irq_lock); goto regmap_sync; } -- GitLab From ee287771644394d071e6a331951ee8079b64f9a7 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 25 Mar 2024 17:18:13 -0500 Subject: [PATCH 111/989] ASoC: rt711-sdca: fix locking sequence The disable_irq_lock protects the 'disable_irq' value, we need to lock before testing it. Fixes: 23adeb7056ac ("ASoC: rt711-sdca: fix for JD event handling in ClockStop Mode0") Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Chao Song Link: https://msgid.link/r/20240325221817.206465-3-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt711-sdca-sdw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/rt711-sdca-sdw.c b/sound/soc/codecs/rt711-sdca-sdw.c index 935e597022d32..b8471b2d8f4f1 100644 --- a/sound/soc/codecs/rt711-sdca-sdw.c +++ b/sound/soc/codecs/rt711-sdca-sdw.c @@ -438,13 +438,13 @@ static int __maybe_unused rt711_sdca_dev_resume(struct device *dev) return 0; if (!slave->unattach_request) { + mutex_lock(&rt711->disable_irq_lock); if (rt711->disable_irq == true) { - mutex_lock(&rt711->disable_irq_lock); sdw_write_no_pm(slave, SDW_SCP_SDCA_INTMASK1, SDW_SCP_SDCA_INTMASK_SDCA_0); sdw_write_no_pm(slave, SDW_SCP_SDCA_INTMASK2, SDW_SCP_SDCA_INTMASK_SDCA_8); rt711->disable_irq = false; - mutex_unlock(&rt711->disable_irq_lock); } + mutex_unlock(&rt711->disable_irq_lock); goto regmap_sync; } -- GitLab From aae86cfd8790bcc7693a5a0894df58de5cb5128c Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 25 Mar 2024 17:18:14 -0500 Subject: [PATCH 112/989] ASoC: rt711-sdw: fix locking sequence The disable_irq_lock protects the 'disable_irq' value, we need to lock before testing it. Fixes: b69de265bd0e ("ASoC: rt711: fix for JD event handling in ClockStop Mode0") Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Chao Song Link: https://msgid.link/r/20240325221817.206465-4-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt711-sdw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/rt711-sdw.c b/sound/soc/codecs/rt711-sdw.c index 3f5773310ae8c..988451f24a756 100644 --- a/sound/soc/codecs/rt711-sdw.c +++ b/sound/soc/codecs/rt711-sdw.c @@ -536,12 +536,12 @@ static int __maybe_unused rt711_dev_resume(struct device *dev) return 0; if (!slave->unattach_request) { + mutex_lock(&rt711->disable_irq_lock); if (rt711->disable_irq == true) { - mutex_lock(&rt711->disable_irq_lock); sdw_write_no_pm(slave, SDW_SCP_INTMASK1, SDW_SCP_INT1_IMPL_DEF); rt711->disable_irq = false; - mutex_unlock(&rt711->disable_irq_lock); } + mutex_unlock(&rt711->disable_irq_lock); goto regmap_sync; } -- GitLab From c8b2e5c1b959d100990e4f0cbad38e7d047bb97c Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 25 Mar 2024 17:18:15 -0500 Subject: [PATCH 113/989] ASoC: rt712-sdca-sdw: fix locking sequence The disable_irq_lock protects the 'disable_irq' value, we need to lock before testing it. Fixes: 7a8735c1551e ("ASoC: rt712-sdca: fix for JD event handling in ClockStop Mode0") Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Chao Song Link: https://msgid.link/r/20240325221817.206465-5-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt712-sdca-sdw.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/rt712-sdca-sdw.c b/sound/soc/codecs/rt712-sdca-sdw.c index 01ac555cd79b8..36d0dd532b8d5 100644 --- a/sound/soc/codecs/rt712-sdca-sdw.c +++ b/sound/soc/codecs/rt712-sdca-sdw.c @@ -438,13 +438,14 @@ static int __maybe_unused rt712_sdca_dev_resume(struct device *dev) return 0; if (!slave->unattach_request) { + mutex_lock(&rt712->disable_irq_lock); if (rt712->disable_irq == true) { - mutex_lock(&rt712->disable_irq_lock); + sdw_write_no_pm(slave, SDW_SCP_SDCA_INTMASK1, SDW_SCP_SDCA_INTMASK_SDCA_0); sdw_write_no_pm(slave, SDW_SCP_SDCA_INTMASK2, SDW_SCP_SDCA_INTMASK_SDCA_8); rt712->disable_irq = false; - mutex_unlock(&rt712->disable_irq_lock); } + mutex_unlock(&rt712->disable_irq_lock); goto regmap_sync; } -- GitLab From adb354bbc231b23d3a05163ce35c1d598512ff64 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 25 Mar 2024 17:18:16 -0500 Subject: [PATCH 114/989] ASoC: rt722-sdca-sdw: fix locking sequence The disable_irq_lock protects the 'disable_irq' value, we need to lock before testing it. Fixes: a0b7c59ac1a9 ("ASoC: rt722-sdca: fix for JD event handling in ClockStop Mode0") Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Chao Song Link: https://msgid.link/r/20240325221817.206465-6-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt722-sdca-sdw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/rt722-sdca-sdw.c b/sound/soc/codecs/rt722-sdca-sdw.c index eb76f4c675b67..65d584c1886e8 100644 --- a/sound/soc/codecs/rt722-sdca-sdw.c +++ b/sound/soc/codecs/rt722-sdca-sdw.c @@ -467,13 +467,13 @@ static int __maybe_unused rt722_sdca_dev_resume(struct device *dev) return 0; if (!slave->unattach_request) { + mutex_lock(&rt722->disable_irq_lock); if (rt722->disable_irq == true) { - mutex_lock(&rt722->disable_irq_lock); sdw_write_no_pm(slave, SDW_SCP_SDCA_INTMASK1, SDW_SCP_SDCA_INTMASK_SDCA_6); sdw_write_no_pm(slave, SDW_SCP_SDCA_INTMASK2, SDW_SCP_SDCA_INTMASK_SDCA_8); rt722->disable_irq = false; - mutex_unlock(&rt722->disable_irq_lock); } + mutex_unlock(&rt722->disable_irq_lock); goto regmap_sync; } -- GitLab From f892e66fcabc6161cd38c0fc86e769208174b840 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 25 Mar 2024 17:18:17 -0500 Subject: [PATCH 115/989] ASoC: rt-sdw*: add __func__ to all error logs The drivers for Realtek SoundWire codecs use similar logs, which is problematic to analyze problems reported by CI tools, e.g. "Failed to get private value: 752001 => 0000 ret=-5". It's not uncommon to have several Realtek devices on the same platform, having the same log thrown makes support difficult. This patch adds __func__ to all error logs which didn't already include it. No functionality change, only error logs are modified. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Link: https://msgid.link/r/20240325221817.206465-7-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt1316-sdw.c | 8 +++--- sound/soc/codecs/rt1318-sdw.c | 8 +++--- sound/soc/codecs/rt5682-sdw.c | 12 ++++---- sound/soc/codecs/rt700.c | 16 +++++------ sound/soc/codecs/rt711-sdca-sdw.c | 2 +- sound/soc/codecs/rt711-sdca.c | 18 ++++++------ sound/soc/codecs/rt711-sdw.c | 4 +-- sound/soc/codecs/rt711.c | 16 +++++------ sound/soc/codecs/rt712-sdca-dmic.c | 24 +++++++++------- sound/soc/codecs/rt712-sdca-sdw.c | 2 +- sound/soc/codecs/rt712-sdca.c | 20 ++++++------- sound/soc/codecs/rt715-sdca-sdw.c | 2 +- sound/soc/codecs/rt715-sdca.c | 46 +++++++++++++++--------------- sound/soc/codecs/rt715-sdw.c | 4 +-- sound/soc/codecs/rt715.c | 24 ++++++++-------- sound/soc/codecs/rt722-sdca.c | 21 +++++++------- 16 files changed, 115 insertions(+), 112 deletions(-) diff --git a/sound/soc/codecs/rt1316-sdw.c b/sound/soc/codecs/rt1316-sdw.c index 47511f70119ae..0b3bf920bcab2 100644 --- a/sound/soc/codecs/rt1316-sdw.c +++ b/sound/soc/codecs/rt1316-sdw.c @@ -537,7 +537,7 @@ static int rt1316_sdw_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt1316->sdw_slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } @@ -577,12 +577,12 @@ static int rt1316_sdw_parse_dt(struct rt1316_sdw_priv *rt1316, struct device *de if (rt1316->bq_params_cnt) { rt1316->bq_params = devm_kzalloc(dev, rt1316->bq_params_cnt, GFP_KERNEL); if (!rt1316->bq_params) { - dev_err(dev, "Could not allocate bq_params memory\n"); + dev_err(dev, "%s: Could not allocate bq_params memory\n", __func__); ret = -ENOMEM; } else { ret = device_property_read_u8_array(dev, "realtek,bq-params", rt1316->bq_params, rt1316->bq_params_cnt); if (ret < 0) - dev_err(dev, "Could not read list of realtek,bq-params\n"); + dev_err(dev, "%s: Could not read list of realtek,bq-params\n", __func__); } } @@ -759,7 +759,7 @@ static int __maybe_unused rt1316_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT1316_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt1318-sdw.c b/sound/soc/codecs/rt1318-sdw.c index ff364bde4a084..462c9a4b1be5d 100644 --- a/sound/soc/codecs/rt1318-sdw.c +++ b/sound/soc/codecs/rt1318-sdw.c @@ -606,7 +606,7 @@ static int rt1318_sdw_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt1318->sdw_slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } @@ -631,8 +631,8 @@ static int rt1318_sdw_hw_params(struct snd_pcm_substream *substream, sampling_rate = RT1318_SDCA_RATE_192000HZ; break; default: - dev_err(component->dev, "Rate %d is not supported\n", - params_rate(params)); + dev_err(component->dev, "%s: Rate %d is not supported\n", + __func__, params_rate(params)); return -EINVAL; } @@ -835,7 +835,7 @@ static int __maybe_unused rt1318_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT1318_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); return -ETIMEDOUT; } diff --git a/sound/soc/codecs/rt5682-sdw.c b/sound/soc/codecs/rt5682-sdw.c index 1fdbef5fd6cba..f9ee42c13dbac 100644 --- a/sound/soc/codecs/rt5682-sdw.c +++ b/sound/soc/codecs/rt5682-sdw.c @@ -132,7 +132,7 @@ static int rt5682_sdw_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt5682->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } @@ -315,8 +315,8 @@ static int rt5682_sdw_init(struct device *dev, struct regmap *regmap, &rt5682_sdw_indirect_regmap); if (IS_ERR(rt5682->regmap)) { ret = PTR_ERR(rt5682->regmap); - dev_err(dev, "Failed to allocate register map: %d\n", - ret); + dev_err(dev, "%s: Failed to allocate register map: %d\n", + __func__, ret); return ret; } @@ -400,7 +400,7 @@ static int rt5682_io_init(struct device *dev, struct sdw_slave *slave) } if (val != DEVICE_ID) { - dev_err(dev, "Device with ID register %x is not rt5682\n", val); + dev_err(dev, "%s: Device with ID register %x is not rt5682\n", __func__, val); ret = -ENODEV; goto err_nodev; } @@ -648,7 +648,7 @@ static int rt5682_bus_config(struct sdw_slave *slave, ret = rt5682_clock_config(&slave->dev); if (ret < 0) - dev_err(&slave->dev, "Invalid clk config"); + dev_err(&slave->dev, "%s: Invalid clk config", __func__); return ret; } @@ -775,7 +775,7 @@ static int __maybe_unused rt5682_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT5682_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt700.c b/sound/soc/codecs/rt700.c index 0ebf344a1b609..434b926f96c83 100644 --- a/sound/soc/codecs/rt700.c +++ b/sound/soc/codecs/rt700.c @@ -37,8 +37,8 @@ static int rt700_index_write(struct regmap *regmap, ret = regmap_write(regmap, addr, value); if (ret < 0) - pr_err("Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + pr_err("%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -52,8 +52,8 @@ static int rt700_index_read(struct regmap *regmap, *value = 0; ret = regmap_read(regmap, addr, value); if (ret < 0) - pr_err("Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + pr_err("%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -930,14 +930,14 @@ static int rt700_pcm_hw_params(struct snd_pcm_substream *substream, port_config.num += 2; break; default: - dev_err(component->dev, "Invalid DAI id %d\n", dai->id); + dev_err(component->dev, "%s: Invalid DAI id %d\n", __func__, dai->id); return -EINVAL; } retval = sdw_stream_add_slave(rt700->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } @@ -945,8 +945,8 @@ static int rt700_pcm_hw_params(struct snd_pcm_substream *substream, /* bit 3:0 Number of Channel */ val |= (params_channels(params) - 1); } else { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } diff --git a/sound/soc/codecs/rt711-sdca-sdw.c b/sound/soc/codecs/rt711-sdca-sdw.c index b8471b2d8f4f1..2636c2eea4bc8 100644 --- a/sound/soc/codecs/rt711-sdca-sdw.c +++ b/sound/soc/codecs/rt711-sdca-sdw.c @@ -451,7 +451,7 @@ static int __maybe_unused rt711_sdca_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT711_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt711-sdca.c b/sound/soc/codecs/rt711-sdca.c index 447154cb60104..1e8dbfc3ecd96 100644 --- a/sound/soc/codecs/rt711-sdca.c +++ b/sound/soc/codecs/rt711-sdca.c @@ -36,8 +36,8 @@ static int rt711_sdca_index_write(struct rt711_sdca_priv *rt711, ret = regmap_write(regmap, addr, value); if (ret < 0) dev_err(&rt711->slave->dev, - "Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + "%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -52,8 +52,8 @@ static int rt711_sdca_index_read(struct rt711_sdca_priv *rt711, ret = regmap_read(regmap, addr, value); if (ret < 0) dev_err(&rt711->slave->dev, - "Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + "%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -1293,13 +1293,13 @@ static int rt711_sdca_pcm_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt711->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } if (params_channels(params) > 16) { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } @@ -1318,8 +1318,8 @@ static int rt711_sdca_pcm_hw_params(struct snd_pcm_substream *substream, sampling_rate = RT711_SDCA_RATE_192000HZ; break; default: - dev_err(component->dev, "Rate %d is not supported\n", - params_rate(params)); + dev_err(component->dev, "%s: Rate %d is not supported\n", + __func__, params_rate(params)); return -EINVAL; } diff --git a/sound/soc/codecs/rt711-sdw.c b/sound/soc/codecs/rt711-sdw.c index 988451f24a756..0d3b43dd22e63 100644 --- a/sound/soc/codecs/rt711-sdw.c +++ b/sound/soc/codecs/rt711-sdw.c @@ -408,7 +408,7 @@ static int rt711_bus_config(struct sdw_slave *slave, ret = rt711_clock_config(&slave->dev); if (ret < 0) - dev_err(&slave->dev, "Invalid clk config"); + dev_err(&slave->dev, "%s: Invalid clk config", __func__); return ret; } @@ -548,7 +548,7 @@ static int __maybe_unused rt711_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT711_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); return -ETIMEDOUT; } diff --git a/sound/soc/codecs/rt711.c b/sound/soc/codecs/rt711.c index 66eaed13b0d6a..5446f9506a167 100644 --- a/sound/soc/codecs/rt711.c +++ b/sound/soc/codecs/rt711.c @@ -37,8 +37,8 @@ static int rt711_index_write(struct regmap *regmap, ret = regmap_write(regmap, addr, value); if (ret < 0) - pr_err("Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + pr_err("%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -52,8 +52,8 @@ static int rt711_index_read(struct regmap *regmap, *value = 0; ret = regmap_read(regmap, addr, value); if (ret < 0) - pr_err("Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + pr_err("%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -428,7 +428,7 @@ static void rt711_jack_init(struct rt711_priv *rt711) RT711_HP_JD_FINAL_RESULT_CTL_JD12); break; default: - dev_warn(rt711->component->dev, "Wrong JD source\n"); + dev_warn(rt711->component->dev, "%s: Wrong JD source\n", __func__); break; } @@ -1020,7 +1020,7 @@ static int rt711_pcm_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt711->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } @@ -1028,8 +1028,8 @@ static int rt711_pcm_hw_params(struct snd_pcm_substream *substream, /* bit 3:0 Number of Channel */ val |= (params_channels(params) - 1); } else { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } diff --git a/sound/soc/codecs/rt712-sdca-dmic.c b/sound/soc/codecs/rt712-sdca-dmic.c index 0926b26619bd4..012b79e72cf6b 100644 --- a/sound/soc/codecs/rt712-sdca-dmic.c +++ b/sound/soc/codecs/rt712-sdca-dmic.c @@ -139,8 +139,8 @@ static int rt712_sdca_dmic_index_write(struct rt712_sdca_dmic_priv *rt712, ret = regmap_write(regmap, addr, value); if (ret < 0) dev_err(&rt712->slave->dev, - "Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + "%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -155,8 +155,8 @@ static int rt712_sdca_dmic_index_read(struct rt712_sdca_dmic_priv *rt712, ret = regmap_read(regmap, addr, value); if (ret < 0) dev_err(&rt712->slave->dev, - "Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + "%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -317,7 +317,8 @@ static int rt712_sdca_dmic_set_gain_put(struct snd_kcontrol *kcontrol, for (i = 0; i < p->count; i++) { err = regmap_write(rt712->mbq_regmap, p->reg_base + i, gain_val[i]); if (err < 0) - dev_err(&rt712->slave->dev, "0x%08x can't be set\n", p->reg_base + i); + dev_err(&rt712->slave->dev, "%s: 0x%08x can't be set\n", + __func__, p->reg_base + i); } return changed; @@ -667,13 +668,13 @@ static int rt712_sdca_dmic_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt712->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } if (params_channels(params) > 4) { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } @@ -698,8 +699,8 @@ static int rt712_sdca_dmic_hw_params(struct snd_pcm_substream *substream, sampling_rate = RT712_SDCA_RATE_192000HZ; break; default: - dev_err(component->dev, "Rate %d is not supported\n", - params_rate(params)); + dev_err(component->dev, "%s: Rate %d is not supported\n", + __func__, params_rate(params)); return -EINVAL; } @@ -923,7 +924,8 @@ static int __maybe_unused rt712_sdca_dmic_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT712_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", + __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt712-sdca-sdw.c b/sound/soc/codecs/rt712-sdca-sdw.c index 36d0dd532b8d5..4e9ab3ef135b3 100644 --- a/sound/soc/codecs/rt712-sdca-sdw.c +++ b/sound/soc/codecs/rt712-sdca-sdw.c @@ -452,7 +452,7 @@ static int __maybe_unused rt712_sdca_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT712_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt712-sdca.c b/sound/soc/codecs/rt712-sdca.c index 6954fbe7ec5f3..b503de9fda80e 100644 --- a/sound/soc/codecs/rt712-sdca.c +++ b/sound/soc/codecs/rt712-sdca.c @@ -34,8 +34,8 @@ static int rt712_sdca_index_write(struct rt712_sdca_priv *rt712, ret = regmap_write(regmap, addr, value); if (ret < 0) dev_err(&rt712->slave->dev, - "Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + "%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -50,8 +50,8 @@ static int rt712_sdca_index_read(struct rt712_sdca_priv *rt712, ret = regmap_read(regmap, addr, value); if (ret < 0) dev_err(&rt712->slave->dev, - "Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + "%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -1060,13 +1060,13 @@ static int rt712_sdca_pcm_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt712->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } if (params_channels(params) > 16) { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } @@ -1085,8 +1085,8 @@ static int rt712_sdca_pcm_hw_params(struct snd_pcm_substream *substream, sampling_rate = RT712_SDCA_RATE_192000HZ; break; default: - dev_err(component->dev, "Rate %d is not supported\n", - params_rate(params)); + dev_err(component->dev, "%s: Rate %d is not supported\n", + __func__, params_rate(params)); return -EINVAL; } @@ -1106,7 +1106,7 @@ static int rt712_sdca_pcm_hw_params(struct snd_pcm_substream *substream, sampling_rate); break; default: - dev_err(component->dev, "Wrong DAI id\n"); + dev_err(component->dev, "%s: Wrong DAI id\n", __func__); return -EINVAL; } diff --git a/sound/soc/codecs/rt715-sdca-sdw.c b/sound/soc/codecs/rt715-sdca-sdw.c index ab54a67a27ebb..ee450126106f9 100644 --- a/sound/soc/codecs/rt715-sdca-sdw.c +++ b/sound/soc/codecs/rt715-sdca-sdw.c @@ -237,7 +237,7 @@ static int __maybe_unused rt715_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->enumeration_complete, msecs_to_jiffies(RT715_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Enumeration not complete, timed out\n"); + dev_err(&slave->dev, "%s: Enumeration not complete, timed out\n", __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt715-sdca.c b/sound/soc/codecs/rt715-sdca.c index 4533eedd7e189..3fb7b9adb61de 100644 --- a/sound/soc/codecs/rt715-sdca.c +++ b/sound/soc/codecs/rt715-sdca.c @@ -41,8 +41,8 @@ static int rt715_sdca_index_write(struct rt715_sdca_priv *rt715, ret = regmap_write(regmap, addr, value); if (ret < 0) dev_err(&rt715->slave->dev, - "Failed to set private value: %08x <= %04x %d\n", - addr, value, ret); + "%s: Failed to set private value: %08x <= %04x %d\n", + __func__, addr, value, ret); return ret; } @@ -59,8 +59,8 @@ static int rt715_sdca_index_read(struct rt715_sdca_priv *rt715, ret = regmap_read(regmap, addr, value); if (ret < 0) dev_err(&rt715->slave->dev, - "Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + "%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -152,8 +152,8 @@ static int rt715_sdca_set_amp_gain_put(struct snd_kcontrol *kcontrol, mc->shift); ret = regmap_write(rt715->mbq_regmap, mc->reg + i, gain_val); if (ret != 0) { - dev_err(component->dev, "Failed to write 0x%x=0x%x\n", - mc->reg + i, gain_val); + dev_err(component->dev, "%s: Failed to write 0x%x=0x%x\n", + __func__, mc->reg + i, gain_val); return ret; } } @@ -188,8 +188,8 @@ static int rt715_sdca_set_amp_gain_4ch_put(struct snd_kcontrol *kcontrol, ret = regmap_write(rt715->mbq_regmap, reg_base + i, gain_val); if (ret != 0) { - dev_err(component->dev, "Failed to write 0x%x=0x%x\n", - reg_base + i, gain_val); + dev_err(component->dev, "%s: Failed to write 0x%x=0x%x\n", + __func__, reg_base + i, gain_val); return ret; } } @@ -224,8 +224,8 @@ static int rt715_sdca_set_amp_gain_8ch_put(struct snd_kcontrol *kcontrol, reg = i < 7 ? reg_base + i : (reg_base - 1) | BIT(15); ret = regmap_write(rt715->mbq_regmap, reg, gain_val); if (ret != 0) { - dev_err(component->dev, "Failed to write 0x%x=0x%x\n", - reg, gain_val); + dev_err(component->dev, "%s: Failed to write 0x%x=0x%x\n", + __func__, reg, gain_val); return ret; } } @@ -246,8 +246,8 @@ static int rt715_sdca_set_amp_gain_get(struct snd_kcontrol *kcontrol, for (i = 0; i < 2; i++) { ret = regmap_read(rt715->mbq_regmap, mc->reg + i, &val); if (ret < 0) { - dev_err(component->dev, "Failed to read 0x%x, ret=%d\n", - mc->reg + i, ret); + dev_err(component->dev, "%s: Failed to read 0x%x, ret=%d\n", + __func__, mc->reg + i, ret); return ret; } ucontrol->value.integer.value[i] = rt715_sdca_get_gain(val, mc->shift); @@ -271,8 +271,8 @@ static int rt715_sdca_set_amp_gain_4ch_get(struct snd_kcontrol *kcontrol, for (i = 0; i < 4; i++) { ret = regmap_read(rt715->mbq_regmap, reg_base + i, &val); if (ret < 0) { - dev_err(component->dev, "Failed to read 0x%x, ret=%d\n", - reg_base + i, ret); + dev_err(component->dev, "%s: Failed to read 0x%x, ret=%d\n", + __func__, reg_base + i, ret); return ret; } ucontrol->value.integer.value[i] = rt715_sdca_get_gain(val, gain_sft); @@ -297,8 +297,8 @@ static int rt715_sdca_set_amp_gain_8ch_get(struct snd_kcontrol *kcontrol, for (i = 0; i < 8; i += 2) { ret = regmap_read(rt715->mbq_regmap, reg_base + i, &val_l); if (ret < 0) { - dev_err(component->dev, "Failed to read 0x%x, ret=%d\n", - reg_base + i, ret); + dev_err(component->dev, "%s: Failed to read 0x%x, ret=%d\n", + __func__, reg_base + i, ret); return ret; } ucontrol->value.integer.value[i] = (val_l >> gain_sft) / 10; @@ -306,8 +306,8 @@ static int rt715_sdca_set_amp_gain_8ch_get(struct snd_kcontrol *kcontrol, reg = (i == 6) ? (reg_base - 1) | BIT(15) : reg_base + 1 + i; ret = regmap_read(rt715->mbq_regmap, reg, &val_r); if (ret < 0) { - dev_err(component->dev, "Failed to read 0x%x, ret=%d\n", - reg, ret); + dev_err(component->dev, "%s: Failed to read 0x%x, ret=%d\n", + __func__, reg, ret); return ret; } ucontrol->value.integer.value[i + 1] = (val_r >> gain_sft) / 10; @@ -834,15 +834,15 @@ static int rt715_sdca_pcm_hw_params(struct snd_pcm_substream *substream, 0xaf00); break; default: - dev_err(component->dev, "Invalid DAI id %d\n", dai->id); + dev_err(component->dev, "%s: Invalid DAI id %d\n", __func__, dai->id); return -EINVAL; } retval = sdw_stream_add_slave(rt715->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(component->dev, "Unable to configure port, retval:%d\n", - retval); + dev_err(component->dev, "%s: Unable to configure port, retval:%d\n", + __func__, retval); return retval; } @@ -893,8 +893,8 @@ static int rt715_sdca_pcm_hw_params(struct snd_pcm_substream *substream, val = 0xf; break; default: - dev_err(component->dev, "Unsupported sample rate %d\n", - params_rate(params)); + dev_err(component->dev, "%s: Unsupported sample rate %d\n", + __func__, params_rate(params)); return -EINVAL; } diff --git a/sound/soc/codecs/rt715-sdw.c b/sound/soc/codecs/rt715-sdw.c index 21f37babd148a..7e13868ff99f0 100644 --- a/sound/soc/codecs/rt715-sdw.c +++ b/sound/soc/codecs/rt715-sdw.c @@ -482,7 +482,7 @@ static int rt715_bus_config(struct sdw_slave *slave, ret = rt715_clock_config(&slave->dev); if (ret < 0) - dev_err(&slave->dev, "Invalid clk config"); + dev_err(&slave->dev, "%s: Invalid clk config", __func__); return 0; } @@ -554,7 +554,7 @@ static int __maybe_unused rt715_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT715_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt715.c b/sound/soc/codecs/rt715.c index 9f732a5abd53f..299c9b12377c6 100644 --- a/sound/soc/codecs/rt715.c +++ b/sound/soc/codecs/rt715.c @@ -40,8 +40,8 @@ static int rt715_index_write(struct regmap *regmap, unsigned int reg, ret = regmap_write(regmap, addr, value); if (ret < 0) { - pr_err("Failed to set private value: %08x <= %04x %d\n", - addr, value, ret); + pr_err("%s: Failed to set private value: %08x <= %04x %d\n", + __func__, addr, value, ret); } return ret; @@ -55,8 +55,8 @@ static int rt715_index_write_nid(struct regmap *regmap, ret = regmap_write(regmap, addr, value); if (ret < 0) - pr_err("Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + pr_err("%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -70,8 +70,8 @@ static int rt715_index_read_nid(struct regmap *regmap, *value = 0; ret = regmap_read(regmap, addr, value); if (ret < 0) - pr_err("Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + pr_err("%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -862,14 +862,14 @@ static int rt715_pcm_hw_params(struct snd_pcm_substream *substream, rt715_index_write(rt715->regmap, RT715_SDW_INPUT_SEL, 0xa000); break; default: - dev_err(component->dev, "Invalid DAI id %d\n", dai->id); + dev_err(component->dev, "%s: Invalid DAI id %d\n", __func__, dai->id); return -EINVAL; } retval = sdw_stream_add_slave(rt715->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } @@ -883,8 +883,8 @@ static int rt715_pcm_hw_params(struct snd_pcm_substream *substream, val |= 0x0 << 8; break; default: - dev_err(component->dev, "Unsupported sample rate %d\n", - params_rate(params)); + dev_err(component->dev, "%s: Unsupported sample rate %d\n", + __func__, params_rate(params)); return -EINVAL; } @@ -892,8 +892,8 @@ static int rt715_pcm_hw_params(struct snd_pcm_substream *substream, /* bit 3:0 Number of Channel */ val |= (params_channels(params) - 1); } else { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } diff --git a/sound/soc/codecs/rt722-sdca.c b/sound/soc/codecs/rt722-sdca.c index 0e1c65a20392a..e0ea3a23f7cc6 100644 --- a/sound/soc/codecs/rt722-sdca.c +++ b/sound/soc/codecs/rt722-sdca.c @@ -35,8 +35,8 @@ int rt722_sdca_index_write(struct rt722_sdca_priv *rt722, ret = regmap_write(regmap, addr, value); if (ret < 0) dev_err(&rt722->slave->dev, - "Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + "%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -51,8 +51,8 @@ int rt722_sdca_index_read(struct rt722_sdca_priv *rt722, ret = regmap_read(regmap, addr, value); if (ret < 0) dev_err(&rt722->slave->dev, - "Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + "%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -663,7 +663,8 @@ static int rt722_sdca_dmic_set_gain_put(struct snd_kcontrol *kcontrol, for (i = 0; i < p->count; i++) { err = regmap_write(rt722->mbq_regmap, p->reg_base + i, gain_val[i]); if (err < 0) - dev_err(&rt722->slave->dev, "%#08x can't be set\n", p->reg_base + i); + dev_err(&rt722->slave->dev, "%s: %#08x can't be set\n", + __func__, p->reg_base + i); } return changed; @@ -1211,13 +1212,13 @@ static int rt722_sdca_pcm_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt722->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } if (params_channels(params) > 16) { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } @@ -1236,8 +1237,8 @@ static int rt722_sdca_pcm_hw_params(struct snd_pcm_substream *substream, sampling_rate = RT722_SDCA_RATE_192000HZ; break; default: - dev_err(component->dev, "Rate %d is not supported\n", - params_rate(params)); + dev_err(component->dev, "%s: Rate %d is not supported\n", + __func__, params_rate(params)); return -EINVAL; } -- GitLab From 7a84602297d36617dbdadeba55a2567031e5165b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 19 Mar 2024 12:34:45 -0400 Subject: [PATCH 116/989] 9p: explicitly deny setlease attempts 9p is a remote network protocol, and it doesn't support asynchronous notifications from the server. Ensure that we don't hand out any leases since we can't guarantee they'll be broken when a file's contents change. Signed-off-by: Jeff Layton Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index abdbbaee51846..348cc90bf9c56 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -520,6 +520,7 @@ const struct file_operations v9fs_file_operations = { .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync, + .setlease = simple_nosetlease, }; const struct file_operations v9fs_file_operations_dotl = { @@ -534,4 +535,5 @@ const struct file_operations v9fs_file_operations_dotl = { .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync_dotl, + .setlease = simple_nosetlease, }; -- GitLab From fc563aa900659a850e2ada4af26b9d7a3de6c591 Mon Sep 17 00:00:00 2001 From: Stephen Lee Date: Mon, 25 Mar 2024 18:01:31 -0700 Subject: [PATCH 117/989] ASoC: ops: Fix wraparound for mask in snd_soc_get_volsw In snd_soc_info_volsw(), mask is generated by figuring out the index of the most significant bit set in max and converting the index to a bitmask through bit shift 1. Unintended wraparound occurs when max is an integer value with msb bit set. Since the bit shift value 1 is treated as an integer type, the left shift operation will wraparound and set mask to 0 instead of all 1's. In order to fix this, we type cast 1 as `1ULL` to prevent the wraparound. Fixes: 7077148fb50a ("ASoC: core: Split ops out of soc-core.c") Signed-off-by: Stephen Lee Link: https://msgid.link/r/20240326010131.6211-1-slee08177@gmail.com Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 2d25748ca7066..b27e89ff6a167 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -263,7 +263,7 @@ int snd_soc_get_volsw(struct snd_kcontrol *kcontrol, int max = mc->max; int min = mc->min; int sign_bit = mc->sign_bit; - unsigned int mask = (1 << fls(max)) - 1; + unsigned int mask = (1ULL << fls(max)) - 1; unsigned int invert = mc->invert; int val; int ret; -- GitLab From 7f1dd39aedfccf60772328c5b88d56dbd39954c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 27 Mar 2024 08:33:10 +0100 Subject: [PATCH 118/989] clk: Provide !COMMON_CLK dummy for devm_clk_rate_exclusive_get() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be able to compile drivers using devm_clk_rate_exclusive_get() also on platforms without the common clk framework, add a dummy implementation that does the same as clk_rate_exclusive_get() in that case (i.e. nothing). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403270305.ydvX9xq1-lkp@intel.com/ Fixes: b0cde62e4c54 ("clk: Add a devm variant of clk_rate_exclusive_get()") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240327073310.520950-2-u.kleine-koenig@pengutronix.de Signed-off-by: Stephen Boyd --- include/linux/clk.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/clk.h b/include/linux/clk.h index 00623f4de5e19..0fa56d6725321 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -286,6 +286,11 @@ static inline int clk_rate_exclusive_get(struct clk *clk) return 0; } +static inline int devm_clk_rate_exclusive_get(struct device *dev, struct clk *clk) +{ + return 0; +} + static inline void clk_rate_exclusive_put(struct clk *clk) {} #endif -- GitLab From c90847bcbfb65d0f1c48fcc73a2b3a2d4ceac6a1 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Tue, 26 Mar 2024 22:45:24 -0700 Subject: [PATCH 119/989] cache: sifive_ccache: Partially convert to a platform driver Commit 8ec99b033147 ("irqchip/sifive-plic: Convert PLIC driver into a platform driver") broke ccache initialization because the PLIC IRQ domain is no longer available during an arch_initcall: [ 0.087229] irq: no irq domain found for interrupt-controller@c000000 ! [ 0.087255] CCACHE: Could not request IRQ 0 Fix this by moving the IRQ handling code to a platform driver. Fixes: 8ec99b033147 ("irqchip/sifive-plic: Convert PLIC driver into a platform driver") Signed-off-by: Samuel Holland Tested-by: Geert Uytterhoeven Signed-off-by: Conor Dooley --- drivers/cache/sifive_ccache.c | 72 ++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/drivers/cache/sifive_ccache.c b/drivers/cache/sifive_ccache.c index 89ed6cd6b059e..e9cc8b4786fbf 100644 --- a/drivers/cache/sifive_ccache.c +++ b/drivers/cache/sifive_ccache.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include #include @@ -247,13 +249,49 @@ static irqreturn_t ccache_int_handler(int irq, void *device) return IRQ_HANDLED; } +static int sifive_ccache_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + unsigned long quirks; + int intr_num, rc; + + quirks = (unsigned long)device_get_match_data(dev); + + intr_num = platform_irq_count(pdev); + if (!intr_num) + return dev_err_probe(dev, -ENODEV, "No interrupts property\n"); + + for (int i = 0; i < intr_num; i++) { + if (i == DATA_UNCORR && (quirks & QUIRK_BROKEN_DATA_UNCORR)) + continue; + + g_irq[i] = platform_get_irq(pdev, i); + if (g_irq[i] < 0) + return g_irq[i]; + + rc = devm_request_irq(dev, g_irq[i], ccache_int_handler, 0, "ccache_ecc", NULL); + if (rc) + return dev_err_probe(dev, rc, "Could not request IRQ %d\n", g_irq[i]); + } + + return 0; +} + +static struct platform_driver sifive_ccache_driver = { + .probe = sifive_ccache_probe, + .driver = { + .name = "sifive_ccache", + .of_match_table = sifive_ccache_ids, + }, +}; + static int __init sifive_ccache_init(void) { struct device_node *np; struct resource res; - int i, rc, intr_num; const struct of_device_id *match; unsigned long quirks; + int rc; np = of_find_matching_node_and_match(NULL, sifive_ccache_ids, &match); if (!np) @@ -277,28 +315,6 @@ static int __init sifive_ccache_init(void) goto err_unmap; } - intr_num = of_property_count_u32_elems(np, "interrupts"); - if (!intr_num) { - pr_err("No interrupts property\n"); - rc = -ENODEV; - goto err_unmap; - } - - for (i = 0; i < intr_num; i++) { - g_irq[i] = irq_of_parse_and_map(np, i); - - if (i == DATA_UNCORR && (quirks & QUIRK_BROKEN_DATA_UNCORR)) - continue; - - rc = request_irq(g_irq[i], ccache_int_handler, 0, "ccache_ecc", - NULL); - if (rc) { - pr_err("Could not request IRQ %d\n", g_irq[i]); - goto err_free_irq; - } - } - of_node_put(np); - #ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS if (quirks & QUIRK_NONSTANDARD_CACHE_OPS) { riscv_cbom_block_size = SIFIVE_CCACHE_LINE_SIZE; @@ -315,11 +331,15 @@ static int __init sifive_ccache_init(void) #ifdef CONFIG_DEBUG_FS setup_sifive_debug(); #endif + + rc = platform_driver_register(&sifive_ccache_driver); + if (rc) + goto err_unmap; + + of_node_put(np); + return 0; -err_free_irq: - while (--i >= 0) - free_irq(g_irq[i], NULL); err_unmap: iounmap(ccache_base); err_node_put: -- GitLab From 931ec1e4cb7fd81fe01e85419238a9cfb9d930c9 Mon Sep 17 00:00:00 2001 From: William Tu Date: Mon, 25 Mar 2024 11:12:28 -0700 Subject: [PATCH 120/989] Documentation: Add documentation for eswitch attribute Provide devlink documentation for three eswitch attributes: mode, inline-mode, and encap-mode. Signed-off-by: William Tu Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240325181228.6244-1-witu@nvidia.com Signed-off-by: Jakub Kicinski --- .../devlink/devlink-eswitch-attr.rst | 76 +++++++++++++++++++ Documentation/networking/devlink/index.rst | 1 + Documentation/networking/representors.rst | 1 + 3 files changed, 78 insertions(+) create mode 100644 Documentation/networking/devlink/devlink-eswitch-attr.rst diff --git a/Documentation/networking/devlink/devlink-eswitch-attr.rst b/Documentation/networking/devlink/devlink-eswitch-attr.rst new file mode 100644 index 0000000000000..08bb39ab15286 --- /dev/null +++ b/Documentation/networking/devlink/devlink-eswitch-attr.rst @@ -0,0 +1,76 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +Devlink E-Switch Attribute +========================== + +Devlink E-Switch supports two modes of operation: legacy and switchdev. +Legacy mode operates based on traditional MAC/VLAN steering rules. Switching +decisions are made based on MAC addresses, VLANs, etc. There is limited ability +to offload switching rules to hardware. + +On the other hand, switchdev mode allows for more advanced offloading +capabilities of the E-Switch to hardware. In switchdev mode, more switching +rules and logic can be offloaded to the hardware switch ASIC. It enables +representor netdevices that represent the slow path of virtual functions (VFs) +or scalable-functions (SFs) of the device. See more information about +:ref:`Documentation/networking/switchdev.rst ` and +:ref:`Documentation/networking/representors.rst `. + +In addition, the devlink E-Switch also comes with other attributes listed +in the following section. + +Attributes Description +====================== + +The following is a list of E-Switch attributes. + +.. list-table:: E-Switch attributes + :widths: 8 5 45 + + * - Name + - Type + - Description + * - ``mode`` + - enum + - The mode of the device. The mode can be one of the following: + + * ``legacy`` operates based on traditional MAC/VLAN steering + rules. + * ``switchdev`` allows for more advanced offloading capabilities of + the E-Switch to hardware. + * - ``inline-mode`` + - enum + - Some HWs need the VF driver to put part of the packet + headers on the TX descriptor so the e-switch can do proper + matching and steering. Support for both switchdev mode and legacy mode. + + * ``none`` none. + * ``link`` L2 mode. + * ``network`` L3 mode. + * ``transport`` L4 mode. + * - ``encap-mode`` + - enum + - The encapsulation mode of the device. Support for both switchdev mode + and legacy mode. The mode can be one of the following: + + * ``none`` Disable encapsulation support. + * ``basic`` Enable encapsulation support. + +Example Usage +============= + +.. code:: shell + + # enable switchdev mode + $ devlink dev eswitch set pci/0000:08:00.0 mode switchdev + + # set inline-mode and encap-mode + $ devlink dev eswitch set pci/0000:08:00.0 inline-mode none encap-mode basic + + # display devlink device eswitch attributes + $ devlink dev eswitch show pci/0000:08:00.0 + pci/0000:08:00.0: mode switchdev inline-mode none encap-mode basic + + # enable encap-mode with legacy mode + $ devlink dev eswitch set pci/0000:08:00.0 mode legacy inline-mode none encap-mode basic diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst index e14d7a701b72b..948c8c44e233f 100644 --- a/Documentation/networking/devlink/index.rst +++ b/Documentation/networking/devlink/index.rst @@ -67,6 +67,7 @@ general. devlink-selftests devlink-trap devlink-linecard + devlink-eswitch-attr Driver-specific documentation ----------------------------- diff --git a/Documentation/networking/representors.rst b/Documentation/networking/representors.rst index decb39c19b9ed..5e23386f69687 100644 --- a/Documentation/networking/representors.rst +++ b/Documentation/networking/representors.rst @@ -1,4 +1,5 @@ .. SPDX-License-Identifier: GPL-2.0 +.. _representors: ============================= Network Function Representors -- GitLab From fa84513997e9703fbac94b73bbe50aafdb29040e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 27 Mar 2024 09:14:13 +0100 Subject: [PATCH 121/989] ptp: MAINTAINERS: drop Jeff Sipek Emails to Jeff Sipek bounce: Your message to jsipek@vmware.com couldn't be delivered. Recipient is not authorized to accept external mail Status code: 550 5.7.1_ETR Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240327081413.306054-1-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6a233e1a3cf2e..7b8f97bf79751 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23661,7 +23661,6 @@ F: drivers/scsi/vmw_pvscsi.c F: drivers/scsi/vmw_pvscsi.h VMWARE VIRTUAL PTP CLOCK DRIVER -M: Jeff Sipek R: Ajay Kaher R: Alexey Makhalov R: VMware PV-Drivers Reviewers -- GitLab From 037965402a010898d34f4e35327d22c0a95cd51f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 27 Mar 2024 13:14:56 +0100 Subject: [PATCH 122/989] xen-netfront: Add missing skb_mark_for_recycle Notice that skb_mark_for_recycle() is introduced later than fixes tag in commit 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling"). It is believed that fixes tag were missing a call to page_pool_release_page() between v5.9 to v5.14, after which is should have used skb_mark_for_recycle(). Since v6.6 the call page_pool_release_page() were removed (in commit 535b9c61bdef ("net: page_pool: hide page_pool_release_page()") and remaining callers converted (in commit 6bfef2ec0172 ("Merge branch 'net-page_pool-remove-page_pool_release_page'")). This leak became visible in v6.8 via commit dba1b8a7ab68 ("mm/page_pool: catch page_pool memory leaks"). Cc: stable@vger.kernel.org Fixes: 6c5aa6fc4def ("xen networking: add basic XDP support for xen-netfront") Reported-by: Leonidas Spyropoulos Link: https://bugzilla.kernel.org/show_bug.cgi?id=218654 Reported-by: Arthur Borsboom Signed-off-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/171154167446.2671062.9127105384591237363.stgit@firesoul Signed-off-by: Jakub Kicinski --- drivers/net/xen-netfront.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index ad29f370034e4..8d2aee88526c6 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -285,6 +285,7 @@ static struct sk_buff *xennet_alloc_one_rx_buffer(struct netfront_queue *queue) return NULL; } skb_add_rx_frag(skb, 0, page, 0, 0, PAGE_SIZE); + skb_mark_for_recycle(skb); /* Align ip header to a 16 bytes boundary */ skb_reserve(skb, NET_IP_ALIGN); -- GitLab From e9c856cabefb71d47b2eeb197f72c9c88e9b45b0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 27 Mar 2024 22:24:25 -0700 Subject: [PATCH 123/989] bpf: put uprobe link's path and task in release callback There is no need to delay putting either path or task to deallocation step. It can be done right after bpf_uprobe_unregister. Between release and dealloc, there could be still some running BPF programs, but they don't access either task or path, only data in link->uprobes, so it is safe to do. On the other hand, doing path_put() in dealloc callback makes this dealloc sleepable because path_put() itself might sleep. Which is problematic due to the need to call uprobe's dealloc through call_rcu(), which is what is done in the next bug fix patch. So solve the problem by releasing these resources early. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240328052426.3042617-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0a5c4efc73c36..0b73fe5f7206b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3157,6 +3157,9 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link) umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); + if (umulti_link->task) + put_task_struct(umulti_link->task); + path_put(&umulti_link->path); } static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) @@ -3164,9 +3167,6 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); - if (umulti_link->task) - put_task_struct(umulti_link->task); - path_put(&umulti_link->path); kvfree(umulti_link->uprobes); kfree(umulti_link); } -- GitLab From 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 27 Mar 2024 22:24:26 -0700 Subject: [PATCH 124/989] bpf: support deferring bpf_link dealloc to after RCU grace period BPF link for some program types is passed as a "context" which can be used by those BPF programs to look up additional information. E.g., for multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values. Because of this runtime dependency, when bpf_link refcnt drops to zero there could still be active BPF programs running accessing link data. This patch adds generic support to defer bpf_link dealloc callback to after RCU GP, if requested. This is done by exposing two different deallocation callbacks, one synchronous and one deferred. If deferred one is provided, bpf_link_free() will schedule dealloc_deferred() callback to happen after RCU GP. BPF is using two flavors of RCU: "classic" non-sleepable one and RCU tasks trace one. The latter is used when sleepable BPF programs are used. bpf_link_free() accommodates that by checking underlying BPF program's sleepable flag, and goes either through normal RCU GP only for non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP (taking into account rcu_trace_implies_rcu_gp() optimization), if BPF program is sleepable. We use this for multi-kprobe and multi-uprobe links, which dereference link during program run. We also preventively switch raw_tp link to use deferred dealloc callback, as upcoming changes in bpf-next tree expose raw_tp link data (specifically, cookie value) to BPF program at runtime as well. Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 +++++++++++++++- kernel/bpf/syscall.c | 35 ++++++++++++++++++++++++++++++++--- kernel/trace/bpf_trace.c | 4 ++-- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4f20f62f9d63d..890e152d553ea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1574,12 +1574,26 @@ struct bpf_link { enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; - struct work_struct work; + /* rcu is used before freeing, work can be used to schedule that + * RCU-based freeing before that, so they never overlap + */ + union { + struct rcu_head rcu; + struct work_struct work; + }; }; struct bpf_link_ops { void (*release)(struct bpf_link *link); + /* deallocate link resources callback, called without RCU grace period + * waiting + */ void (*dealloc)(struct bpf_link *link); + /* deallocate link resources callback, called after RCU grace period; + * if underlying BPF program is sleepable we go through tasks trace + * RCU GP and then "classic" RCU GP + */ + void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae2ff73bde7e7..c287925471f68 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3024,17 +3024,46 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); } +static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) +{ + struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); + + /* free bpf_link and its containing memory */ + link->ops->dealloc_deferred(link); +} + +static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_link_defer_dealloc_rcu_gp(rcu); + else + call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp); +} + /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bool sleepable = false; + bpf_link_free_id(link->id); if (link->prog) { + sleepable = link->prog->sleepable; /* detach BPF program, clean up used resources */ link->ops->release(link); bpf_prog_put(link->prog); } - /* free bpf_link and its containing memory */ - link->ops->dealloc(link); + if (link->ops->dealloc_deferred) { + /* schedule BPF link deallocation; if underlying BPF program + * is sleepable, we need to first wait for RCU tasks trace + * sync, then go through "classic" RCU grace period + */ + if (sleepable) + call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); + else + call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); + } + if (link->ops->dealloc) + link->ops->dealloc(link); } static void bpf_link_put_deferred(struct work_struct *work) @@ -3544,7 +3573,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, - .dealloc = bpf_raw_tp_link_dealloc, + .dealloc_deferred = bpf_raw_tp_link_dealloc, .show_fdinfo = bpf_raw_tp_link_show_fdinfo, .fill_link_info = bpf_raw_tp_link_fill_link_info, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0b73fe5f7206b..9dc605f08a231 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2728,7 +2728,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, - .dealloc = bpf_kprobe_multi_link_dealloc, + .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, }; @@ -3242,7 +3242,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, - .dealloc = bpf_uprobe_multi_link_dealloc, + .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, }; -- GitLab From 62248b22d01e96a4d669cde0d7005bd51ebf9e76 Mon Sep 17 00:00:00 2001 From: Natanael Copa Date: Thu, 28 Mar 2024 11:59:13 +0100 Subject: [PATCH 125/989] tools/resolve_btfids: fix build with musl libc Include the header that defines u32. This fixes build of 6.6.23 and 6.1.83 kernels for Alpine Linux, which uses musl libc. I assume that GNU libc indirecly pulls in linux/types.h. Fixes: 9707ac4fe2f5 ("tools/resolve_btfids: Refactor set sorting with types from btf_ids.h") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218647 Cc: stable@vger.kernel.org Signed-off-by: Natanael Copa Tested-by: Greg Thelen Link: https://lore.kernel.org/r/20240328110103.28734-1-ncopa@alpinelinux.org Signed-off-by: Alexei Starovoitov --- tools/include/linux/btf_ids.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h index 72535f00572f6..72ea363d434db 100644 --- a/tools/include/linux/btf_ids.h +++ b/tools/include/linux/btf_ids.h @@ -3,6 +3,8 @@ #ifndef _LINUX_BTF_IDS_H #define _LINUX_BTF_IDS_H +#include /* for u32 */ + struct btf_id_set { u32 cnt; u32 ids[]; -- GitLab From 8cb10cba124c4798b6cb333245ecdc8dde78aeae Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Wed, 28 Feb 2024 12:02:15 -0800 Subject: [PATCH 126/989] arm64: dts: freescale: imx8mp-venice-gw72xx-2x: fix USB vbus regulator When using usb-conn-gpio to control USB role and VBUS, the vbus-supply property must be present in the usb-conn-gpio node. Additionally it should not be present in the phy node as that isn't what controls vbus and will upset the use count. This resolves an issue where VBUS is enabled with OTG in peripheral mode. Fixes: ad9a12f7a522 ("arm64: dts: imx8mp-venice: Fix USB connector description") Signed-off-by: Tim Harvey Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi index 41c79d2ebdd62..f24b14744799e 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi @@ -14,6 +14,7 @@ pinctrl-0 = <&pinctrl_usbcon1>; type = "micro"; label = "otg"; + vbus-supply = <®_usb1_vbus>; id-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; port { @@ -183,7 +184,6 @@ }; &usb3_phy0 { - vbus-supply = <®_usb1_vbus>; status = "okay"; }; -- GitLab From 6f8e0aca838e163e81fde176e945161d50679339 Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Wed, 28 Feb 2024 12:02:16 -0800 Subject: [PATCH 127/989] arm64: dts: freescale: imx8mp-venice-gw73xx-2x: fix USB vbus regulator When using usb-conn-gpio to control USB role and VBUS, the vbus-supply property must be present in the usb-conn-gpio node. Additionally it should not be present in the phy node as that isn't what controls vbus and will upset the use count. This resolves an issue where VBUS is enabled with OTG in peripheral mode. Fixes: ad9a12f7a522 ("arm64: dts: imx8mp-venice: Fix USB connector description") Signed-off-by: Tim Harvey Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi index d5c400b355af5..f5491a608b2f3 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi @@ -14,6 +14,7 @@ pinctrl-0 = <&pinctrl_usbcon1>; type = "micro"; label = "otg"; + vbus-supply = <®_usb1_vbus>; id-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; port { @@ -202,7 +203,6 @@ }; &usb3_phy0 { - vbus-supply = <®_usb1_vbus>; status = "okay"; }; -- GitLab From 831ec5e3538e989c7995137b5c5c661991a09504 Mon Sep 17 00:00:00 2001 From: Gergo Koteles Date: Thu, 28 Mar 2024 23:47:37 +0100 Subject: [PATCH 128/989] ASoC: tas2781: mark dvc_tlv with __maybe_unused Since we put dvc_tlv static variable to a header file it's copied to each module that includes the header. But not all of them are actually used it. Fix this W=1 build warning: include/sound/tas2781-tlv.h:18:35: warning: 'dvc_tlv' defined but not used [-Wunused-const-variable=] Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403290354.v0StnRpc-lkp@intel.com/ Fixes: ae065d0ce9e3 ("ALSA: hda/tas2781: remove digital gain kcontrol") Signed-off-by: Gergo Koteles Message-ID: <0e461545a2a6e9b6152985143e50526322e5f76b.1711665731.git.soyer@irl.hu> Signed-off-by: Takashi Iwai --- include/sound/tas2781-tlv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sound/tas2781-tlv.h b/include/sound/tas2781-tlv.h index 4038dd421150a..1dc59005d241f 100644 --- a/include/sound/tas2781-tlv.h +++ b/include/sound/tas2781-tlv.h @@ -15,7 +15,7 @@ #ifndef __TAS2781_TLV_H__ #define __TAS2781_TLV_H__ -static const DECLARE_TLV_DB_SCALE(dvc_tlv, -10000, 100, 0); +static const __maybe_unused DECLARE_TLV_DB_SCALE(dvc_tlv, -10000, 100, 0); static const DECLARE_TLV_DB_SCALE(amp_vol_tlv, 1100, 50, 0); #endif -- GitLab From 10e52ad5ced2a7dcdb3fb18c9cef111d5f30471d Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Tue, 26 Mar 2024 09:56:49 +0100 Subject: [PATCH 129/989] net: hsr: Use full string description when opening HSR network device Up till now only single character ('A' or 'B') was used to provide information of HSR slave network device status. As it is also possible and valid, that Interlink network device may be supported as well, the description must be more verbose. As a result the full string description is now used. Signed-off-by: Lukasz Majewski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index c98b5b71ad7c3..e9d45133d6412 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -132,30 +132,29 @@ static int hsr_dev_open(struct net_device *dev) { struct hsr_priv *hsr; struct hsr_port *port; - char designation; + const char *designation = NULL; hsr = netdev_priv(dev); - designation = '\0'; hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { case HSR_PT_SLAVE_A: - designation = 'A'; + designation = "Slave A"; break; case HSR_PT_SLAVE_B: - designation = 'B'; + designation = "Slave B"; break; default: - designation = '?'; + designation = "Unknown"; } if (!is_slave_up(port->dev)) - netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a fully working HSR network\n", + netdev_warn(dev, "%s (%s) is not up; please bring it up to get a fully working HSR network\n", designation, port->dev->name); } - if (designation == '\0') + if (!designation) netdev_warn(dev, "No slave devices configured\n"); return 0; -- GitLab From 3d010c8031e39f5fa1e8b13ada77e0321091011f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:33:58 +0100 Subject: [PATCH 130/989] udp: do not accept non-tunnel GSO skbs landing in a tunnel When rx-udp-gro-forwarding is enabled UDP packets might be GROed when being forwarded. If such packets might land in a tunnel this can cause various issues and udp_gro_receive makes sure this isn't the case by looking for a matching socket. This is performed in udp4/6_gro_lookup_skb but only in the current netns. This is an issue with tunneled packets when the endpoint is in another netns. In such cases the packets will be GROed at the UDP level, which leads to various issues later on. The same thing can happen with rx-gro-list. We saw this with geneve packets being GROed at the UDP level. In such case gso_size is set; later the packet goes through the geneve rx path, the geneve header is pulled, the offset are adjusted and frag_list skbs are not adjusted with regard to geneve. When those skbs hit skb_fragment, it will misbehave. Different outcomes are possible depending on what the GROed skbs look like; from corrupted packets to kernel crashes. One example is a BUG_ON[1] triggered in skb_segment while processing the frag_list. Because gso_size is wrong (geneve header was pulled) skb_segment thinks there is "geneve header size" of data in frag_list, although it's in fact the next packet. The BUG_ON itself has nothing to do with the issue. This is only one of the potential issues. Looking up for a matching socket in udp_gro_receive is fragile: the lookup could be extended to all netns (not speaking about performances) but nothing prevents those packets from being modified in between and we could still not find a matching socket. It's OK to keep the current logic there as it should cover most cases but we also need to make sure we handle tunnel packets being GROed too early. This is done by extending the checks in udp_unexpected_gso: GSO packets lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits and landing in a tunnel must be segmented. [1] kernel BUG at net/core/skbuff.c:4408! RIP: 0010:skb_segment+0xd2a/0xf70 __udp_gso_segment+0xaa/0x560 Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Fixes: 36707061d6ba ("udp: allow forwarding of plain (non-fraglisted) UDP GRO packets") Signed-off-by: Antoine Tenart Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 28 ++++++++++++++++++++++++++++ net/ipv4/udp.c | 7 +++++++ net/ipv4/udp_offload.c | 6 ++++-- net/ipv6/udp.c | 2 +- 4 files changed, 40 insertions(+), 3 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 3748e82b627b7..17539d0896661 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -150,6 +150,24 @@ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk, } } +DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); +#if IS_ENABLED(CONFIG_IPV6) +DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +#endif + +static inline bool udp_encap_needed(void) +{ + if (static_branch_unlikely(&udp_encap_needed_key)) + return true; + +#if IS_ENABLED(CONFIG_IPV6) + if (static_branch_unlikely(&udpv6_encap_needed_key)) + return true; +#endif + + return false; +} + static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) { if (!skb_is_gso(skb)) @@ -163,6 +181,16 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) !udp_test_bit(ACCEPT_FRAGLIST, sk)) return true; + /* GSO packets lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits might still + * land in a tunnel as the socket check in udp_gro_receive cannot be + * foolproof. + */ + if (udp_encap_needed() && + READ_ONCE(udp_sk(sk)->encap_rcv) && + !(skb_shinfo(skb)->gso_type & + (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM))) + return true; + return false; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 661d0e0d273f6..c02bf011d4a6f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -582,6 +582,13 @@ static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk, } DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); +EXPORT_SYMBOL(udp_encap_needed_key); + +#if IS_ENABLED(CONFIG_IPV6) +DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +EXPORT_SYMBOL(udpv6_encap_needed_key); +#endif + void udp_encap_enable(void) { static_branch_inc(&udp_encap_needed_key); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b9880743765c6..e9719afe91cf5 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -551,8 +551,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, unsigned int off = skb_gro_offset(skb); int flush = 1; - /* we can do L4 aggregation only if the packet can't land in a tunnel - * otherwise we could corrupt the inner stream + /* We can do L4 aggregation only if the packet can't land in a tunnel + * otherwise we could corrupt the inner stream. Detecting such packets + * cannot be foolproof and the aggregation might still happen in some + * cases. Such packets should be caught in udp_unexpected_gso later. */ NAPI_GRO_CB(skb)->is_flist = 0; if (!sk || !udp_sk(sk)->gro_receive) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7c1e6469d091d..8b1dd7f512491 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -447,7 +447,7 @@ csum_copy_err: goto try_again; } -DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void) { static_branch_inc(&udpv6_encap_needed_key); -- GitLab From ed4cccef64c1d0d5b91e69f7a8a6697c3a865486 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:33:59 +0100 Subject: [PATCH 131/989] gro: fix ownership transfer If packets are GROed with fraglist they might be segmented later on and continue their journey in the stack. In skb_segment_list those skbs can be reused as-is. This is an issue as their destructor was removed in skb_gro_receive_list but not the reference to their socket, and then they can't be orphaned. Fix this by also removing the reference to the socket. For example this could be observed, kernel BUG at include/linux/skbuff.h:3131! (skb_orphan) RIP: 0010:ip6_rcv_core+0x11bc/0x19a0 Call Trace: ipv6_list_rcv+0x250/0x3f0 __netif_receive_skb_list_core+0x49d/0x8f0 netif_receive_skb_list_internal+0x634/0xd40 napi_complete_done+0x1d2/0x7d0 gro_cell_poll+0x118/0x1f0 A similar construction is found in skb_gro_receive, apply the same change there. Fixes: 5e10da5385d2 ("skbuff: allow 'slow_gro' for skb carring sock reference") Signed-off-by: Antoine Tenart Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/gro.c | 3 ++- net/ipv4/udp_offload.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/core/gro.c b/net/core/gro.c index ee30d4f0c0387..83f35d99a682c 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -192,8 +192,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) } merge: - /* sk owenrship - if any - completely transferred to the aggregated packet */ + /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; + skb->sk = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e9719afe91cf5..3bb69464930bf 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -449,8 +449,9 @@ static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) NAPI_GRO_CB(p)->count++; p->data_len += skb->len; - /* sk owenrship - if any - completely transferred to the aggregated packet */ + /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; + skb->sk = NULL; p->truesize += skb->truesize; p->len += skb->len; -- GitLab From f0b8c30345565344df2e33a8417a27503589247d Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:34:00 +0100 Subject: [PATCH 132/989] udp: do not transition UDP GRO fraglist partial checksums to unnecessary UDP GRO validates checksums and in udp4/6_gro_complete fraglist packets are converted to CHECKSUM_UNNECESSARY to avoid later checks. However this is an issue for CHECKSUM_PARTIAL packets as they can be looped in an egress path and then their partial checksums are not fixed. Different issues can be observed, from invalid checksum on packets to traces like: gen01: hw csum failure skb len=3008 headroom=160 headlen=1376 tailroom=0 mac=(106,14) net=(120,40) trans=160 shinfo(txflags=0 nr_frags=0 gso(size=0 type=0 segs=0)) csum(0xffff232e ip_summed=2 complete_sw=0 valid=0 level=0) hash(0x77e3d716 sw=1 l4=1) proto=0x86dd pkttype=0 iif=12 ... Fix this by only converting CHECKSUM_NONE packets to CHECKSUM_UNNECESSARY by reusing __skb_incr_checksum_unnecessary. All other checksum types are kept as-is, including CHECKSUM_COMPLETE as fraglist packets being segmented back would have their skb->csum valid. Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Signed-off-by: Antoine Tenart Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 8 +------- net/ipv6/udp_offload.c | 8 +------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 3bb69464930bf..548476d782371 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -722,13 +722,7 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - if (skb->csum_level < SKB_MAX_CSUM_LEVEL) - skb->csum_level++; - } else { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = 0; - } + __skb_incr_checksum_unnecessary(skb); return 0; } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 312bcaeea96fb..bbd347de00b45 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -174,13 +174,7 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - if (skb->csum_level < SKB_MAX_CSUM_LEVEL) - skb->csum_level++; - } else { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = 0; - } + __skb_incr_checksum_unnecessary(skb); return 0; } -- GitLab From 64235eabc4b5b18c507c08a1f16cdac6c5661220 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:34:01 +0100 Subject: [PATCH 133/989] udp: prevent local UDP tunnel packets from being GROed GRO has a fundamental issue with UDP tunnel packets as it can't detect those in a foolproof way and GRO could happen before they reach the tunnel endpoint. Previous commits have fixed issues when UDP tunnel packets come from a remote host, but if those packets are issued locally they could run into checksum issues. If the inner packet has a partial checksum the information will be lost in the GRO logic, either in udp4/6_gro_complete or in udp_gro_complete_segment and packets will have an invalid checksum when leaving the host. Prevent local UDP tunnel packets from ever being GROed at the outer UDP level. Due to skb->encapsulation being wrongly used in some drivers this is actually only preventing UDP tunnel packets with a partial checksum to be GROed (see iptunnel_handle_offloads) but those were also the packets triggering issues so in practice this should be sufficient. Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Fixes: 36707061d6ba ("udp: allow forwarding of plain (non-fraglisted) UDP GRO packets") Suggested-by: Paolo Abeni Signed-off-by: Antoine Tenart Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 548476d782371..3498dd1d0694d 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -559,6 +559,12 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, */ NAPI_GRO_CB(skb)->is_flist = 0; if (!sk || !udp_sk(sk)->gro_receive) { + /* If the packet was locally encapsulated in a UDP tunnel that + * wasn't detected above, do not GRO. + */ + if (skb->encapsulation) + goto out; + if (skb->dev->features & NETIF_F_GRO_FRAGLIST) NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1; -- GitLab From 0fb101be97ca27850c5ecdbd1269423ce4d1f607 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:34:02 +0100 Subject: [PATCH 134/989] selftests: net: gro fwd: update vxlan GRO test expectations UDP tunnel packets can't be GRO in-between their endpoints as this causes different issues. The UDP GRO fwd vxlan tests were relying on this and their expectations have to be fixed. We keep both vxlan tests and expected no GRO from happening. The vxlan UDP GRO bench test was removed as it's not providing any valuable information now. Fixes: a062260a9d5f ("selftests: net: add UDP GRO forwarding self-tests") Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgro_fwd.sh | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh index 380cb15e942e4..83ed987cff340 100755 --- a/tools/testing/selftests/net/udpgro_fwd.sh +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -244,7 +244,7 @@ for family in 4 6; do create_vxlan_pair ip netns exec $NS_DST ethtool -K veth$DST generic-receive-offload on ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on - run_test "GRO frag list over UDP tunnel" $OL_NET$DST 1 1 + run_test "GRO frag list over UDP tunnel" $OL_NET$DST 10 10 cleanup # use NAT to circumvent GRO FWD check @@ -258,13 +258,7 @@ for family in 4 6; do # load arp cache before running the test to reduce the amount of # stray traffic on top of the UDP tunnel ip netns exec $NS_SRC $PING -q -c 1 $OL_NET$DST_NAT >/dev/null - run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 1 1 $OL_NET$DST - cleanup - - create_vxlan_pair - run_bench "UDP tunnel fwd perf" $OL_NET$DST - ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on - run_bench "UDP tunnel GRO fwd perf" $OL_NET$DST + run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 10 10 $OL_NET$DST cleanup done -- GitLab From 0ba80d96585662299d4ea4624043759ce9015421 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Tue, 26 Mar 2024 17:51:49 +0530 Subject: [PATCH 135/989] octeontx2-af: Fix issue with loading coalesced KPU profiles The current implementation for loading coalesced KPU profiles has a limitation. The "offset" field, which is used to locate profiles within the profile is restricted to a u16. This restricts the number of profiles that can be loaded. This patch addresses this limitation by increasing the size of the "offset" field. Fixes: 11c730bfbf5b ("octeontx2-af: support for coalescing KPU profiles") Signed-off-by: Hariprasad Kelam Reviewed-by: Kalesh AP Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index e350242bbafba..be709f83f3318 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -1657,7 +1657,7 @@ static int npc_fwdb_detect_load_prfl_img(struct rvu *rvu, uint64_t prfl_sz, struct npc_coalesced_kpu_prfl *img_data = NULL; int i = 0, rc = -EINVAL; void __iomem *kpu_prfl_addr; - u16 offset; + u32 offset; img_data = (struct npc_coalesced_kpu_prfl __force *)rvu->kpu_prfl_addr; if (le64_to_cpu(img_data->signature) == KPU_SIGN && -- GitLab From 73dfe970c038d0548beccc5bfb2707e1d543b01f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 29 Mar 2024 11:35:40 +0100 Subject: [PATCH 136/989] pwm: Fix setting period with #pwm-cells = <1> and of_pwm_single_xlate() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For drivers making use of of_pwm_single_xlate() (i.e. those that don't pass a hwpwm index) and also don't pass flags, setting period was wrongly skipped. This affects the pwm-pxa and ti-sn65dsi86 drivers. Reported-by: Karel Balej Link: https://lore.kernel.org/r/D05IVTPYH35N.2CLDG6LSILRSN@matfyz.cz Fixes: 40ade0c2e794 ("pwm: Let the of_xlate callbacks accept references without period") Tested-by: Karel Balej Link: https://lore.kernel.org/r/20240329103544.545290-2-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- drivers/pwm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index d70f793ce4b38..403525cc17833 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -443,7 +443,7 @@ of_pwm_single_xlate(struct pwm_chip *chip, const struct of_phandle_args *args) if (IS_ERR(pwm)) return pwm; - if (args->args_count > 1) + if (args->args_count > 0) pwm->args.period = args->args[0]; pwm->args.polarity = PWM_POLARITY_NORMAL; -- GitLab From a3d3eab627bbbb0cb175910cf8d0f7022628a642 Mon Sep 17 00:00:00 2001 From: Jaewon Kim Date: Fri, 29 Mar 2024 17:58:40 +0900 Subject: [PATCH 137/989] spi: s3c64xx: Use DMA mode from fifo size If the SPI data size is smaller than FIFO, it operates in PIO mode, and if it is larger than FIFO size, it oerates in DMA mode. If the SPI data size is equal to fifo, it operates in PIO mode and it is separated to 2 transfers. To prevent it, it must operate in DMA mode from the case where the data size and the fifo size are the same. Fixes: 1ee806718d5e ("spi: s3c64xx: support interrupt based pio mode") Signed-off-by: Jaewon Kim Reviewed-by: Sam Protsenko Link: https://lore.kernel.org/r/20240329085840.65856-1-jaewon02.kim@samsung.com Signed-off-by: Mark Brown --- drivers/spi/spi-s3c64xx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c index 9fcbe040cb2f2..f726d86704287 100644 --- a/drivers/spi/spi-s3c64xx.c +++ b/drivers/spi/spi-s3c64xx.c @@ -430,7 +430,7 @@ static bool s3c64xx_spi_can_dma(struct spi_controller *host, struct s3c64xx_spi_driver_data *sdd = spi_controller_get_devdata(host); if (sdd->rx_dma.ch && sdd->tx_dma.ch) - return xfer->len > sdd->fifo_depth; + return xfer->len >= sdd->fifo_depth; return false; } @@ -826,10 +826,9 @@ static int s3c64xx_spi_transfer_one(struct spi_controller *host, return status; } - if (!is_polling(sdd) && (xfer->len > fifo_len) && + if (!is_polling(sdd) && xfer->len >= fifo_len && sdd->rx_dma.ch && sdd->tx_dma.ch) { use_dma = 1; - } else if (xfer->len >= fifo_len) { tx_buf = xfer->tx_buf; rx_buf = xfer->rx_buf; -- GitLab From 4790a73ace86f3d165bbedba898e0758e6e1b82d Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 14 Mar 2024 09:44:12 +0100 Subject: [PATCH 138/989] Revert "Bluetooth: hci_qca: Set BDA quirk bit if fwnode exists in DT" This reverts commit 7dcd3e014aa7faeeaf4047190b22d8a19a0db696. Qualcomm Bluetooth controllers like WCN6855 do not have persistent storage for the Bluetooth address and must therefore start as unconfigured to allow the user to set a valid address unless one has been provided by the boot firmware in the devicetree. A recent change snuck into v6.8-rc7 and incorrectly started marking the default (non-unique) address as valid. This specifically also breaks the Bluetooth setup for some user of the Lenovo ThinkPad X13s. Note that this is the second time Qualcomm breaks the driver this way and that this was fixed last year by commit 6945795bc81a ("Bluetooth: fix use-bdaddr-property quirk"), which also has some further details. Fixes: 7dcd3e014aa7 ("Bluetooth: hci_qca: Set BDA quirk bit if fwnode exists in DT") Cc: stable@vger.kernel.org # 6.8 Cc: Janaki Ramaiah Thota Signed-off-by: Johan Hovold Reported-by: Clayton Craft Tested-by: Clayton Craft Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_qca.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 8a60ad7acd705..4ecbcb1644cc6 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -7,7 +7,6 @@ * * Copyright (C) 2007 Texas Instruments, Inc. * Copyright (c) 2010, 2012, 2018 The Linux Foundation. All rights reserved. - * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved. * * Acknowledgements: * This file is based on hci_ll.c, which was... @@ -1904,17 +1903,7 @@ retry: case QCA_WCN6750: case QCA_WCN6855: case QCA_WCN7850: - - /* Set BDA quirk bit for reading BDA value from fwnode property - * only if that property exist in DT. - */ - if (fwnode_property_present(dev_fwnode(hdev->dev.parent), "local-bd-address")) { - set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); - bt_dev_info(hdev, "setting quirk bit to read BDA from fwnode later"); - } else { - bt_dev_dbg(hdev, "local-bd-address` is not present in the devicetree so not setting quirk bit for BDA"); - } - + set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); hci_set_aosp_capable(hdev); ret = qca_read_soc_version(hdev, &ver, soc_type); -- GitLab From 7003de8a226ea07d36e9461a30633af26dc79248 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 20 Mar 2024 08:55:51 +0100 Subject: [PATCH 139/989] dt-bindings: bluetooth: add 'qcom,local-bd-address-broken' Several Qualcomm Bluetooth controllers lack persistent storage for the device address and instead one can be provided by the boot firmware using the 'local-bd-address' devicetree property. The Bluetooth bindings clearly states that the address should be specified in little-endian order, but due to a long-standing bug in the Qualcomm driver which reversed the address some boot firmware has been providing the address in big-endian order instead. The only device out there that should be affected by this is the WCN3991 used in some Chromebooks. Add a 'qcom,local-bd-address-broken' property which can be set on these platforms to indicate that the boot firmware is using the wrong byte order. Note that ChromeOS always updates the kernel and devicetree in lockstep so that there is no need to handle backwards compatibility with older devicetrees. Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Reviewed-by: Rob Herring Signed-off-by: Luiz Augusto von Dentz --- .../devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml b/Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml index 528ef3572b621..055a3351880bc 100644 --- a/Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml +++ b/Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml @@ -94,6 +94,10 @@ properties: local-bd-address: true + qcom,local-bd-address-broken: + type: boolean + description: + boot firmware is incorrectly passing the address in big-endian order required: - compatible -- GitLab From e12e28009e584c8f8363439f6a928ec86278a106 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 20 Mar 2024 08:55:52 +0100 Subject: [PATCH 140/989] arm64: dts: qcom: sc7180-trogdor: mark bluetooth address as broken Several Qualcomm Bluetooth controllers lack persistent storage for the device address and instead one can be provided by the boot firmware using the 'local-bd-address' devicetree property. The Bluetooth bindings clearly states that the address should be specified in little-endian order, but due to a long-standing bug in the Qualcomm driver which reversed the address some boot firmware has been providing the address in big-endian order instead. The boot firmware in SC7180 Trogdor Chromebooks is known to be affected so mark the 'local-bd-address' property as broken to maintain backwards compatibility with older firmware when fixing the underlying driver bug. Note that ChromeOS always updates the kernel and devicetree in lockstep so that there is no need to handle backwards compatibility with older devicetrees. Fixes: 7ec3e67307f8 ("arm64: dts: qcom: sc7180-trogdor: add initial trogdor and lazor dt") Cc: stable@vger.kernel.org # 5.10 Cc: Rob Clark Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Acked-by: Bjorn Andersson Reviewed-by: Bjorn Andersson Signed-off-by: Luiz Augusto von Dentz --- arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi index f3a6da8b28901..5260c63db0078 100644 --- a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi @@ -944,6 +944,8 @@ ap_spi_fp: &spi10 { vddrf-supply = <&pp1300_l2c>; vddch0-supply = <&pp3300_l10c>; max-speed = <3200000>; + + qcom,local-bd-address-broken; }; }; -- GitLab From 39646f29b100566451d37abc4cc8cdd583756dfe Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 20 Mar 2024 08:55:53 +0100 Subject: [PATCH 141/989] Bluetooth: add quirk for broken address properties Some Bluetooth controllers lack persistent storage for the device address and instead one can be provided by the boot firmware using the 'local-bd-address' devicetree property. The Bluetooth devicetree bindings clearly states that the address should be specified in little-endian order, but due to a long-standing bug in the Qualcomm driver which reversed the address some boot firmware has been providing the address in big-endian order instead. Add a new quirk that can be set on platforms with broken firmware and use it to reverse the address when parsing the property so that the underlying driver bug can be fixed. Fixes: 5c0a1001c8be ("Bluetooth: hci_qca: Add helper to set device address") Cc: stable@vger.kernel.org # 5.1 Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 9 +++++++++ net/bluetooth/hci_sync.c | 5 ++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8701ca5f31eec..5c12761cbc0e2 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -176,6 +176,15 @@ enum { */ HCI_QUIRK_USE_BDADDR_PROPERTY, + /* When this quirk is set, the Bluetooth Device Address provided by + * the 'local-bd-address' fwnode property is incorrectly specified in + * big-endian order. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BDADDR_PROPERTY_BROKEN, + /* When this quirk is set, the duplicate filtering during * scanning is based on Bluetooth devices addresses. To allow * RSSI based updates, restart scanning if needed. diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index f6b662369322b..639090b9f4b85 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3416,7 +3416,10 @@ static void hci_dev_get_bd_addr_from_property(struct hci_dev *hdev) if (ret < 0 || !bacmp(&ba, BDADDR_ANY)) return; - bacpy(&hdev->public_addr, &ba); + if (test_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks)) + baswap(&hdev->public_addr, &ba); + else + bacpy(&hdev->public_addr, &ba); } struct hci_init_stage { -- GitLab From 77f45cca8bc55d00520a192f5a7715133591c83e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 20 Mar 2024 08:55:54 +0100 Subject: [PATCH 142/989] Bluetooth: qca: fix device-address endianness The WCN6855 firmware on the Lenovo ThinkPad X13s expects the Bluetooth device address in big-endian order when setting it using the EDL_WRITE_BD_ADDR_OPCODE command. Presumably, this is the case for all non-ROME devices which all use the EDL_WRITE_BD_ADDR_OPCODE command for this (unlike the ROME devices which use a different command and expect the address in little-endian order). Reverse the little-endian address before setting it to make sure that the address can be configured using tools like btmgmt or using the 'local-bd-address' devicetree property. Note that this can potentially break systems with boot firmware which has started relying on the broken behaviour and is incorrectly passing the address via devicetree in big-endian order. The only device affected by this should be the WCN3991 used in some Chromebooks. As ChromeOS updates the kernel and devicetree in lockstep, the new 'qcom,local-bd-address-broken' property can be used to determine if the firmware is buggy so that the underlying driver bug can be fixed without breaking backwards compatibility. Set the HCI_QUIRK_BDADDR_PROPERTY_BROKEN quirk for such platforms so that the address is reversed when parsing the address property. Fixes: 5c0a1001c8be ("Bluetooth: hci_qca: Add helper to set device address") Cc: stable@vger.kernel.org # 5.1 Cc: Balakrishna Godavarthi Cc: Matthias Kaehlcke Tested-by: Nikita Travkin # sc7180 Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btqca.c | 8 ++++++-- drivers/bluetooth/hci_qca.c | 10 ++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index b40b32fa7f1c3..19cfc342fc7bb 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -826,11 +826,15 @@ EXPORT_SYMBOL_GPL(qca_uart_setup); int qca_set_bdaddr(struct hci_dev *hdev, const bdaddr_t *bdaddr) { + bdaddr_t bdaddr_swapped; struct sk_buff *skb; int err; - skb = __hci_cmd_sync_ev(hdev, EDL_WRITE_BD_ADDR_OPCODE, 6, bdaddr, - HCI_EV_VENDOR, HCI_INIT_TIMEOUT); + baswap(&bdaddr_swapped, bdaddr); + + skb = __hci_cmd_sync_ev(hdev, EDL_WRITE_BD_ADDR_OPCODE, 6, + &bdaddr_swapped, HCI_EV_VENDOR, + HCI_INIT_TIMEOUT); if (IS_ERR(skb)) { err = PTR_ERR(skb); bt_dev_err(hdev, "QCA Change address cmd failed (%d)", err); diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 4ecbcb1644cc6..ecbc52eaf1010 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -225,6 +225,7 @@ struct qca_serdev { struct qca_power *bt_power; u32 init_speed; u32 oper_speed; + bool bdaddr_property_broken; const char *firmware_name; }; @@ -1842,6 +1843,7 @@ static int qca_setup(struct hci_uart *hu) const char *firmware_name = qca_get_firmware_name(hu); int ret; struct qca_btsoc_version ver; + struct qca_serdev *qcadev; const char *soc_name; ret = qca_check_speeds(hu); @@ -1904,6 +1906,11 @@ retry: case QCA_WCN6855: case QCA_WCN7850: set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); + + qcadev = serdev_device_get_drvdata(hu->serdev); + if (qcadev->bdaddr_property_broken) + set_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks); + hci_set_aosp_capable(hdev); ret = qca_read_soc_version(hdev, &ver, soc_type); @@ -2284,6 +2291,9 @@ static int qca_serdev_probe(struct serdev_device *serdev) if (!qcadev->oper_speed) BT_DBG("UART will pick default operating speed"); + qcadev->bdaddr_property_broken = device_property_read_bool(&serdev->dev, + "qcom,local-bd-address-broken"); + if (data) qcadev->btsoc_type = data->soc_type; else -- GitLab From 6946b9c99bde45f3ba74e00a7af9a3458cc24bea Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 26 Mar 2024 12:43:17 -0400 Subject: [PATCH 143/989] Bluetooth: hci_sync: Fix not checking error on hci_cmd_sync_cancel_sync hci_cmd_sync_cancel_sync shall check the error passed to it since it will be propagated using req_result which is __u32 it needs to be properly set to a positive value if it was passed as negative othertise IS_ERR will not trigger as -(errno) would be converted to a positive value. Fixes: 63298d6e752f ("Bluetooth: hci_core: Cancel request on command timeout") Signed-off-by: Luiz Augusto von Dentz Reported-and-tested-by: Thorsten Leemhuis Closes: https://lore.kernel.org/all/08275279-7462-4f4a-a0ee-8aa015f829bc@leemhuis.info/ --- net/bluetooth/hci_core.c | 6 +++--- net/bluetooth/hci_sync.c | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 1690ae57a09db..a7028d38c1f5c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2874,7 +2874,7 @@ static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err) cancel_delayed_work_sync(&hdev->ncmd_timer); atomic_set(&hdev->cmd_cnt, 1); - hci_cmd_sync_cancel_sync(hdev, -err); + hci_cmd_sync_cancel_sync(hdev, err); } /* Suspend HCI device */ @@ -2894,7 +2894,7 @@ int hci_suspend_dev(struct hci_dev *hdev) return 0; /* Cancel potentially blocking sync operation before suspend */ - hci_cancel_cmd_sync(hdev, -EHOSTDOWN); + hci_cancel_cmd_sync(hdev, EHOSTDOWN); hci_req_sync_lock(hdev); ret = hci_suspend_sync(hdev); @@ -4210,7 +4210,7 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) err = hci_send_frame(hdev, skb); if (err < 0) { - hci_cmd_sync_cancel_sync(hdev, err); + hci_cmd_sync_cancel_sync(hdev, -err); return; } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 639090b9f4b85..8fe02921adf15 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -617,7 +617,10 @@ void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err) bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hdev->req_status == HCI_REQ_PEND) { - hdev->req_result = err; + /* req_result is __u32 so error must be positive to be properly + * propagated. + */ + hdev->req_result = err < 0 ? -err : err; hdev->req_status = HCI_REQ_CANCELED; wake_up_interruptible(&hdev->req_wait_q); -- GitLab From c569242cd49287d53b73a94233db40097d838535 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Wed, 27 Mar 2024 12:30:30 +0800 Subject: [PATCH 144/989] Bluetooth: hci_event: set the conn encrypted before conn establishes We have a BT headset (Lenovo Thinkplus XT99), the pairing and connecting has no problem, once this headset is paired, bluez will remember this device and will auto re-connect it whenever the device is powered on. The auto re-connecting works well with Windows and Android, but with Linux, it always fails. Through debugging, we found at the rfcomm connection stage, the bluetooth stack reports "Connection refused - security block (0x0003)". For this device, the re-connecting negotiation process is different from other BT headsets, it sends the Link_KEY_REQUEST command before the CONNECT_REQUEST completes, and it doesn't send ENCRYPT_CHANGE command during the negotiation. When the device sends the "connect complete" to hci, the ev->encr_mode is 1. So here in the conn_complete_evt(), if ev->encr_mode is 1, link type is ACL and HCI_CONN_ENCRYPT is not set, we set HCI_CONN_ENCRYPT to this conn, and update conn->enc_key_size accordingly. After this change, this BT headset could re-connect with Linux successfully. This is the btmon log after applying the patch, after receiving the "Connect Complete" with "Encryption: Enabled", will send the command to read encryption key size: > HCI Event: Connect Request (0x04) plen 10 Address: 8C:3C:AA:D8:11:67 (OUI 8C-3C-AA) Class: 0x240404 Major class: Audio/Video (headset, speaker, stereo, video, vcr) Minor class: Wearable Headset Device Rendering (Printing, Speaker) Audio (Speaker, Microphone, Headset) Link type: ACL (0x01) ... > HCI Event: Link Key Request (0x17) plen 6 Address: 8C:3C:AA:D8:11:67 (OUI 8C-3C-AA) < HCI Command: Link Key Request Reply (0x01|0x000b) plen 22 Address: 8C:3C:AA:D8:11:67 (OUI 8C-3C-AA) Link key: ${32-hex-digits-key} ... > HCI Event: Connect Complete (0x03) plen 11 Status: Success (0x00) Handle: 256 Address: 8C:3C:AA:D8:11:67 (OUI 8C-3C-AA) Link type: ACL (0x01) Encryption: Enabled (0x01) < HCI Command: Read Encryption Key... (0x05|0x0008) plen 2 Handle: 256 < ACL Data TX: Handle 256 flags 0x00 dlen 10 L2CAP: Information Request (0x0a) ident 1 len 2 Type: Extended features supported (0x0002) > HCI Event: Command Complete (0x0e) plen 7 Read Encryption Key Size (0x05|0x0008) ncmd 1 Status: Success (0x00) Handle: 256 Key size: 16 Cc: stable@vger.kernel.org Link: https://github.com/bluez/bluez/issues/704 Reviewed-by: Paul Menzel Reviewed-by: Luiz Augusto von Dentz Signed-off-by: Hui Wang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 4ae2248240121..a8b8cfebe0180 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3208,6 +3208,31 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, if (test_bit(HCI_ENCRYPT, &hdev->flags)) set_bit(HCI_CONN_ENCRYPT, &conn->flags); + /* "Link key request" completed ahead of "connect request" completes */ + if (ev->encr_mode == 1 && !test_bit(HCI_CONN_ENCRYPT, &conn->flags) && + ev->link_type == ACL_LINK) { + struct link_key *key; + struct hci_cp_read_enc_key_size cp; + + key = hci_find_link_key(hdev, &ev->bdaddr); + if (key) { + set_bit(HCI_CONN_ENCRYPT, &conn->flags); + + if (!(hdev->commands[20] & 0x10)) { + conn->enc_key_size = HCI_LINK_KEY_SIZE; + } else { + cp.handle = cpu_to_le16(conn->handle); + if (hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, + sizeof(cp), &cp)) { + bt_dev_err(hdev, "sending read key size failed"); + conn->enc_key_size = HCI_LINK_KEY_SIZE; + } + } + + hci_encrypt_cfm(conn, ev->status); + } + } + /* Get remote features */ if (conn->type == ACL_LINK) { struct hci_cp_read_remote_features cp; -- GitLab From 7835fcfd132eb88b87e8eb901f88436f63ab60f7 Mon Sep 17 00:00:00 2001 From: Bastien Nocera Date: Wed, 27 Mar 2024 15:24:56 +0100 Subject: [PATCH 145/989] Bluetooth: Fix TOCTOU in HCI debugfs implementation struct hci_dev members conn_info_max_age, conn_info_min_age, le_conn_max_interval, le_conn_min_interval, le_adv_max_interval, and le_adv_min_interval can be modified from the HCI core code, as well through debugfs. The debugfs implementation, that's only available to privileged users, will check for boundaries, making sure that the minimum value being set is strictly above the maximum value that already exists, and vice-versa. However, as both minimum and maximum values can be changed concurrently to us modifying them, we need to make sure that the value we check is the value we end up using. For example, with ->conn_info_max_age set to 10, conn_info_min_age_set() gets called from vfs handlers to set conn_info_min_age to 8. In conn_info_min_age_set(), this goes through: if (val == 0 || val > hdev->conn_info_max_age) return -EINVAL; Concurrently, conn_info_max_age_set() gets called to set to set the conn_info_max_age to 7: if (val == 0 || val > hdev->conn_info_max_age) return -EINVAL; That check will also pass because we used the old value (10) for conn_info_max_age. After those checks that both passed, the struct hci_dev access is mutex-locked, disabling concurrent access, but that does not matter because the invalid value checks both passed, and we'll end up with conn_info_min_age = 8 and conn_info_max_age = 7 To fix this problem, we need to lock the structure access before so the check and assignment are not interrupted. This fix was originally devised by the BassCheck[1] team, and considered the problem to be an atomicity one. This isn't the case as there aren't any concerns about the variable changing while we check it, but rather after we check it parallel to another change. This patch fixes CVE-2024-24858 and CVE-2024-24857. [1] https://sites.google.com/view/basscheck/ Co-developed-by: Gui-Dong Han <2045gemini@gmail.com> Signed-off-by: Gui-Dong Han <2045gemini@gmail.com> Link: https://lore.kernel.org/linux-bluetooth/20231222161317.6255-1-2045gemini@gmail.com/ Link: https://nvd.nist.gov/vuln/detail/CVE-2024-24858 Link: https://lore.kernel.org/linux-bluetooth/20231222162931.6553-1-2045gemini@gmail.com/ Link: https://lore.kernel.org/linux-bluetooth/20231222162310.6461-1-2045gemini@gmail.com/ Link: https://nvd.nist.gov/vuln/detail/CVE-2024-24857 Fixes: 31ad169148df ("Bluetooth: Add conn info lifetime parameters to debugfs") Fixes: 729a1051da6f ("Bluetooth: Expose default LE advertising interval via debugfs") Fixes: 71c3b60ec6d2 ("Bluetooth: Move BR/EDR debugfs file creation into hci_debugfs.c") Signed-off-by: Bastien Nocera Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_debugfs.c | 48 ++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 233453807b509..ce3ff2fa72e58 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -218,10 +218,12 @@ static int conn_info_min_age_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val > hdev->conn_info_max_age) + hci_dev_lock(hdev); + if (val == 0 || val > hdev->conn_info_max_age) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->conn_info_min_age = val; hci_dev_unlock(hdev); @@ -246,10 +248,12 @@ static int conn_info_max_age_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val < hdev->conn_info_min_age) + hci_dev_lock(hdev); + if (val == 0 || val < hdev->conn_info_min_age) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->conn_info_max_age = val; hci_dev_unlock(hdev); @@ -567,10 +571,12 @@ static int sniff_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val % 2 || val > hdev->sniff_max_interval) + hci_dev_lock(hdev); + if (val == 0 || val % 2 || val > hdev->sniff_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->sniff_min_interval = val; hci_dev_unlock(hdev); @@ -595,10 +601,12 @@ static int sniff_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val % 2 || val < hdev->sniff_min_interval) + hci_dev_lock(hdev); + if (val == 0 || val % 2 || val < hdev->sniff_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->sniff_max_interval = val; hci_dev_unlock(hdev); @@ -850,10 +858,12 @@ static int conn_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval) + hci_dev_lock(hdev); + if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_conn_min_interval = val; hci_dev_unlock(hdev); @@ -878,10 +888,12 @@ static int conn_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval) + hci_dev_lock(hdev); + if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_conn_max_interval = val; hci_dev_unlock(hdev); @@ -990,10 +1002,12 @@ static int adv_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval) + hci_dev_lock(hdev); + if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_adv_min_interval = val; hci_dev_unlock(hdev); @@ -1018,10 +1032,12 @@ static int adv_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval) + hci_dev_lock(hdev); + if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_adv_max_interval = val; hci_dev_unlock(hdev); -- GitLab From 2c603a4947a1247102ccb008d5eb6f37a4043c98 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Fri, 29 Mar 2024 11:08:12 +0530 Subject: [PATCH 146/989] ASoC: amd: acp: fix for acp_init function error handling If acp_init() fails, acp pci driver probe should return error. Add acp_init() function return value check logic. Fixes: e61b415515d3 ("ASoC: amd: acp: refactor the acp init and de-init sequence") Signed-off-by: Vijendar Mukunda Link: https://lore.kernel.org/r/20240329053815.2373979-1-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-pci.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sound/soc/amd/acp/acp-pci.c b/sound/soc/amd/acp/acp-pci.c index 440b91d4f261c..5f35b90eab8d3 100644 --- a/sound/soc/amd/acp/acp-pci.c +++ b/sound/soc/amd/acp/acp-pci.c @@ -115,7 +115,10 @@ static int acp_pci_probe(struct pci_dev *pci, const struct pci_device_id *pci_id goto unregister_dmic_dev; } - acp_init(chip); + ret = acp_init(chip); + if (ret) + goto unregister_dmic_dev; + res = devm_kcalloc(&pci->dev, num_res, sizeof(struct resource), GFP_KERNEL); if (!res) { ret = -ENOMEM; -- GitLab From 42fb9cfd5b186fe2e615564f0a1bdd424aa1b151 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 28 Mar 2024 12:49:47 +0000 Subject: [PATCH 147/989] Documentation: dev-tools: Add link to RV docs I could not remember the name of this system and it's pretty hard to find without the right keywords. I had to ask an LLM! Drop a breadcrumb to help people find it in the future. Signed-off-by: Brendan Jackman Acked-by: Daniel Bristot de Oliveira Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240328124947.2107524-1-jackmanb@google.com --- Documentation/dev-tools/testing-overview.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/dev-tools/testing-overview.rst b/Documentation/dev-tools/testing-overview.rst index 0aaf6ea53608f..1619e5e5cc9c4 100644 --- a/Documentation/dev-tools/testing-overview.rst +++ b/Documentation/dev-tools/testing-overview.rst @@ -104,6 +104,8 @@ Some of these tools are listed below: KASAN and can be used in production. See Documentation/dev-tools/kfence.rst * lockdep is a locking correctness validator. See Documentation/locking/lockdep-design.rst +* Runtime Verification (RV) supports checking specific behaviours for a given + subsystem. See Documentation/trace/rv/runtime-verification.rst * There are several other pieces of debug instrumentation in the kernel, many of which can be found in lib/Kconfig.debug -- GitLab From 0ec69b3bed23a4a5a88b4261afeee44ade709ed3 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Tue, 26 Mar 2024 17:38:25 +0000 Subject: [PATCH 148/989] docs: Fix bitfield handling in kernel-doc kernel-doc doesn't handle bitfields that are specified with symbolic name, e.g. u32 cs_index_mask : SPI_CS_CNT_MAX This results in the following warnings when running `make htmldocs`: include/linux/spi/spi.h:246: warning: Function parameter or struct member 'cs_index_mask:SPI_CS_CNT_MAX' not described in 'spi_device' include/linux/spi/spi.h:246: warning: Excess struct member 'cs_index_mask' description in 'spi_device' Update the regexp for bitfields to accept all word chars, not just digits. Signed-off-by: Donald Hunter Acked-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240326173825.99190-1-donald.hunter@gmail.com --- scripts/kernel-doc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 967f1abb0edbd..cb1be22afc65f 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -1541,7 +1541,7 @@ sub create_parameterlist($$$$) { save_struct_actual($2); push_parameter($2, "$type $1", $arg, $file, $declaration_name); - } elsif ($param =~ m/(.*?):(\d+)/) { + } elsif ($param =~ m/(.*?):(\w+)/) { if ($type ne "") { # skip unnamed bit-fields save_struct_actual($1); push_parameter($1, "$type:$2", $arg, $file, $declaration_name) -- GitLab From b75d85218fdfd8774f2f8397d1f6092ed06bd311 Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Sun, 24 Mar 2024 02:17:04 +0300 Subject: [PATCH 149/989] tracing: Fix documentation on tp_printk cmdline option kernel-parameters.txt incorrectly states that workings of kernel.tracepoint_printk sysctl depends on "tracepoint_printk kernel cmdline option", this is a bit misleading for new users since the actual cmdline option name is tp_printk. Fixes: 0daa2302968c ("tracing: Add tp_printk cmdline to have tracepoints go to printk()") Signed-off-by: Vitaly Chikunov Reviewed-by: Randy Dunlap Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240323231704.1217926-1-vt@altlinux.org --- Documentation/admin-guide/kernel-parameters.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bb884c14b2f67..623fce7d5fcd0 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6599,7 +6599,7 @@ To turn off having tracepoints sent to printk, echo 0 > /proc/sys/kernel/tracepoint_printk Note, echoing 1 into this file without the - tracepoint_printk kernel cmdline option has no effect. + tp_printk kernel cmdline option has no effect. The tp_printk_stop_on_boot (see below) can also be used to stop the printing of events to console at -- GitLab From e9c44c1beaba623b12201d2028bc20f535464d9b Mon Sep 17 00:00:00 2001 From: Weiji Wang Date: Tue, 19 Mar 2024 19:42:15 +0800 Subject: [PATCH 150/989] docs: zswap: fix shell command format Format the shell commands as code block to keep the documentation in the same style Signed-off-by: Weiji Wang Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240319114253.2647-1-nebclllo0444@gmail.com --- Documentation/admin-guide/mm/zswap.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index b42132969e315..13632671adaea 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -155,7 +155,7 @@ Setting this parameter to 100 will disable the hysteresis. Some users cannot tolerate the swapping that comes with zswap store failures and zswap writebacks. Swapping can be disabled entirely (without disabling -zswap itself) on a cgroup-basis as follows: +zswap itself) on a cgroup-basis as follows:: echo 0 > /sys/fs/cgroup//memory.zswap.writeback @@ -166,7 +166,7 @@ writeback (because the same pages might be rejected again and again). When there is a sizable amount of cold memory residing in the zswap pool, it can be advantageous to proactively write these cold pages to swap and reclaim the memory for other use cases. By default, the zswap shrinker is disabled. -User can enable it as follows: +User can enable it as follows:: echo Y > /sys/module/zswap/parameters/shrinker_enabled -- GitLab From 09ba28e1cd3cf715daab1fca6e1623e22fd754a6 Mon Sep 17 00:00:00 2001 From: David Thompson Date: Mon, 25 Mar 2024 17:09:29 -0400 Subject: [PATCH 151/989] mlxbf_gige: stop interface during shutdown The mlxbf_gige driver intermittantly encounters a NULL pointer exception while the system is shutting down via "reboot" command. The mlxbf_driver will experience an exception right after executing its shutdown() method. One example of this exception is: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000070 Mem abort info: ESR = 0x0000000096000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x04: level 0 translation fault Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=000000011d373000 [0000000000000070] pgd=0000000000000000, p4d=0000000000000000 Internal error: Oops: 96000004 [#1] SMP CPU: 0 PID: 13 Comm: ksoftirqd/0 Tainted: G S OE 5.15.0-bf.6.gef6992a #1 Hardware name: https://www.mellanox.com BlueField SoC/BlueField SoC, BIOS 4.0.2.12669 Apr 21 2023 pstate: 20400009 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : mlxbf_gige_handle_tx_complete+0xc8/0x170 [mlxbf_gige] lr : mlxbf_gige_poll+0x54/0x160 [mlxbf_gige] sp : ffff8000080d3c10 x29: ffff8000080d3c10 x28: ffffcce72cbb7000 x27: ffff8000080d3d58 x26: ffff0000814e7340 x25: ffff331cd1a05000 x24: ffffcce72c4ea008 x23: ffff0000814e4b40 x22: ffff0000814e4d10 x21: ffff0000814e4128 x20: 0000000000000000 x19: ffff0000814e4a80 x18: ffffffffffffffff x17: 000000000000001c x16: ffffcce72b4553f4 x15: ffff80008805b8a7 x14: 0000000000000000 x13: 0000000000000030 x12: 0101010101010101 x11: 7f7f7f7f7f7f7f7f x10: c2ac898b17576267 x9 : ffffcce720fa5404 x8 : ffff000080812138 x7 : 0000000000002e9a x6 : 0000000000000080 x5 : ffff00008de3b000 x4 : 0000000000000000 x3 : 0000000000000001 x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000000 Call trace: mlxbf_gige_handle_tx_complete+0xc8/0x170 [mlxbf_gige] mlxbf_gige_poll+0x54/0x160 [mlxbf_gige] __napi_poll+0x40/0x1c8 net_rx_action+0x314/0x3a0 __do_softirq+0x128/0x334 run_ksoftirqd+0x54/0x6c smpboot_thread_fn+0x14c/0x190 kthread+0x10c/0x110 ret_from_fork+0x10/0x20 Code: 8b070000 f9000ea0 f95056c0 f86178a1 (b9407002) ---[ end trace 7cc3941aa0d8e6a4 ]--- Kernel panic - not syncing: Oops: Fatal exception in interrupt Kernel Offset: 0x4ce722520000 from 0xffff800008000000 PHYS_OFFSET: 0x80000000 CPU features: 0x000005c1,a3330e5a Memory Limit: none ---[ end Kernel panic - not syncing: Oops: Fatal exception in interrupt ]--- During system shutdown, the mlxbf_gige driver's shutdown() is always executed. However, the driver's stop() method will only execute if networking interface configuration logic within the Linux distribution has been setup to do so. If shutdown() executes but stop() does not execute, NAPI remains enabled and this can lead to an exception if NAPI is scheduled while the hardware interface has only been partially deinitialized. The networking interface managed by the mlxbf_gige driver must be properly stopped during system shutdown so that IFF_UP is cleared, the hardware interface is put into a clean state, and NAPI is fully deinitialized. Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver") Signed-off-by: David Thompson Link: https://lore.kernel.org/r/20240325210929.25362-1-davthompson@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c index 77134ca929382..ba303868686a7 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "mlxbf_gige.h" @@ -492,8 +493,13 @@ static void mlxbf_gige_shutdown(struct platform_device *pdev) { struct mlxbf_gige *priv = platform_get_drvdata(pdev); - writeq(0, priv->base + MLXBF_GIGE_INT_EN); - mlxbf_gige_clean_port(priv); + rtnl_lock(); + netif_device_detach(priv->netdev); + + if (netif_running(priv->netdev)) + dev_close(priv->netdev); + + rtnl_unlock(); } static const struct acpi_device_id __maybe_unused mlxbf_gige_acpi_match[] = { -- GitLab From 6dae957c8eef6eae5b386462767de97303235d5c Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Fri, 29 Mar 2024 07:11:06 +0000 Subject: [PATCH 152/989] bpf: fix possible file descriptor leaks in verifier The resolve_pseudo_ldimm64() function might have leaked file descriptors when BPF_MAP_TYPE_ARENA was used in a program (some error paths missed a corresponding fdput). Add missing fdputs. v2: remove unrelated changes from the fix Fixes: 6082b6c328b5 ("bpf: Recognize addr_space_cast instruction in the verifier.") Signed-off-by: Anton Protopopov Acked-by: Yonghong Song Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20240329071106.67968-1-aspsk@isovalent.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 353985b2b6a27..98188379d5c77 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18379,15 +18379,18 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } if (!env->prog->jit_requested) { verbose(env, "JIT is required to use arena\n"); + fdput(f); return -EOPNOTSUPP; } if (!bpf_jit_supports_arena()) { verbose(env, "JIT doesn't support arena\n"); + fdput(f); return -EOPNOTSUPP; } env->prog->aux->arena = (void *)map; if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) { verbose(env, "arena's user address must be set via map_extra or mmap()\n"); + fdput(f); return -EINVAL; } } -- GitLab From eaa03486d932572dfd1c5f64f9dfebe572ad88c0 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Fri, 29 Mar 2024 14:46:30 +0000 Subject: [PATCH 153/989] regmap: maple: Fix uninitialized symbol 'ret' warnings Fix warnings reported by smatch by initializing local 'ret' variable to 0. drivers/base/regmap/regcache-maple.c:186 regcache_maple_drop() error: uninitialized symbol 'ret'. drivers/base/regmap/regcache-maple.c:290 regcache_maple_sync() error: uninitialized symbol 'ret'. Signed-off-by: Richard Fitzgerald Fixes: f033c26de5a5 ("regmap: Add maple tree based register cache") Link: https://lore.kernel.org/r/20240329144630.1965159-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/base/regmap/regcache-maple.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/regmap/regcache-maple.c b/drivers/base/regmap/regcache-maple.c index c1776127a5724..55999a50ccc0b 100644 --- a/drivers/base/regmap/regcache-maple.c +++ b/drivers/base/regmap/regcache-maple.c @@ -112,7 +112,7 @@ static int regcache_maple_drop(struct regmap *map, unsigned int min, unsigned long *entry, *lower, *upper; unsigned long lower_index, lower_last; unsigned long upper_index, upper_last; - int ret; + int ret = 0; lower = NULL; upper = NULL; @@ -244,7 +244,7 @@ static int regcache_maple_sync(struct regmap *map, unsigned int min, unsigned long lmin = min; unsigned long lmax = max; unsigned int r, v, sync_start; - int ret; + int ret = 0; bool sync_needed = false; map->cache_bypass = true; -- GitLab From 302b84e84d108b878efc56ebfea09474159be56b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 28 Mar 2024 16:07:23 -0500 Subject: [PATCH 154/989] Revert "PCI: Mark LSI FW643 to avoid bus reset" This reverts commit 29a43dc130ce65d365a8ea9e1cc4bc51005a353e. 29a43dc130ce ("PCI: Mark LSI FW643 to avoid bus reset") by Edmund was based on the assumption that the LSI / Agere FW643 has a defect such that it can't recover after a Secondary Bus Reset (SBR). But Takashi Sakamoto reported that SBR works fine on this same FW643 device in an AMD Ryzen 5 2400G system, so apparently there is some other aspect of Edmund's system that accounts for the issue. The down side of 29a43dc130ce is that when the FW643 is assigned to a VM, avoiding the SBR means we leak data out of the VM. Revert 29a43dc130ce until we figure out a better solution. In the meantime, we can use the sysfs "reset_method" interface to restrict the available reset methods. Link: https://lore.kernel.org/r/20240328212302.1582483-1-helgaas@kernel.org Fixes: 29a43dc130ce ("PCI: Mark LSI FW643 to avoid bus reset") Reported-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20240325012135.36861-1-o-takashi@sakamocchi.jp Signed-off-by: Bjorn Helgaas Reviewed-by: Takashi Sakamoto --- drivers/pci/quirks.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index bf4833221816d..eff7f5df08e27 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3765,14 +3765,6 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003e, quirk_no_bus_reset); */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_CAVIUM, 0xa100, quirk_no_bus_reset); -/* - * Apparently the LSI / Agere FW643 can't recover after a Secondary Bus - * Reset and requires a power-off or suspend/resume and rescan. Prevent - * use of that reset. - */ -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATT, 0x5900, quirk_no_bus_reset); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATT, 0x5901, quirk_no_bus_reset); - /* * Some TI KeyStone C667X devices do not support bus/hot reset. The PCIESS * automatically disables LTSSM when Secondary Bus Reset is received and -- GitLab From 625aefac340f45a4fc60908da763f437599a0d6f Mon Sep 17 00:00:00 2001 From: Michael Krummsdorf Date: Tue, 26 Mar 2024 13:36:54 +0100 Subject: [PATCH 155/989] net: dsa: mv88e6xxx: fix usable ports on 88e6020 The switch has 4 ports with 2 internal PHYs, but ports are numbered up to 6, with ports 0, 1, 5 and 6 being usable. Fixes: 71d94a432a15 ("net: dsa: mv88e6xxx: add support for MV88E6020 switch") Signed-off-by: Michael Krummsdorf Signed-off-by: Matthias Schiffer Reviewed-by: Andrew Lunn Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240326123655.40666-1-matthias.schiffer@ew.tq-group.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 9ed1821184ece..c95787cb90867 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -5503,8 +5503,12 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .family = MV88E6XXX_FAMILY_6250, .name = "Marvell 88E6020", .num_databases = 64, - .num_ports = 4, + /* Ports 2-4 are not routed to pins + * => usable ports 0, 1, 5, 6 + */ + .num_ports = 7, .num_internal_phys = 2, + .invalid_port_mask = BIT(2) | BIT(3) | BIT(4), .max_vid = 4095, .port_base_addr = 0x8, .phy_base_addr = 0x0, -- GitLab From 62fc3357e079a07a22465b9b6ef71bb6ea75ee4b Mon Sep 17 00:00:00 2001 From: Mahmoud Adam Date: Tue, 26 Mar 2024 16:31:33 +0100 Subject: [PATCH 156/989] net/rds: fix possible cp null dereference cp might be null, calling cp->cp_conn would produce null dereference [Simon Horman adds:] Analysis: * cp is a parameter of __rds_rdma_map and is not reassigned. * The following call-sites pass a NULL cp argument to __rds_rdma_map() - rds_get_mr() - rds_get_mr_for_dest * Prior to the code above, the following assumes that cp may be NULL (which is indicative, but could itself be unnecessary) trans_private = rs->rs_transport->get_mr( sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, args->vec.addr, args->vec.bytes, need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); * The code modified by this patch is guarded by IS_ERR(trans_private), where trans_private is assigned as per the previous point in this analysis. The only implementation of get_mr that I could locate is rds_ib_get_mr() which can return an ERR_PTR if the conn (4th) argument is NULL. * ret is set to PTR_ERR(trans_private). rds_ib_get_mr can return ERR_PTR(-ENODEV) if the conn (4th) argument is NULL. Thus ret may be -ENODEV in which case the code in question will execute. Conclusion: * cp may be NULL at the point where this patch adds a check; this patch does seem to address a possible bug Fixes: c055fc00c07b ("net/rds: fix WARNING in rds_conn_connect_if_down") Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Mahmoud Adam Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240326153132.55580-1-mngyadam@amazon.com Signed-off-by: Jakub Kicinski --- net/rds/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index a4e3c5de998be..00dbcd4d28e68 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -302,7 +302,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, } ret = PTR_ERR(trans_private); /* Trigger connection so that its ready for the next retry */ - if (ret == -ENODEV) + if (ret == -ENODEV && cp) rds_conn_connect_if_down(cp->cp_conn); goto out; } -- GitLab From b1f532a3b1e6d2e5559c7ace49322922637a28aa Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 12 Feb 2024 13:58:33 +0100 Subject: [PATCH 157/989] batman-adv: Avoid infinite loop trying to resize local TT If the MTU of one of an attached interface becomes too small to transmit the local translation table then it must be resized to fit inside all fragments (when enabled) or a single packet. But if the MTU becomes too low to transmit even the header + the VLAN specific part then the resizing of the local TT will never succeed. This can for example happen when the usable space is 110 bytes and 11 VLANs are on top of batman-adv. In this case, at least 116 byte would be needed. There will just be an endless spam of batman_adv: batadv0: Forced to purge local tt entries to fit new maximum fragment MTU (110) in the log but the function will never finish. Problem here is that the timeout will be halved all the time and will then stagnate at 0 and therefore never be able to reduce the table even more. There are other scenarios possible with a similar result. The number of BATADV_TT_CLIENT_NOPURGE entries in the local TT can for example be too high to fit inside a packet. Such a scenario can therefore happen also with only a single VLAN + 7 non-purgable addresses - requiring at least 120 bytes. While this should be handled proactively when: * interface with too low MTU is added * VLAN is added * non-purgeable local mac is added * MTU of an attached interface is reduced * fragmentation setting gets disabled (which most likely requires dropping attached interfaces) not all of these scenarios can be prevented because batman-adv is only consuming events without the the possibility to prevent these actions (non-purgable MAC address added, MTU of an attached interface is reduced). It is therefore necessary to also make sure that the code is able to handle also the situations when there were already incompatible system configuration are present. Cc: stable@vger.kernel.org Fixes: a19d3d85e1b8 ("batman-adv: limit local translation table max size") Reported-by: syzbot+a6a4b5bb3da165594cff@syzkaller.appspotmail.com Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index b95c36765d045..2243cec18ecc8 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -3948,7 +3948,7 @@ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface) spin_lock_bh(&bat_priv->tt.commit_lock); - while (true) { + while (timeout) { table_size = batadv_tt_local_table_transmit_size(bat_priv); if (packet_size_max >= table_size) break; -- GitLab From 5086f0fe46dcf8687c8c2e41e1f07826affebbba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Mar 2024 17:34:48 +0000 Subject: [PATCH 158/989] net: do not consume a cacheline for system_page_pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no reason to consume a full cacheline to store system_page_pool. We can eventually move it to softnet_data later for full locality control. Fixes: 2b0cfa6e4956 ("net: add generic percpu page_pool allocator") Signed-off-by: Eric Dumazet Cc: Lorenzo Bianconi Cc: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20240328173448.2262593-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 9a67003e49db8..984ff8b9d0e1a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -429,7 +429,7 @@ EXPORT_PER_CPU_SYMBOL(softnet_data); * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ -static DEFINE_PER_CPU_ALIGNED(struct page_pool *, system_page_pool); +static DEFINE_PER_CPU(struct page_pool *, system_page_pool); #ifdef CONFIG_LOCKDEP /* -- GitLab From e709acbd84fb6ef32736331b0147f027a3ef4c20 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Thu, 28 Mar 2024 10:06:21 +0800 Subject: [PATCH 159/989] octeontx2-pf: check negative error code in otx2_open() otx2_rxtx_enable() return negative error code such as -EIO, check -EIO rather than EIO to fix this problem. Fixes: c926252205c4 ("octeontx2-pf: Disable packet I/O for graceful exit") Signed-off-by: Su Hui Reviewed-by: Subbaraya Sundeep Reviewed-by: Simon Horman Reviewed-by: Kalesh AP Link: https://lore.kernel.org/r/20240328020620.4054692-1-suhui@nfschina.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index b40bd0e467514..3f46d5e0fb2ec 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1933,7 +1933,7 @@ int otx2_open(struct net_device *netdev) * mcam entries are enabled to receive the packets. Hence disable the * packet I/O. */ - if (err == EIO) + if (err == -EIO) goto err_disable_rxtx; else if (err) goto err_tx_stop_queues; -- GitLab From 5e864d90b20803edf6bd44a99fb9afa7171785f2 Mon Sep 17 00:00:00 2001 From: Atlas Yu Date: Thu, 28 Mar 2024 13:51:52 +0800 Subject: [PATCH 160/989] r8169: skip DASH fw status checks when DASH is disabled On devices that support DASH, the current code in the "rtl_loop_wait" function raises false alarms when DASH is disabled. This occurs because the function attempts to wait for the DASH firmware to be ready, even though it's not relevant in this case. r8169 0000:0c:00.0 eth0: RTL8168ep/8111ep, 38:7c:76:49:08:d9, XID 502, IRQ 86 r8169 0000:0c:00.0 eth0: jumbo features [frames: 9194 bytes, tx checksumming: ko] r8169 0000:0c:00.0 eth0: DASH disabled ... r8169 0000:0c:00.0 eth0: rtl_ep_ocp_read_cond == 0 (loop: 30, delay: 10000). This patch modifies the driver start/stop functions to skip checking the DASH firmware status when DASH is explicitly disabled. This prevents unnecessary delays and false alarms. The patch has been tested on several ThinkStation P8/PX workstations. Fixes: 0ab0c45d8aae ("r8169: add handling DASH when DASH is disabled") Signed-off-by: Atlas Yu Reviewed-by: Heiner Kallweit Link: https://lore.kernel.org/r/20240328055152.18443-1-atlas.yu@canonical.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 31 ++++++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 5c879a5c86d70..4ac444eb269ff 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1314,17 +1314,40 @@ static void rtl8168ep_stop_cmac(struct rtl8169_private *tp) RTL_W8(tp, IBCR0, RTL_R8(tp, IBCR0) & ~0x01); } +static void rtl_dash_loop_wait(struct rtl8169_private *tp, + const struct rtl_cond *c, + unsigned long usecs, int n, bool high) +{ + if (!tp->dash_enabled) + return; + rtl_loop_wait(tp, c, usecs, n, high); +} + +static void rtl_dash_loop_wait_high(struct rtl8169_private *tp, + const struct rtl_cond *c, + unsigned long d, int n) +{ + rtl_dash_loop_wait(tp, c, d, n, true); +} + +static void rtl_dash_loop_wait_low(struct rtl8169_private *tp, + const struct rtl_cond *c, + unsigned long d, int n) +{ + rtl_dash_loop_wait(tp, c, d, n, false); +} + static void rtl8168dp_driver_start(struct rtl8169_private *tp) { r8168dp_oob_notify(tp, OOB_CMD_DRIVER_START); - rtl_loop_wait_high(tp, &rtl_dp_ocp_read_cond, 10000, 10); + rtl_dash_loop_wait_high(tp, &rtl_dp_ocp_read_cond, 10000, 10); } static void rtl8168ep_driver_start(struct rtl8169_private *tp) { r8168ep_ocp_write(tp, 0x01, 0x180, OOB_CMD_DRIVER_START); r8168ep_ocp_write(tp, 0x01, 0x30, r8168ep_ocp_read(tp, 0x30) | 0x01); - rtl_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 30); + rtl_dash_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 30); } static void rtl8168_driver_start(struct rtl8169_private *tp) @@ -1338,7 +1361,7 @@ static void rtl8168_driver_start(struct rtl8169_private *tp) static void rtl8168dp_driver_stop(struct rtl8169_private *tp) { r8168dp_oob_notify(tp, OOB_CMD_DRIVER_STOP); - rtl_loop_wait_low(tp, &rtl_dp_ocp_read_cond, 10000, 10); + rtl_dash_loop_wait_low(tp, &rtl_dp_ocp_read_cond, 10000, 10); } static void rtl8168ep_driver_stop(struct rtl8169_private *tp) @@ -1346,7 +1369,7 @@ static void rtl8168ep_driver_stop(struct rtl8169_private *tp) rtl8168ep_stop_cmac(tp); r8168ep_ocp_write(tp, 0x01, 0x180, OOB_CMD_DRIVER_STOP); r8168ep_ocp_write(tp, 0x01, 0x30, r8168ep_ocp_read(tp, 0x30) | 0x01); - rtl_loop_wait_low(tp, &rtl_ep_ocp_read_cond, 10000, 10); + rtl_dash_loop_wait_low(tp, &rtl_ep_ocp_read_cond, 10000, 10); } static void rtl8168_driver_stop(struct rtl8169_private *tp) -- GitLab From 17af420545a750f763025149fa7b833a4fc8b8f0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Mar 2024 11:22:48 +0000 Subject: [PATCH 161/989] erspan: make sure erspan_base_hdr is present in skb->head syzbot reported a problem in ip6erspan_rcv() [1] Issue is that ip6erspan_rcv() (and erspan_rcv()) no longer make sure erspan_base_hdr is present in skb linear part (skb->head) before getting @ver field from it. Add the missing pskb_may_pull() calls. v2: Reload iph pointer in erspan_rcv() after pskb_may_pull() because skb->head might have changed. [1] BUG: KMSAN: uninit-value in pskb_may_pull_reason include/linux/skbuff.h:2742 [inline] BUG: KMSAN: uninit-value in pskb_may_pull include/linux/skbuff.h:2756 [inline] BUG: KMSAN: uninit-value in ip6erspan_rcv net/ipv6/ip6_gre.c:541 [inline] BUG: KMSAN: uninit-value in gre_rcv+0x11f8/0x1930 net/ipv6/ip6_gre.c:610 pskb_may_pull_reason include/linux/skbuff.h:2742 [inline] pskb_may_pull include/linux/skbuff.h:2756 [inline] ip6erspan_rcv net/ipv6/ip6_gre.c:541 [inline] gre_rcv+0x11f8/0x1930 net/ipv6/ip6_gre.c:610 ip6_protocol_deliver_rcu+0x1d4c/0x2ca0 net/ipv6/ip6_input.c:438 ip6_input_finish net/ipv6/ip6_input.c:483 [inline] NF_HOOK include/linux/netfilter.h:314 [inline] ip6_input+0x15d/0x430 net/ipv6/ip6_input.c:492 ip6_mc_input+0xa7e/0xc80 net/ipv6/ip6_input.c:586 dst_input include/net/dst.h:460 [inline] ip6_rcv_finish+0x955/0x970 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:314 [inline] ipv6_rcv+0xde/0x390 net/ipv6/ip6_input.c:310 __netif_receive_skb_one_core net/core/dev.c:5538 [inline] __netif_receive_skb+0x1da/0xa00 net/core/dev.c:5652 netif_receive_skb_internal net/core/dev.c:5738 [inline] netif_receive_skb+0x58/0x660 net/core/dev.c:5798 tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1549 tun_get_user+0x5566/0x69e0 drivers/net/tun.c:2002 tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:2108 [inline] new_sync_write fs/read_write.c:497 [inline] vfs_write+0xb63/0x1520 fs/read_write.c:590 ksys_write+0x20f/0x4c0 fs/read_write.c:643 __do_sys_write fs/read_write.c:655 [inline] __se_sys_write fs/read_write.c:652 [inline] __x64_sys_write+0x93/0xe0 fs/read_write.c:652 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was created at: slab_post_alloc_hook mm/slub.c:3804 [inline] slab_alloc_node mm/slub.c:3845 [inline] kmem_cache_alloc_node+0x613/0xc50 mm/slub.c:3888 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:577 __alloc_skb+0x35b/0x7a0 net/core/skbuff.c:668 alloc_skb include/linux/skbuff.h:1318 [inline] alloc_skb_with_frags+0xc8/0xbf0 net/core/skbuff.c:6504 sock_alloc_send_pskb+0xa81/0xbf0 net/core/sock.c:2795 tun_alloc_skb drivers/net/tun.c:1525 [inline] tun_get_user+0x209a/0x69e0 drivers/net/tun.c:1846 tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:2108 [inline] new_sync_write fs/read_write.c:497 [inline] vfs_write+0xb63/0x1520 fs/read_write.c:590 ksys_write+0x20f/0x4c0 fs/read_write.c:643 __do_sys_write fs/read_write.c:655 [inline] __se_sys_write fs/read_write.c:652 [inline] __x64_sys_write+0x93/0xe0 fs/read_write.c:652 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 CPU: 1 PID: 5045 Comm: syz-executor114 Not tainted 6.9.0-rc1-syzkaller-00021-g962490525cff #0 Fixes: cb73ee40b1b3 ("net: ip_gre: use erspan key field for tunnel lookup") Reported-by: syzbot+1c1cf138518bf0c53d68@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/000000000000772f2c0614b66ef7@google.com/ Signed-off-by: Eric Dumazet Cc: Lorenzo Bianconi Link: https://lore.kernel.org/r/20240328112248.1101491-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_gre.c | 5 +++++ net/ipv6/ip6_gre.c | 3 +++ 2 files changed, 8 insertions(+) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7b16c211b9044..57ddcd8c62f67 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -280,8 +280,13 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, tpi->flags | TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); } else { + if (unlikely(!pskb_may_pull(skb, + gre_hdr_len + sizeof(*ershdr)))) + return PACKET_REJECT; + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); ver = ershdr->ver; + iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, iph->saddr, iph->daddr, tpi->key); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index ca7e77e842835..c89aef524df9a 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -528,6 +528,9 @@ static int ip6erspan_rcv(struct sk_buff *skb, struct ip6_tnl *tunnel; u8 ver; + if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) + return PACKET_REJECT; + ipv6h = ipv6_hdr(skb); ershdr = (struct erspan_base_hdr *)skb->data; ver = ershdr->ver; -- GitLab From 0640f47b742667fca6aac174f7cd62b6c2c7532c Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 13 Mar 2024 17:43:05 +0100 Subject: [PATCH 162/989] drm/msm/dp: fix runtime PM leak on disconnect Make sure to put the runtime PM usage count (and suspend) also when receiving a disconnect event while in the ST_MAINLINK_READY state. This specifically avoids leaking a runtime PM usage count on every disconnect with display servers that do not automatically enable external displays when receiving a hotplug notification. Fixes: 5814b8bf086a ("drm/msm/dp: incorporate pm_runtime framework into DP driver") Cc: stable@vger.kernel.org # 6.8 Cc: Kuogee Hsieh Signed-off-by: Johan Hovold Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/582744/ Link: https://lore.kernel.org/r/20240313164306.23133-2-johan+linaro@kernel.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dp/dp_display.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index d80f89581760d..ffa785ec641aa 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -629,6 +629,7 @@ static int dp_hpd_unplug_handle(struct dp_display_private *dp, u32 data) dp_display_host_phy_exit(dp); dp->hpd_state = ST_DISCONNECTED; dp_display_notify_disconnect(&dp->dp_display.pdev->dev); + pm_runtime_put_sync(&pdev->dev); mutex_unlock(&dp->event_mutex); return 0; } -- GitLab From e86750b01a1560f198e4b3e21bb3f78bfd5bb2c3 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 13 Mar 2024 17:43:06 +0100 Subject: [PATCH 163/989] drm/msm/dp: fix runtime PM leak on connect failure Make sure to balance the runtime PM usage counter (and suspend) before returning on connect failures (e.g. DPCD read failures after a spurious connect event or if link training fails). Fixes: 5814b8bf086a ("drm/msm/dp: incorporate pm_runtime framework into DP driver") Cc: stable@vger.kernel.org # 6.8 Cc: Kuogee Hsieh Signed-off-by: Johan Hovold Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/582746/ Link: https://lore.kernel.org/r/20240313164306.23133-3-johan+linaro@kernel.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dp/dp_display.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index ffa785ec641aa..2ed40bd0acb60 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -572,6 +572,7 @@ static int dp_hpd_plug_handle(struct dp_display_private *dp, u32 data) ret = dp_display_usbpd_configure_cb(&pdev->dev); if (ret) { /* link train failed */ dp->hpd_state = ST_DISCONNECTED; + pm_runtime_put_sync(&pdev->dev); } else { dp->hpd_state = ST_MAINLINK_READY; } -- GitLab From c588f7d67044d6d59ef92d75a970b64929984d89 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2024 14:08:09 -0700 Subject: [PATCH 164/989] drm/msm: Add newlines to some debug prints These debug prints are missing newlines, leading to multiple messages being printed on one line and hard to read logs. Add newlines to have the debug prints on separate lines. The DBG macro used to add a newline, but I missed that while migrating to drm_dbg wrappers. Fixes: 7cb017db1896 ("drm/msm: Move FB debug prints to drm_dbg_state()") Fixes: 721c6e0c6aed ("drm/msm: Move vblank debug prints to drm_dbg_vbl()") Signed-off-by: Stephen Boyd Reviewed-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/584769/ Link: https://lore.kernel.org/r/20240325210810.1340820-1-swboyd@chromium.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/msm_fb.c | 6 +++--- drivers/gpu/drm/msm/msm_kms.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_fb.c b/drivers/gpu/drm/msm/msm_fb.c index e3f61c39df69b..80166f702a0db 100644 --- a/drivers/gpu/drm/msm/msm_fb.c +++ b/drivers/gpu/drm/msm/msm_fb.c @@ -89,7 +89,7 @@ int msm_framebuffer_prepare(struct drm_framebuffer *fb, for (i = 0; i < n; i++) { ret = msm_gem_get_and_pin_iova(fb->obj[i], aspace, &msm_fb->iova[i]); - drm_dbg_state(fb->dev, "FB[%u]: iova[%d]: %08llx (%d)", + drm_dbg_state(fb->dev, "FB[%u]: iova[%d]: %08llx (%d)\n", fb->base.id, i, msm_fb->iova[i], ret); if (ret) return ret; @@ -176,7 +176,7 @@ static struct drm_framebuffer *msm_framebuffer_init(struct drm_device *dev, const struct msm_format *format; int ret, i, n; - drm_dbg_state(dev, "create framebuffer: mode_cmd=%p (%dx%d@%4.4s)", + drm_dbg_state(dev, "create framebuffer: mode_cmd=%p (%dx%d@%4.4s)\n", mode_cmd, mode_cmd->width, mode_cmd->height, (char *)&mode_cmd->pixel_format); @@ -232,7 +232,7 @@ static struct drm_framebuffer *msm_framebuffer_init(struct drm_device *dev, refcount_set(&msm_fb->dirtyfb, 1); - drm_dbg_state(dev, "create: FB ID: %d (%p)", fb->base.id, fb); + drm_dbg_state(dev, "create: FB ID: %d (%p)\n", fb->base.id, fb); return fb; diff --git a/drivers/gpu/drm/msm/msm_kms.c b/drivers/gpu/drm/msm/msm_kms.c index 84c21ec2ceeae..af6a6fcb11736 100644 --- a/drivers/gpu/drm/msm/msm_kms.c +++ b/drivers/gpu/drm/msm/msm_kms.c @@ -149,7 +149,7 @@ int msm_crtc_enable_vblank(struct drm_crtc *crtc) struct msm_kms *kms = priv->kms; if (!kms) return -ENXIO; - drm_dbg_vbl(dev, "crtc=%u", crtc->base.id); + drm_dbg_vbl(dev, "crtc=%u\n", crtc->base.id); return vblank_ctrl_queue_work(priv, crtc, true); } @@ -160,7 +160,7 @@ void msm_crtc_disable_vblank(struct drm_crtc *crtc) struct msm_kms *kms = priv->kms; if (!kms) return; - drm_dbg_vbl(dev, "crtc=%u", crtc->base.id); + drm_dbg_vbl(dev, "crtc=%u\n", crtc->base.id); vblank_ctrl_queue_work(priv, crtc, false); } -- GitLab From 4f3b77ae5ff5b5ba9d99c5d5450db388dbee5107 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 14 Mar 2024 03:10:41 +0200 Subject: [PATCH 165/989] drm/msm/dpu: don't allow overriding data from catalog The data from catalog is marked as const, so it is a part of the RO segment. Allowing userspace to write to it through debugfs can cause protection faults. Set debugfs file mode to read-only for debug entries corresponding to perf_cfg coming from catalog. Fixes: abda0d925f9c ("drm/msm/dpu: Mark various data tables as const") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/582844/ Link: https://lore.kernel.org/r/20240314-dpu-perf-rework-v3-1-79fa4e065574@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c index ef871239adb2a..68fae048a9a83 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c @@ -459,15 +459,15 @@ int dpu_core_perf_debugfs_init(struct dpu_kms *dpu_kms, struct dentry *parent) &perf->core_clk_rate); debugfs_create_u32("enable_bw_release", 0600, entry, (u32 *)&perf->enable_bw_release); - debugfs_create_u32("threshold_low", 0600, entry, + debugfs_create_u32("threshold_low", 0400, entry, (u32 *)&perf->perf_cfg->max_bw_low); - debugfs_create_u32("threshold_high", 0600, entry, + debugfs_create_u32("threshold_high", 0400, entry, (u32 *)&perf->perf_cfg->max_bw_high); - debugfs_create_u32("min_core_ib", 0600, entry, + debugfs_create_u32("min_core_ib", 0400, entry, (u32 *)&perf->perf_cfg->min_core_ib); - debugfs_create_u32("min_llcc_ib", 0600, entry, + debugfs_create_u32("min_llcc_ib", 0400, entry, (u32 *)&perf->perf_cfg->min_llcc_ib); - debugfs_create_u32("min_dram_ib", 0600, entry, + debugfs_create_u32("min_dram_ib", 0400, entry, (u32 *)&perf->perf_cfg->min_dram_ib); debugfs_create_file("perf_mode", 0600, entry, (u32 *)perf, &dpu_core_perf_mode_fops); -- GitLab From ee15c8bf5d77a306614bdefe33828310662dee05 Mon Sep 17 00:00:00 2001 From: Kuogee Hsieh Date: Fri, 29 Mar 2024 12:46:26 -0700 Subject: [PATCH 166/989] drm/msm/dp: assign correct DP controller ID to x1e80100 interface table At current x1e80100 interface table, interface #3 is wrongly connected to DP controller #0 and interface #4 wrongly connected to DP controller #2. Fix this problem by connect Interface #3 to DP controller #0 and interface #4 connect to DP controller #1. Also add interface #6, #7 and #8 connections to DP controller to complete x1e80100 interface table. Changs in V3: -- add v2 changes log Changs in V2: -- add x1e80100 to subject -- add Fixes Fixes: e3b1f369db5a ("drm/msm/dpu: Add X1E80100 support") Signed-off-by: Kuogee Hsieh Reviewed-by: Abhinav Kumar Reviewed-by: Abel Vesa Patchwork: https://patchwork.freedesktop.org/patch/585549/ Link: https://lore.kernel.org/r/1711741586-9037-1-git-send-email-quic_khsieh@quicinc.com Signed-off-by: Abhinav Kumar --- .../msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h index 9a9f7092c526a..a3e60ac70689e 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h @@ -324,6 +324,7 @@ static const struct dpu_wb_cfg x1e80100_wb[] = { }, }; +/* TODO: INTF 3, 8 and 7 are used for MST, marked as INTF_NONE for now */ static const struct dpu_intf_cfg x1e80100_intf[] = { { .name = "intf_0", .id = INTF_0, @@ -358,8 +359,8 @@ static const struct dpu_intf_cfg x1e80100_intf[] = { .name = "intf_3", .id = INTF_3, .base = 0x37000, .len = 0x280, .features = INTF_SC7280_MASK, - .type = INTF_DP, - .controller_id = MSM_DP_CONTROLLER_1, + .type = INTF_NONE, + .controller_id = MSM_DP_CONTROLLER_0, /* pair with intf_0 for DP MST */ .prog_fetch_lines_worst_case = 24, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 30), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 31), @@ -368,7 +369,7 @@ static const struct dpu_intf_cfg x1e80100_intf[] = { .base = 0x38000, .len = 0x280, .features = INTF_SC7280_MASK, .type = INTF_DP, - .controller_id = MSM_DP_CONTROLLER_2, + .controller_id = MSM_DP_CONTROLLER_1, .prog_fetch_lines_worst_case = 24, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 20), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 21), @@ -381,6 +382,33 @@ static const struct dpu_intf_cfg x1e80100_intf[] = { .prog_fetch_lines_worst_case = 24, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 22), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 23), + }, { + .name = "intf_6", .id = INTF_6, + .base = 0x3A000, .len = 0x280, + .features = INTF_SC7280_MASK, + .type = INTF_DP, + .controller_id = MSM_DP_CONTROLLER_2, + .prog_fetch_lines_worst_case = 24, + .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 17), + .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 16), + }, { + .name = "intf_7", .id = INTF_7, + .base = 0x3b000, .len = 0x280, + .features = INTF_SC7280_MASK, + .type = INTF_NONE, + .controller_id = MSM_DP_CONTROLLER_2, /* pair with intf_6 for DP MST */ + .prog_fetch_lines_worst_case = 24, + .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 18), + .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 19), + }, { + .name = "intf_8", .id = INTF_8, + .base = 0x3c000, .len = 0x280, + .features = INTF_SC7280_MASK, + .type = INTF_NONE, + .controller_id = MSM_DP_CONTROLLER_1, /* pair with intf_4 for DP MST */ + .prog_fetch_lines_worst_case = 24, + .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 12), + .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 13), }, }; -- GitLab From ea111449501ea32bf6da82750de860243691efc7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:44 -0700 Subject: [PATCH 167/989] tcp: Fix bind() regression for v6-only wildcard and v4-mapped-v6 non-wildcard addresses. Commit 5e07e672412b ("tcp: Use bhash2 for v4-mapped-v6 non-wildcard address.") introduced bind() regression for v4-mapped-v6 address. When we bind() the following two addresses on the same port, the 2nd bind() should succeed but fails now. 1. [::] w/ IPV6_ONLY 2. ::ffff:127.0.0.1 After the chagne, v4-mapped-v6 uses bhash2 instead of bhash to detect conflict faster, but I forgot to add a necessary change. During the 2nd bind(), inet_bind2_bucket_match_addr_any() returns the tb2 bucket of [::], and inet_bhash2_conflict() finally calls inet_bind_conflict(), which returns true, meaning conflict. inet_bhash2_addr_any_conflict |- inet_bind2_bucket_match_addr_any <-- return [::] bucket `- inet_bhash2_conflict `- __inet_bhash2_conflict <-- checks IPV6_ONLY for AF_INET | but not for v4-mapped-v6 address `- inet_bind_conflict <-- does not check address inet_bind_conflict() does not check socket addresses because __inet_bhash2_conflict() is expected to do so. However, it checks IPV6_V6ONLY attribute only against AF_INET socket, and not for v4-mapped-v6 address. As a result, v4-mapped-v6 address conflicts with v6-only wildcard address. To avoid that, let's add the missing test to use bhash2 for v4-mapped-v6 address. Fixes: 5e07e672412b ("tcp: Use bhash2 for v4-mapped-v6 non-wildcard address.") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index c038e28e2f1e6..4184d45f890c8 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -203,8 +203,15 @@ static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { - if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) - return false; + if (ipv6_only_sock(sk2)) { + if (sk->sk_family == AF_INET) + return false; + +#if IS_ENABLED(CONFIG_IPV6) + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return false; +#endif + } return inet_bind_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok); -- GitLab From d91ef1e1b55f730bee8ce286b02b7bdccbc42973 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:45 -0700 Subject: [PATCH 168/989] tcp: Fix bind() regression for v6-only wildcard and v4(-mapped-v6) non-wildcard addresses. Jianguo Wu reported another bind() regression introduced by bhash2. Calling bind() for the following 3 addresses on the same port, the 3rd one should fail but now succeeds. 1. 0.0.0.0 or ::ffff:0.0.0.0 2. [::] w/ IPV6_V6ONLY 3. IPv4 non-wildcard address or v4-mapped-v6 non-wildcard address The first two bind() create tb2 like this: bhash2 -> tb2(:: w/ IPV6_V6ONLY) -> tb2(0.0.0.0) The 3rd bind() will match with the IPv6 only wildcard address bucket in inet_bind2_bucket_match_addr_any(), however, no conflicting socket exists in the bucket. So, inet_bhash2_conflict() will returns false, and thus, inet_bhash2_addr_any_conflict() returns false consequently. As a result, the 3rd bind() bypasses conflict check, which should be done against the IPv4 wildcard address bucket. So, in inet_bhash2_addr_any_conflict(), we must iterate over all buckets. Note that we cannot add ipv6_only flag for inet_bind2_bucket as it would confuse the following patetrn. 1. [::] w/ SO_REUSE{ADDR,PORT} and IPV6_V6ONLY 2. [::] w/ SO_REUSE{ADDR,PORT} 3. IPv4 non-wildcard address or v4-mapped-v6 non-wildcard address The first bind() would create a bucket with ipv6_only flag true, the second bind() would add the [::] socket into the same bucket, and the third bind() could succeed based on the wrong assumption that ipv6_only bucket would not conflict with v4(-mapped-v6) address. Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Diagnosed-by: Jianguo Wu Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4184d45f890c8..3b38610958ee4 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -294,6 +294,7 @@ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l struct sock_reuseport *reuseport_cb; struct inet_bind_hashbucket *head2; struct inet_bind2_bucket *tb2; + bool conflict = false; bool reuseport_cb_ok; rcu_read_lock(); @@ -306,18 +307,20 @@ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l spin_lock(&head2->lock); - inet_bind_bucket_for_each(tb2, &head2->chain) - if (inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) - break; + inet_bind_bucket_for_each(tb2, &head2->chain) { + if (!inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) + continue; - if (tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, - reuseport_ok)) { - spin_unlock(&head2->lock); - return true; + if (!inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok)) + continue; + + conflict = true; + break; } spin_unlock(&head2->lock); - return false; + + return conflict; } /* -- GitLab From c48baf567dedbba731d66f5a2cd46f1b6def50aa Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:46 -0700 Subject: [PATCH 169/989] selftest: tcp: Make bind() selftest flexible. Currently, bind_wildcard.c tests only (IPv4, IPv6) pairs, but we will add more tests for the same protocol pairs. This patch makes it possible by changing the address pointer to void. No functional changes are intended. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 92 +++++++++++++-------- 1 file changed, 58 insertions(+), 34 deletions(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index a2662348cdb1a..d65c3bb6ba138 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -6,7 +6,9 @@ #include "../kselftest_harness.h" -struct in6_addr in6addr_v4mapped_any = { +static const __u32 in4addr_any = INADDR_ANY; +static const __u32 in4addr_loopback = INADDR_LOOPBACK; +static const struct in6_addr in6addr_v4mapped_any = { .s6_addr = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -14,8 +16,7 @@ struct in6_addr in6addr_v4mapped_any = { 0, 0, 0, 0 } }; - -struct in6_addr in6addr_v4mapped_loopback = { +static const struct in6_addr in6addr_v4mapped_loopback = { .s6_addr = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -26,82 +27,105 @@ struct in6_addr in6addr_v4mapped_loopback = { FIXTURE(bind_wildcard) { - struct sockaddr_in addr4; - struct sockaddr_in6 addr6; + socklen_t addrlen[2]; + union { + struct sockaddr addr; + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + } addr[2]; }; FIXTURE_VARIANT(bind_wildcard) { - const __u32 addr4_const; - const struct in6_addr *addr6_const; + sa_family_t family[2]; + const void *addr[2]; int expected_errno; }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_any, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_any}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_loopback, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_loopback}, .expected_errno = 0, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_v4mapped_any, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_v4mapped_any}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_v4mapped_loopback, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_v4mapped_loopback}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_any, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_any}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_loopback, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_loopback}, .expected_errno = 0, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_v4mapped_any, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_v4mapped_any}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_v4mapped_loopback, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_v4mapped_loopback}, .expected_errno = EADDRINUSE, }; -FIXTURE_SETUP(bind_wildcard) +static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, + int family, const void *addr_const) { - self->addr4.sin_family = AF_INET; - self->addr4.sin_port = htons(0); - self->addr4.sin_addr.s_addr = htonl(variant->addr4_const); + if (family == AF_INET) { + struct sockaddr_in *addr4 = &self->addr[i].addr4; + const __u32 *addr4_const = addr_const; + + addr4->sin_family = AF_INET; + addr4->sin_port = htons(0); + addr4->sin_addr.s_addr = htonl(*addr4_const); + + self->addrlen[i] = sizeof(struct sockaddr_in); + } else { + struct sockaddr_in6 *addr6 = &self->addr[i].addr6; + const struct in6_addr *addr6_const = addr_const; + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(0); + addr6->sin6_addr = *addr6_const; - self->addr6.sin6_family = AF_INET6; - self->addr6.sin6_port = htons(0); - self->addr6.sin6_addr = *variant->addr6_const; + self->addrlen[i] = sizeof(struct sockaddr_in6); + } +} + +FIXTURE_SETUP(bind_wildcard) +{ + setup_addr(self, 0, variant->family[0], variant->addr[0]); + setup_addr(self, 1, variant->family[1], variant->addr[1]); } FIXTURE_TEARDOWN(bind_wildcard) @@ -146,15 +170,15 @@ void bind_sockets(struct __test_metadata *_metadata, TEST_F(bind_wildcard, v4_v6) { bind_sockets(_metadata, self, variant->expected_errno, - (struct sockaddr *)&self->addr4, sizeof(self->addr4), - (struct sockaddr *)&self->addr6, sizeof(self->addr6)); + &self->addr[0].addr, self->addrlen[0], + &self->addr[1].addr, self->addrlen[1]); } TEST_F(bind_wildcard, v6_v4) { bind_sockets(_metadata, self, variant->expected_errno, - (struct sockaddr *)&self->addr6, sizeof(self->addr6), - (struct sockaddr *)&self->addr4, sizeof(self->addr4)); + &self->addr[1].addr, self->addrlen[1], + &self->addr[0].addr, self->addrlen[0]); } TEST_HARNESS_MAIN -- GitLab From 6f9bc755c0215501c45897aa5c8b8b56fb65724e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:47 -0700 Subject: [PATCH 170/989] selftest: tcp: Define the reverse order bind() tests explicitly. Currently, bind_wildcard.c calls bind() twice for two addresses and checks the pre-defined errno against the 2nd call. Also, the two bind() calls are swapped to cover various patterns how bind buckets are created. However, only testing two addresses is insufficient to detect regression. So, we will add more bind() calls, and then, we need to define different errno for each bind() per test case. As a prepartion, let's define the reverse order bind() test cases as fixtures. No functional changes are intended. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 67 ++++++++++++++++++--- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index d65c3bb6ba138..143aae383da39 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -42,6 +42,7 @@ FIXTURE_VARIANT(bind_wildcard) int expected_errno; }; +/* (IPv4, IPv6) */ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { .family = {AF_INET, AF_INET6}, @@ -98,6 +99,63 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) .expected_errno = EADDRINUSE, }; +/* (IPv6, IPv4) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_loopback, &in4addr_any}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_loopback, &in4addr_loopback}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_any, &in4addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_any, &in4addr_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_loopback, &in4addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_loopback, &in4addr_loopback}, + .expected_errno = EADDRINUSE, +}; + static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, int family, const void *addr_const) { @@ -167,18 +225,11 @@ void bind_sockets(struct __test_metadata *_metadata, close(fd[0]); } -TEST_F(bind_wildcard, v4_v6) +TEST_F(bind_wildcard, plain) { bind_sockets(_metadata, self, variant->expected_errno, &self->addr[0].addr, self->addrlen[0], &self->addr[1].addr, self->addrlen[1]); } -TEST_F(bind_wildcard, v6_v4) -{ - bind_sockets(_metadata, self, variant->expected_errno, - &self->addr[1].addr, self->addrlen[1], - &self->addr[0].addr, self->addrlen[0]); -} - TEST_HARNESS_MAIN -- GitLab From 5e9e9afdb50449f35d3e65dd6b1cdf87e8ce185e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:48 -0700 Subject: [PATCH 171/989] selftest: tcp: Add v4-v4 and v6-v6 bind() conflict tests. We don't have bind() conflict tests for the same protocol pairs. Let's add them except for the same address pair, which will be covered by the following patch adding 6 more bind() calls for each test case. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 100 ++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 143aae383da39..5100dd713a494 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -42,6 +42,21 @@ FIXTURE_VARIANT(bind_wildcard) int expected_errno; }; +/* (IPv4, IPv4) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v4_local) +{ + .family = {AF_INET, AF_INET}, + .addr = {&in4addr_any, &in4addr_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any) +{ + .family = {AF_INET, AF_INET}, + .addr = {&in4addr_loopback, &in4addr_any}, + .expected_errno = EADDRINUSE, +}; + /* (IPv4, IPv6) */ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { @@ -156,6 +171,91 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) .expected_errno = EADDRINUSE, }; +/* (IPv6, IPv6) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_v4mapped_any}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_v4mapped_loopback}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_loopback}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_v4mapped_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_loopback}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_v4mapped_any}, + .expected_errno = EADDRINUSE, +}; + static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, int family, const void *addr_const) { -- GitLab From f40742c22a6e9ffb53bf02f22ea5eda55fbcfcc5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:49 -0700 Subject: [PATCH 172/989] selftest: tcp: Add more bind() calls. In addtition to the two addresses defined in the fixtures, this patch add 6 more bind calls(): * 0.0.0.0 * 127.0.0.1 * :: * ::1 * ::ffff:0.0.0.0 * ::ffff:127.0.0.1 The first two per-fixture bind() calls control how inet_bind2_bucket is created, and the rest 6 bind() calls cover as many conflicting patterns as possible. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 225 +++++++++++++++----- 1 file changed, 166 insertions(+), 59 deletions(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 5100dd713a494..4ecd3835f33ca 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -25,21 +25,34 @@ static const struct in6_addr in6addr_v4mapped_loopback = { } }; +#define NR_SOCKETS 8 + FIXTURE(bind_wildcard) { - socklen_t addrlen[2]; + int fd[NR_SOCKETS]; + socklen_t addrlen[NR_SOCKETS]; union { struct sockaddr addr; struct sockaddr_in addr4; struct sockaddr_in6 addr6; - } addr[2]; + } addr[NR_SOCKETS]; }; FIXTURE_VARIANT(bind_wildcard) { sa_family_t family[2]; const void *addr[2]; - int expected_errno; + + /* 6 bind() calls below follow two bind() for the defined 2 addresses: + * + * 0.0.0.0 + * 127.0.0.1 + * :: + * ::1 + * ::ffff:0.0.0.0 + * ::ffff:127.0.0.1 + */ + int expected_errno[NR_SOCKETS]; }; /* (IPv4, IPv4) */ @@ -47,14 +60,20 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v4_local) { .family = {AF_INET, AF_INET}, .addr = {&in4addr_any, &in4addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any) { .family = {AF_INET, AF_INET}, .addr = {&in4addr_loopback, &in4addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv4, IPv6) */ @@ -62,56 +81,80 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_any, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_any, &in6addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_any, &in6addr_v4mapped_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_any, &in6addr_v4mapped_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_loopback, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_loopback, &in6addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_loopback, &in6addr_v4mapped_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_loopback, &in6addr_v4mapped_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv6, IPv4) */ @@ -119,56 +162,80 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_any, &in4addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_any, &in4addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_loopback, &in4addr_any}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_loopback, &in4addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_v4mapped_any, &in4addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_v4mapped_any, &in4addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_v4mapped_loopback, &in4addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_v4mapped_loopback, &in4addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv6, IPv6) */ @@ -176,84 +243,120 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_any, &in6addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_any, &in6addr_v4mapped_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_any, &in6addr_v4mapped_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_loopback, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_loopback, &in6addr_v4mapped_any}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_loopback, &in6addr_v4mapped_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_any, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_any, &in6addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_any, &in6addr_v4mapped_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_loopback, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_loopback, &in6addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_loopback, &in6addr_v4mapped_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, @@ -284,52 +387,56 @@ FIXTURE_SETUP(bind_wildcard) { setup_addr(self, 0, variant->family[0], variant->addr[0]); setup_addr(self, 1, variant->family[1], variant->addr[1]); + + setup_addr(self, 2, AF_INET, &in4addr_any); + setup_addr(self, 3, AF_INET, &in4addr_loopback); + + setup_addr(self, 4, AF_INET6, &in6addr_any); + setup_addr(self, 5, AF_INET6, &in6addr_loopback); + setup_addr(self, 6, AF_INET6, &in6addr_v4mapped_any); + setup_addr(self, 7, AF_INET6, &in6addr_v4mapped_loopback); } FIXTURE_TEARDOWN(bind_wildcard) { + int i; + + for (i = 0; i < NR_SOCKETS; i++) + close(self->fd[i]); } -void bind_sockets(struct __test_metadata *_metadata, - FIXTURE_DATA(bind_wildcard) *self, - int expected_errno, - struct sockaddr *addr1, socklen_t addrlen1, - struct sockaddr *addr2, socklen_t addrlen2) +void bind_socket(struct __test_metadata *_metadata, + FIXTURE_DATA(bind_wildcard) *self, + const FIXTURE_VARIANT(bind_wildcard) *variant, + int i) { - int fd[2]; int ret; - fd[0] = socket(addr1->sa_family, SOCK_STREAM, 0); - ASSERT_GT(fd[0], 0); - - ret = bind(fd[0], addr1, addrlen1); - ASSERT_EQ(ret, 0); + self->fd[i] = socket(self->addr[i].addr.sa_family, SOCK_STREAM, 0); + ASSERT_GT(self->fd[i], 0); - ret = getsockname(fd[0], addr1, &addrlen1); - ASSERT_EQ(ret, 0); + self->addr[i].addr4.sin_port = self->addr[0].addr4.sin_port; - ((struct sockaddr_in *)addr2)->sin_port = ((struct sockaddr_in *)addr1)->sin_port; - - fd[1] = socket(addr2->sa_family, SOCK_STREAM, 0); - ASSERT_GT(fd[1], 0); - - ret = bind(fd[1], addr2, addrlen2); - if (expected_errno) { + ret = bind(self->fd[i], &self->addr[i].addr, self->addrlen[i]); + if (variant->expected_errno[i]) { ASSERT_EQ(ret, -1); - ASSERT_EQ(errno, expected_errno); + ASSERT_EQ(errno, variant->expected_errno[i]); } else { ASSERT_EQ(ret, 0); } - close(fd[1]); - close(fd[0]); + if (i == 0) { + ret = getsockname(self->fd[0], &self->addr[0].addr, &self->addrlen[0]); + ASSERT_EQ(ret, 0); + } } TEST_F(bind_wildcard, plain) { - bind_sockets(_metadata, self, variant->expected_errno, - &self->addr[0].addr, self->addrlen[0], - &self->addr[1].addr, self->addrlen[1]); + int i; + + for (i = 0; i < NR_SOCKETS; i++) + bind_socket(_metadata, self, variant, i); } TEST_HARNESS_MAIN -- GitLab From d37f2f72c91f2c5b61db7e6685c8b4bfdff85cb8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:50 -0700 Subject: [PATCH 173/989] selftest: tcp: Add bind() tests for IPV6_V6ONLY. bhash2 was not well tested for IPv6-only sockets. This patch adds test cases where we set IPV6_V6ONLY for per-fixture bind() calls if variant->ipv6_only[i] is true. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 116 ++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 4ecd3835f33ca..6d7f02441b9d7 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -42,6 +42,7 @@ FIXTURE_VARIANT(bind_wildcard) { sa_family_t family[2]; const void *addr[2]; + bool ipv6_only[2]; /* 6 bind() calls below follow two bind() for the defined 2 addresses: * @@ -87,6 +88,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any_only) +{ + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) { .family = {AF_INET, AF_INET6}, @@ -127,6 +139,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any_only) +{ + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) { .family = {AF_INET, AF_INET6}, @@ -168,6 +191,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_any}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) { .family = {AF_INET6, AF_INET}, @@ -178,6 +212,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_loopback}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) { .family = {AF_INET6, AF_INET}, @@ -249,6 +294,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_loopback}, + .ipv6_only = {true, false}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, @@ -259,6 +315,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_any}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) { .family = {AF_INET6, AF_INET6}, @@ -269,6 +336,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_loopback}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) { .family = {AF_INET6, AF_INET6}, @@ -279,6 +357,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, @@ -309,6 +398,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) { .family = {AF_INET6, AF_INET6}, @@ -339,6 +439,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) { .family = {AF_INET6, AF_INET6}, @@ -415,6 +526,11 @@ void bind_socket(struct __test_metadata *_metadata, self->fd[i] = socket(self->addr[i].addr.sa_family, SOCK_STREAM, 0); ASSERT_GT(self->fd[i], 0); + if (i < 2 && variant->ipv6_only[i]) { + ret = setsockopt(self->fd[i], SOL_IPV6, IPV6_V6ONLY, &(int){1}, sizeof(int)); + ASSERT_EQ(ret, 0); + } + self->addr[i].addr4.sin_port = self->addr[0].addr4.sin_port; ret = bind(self->fd[i], &self->addr[i].addr, self->addrlen[i]); -- GitLab From 7679f0968d01878b8da80c5078eebe23231a19e8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:51 -0700 Subject: [PATCH 174/989] selftest: tcp: Add bind() tests for SO_REUSEADDR/SO_REUSEPORT. This patch adds two tests using SO_REUSEADDR and SO_REUSEPORT and defines errno for each test case. SO_REUSEADDR/SO_REUSEPORT is set for the per-fixture two bind() calls. The notable pattern is the pair of v6only [::] and plain [::]. The two sockets are put into the same tb2, where per-bucket v6only flag would be useless to detect bind() conflict. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 263 +++++++++++++++++++- 1 file changed, 257 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 6d7f02441b9d7..b7b54d646b937 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -54,6 +54,7 @@ FIXTURE_VARIANT(bind_wildcard) * ::ffff:127.0.0.1 */ int expected_errno[NR_SOCKETS]; + int expected_reuse_errno[NR_SOCKETS]; }; /* (IPv4, IPv4) */ @@ -65,6 +66,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any) @@ -75,6 +80,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv4, IPv6) */ @@ -86,6 +95,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any_only) @@ -97,6 +110,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any_only) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) @@ -107,6 +124,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) @@ -117,6 +138,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) @@ -127,6 +152,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) @@ -137,6 +166,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any_only) @@ -148,6 +181,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any_only) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) @@ -158,6 +195,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) @@ -168,6 +209,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) @@ -178,6 +223,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv6, IPv4) */ @@ -189,6 +238,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_any) @@ -200,6 +253,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) @@ -210,6 +267,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_local) @@ -221,6 +282,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) @@ -231,6 +296,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local) @@ -241,6 +310,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any) @@ -251,6 +324,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local) @@ -261,6 +338,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any) @@ -271,6 +352,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) @@ -281,9 +366,72 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv6, IPv6) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .ipv6_only = {true, false}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .ipv6_only = {true, true}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) { .family = {AF_INET6, AF_INET6}, @@ -292,6 +440,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_local) @@ -303,6 +455,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_local) 0, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) @@ -313,6 +469,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_any) @@ -324,6 +484,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) @@ -334,6 +498,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_local) @@ -345,6 +513,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) @@ -355,6 +527,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) 0, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any_only) @@ -366,6 +542,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any_only) 0, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) @@ -376,6 +556,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local) @@ -386,6 +570,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) @@ -396,6 +584,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any_only) @@ -407,6 +599,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any_only) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) @@ -417,6 +613,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local) @@ -427,6 +627,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) @@ -437,6 +641,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any_only) @@ -448,6 +656,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any_only) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) @@ -458,6 +670,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any) @@ -468,6 +684,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, @@ -519,7 +739,7 @@ FIXTURE_TEARDOWN(bind_wildcard) void bind_socket(struct __test_metadata *_metadata, FIXTURE_DATA(bind_wildcard) *self, const FIXTURE_VARIANT(bind_wildcard) *variant, - int i) + int i, int reuse) { int ret; @@ -531,14 +751,29 @@ void bind_socket(struct __test_metadata *_metadata, ASSERT_EQ(ret, 0); } + if (i < 2 && reuse) { + ret = setsockopt(self->fd[i], SOL_SOCKET, reuse, &(int){1}, sizeof(int)); + ASSERT_EQ(ret, 0); + } + self->addr[i].addr4.sin_port = self->addr[0].addr4.sin_port; ret = bind(self->fd[i], &self->addr[i].addr, self->addrlen[i]); - if (variant->expected_errno[i]) { - ASSERT_EQ(ret, -1); - ASSERT_EQ(errno, variant->expected_errno[i]); + + if (reuse) { + if (variant->expected_reuse_errno[i]) { + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, variant->expected_reuse_errno[i]); + } else { + ASSERT_EQ(ret, 0); + } } else { - ASSERT_EQ(ret, 0); + if (variant->expected_errno[i]) { + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, variant->expected_errno[i]); + } else { + ASSERT_EQ(ret, 0); + } } if (i == 0) { @@ -552,7 +787,23 @@ TEST_F(bind_wildcard, plain) int i; for (i = 0; i < NR_SOCKETS; i++) - bind_socket(_metadata, self, variant, i); + bind_socket(_metadata, self, variant, i, 0); +} + +TEST_F(bind_wildcard, reuseaddr) +{ + int i; + + for (i = 0; i < NR_SOCKETS; i++) + bind_socket(_metadata, self, variant, i, SO_REUSEADDR); +} + +TEST_F(bind_wildcard, reuseport) +{ + int i; + + for (i = 0; i < NR_SOCKETS; i++) + bind_socket(_metadata, self, variant, i, SO_REUSEPORT); } TEST_HARNESS_MAIN -- GitLab From c33f0d4fcfe072adbbb7f3cf93f1b146e181bf3b Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Fri, 29 Mar 2024 11:28:03 +0000 Subject: [PATCH 175/989] ALSA: hda/realtek: Add quirks for ASUS Laptops using CS35L56 These ASUS laptops use the Realtek HDA codec combined with a number of CS35L56 amplifiers. The SSID of the GA403U matches a previous ASUS laptop - we can tell them apart because they use different codecs. Signed-off-by: Simon Trimmer Message-ID: <20240329112803.23897-1-simont@opensource.cirrus.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 53 ++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index c31e9be257a9b..e866b8a75cda3 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6875,11 +6875,38 @@ static void alc287_fixup_legion_16ithg6_speakers(struct hda_codec *cdc, const st comp_generic_fixup(cdc, action, "i2c", "CLSA0101", "-%s:00-cs35l41-hda.%d", 2); } +static void cs35l56_fixup_i2c_two(struct hda_codec *cdc, const struct hda_fixup *fix, int action) +{ + comp_generic_fixup(cdc, action, "i2c", "CSC3556", "-%s:00-cs35l56-hda.%d", 2); +} + +static void cs35l56_fixup_i2c_four(struct hda_codec *cdc, const struct hda_fixup *fix, int action) +{ + comp_generic_fixup(cdc, action, "i2c", "CSC3556", "-%s:00-cs35l56-hda.%d", 4); +} + +static void cs35l56_fixup_spi_two(struct hda_codec *cdc, const struct hda_fixup *fix, int action) +{ + comp_generic_fixup(cdc, action, "spi", "CSC3556", "-%s:00-cs35l56-hda.%d", 2); +} + static void cs35l56_fixup_spi_four(struct hda_codec *cdc, const struct hda_fixup *fix, int action) { comp_generic_fixup(cdc, action, "spi", "CSC3556", "-%s:00-cs35l56-hda.%d", 4); } +static void alc285_fixup_asus_ga403u(struct hda_codec *cdc, const struct hda_fixup *fix, int action) +{ + /* + * The same SSID has been re-used in different hardware, they have + * different codecs and the newer GA403U has a ALC285. + */ + if (cdc->core.vendor_id == 0x10ec0285) + cs35l56_fixup_i2c_two(cdc, fix, action); + else + alc_fixup_inv_dmic(cdc, fix, action); +} + static void tas2781_fixup_i2c(struct hda_codec *cdc, const struct hda_fixup *fix, int action) { @@ -7436,6 +7463,10 @@ enum { ALC256_FIXUP_ACER_SFG16_MICMUTE_LED, ALC256_FIXUP_HEADPHONE_AMP_VOL, ALC245_FIXUP_HP_SPECTRE_X360_EU0XXX, + ALC285_FIXUP_CS35L56_SPI_2, + ALC285_FIXUP_CS35L56_I2C_2, + ALC285_FIXUP_CS35L56_I2C_4, + ALC285_FIXUP_ASUS_GA403U, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -9643,6 +9674,22 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc245_fixup_hp_spectre_x360_eu0xxx, }, + [ALC285_FIXUP_CS35L56_SPI_2] = { + .type = HDA_FIXUP_FUNC, + .v.func = cs35l56_fixup_spi_two, + }, + [ALC285_FIXUP_CS35L56_I2C_2] = { + .type = HDA_FIXUP_FUNC, + .v.func = cs35l56_fixup_i2c_two, + }, + [ALC285_FIXUP_CS35L56_I2C_4] = { + .type = HDA_FIXUP_FUNC, + .v.func = cs35l56_fixup_i2c_four, + }, + [ALC285_FIXUP_ASUS_GA403U] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_asus_ga403u, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -10096,7 +10143,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1a83, "ASUS UM5302LA", ALC294_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1a8f, "ASUS UX582ZS", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1b11, "ASUS UX431DA", ALC294_FIXUP_ASUS_COEF_1B), - SND_PCI_QUIRK(0x1043, 0x1b13, "Asus U41SV", ALC269_FIXUP_INV_DMIC), + SND_PCI_QUIRK(0x1043, 0x1b13, "ASUS U41SV/GA403U", ALC285_FIXUP_ASUS_GA403U), SND_PCI_QUIRK(0x1043, 0x1b93, "ASUS G614JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1bbd, "ASUS Z550MA", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1c03, "ASUS UM3406HA", ALC287_FIXUP_CS35L41_I2C_2), @@ -10104,6 +10151,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1c33, "ASUS UX5304MA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1c43, "ASUS UX8406MA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1c62, "ASUS GU603", ALC289_FIXUP_ASUS_GA401), + SND_PCI_QUIRK(0x1043, 0x1c63, "ASUS GU605M", ALC285_FIXUP_CS35L56_SPI_2), SND_PCI_QUIRK(0x1043, 0x1c92, "ASUS ROG Strix G15", ALC285_FIXUP_ASUS_G533Z_PINS), SND_PCI_QUIRK(0x1043, 0x1c9f, "ASUS G614JU/JV/JI", ALC285_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1caf, "ASUS G634JY/JZ/JI/JG", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), @@ -10115,11 +10163,14 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1d42, "ASUS Zephyrus G14 2022", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1d4e, "ASUS TM420", ALC256_FIXUP_ASUS_HPE), SND_PCI_QUIRK(0x1043, 0x1da2, "ASUS UP6502ZA/ZD", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x1df3, "ASUS UM5606", ALC285_FIXUP_CS35L56_I2C_4), SND_PCI_QUIRK(0x1043, 0x1e02, "ASUS UX3402ZA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1e11, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA502), SND_PCI_QUIRK(0x1043, 0x1e12, "ASUS UM3402", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1e51, "ASUS Zephyrus M15", ALC294_FIXUP_ASUS_GU502_PINS), SND_PCI_QUIRK(0x1043, 0x1e5e, "ASUS ROG Strix G513", ALC294_FIXUP_ASUS_G513_PINS), + SND_PCI_QUIRK(0x1043, 0x1e63, "ASUS H7606W", ALC285_FIXUP_CS35L56_I2C_2), + SND_PCI_QUIRK(0x1043, 0x1e83, "ASUS GA605W", ALC285_FIXUP_CS35L56_I2C_2), SND_PCI_QUIRK(0x1043, 0x1e8e, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1ee2, "ASUS UM6702RA/RC", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1c52, "ASUS Zephyrus G15 2022", ALC289_FIXUP_ASUS_GA401), -- GitLab From ebd9779683aaf089ad0173862553cdd3288ad9b4 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Thu, 28 Mar 2024 21:44:48 +0000 Subject: [PATCH 176/989] smb: client: replace deprecated strncpy with strscpy strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. In cifssmb.c: Using strncpy with a length argument equal to strlen(src) is generally dangerous because it can cause string buffers to not be NUL-terminated. In this case, however, there was extra effort made to ensure the buffer was NUL-terminated via a manual NUL-byte assignment. In an effort to rid the kernel of strncpy() use, let's swap over to using strscpy() which guarantees NUL-termination on the destination buffer. To handle the case where ea_name is NULL, let's use the ?: operator to substitute in an empty string, thereby allowing strscpy to still NUL-terminate the destintation string. Interesting note: this flex array buffer may go on to also have some value encoded after the NUL-termination: | if (ea_value_len) | memcpy(parm_data->list.name + name_len + 1, | ea_value, ea_value_len); Now for smb2ops.c and smb2transport.c: Both of these cases are simple, strncpy() is used to copy string literals which have a length less than the destination buffer's size. We can simply swap in the new 2-argument version of strscpy() introduced in Commit e6584c3964f2f ("string: Allow 2-argument strscpy()"). Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html [2] Link: https://github.com/KSPP/linux/issues/90 Cc: linux-hardening@vger.kernel.org Signed-off-by: Justin Stitt Reviewed-by: Kees Cook Signed-off-by: Steve French --- fs/smb/client/cifssmb.c | 6 ++---- fs/smb/client/smb2ops.c | 2 +- fs/smb/client/smb2transport.c | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 5aee555515730..23b5709ddc311 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -5854,10 +5854,8 @@ SetEARetry: parm_data->list.EA_flags = 0; /* we checked above that name len is less than 255 */ parm_data->list.name_len = (__u8)name_len; - /* EA names are always ASCII */ - if (ea_name) - strncpy(parm_data->list.name, ea_name, name_len); - parm_data->list.name[name_len] = '\0'; + /* EA names are always ASCII and NUL-terminated */ + strscpy(parm_data->list.name, ea_name ?: "", name_len + 1); parm_data->list.value_len = cpu_to_le16(ea_value_len); /* caller ensures that ea_value_len is less than 64K but we need to ensure that it fits within the smb */ diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 2ed456948f34c..35bf7eb315cdf 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -3913,7 +3913,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, strcat(message, "W"); } if (!new_oplock) - strncpy(message, "None", sizeof(message)); + strscpy(message, "None"); cinode->oplock = new_oplock; cifs_dbg(FYI, "%s Lease granted on inode %p\n", message, diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 5a3ca62d2f07f..1d6e54f7879e6 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -659,7 +659,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) } spin_unlock(&server->srv_lock); if (!is_binding && !server->session_estab) { - strncpy(shdr->Signature, "BSRSPYL", 8); + strscpy(shdr->Signature, "BSRSPYL"); return 0; } -- GitLab From 52f80bb181a9a1530ade30bc18991900bbb9697f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 26 Mar 2024 15:53:37 +0100 Subject: [PATCH 177/989] ata: sata_sx4: fix pdc20621_get_from_dimm() on 64-bit gcc warns about a memcpy() with overlapping pointers because of an incorrect size calculation: In file included from include/linux/string.h:369, from drivers/ata/sata_sx4.c:66: In function 'memcpy_fromio', inlined from 'pdc20621_get_from_dimm.constprop' at drivers/ata/sata_sx4.c:962:2: include/linux/fortify-string.h:97:33: error: '__builtin_memcpy' accessing 4294934464 bytes at offsets 0 and [16, 16400] overlaps 6442385281 bytes at offset -2147450817 [-Werror=restrict] 97 | #define __underlying_memcpy __builtin_memcpy | ^ include/linux/fortify-string.h:620:9: note: in expansion of macro '__underlying_memcpy' 620 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ include/linux/fortify-string.h:665:26: note: in expansion of macro '__fortify_memcpy_chk' 665 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ include/asm-generic/io.h:1184:9: note: in expansion of macro 'memcpy' 1184 | memcpy(buffer, __io_virt(addr), size); | ^~~~~~ The problem here is the overflow of an unsigned 32-bit number to a negative that gets converted into a signed 'long', keeping a large positive number. Replace the complex calculation with a more readable min() variant that avoids the warning. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Arnd Bergmann Signed-off-by: Damien Le Moal --- drivers/ata/sata_sx4.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/ata/sata_sx4.c b/drivers/ata/sata_sx4.c index b51d7a9d0d90c..a482741eb181f 100644 --- a/drivers/ata/sata_sx4.c +++ b/drivers/ata/sata_sx4.c @@ -957,8 +957,7 @@ static void pdc20621_get_from_dimm(struct ata_host *host, void *psource, offset -= (idx * window_size); idx++; - dist = ((long) (window_size - (offset + size))) >= 0 ? size : - (long) (window_size - offset); + dist = min(size, window_size - offset); memcpy_fromio(psource, dimm_mmio + offset / 4, dist); psource += dist; @@ -1005,8 +1004,7 @@ static void pdc20621_put_to_dimm(struct ata_host *host, void *psource, readl(mmio + PDC_DIMM_WINDOW_CTLR); offset -= (idx * window_size); idx++; - dist = ((long)(s32)(window_size - (offset + size))) >= 0 ? size : - (long) (window_size - offset); + dist = min(size, window_size - offset); memcpy_toio(dimm_mmio + offset / 4, psource, dist); writel(0x01, mmio + PDC_GENERAL_CTLR); readl(mmio + PDC_GENERAL_CTLR); -- GitLab From 7d899947bca5e1dc2447d9cffb2b31c989e0ceb4 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 27 Mar 2024 18:49:36 +0100 Subject: [PATCH 178/989] ata: pata_macio: drop driver owner assignment PCI core in pci_register_driver() already sets the .owner, so driver does not need to. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Sergey Shtylyov Signed-off-by: Damien Le Moal --- drivers/ata/pata_macio.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c index 4ac854f6b0577..88b2e9817f49d 100644 --- a/drivers/ata/pata_macio.c +++ b/drivers/ata/pata_macio.c @@ -1371,9 +1371,6 @@ static struct pci_driver pata_macio_pci_driver = { .suspend = pata_macio_pci_suspend, .resume = pata_macio_pci_resume, #endif - .driver = { - .owner = THIS_MODULE, - }, }; MODULE_DEVICE_TABLE(pci, pata_macio_pci_match); -- GitLab From a5e3dce493d4b12b74000b6a99b6712afa5d1a4d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 22 Mar 2024 00:15:41 -0400 Subject: [PATCH 179/989] bcachefs: Fix assert in bch2_backpointer_invalid() Backpointers that point to invalid devices are caught by fsck, not .key_invalid; so .key_invalid needs to check for them instead of hitting asserts. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 8cb35ea572cb9..21b3ae7df8240 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -44,6 +44,11 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + + /* these will be caught by fsck */ + if (!bch2_dev_exists2(c, bp.k->p.inode)) + return 0; + struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); int ret = 0; -- GitLab From 8aad8e1f659fcea1b24072e816e434e4cd12382d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 22 Mar 2024 04:01:27 -0400 Subject: [PATCH 180/989] bcachefs: Fix journal pins in btree write buffer btree write buffer flush has two phases - in natural key order, which is more efficient but may fail - then in journal order The journal order flush was assuming that keys were still correctly ordered by journal sequence number - but due to coalescing by the previous phase, we need an additional sort. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_write_buffer.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 5cbad8445782c..baf63e2fddb64 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -11,6 +11,7 @@ #include "journal_reclaim.h" #include +#include static int bch2_btree_write_buffer_journal_flush(struct journal *, struct journal_entry_pin *, u64); @@ -46,6 +47,14 @@ static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_ke #endif } +static int wb_key_seq_cmp(const void *_l, const void *_r) +{ + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; + + return cmp_int(l->journal_seq, r->journal_seq); +} + /* Compare excluding idx, the low 24 bits: */ static inline bool wb_key_eq(const void *_l, const void *_r) { @@ -357,6 +366,11 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) */ trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); + sort(wb->flushing.keys.data, + wb->flushing.keys.nr, + sizeof(wb->flushing.keys.data[0]), + wb_key_seq_cmp, NULL); + darray_for_each(wb->flushing.keys, i) { if (!i->journal_seq) continue; -- GitLab From 688d750d10aa9c4fb71c5154521c775f94c887e0 Mon Sep 17 00:00:00 2001 From: Thomas Bertschinger Date: Wed, 20 Mar 2024 21:42:42 -0600 Subject: [PATCH 181/989] bcachefs: fix misplaced newline in __bch2_inode_unpacked_to_text() before: u64s 18 type inode_v3 0:1879048192:U32_MAX len 0 ver 0: mode=40700 flags= (15300000) journal_seq=4 bi_size=0 bi_sectors=0 bi_version=0bi_atime=227064388944 ... after: u64s 18 type inode_v3 0:1879048192:U32_MAX len 0 ver 0: mode=40700 flags= (15300000) journal_seq=4 bi_size=0 bi_sectors=0 bi_version=0 bi_atime=227064388944 ... Signed-off-by: Thomas Bertschinger Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 2b5e06770ab39..ca4a066e9a542 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -552,8 +552,8 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, prt_printf(out, "bi_sectors=%llu", inode->bi_sectors); prt_newline(out); - prt_newline(out); prt_printf(out, "bi_version=%llu", inode->bi_version); + prt_newline(out); #define x(_name, _bits) \ prt_printf(out, #_name "=%llu", (u64) inode->_name); \ -- GitLab From 4bd02d3fb33d8a46e73085b8d47d21c0ccb3de9d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Mar 2024 01:20:36 -0400 Subject: [PATCH 182/989] bcachefs: fix mount error path Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 0ccee05f6887b..b5ea9fa1259d1 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1997,6 +1997,7 @@ out: return dget(sb->s_root); err_put_super: + __bch2_fs_stop(c); deactivate_locked_super(sb); return ERR_PTR(bch2_err_class(ret)); } -- GitLab From aa6e130e3c2965a5c26a4033ff63b5dc9549bd76 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Mar 2024 19:52:03 -0400 Subject: [PATCH 183/989] bcachefs: Add an assertion for trying to evict btree root Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 562561a9a510e..8ff21b2e14636 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1134,6 +1134,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) b = btree_cache_find(bc, k); if (!b) return; + + BUG_ON(b == btree_node_root(trans->c, b)); wait_on_io: /* not allowed to wait on io with btree locks held: */ -- GitLab From 63332394c7e1f4f26e8e5b1387212016aaa7eae2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 21 Mar 2024 20:16:23 -0400 Subject: [PATCH 184/989] bcachefs: Move snapshot table size to struct snapshot_table We need to add bounds checking for snapshot table accesses - it turns out there are cases where we do need to use the snapshots table before fsck checks have completed (and indeed, fsck may not have been run). Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 - fs/bcachefs/snapshot.c | 25 ++++++++++++++----------- fs/bcachefs/snapshot.h | 6 +++++- fs/bcachefs/subvolume_types.h | 2 ++ 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 799aa32b6b4d9..a9ade0b1f78cf 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -810,7 +810,6 @@ struct bch_fs { /* snapshot.c: */ struct snapshot_table __rcu *snapshots; - size_t snapshot_table_size; struct mutex snapshot_table_lock; struct rw_semaphore snapshot_create_lock; diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 39debe814bf39..9cd71e613dc9b 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -151,36 +151,39 @@ out: static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) { size_t idx = U32_MAX - id; - size_t new_size; struct snapshot_table *new, *old; - new_size = max(16UL, roundup_pow_of_two(idx + 1)); + size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); + size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); - new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); + new = kvzalloc(new_bytes, GFP_KERNEL); if (!new) return NULL; + new->nr = new_size; + old = rcu_dereference_protected(c->snapshots, true); if (old) - memcpy(new->s, - rcu_dereference_protected(c->snapshots, true)->s, - sizeof(new->s[0]) * c->snapshot_table_size); + memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr); rcu_assign_pointer(c->snapshots, new); - c->snapshot_table_size = new_size; - kvfree_rcu_mightsleep(old); + kvfree_rcu(old, rcu); - return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + return &rcu_dereference_protected(c->snapshots, + lockdep_is_held(&c->snapshot_table_lock))->s[idx]; } static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) { size_t idx = U32_MAX - id; + struct snapshot_table *table = + rcu_dereference_protected(c->snapshots, + lockdep_is_held(&c->snapshot_table_lock)); lockdep_assert_held(&c->snapshot_table_lock); - if (likely(idx < c->snapshot_table_size)) - return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + if (likely(table && idx < table->nr)) + return &table->s[idx]; return __snapshot_t_mut(c, id); } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 7c66ffc06385d..8538b7fddfed3 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -33,7 +33,11 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) { - return &t->s[U32_MAX - id]; + u32 idx = U32_MAX - id; + + return likely(t && idx < t->nr) + ? &t->s[idx] + : NULL; } static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index ae644adfc3916..9b10c8947828e 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -20,6 +20,8 @@ struct snapshot_t { }; struct snapshot_table { + struct rcu_head rcu; + size_t nr; #ifndef RUST_BINDGEN DECLARE_FLEX_ARRAY(struct snapshot_t, s); #else -- GitLab From ec9cc18fc2e65b08c588e01f24aaeb71551a7132 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 22 Mar 2024 16:29:23 -0400 Subject: [PATCH 185/989] bcachefs: Add checks for invalid snapshot IDs Previously, we assumed that keys were consistent with the snapshots btree - but that's not correct as fsck may not have been run or may not be complete. This adds checks and error handling when using the in-memory snapshots table (that mirrors the snapshots btree). Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 2 +- fs/bcachefs/data_update.c | 9 +++++ fs/bcachefs/errcode.h | 3 +- fs/bcachefs/snapshot.c | 8 +++-- fs/bcachefs/snapshot.h | 57 ++++++++++++-------------------- 5 files changed, 40 insertions(+), 39 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 30d69a6d133ee..96669fede7d34 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -318,7 +318,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && i->k->k.p.snapshot && - bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); + bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0); } static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 4150feca42a2e..b564404dad711 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -14,6 +14,7 @@ #include "move.h" #include "nocow_locking.h" #include "rebalance.h" +#include "snapshot.h" #include "subvolume.h" #include "trace.h" @@ -509,6 +510,14 @@ int bch2_data_update_init(struct btree_trans *trans, unsigned ptrs_locked = 0; int ret = 0; + /* + * fs is corrupt we have a key for a snapshot node that doesn't exist, + * and we have to check for this because we go rw before repairing the + * snapshots table - just skip it, we can move it later. + */ + if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) + return -BCH_ERR_data_update_done; + bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); m->btree_id = btree_id; diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index af25d8ec60f22..01a79fa3eacb2 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -252,7 +252,8 @@ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ - x(0, need_inode_lock) + x(0, need_inode_lock) \ + x(0, invalid_snapshot_node) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 9cd71e613dc9b..4e074136c4907 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -93,8 +93,10 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans, static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor) { - while (id && id < ancestor) - id = __snapshot_t(t, id)->parent; + while (id && id < ancestor) { + const struct snapshot_t *s = __snapshot_t(t, id); + id = s ? s->parent : 0; + } return id == ancestor; } @@ -110,6 +112,8 @@ static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancest static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) { const struct snapshot_t *s = __snapshot_t(t, id); + if (!s) + return 0; if (s->skip[2] <= ancestor) return s->skip[2]; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 8538b7fddfed3..331f20fd8d03d 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -48,7 +48,8 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) { rcu_read_lock(); - id = snapshot_t(c, id)->tree; + const struct snapshot_t *s = snapshot_t(c, id); + id = s ? s->tree : 0; rcu_read_unlock(); return id; @@ -56,7 +57,8 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) { - return snapshot_t(c, id)->parent; + const struct snapshot_t *s = snapshot_t(c, id); + return s ? s->parent : 0; } static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) @@ -70,19 +72,19 @@ static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) { -#ifdef CONFIG_BCACHEFS_DEBUG - u32 parent = snapshot_t(c, id)->parent; + const struct snapshot_t *s = snapshot_t(c, id); + if (!s) + return 0; - if (parent && - snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) + u32 parent = s->parent; + if (IS_ENABLED(CONFIG_BCACHEFS_DEBU) && + parent && + s->depth != snapshot_t(c, parent)->depth + 1) panic("id %u depth=%u parent %u depth=%u\n", id, snapshot_t(c, id)->depth, parent, snapshot_t(c, parent)->depth); return parent; -#else - return snapshot_t(c, id)->parent; -#endif } static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) @@ -120,7 +122,8 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) { - return snapshot_t(c, id)->equiv; + const struct snapshot_t *s = snapshot_t(c, id); + return s ? s->equiv : 0; } static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) @@ -137,38 +140,22 @@ static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) return id == bch2_snapshot_equiv(c, id); } -static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) +static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { - const struct snapshot_t *s; - bool ret; - rcu_read_lock(); - s = snapshot_t(c, id); - ret = s->children[0]; + const struct snapshot_t *s = snapshot_t(c, id); + int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; rcu_read_unlock(); return ret; } -static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) +static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) { - return !bch2_snapshot_is_internal_node(c, id); -} - -static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s; - u32 parent = __bch2_snapshot_parent(c, id); - - if (!parent) - return 0; - - s = snapshot_t(c, __bch2_snapshot_parent(c, id)); - if (id == s->children[0]) - return s->children[1]; - if (id == s->children[1]) - return s->children[0]; - return 0; + int ret = bch2_snapshot_is_internal_node(c, id); + if (ret < 0) + return ret; + return !ret; } static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) @@ -253,7 +240,7 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, struct bpos pos) { if (!btree_type_has_snapshots(id) || - bch2_snapshot_is_leaf(trans->c, pos.snapshot)) + bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0) return 0; return __bch2_key_has_snapshot_overwrites(trans, id, pos); -- GitLab From 57339b24a0eda5433751e7e0f4a8ea1e23315f60 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Mar 2024 19:29:55 -0400 Subject: [PATCH 186/989] bcachefs: Don't do extent merging before journal replay is finished We don't normally do extent updates this early in recovery, but some of the repair paths have to and when we do, we don't want to do anything that requires the snapshots table. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index a4b40c1656a54..8e47e260eba59 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -38,6 +38,9 @@ static noinline int extent_front_merge(struct btree_trans *trans, struct bkey_i *update; int ret; + if (unlikely(trans->journal_replay_not_finished)) + return 0; + update = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(update); if (ret) @@ -69,6 +72,9 @@ static noinline int extent_back_merge(struct btree_trans *trans, struct bch_fs *c = trans->c; int ret; + if (unlikely(trans->journal_replay_not_finished)) + return 0; + ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?: bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p); if (ret < 0) -- GitLab From 36f9ef109b1c6935928d09a3e73d744291f71545 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Mon, 25 Mar 2024 10:50:48 +0800 Subject: [PATCH 187/989] bcachefs: fix trans->mem realloc in __bch2_trans_kmalloc The old code doesn't consider the mem alloced from mempool when call krealloc on trans->mem. Also in bch2_trans_put, using mempool_free to free trans->mem by condition "trans->mem_bytes == BTREE_TRANS_MEM_MAX" is inaccurate when trans->mem was allocated by krealloc function. Instead, we use used_mempool stuff to record the situation, and realloc or free the trans->mem in elegant way. Also, after krealloc failed in __bch2_trans_kmalloc, the old data should be copied to the new buffer when alloc from mempool_alloc. Fixes: 31403dca5bb1 ("bcachefs: optimize __bch2_trans_get(), kill DEBUG_TRANSACTIONS") Signed-off-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 51bcdc6c6d1cd..ae4a28a33ad1a 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2790,6 +2790,31 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) struct btree_transaction_stats *s = btree_trans_stats(trans); s->max_mem = max(s->max_mem, new_bytes); + if (trans->used_mempool) { + if (trans->mem_bytes >= new_bytes) + goto out_change_top; + + /* No more space from mempool item, need malloc new one */ + new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); + if (unlikely(!new_mem)) { + bch2_trans_unlock(trans); + + new_mem = kmalloc(new_bytes, GFP_KERNEL); + if (!new_mem) + return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + + ret = bch2_trans_relock(trans); + if (ret) { + kfree(new_mem); + return ERR_PTR(ret); + } + } + memcpy(new_mem, trans->mem, trans->mem_top); + trans->used_mempool = false; + mempool_free(trans->mem, &c->btree_trans_mem_pool); + goto out_new_mem; + } + new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_mem)) { bch2_trans_unlock(trans); @@ -2798,6 +2823,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; + memcpy(new_mem, trans->mem, trans->mem_top); + trans->used_mempool = true; kfree(trans->mem); } @@ -2811,7 +2838,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (ret) return ERR_PTR(ret); } - +out_new_mem: trans->mem = new_mem; trans->mem_bytes = new_bytes; @@ -2819,7 +2846,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); } - +out_change_top: p = trans->mem + trans->mem_top; trans->mem_top += size; memset(p, 0, size); @@ -3093,7 +3120,7 @@ void bch2_trans_put(struct btree_trans *trans) if (paths_allocated != trans->_paths_allocated) kvfree_rcu_mightsleep(paths_allocated); - if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) + if (trans->used_mempool) mempool_free(trans->mem, &c->btree_trans_mem_pool); else kfree(trans->mem); -- GitLab From 048f47e83fc315499dc1943176b3ebe1a55574fb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Mar 2024 13:51:29 -0400 Subject: [PATCH 188/989] bcachefs: btree_and_journal_iter now respects trans->journal_replay_not_finished btree_and_journal_iter is now safe to use at runtime, not just during recovery before journal keys have been freed. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 50e04356d72c8..0272ef0d73109 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -363,7 +363,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) { - struct bkey_s_c btree_k, journal_k, ret; + struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; if (iter->prefetch && iter->journal.level) btree_and_journal_iter_prefetch(iter); @@ -375,9 +375,10 @@ again: bpos_lt(btree_k.k->p, iter->pos)) bch2_journal_iter_advance_btree(iter); - while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && - bpos_lt(journal_k.k->p, iter->pos)) - bch2_journal_iter_advance(&iter->journal); + if (iter->trans->journal_replay_not_finished) + while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && + bpos_lt(journal_k.k->p, iter->pos)) + bch2_journal_iter_advance(&iter->journal); ret = journal_k.k && (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) @@ -435,7 +436,9 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, bch2_btree_node_iter_init_from_start(&node_iter, b); __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); - list_add(&iter->journal.list, &trans->c->journal_iters); + if (trans->journal_replay_not_finished && + !test_bit(BCH_FS_may_go_rw, &trans->c->flags)) + list_add(&iter->journal.list, &trans->c->journal_iters); } /* sort and dedup all keys in the journal: */ -- GitLab From 40cb26233a060aeb936de7ea1f6ac2659ed9951c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Mar 2024 17:14:43 -0400 Subject: [PATCH 189/989] bcachefs: Be careful about btree node splits during journal replay Don't pick a pivot that's going to be deleted. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 16 ++++++++++++++++ fs/bcachefs/btree_journal_iter.h | 4 ++-- fs/bcachefs/btree_update_interior.c | 9 ++++++++- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 0272ef0d73109..1f588264575db 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -261,6 +261,22 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, return bch2_journal_key_insert(c, id, level, &whiteout); } +bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &trans->c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (!trans->journal_replay_not_finished) + return false; + + return (idx < keys->size && + keys->data[idx].btree_id == btree && + keys->data[idx].level == level && + bpos_eq(keys->data[idx].k->k.p, pos) && + bkey_deleted(&keys->data[idx].k->k)); +} + void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos pos) { diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index c9d19da3ea048..09cd53091a05b 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -40,8 +40,8 @@ int bch2_journal_key_insert(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_delete(struct bch_fs *, enum btree_id, unsigned, struct bpos); -void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, - unsigned, struct bpos); +bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos); +void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos); void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index b2f5f2e50f7e1..b45c7ced15d8a 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1380,9 +1380,16 @@ static void __btree_split_node(struct btree_update *as, if (bkey_deleted(k)) continue; + uk = bkey_unpack_key(b, k); + + if (b->c.level && + u64s < n1_u64s && + u64s + k->u64s >= n1_u64s && + bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p)) + n1_u64s += k->u64s; + i = u64s >= n1_u64s; u64s += k->u64s; - uk = bkey_unpack_key(b, k); if (!i) n1_pos = uk.p; bch2_bkey_format_add_key(&format[i], &uk); -- GitLab From 79032b078173f87a13f8618cdab710798be67314 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Mar 2024 19:29:19 -0400 Subject: [PATCH 190/989] bcachefs: Improved topology repair checks Consolidate bch2_gc_check_topology() and btree_node_interior_verify(), and replace them with an improved version, bch2_btree_node_check_topology(). This checks that children of an interior node correctly span the full range of the parent node with no overlaps. Also, ensure that topology repairs at runtime are always a fatal error; in particular, this adds a check in btree_iter_down() - if we don't find a key while walking down the btree that's indicative of a topology error and should be flagged as such, not a null ptr deref. Some checks in btree_update_interior.c remaining BUG_ONS(), because we already checked the node for topology errors when starting the update, and the assertions indicate that we _just_ corrupted the btree node - i.e. the problem can't be that existing on disk corruption, they indicate an actual algorithmic bug. In the future, we'll be annotating the fsck errors list with which recovery pass corrects them; the open coded "run explicit recovery pass or fatal error" in bch2_btree_node_check_topology() will in the future be done for every fsck_err() call. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 3 +- fs/bcachefs/btree_gc.c | 133 +++------------------------ fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/btree_iter.c | 19 +++- fs/bcachefs/btree_update_interior.c | 134 ++++++++++++++++++++-------- fs/bcachefs/btree_update_interior.h | 2 + fs/bcachefs/error.h | 6 ++ fs/bcachefs/sb-errors_types.h | 3 +- 8 files changed, 137 insertions(+), 165 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 8ff21b2e14636..84474324dba9b 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -808,7 +808,8 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) prt_printf(&buf, "\nmax "); bch2_bpos_to_text(&buf, b->data->max_key); - bch2_fs_inconsistent(c, "%s", buf.buf); + bch2_fs_topology_error(c, "%s", buf.buf); + printbuf_exit(&buf); } diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index bdaed29f084a4..6d32ac9be2438 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -70,90 +70,6 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) __gc_pos_set(c, new_pos); } -/* - * Missing: if an interior btree node is empty, we need to do something - - * perhaps just kill it - */ -static int bch2_gc_check_topology(struct bch_fs *c, - struct btree *b, - struct bkey_buf *prev, - struct bkey_buf cur, - bool is_last) -{ - struct bpos node_start = b->data->min_key; - struct bpos node_end = b->data->max_key; - struct bpos expected_start = bkey_deleted(&prev->k->k) - ? node_start - : bpos_successor(prev->k->k.p); - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - int ret = 0; - - if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { - struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); - - if (!bpos_eq(expected_start, bp->v.min_key)) { - bch2_topology_error(c); - - if (bkey_deleted(&prev->k->k)) { - prt_printf(&buf1, "start of node: "); - bch2_bpos_to_text(&buf1, node_start); - } else { - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); - } - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); - - if (__fsck_err(c, - FSCK_CAN_FIX| - FSCK_CAN_IGNORE| - FSCK_NO_RATELIMIT, - btree_node_topology_bad_min_key, - "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " cur %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto err; - } else { - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - } - } - } - - if (is_last && !bpos_eq(cur.k->k.p, node_end)) { - bch2_topology_error(c); - - printbuf_reset(&buf1); - printbuf_reset(&buf2); - - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); - bch2_bpos_to_text(&buf2, node_end); - - if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " %s\n" - " expected %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf) && - should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto err; - } else { - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - } - } - - bch2_bkey_buf_copy(prev, c, cur.k); -err: -fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) { switch (b->key.k.type) { @@ -445,6 +361,7 @@ again: prev = NULL; if (ret == DROP_PREV_NODE) { + bch_info(c, "dropped prev node"); bch2_btree_node_evict(trans, prev_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, prev_k.k->k.p); @@ -841,42 +758,30 @@ err: static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) { - struct bch_fs *c = trans->c; struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; - struct bkey_buf prev, cur; int ret = 0; + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + if (!btree_node_type_needs_gc(btree_node_type(b))) return 0; bch2_btree_node_iter_init_from_start(&iter, b); - bch2_bkey_buf_init(&prev); - bch2_bkey_buf_init(&cur); - bkey_init(&prev.k->k); while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, &k, initial); if (ret) - break; + return ret; bch2_btree_node_iter_advance(&iter, b); - - if (b->c.level) { - bch2_bkey_buf_reassemble(&cur, c, k); - - ret = bch2_gc_check_topology(c, b, &prev, cur, - bch2_btree_node_iter_end(&iter)); - if (ret) - break; - } } - bch2_bkey_buf_exit(&cur, c); - bch2_bkey_buf_exit(&prev, c); - return ret; + return 0; } static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, @@ -925,14 +830,16 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bkey_buf cur, prev; + struct bkey_buf cur; struct printbuf buf = PRINTBUF; int ret = 0; + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - bch2_bkey_buf_init(&prev); bch2_bkey_buf_init(&cur); - bkey_init(&prev.k->k); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { BUG_ON(bpos_lt(k.k->p, b->data->min_key)); @@ -943,20 +850,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b if (ret) goto fsck_err; - if (b->c.level) { - bch2_bkey_buf_reassemble(&cur, c, k); - k = bkey_i_to_s_c(cur.k); - - bch2_btree_and_journal_iter_advance(&iter); - - ret = bch2_gc_check_topology(c, b, - &prev, cur, - !bch2_btree_and_journal_iter_peek(&iter).k); - if (ret) - goto fsck_err; - } else { - bch2_btree_and_journal_iter_advance(&iter); - } + bch2_btree_and_journal_iter_advance(&iter); } if (b->c.level > target_depth) { @@ -1015,7 +909,6 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b } fsck_err: bch2_bkey_buf_exit(&cur, c); - bch2_bkey_buf_exit(&prev, c); bch2_btree_and_journal_iter_exit(&iter); printbuf_exit(&buf); return ret; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 34df8ccc5fecc..b85e2cd61c381 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1657,7 +1657,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, prt_str(&buf, "btree node read error: no device to read from\n at "); bch2_btree_pos_to_text(&buf, c, b); - bch_err(c, "%s", buf.buf); + bch_err_ratelimited(c, "%s", buf.buf); if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index ae4a28a33ad1a..2a211a4bebd15 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -927,8 +927,22 @@ static __always_inline int btree_path_down(struct btree_trans *trans, if (ret) goto err; } else { - bch2_bkey_buf_unpack(&tmp, c, l->b, - bch2_btree_node_iter_peek(&l->iter, l->b)); + struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); + if (!k) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " within parent node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key)); + + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + ret = -BCH_ERR_btree_need_topology_repair; + goto err; + } + + bch2_bkey_buf_unpack(&tmp, c, l->b, k); if ((flags & BTREE_ITER_PREFETCH) && c->opts.btree_node_prefetch) { @@ -962,7 +976,6 @@ err: return ret; } - static int bch2_btree_path_traverse_all(struct btree_trans *trans) { struct bch_fs *c = trans->c; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index b45c7ced15d8a..aae7a2687eee7 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_buf.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_gc.h" @@ -18,6 +19,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "recovery.h" #include "replicas.h" #include "super-io.h" #include "trace.h" @@ -44,56 +46,103 @@ static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, return path_idx; } -/* Debug code: */ - /* * Verify that child nodes correctly span parent node's range: */ -static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) +int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) { -#ifdef CONFIG_BCACHEFS_DEBUG - struct bpos next_node = b->data->min_key; - struct btree_node_iter iter; + struct bch_fs *c = trans->c; + struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key + : b->data->min_key; + struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bkey_s_c_btree_ptr_v2 bp; - struct bkey unpacked; - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; + struct bkey_buf prev; + int ret = 0; - BUG_ON(!b->c.level); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - return; + if (!b->c.level) + return 0; - bch2_btree_node_iter_init_from_start(&iter, b); + bch2_bkey_buf_init(&prev); + bkey_init(&prev.k->k); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - while (1) { - k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { if (k.k->type != KEY_TYPE_btree_ptr_v2) - break; - bp = bkey_s_c_to_btree_ptr_v2(k); + goto out; - if (!bpos_eq(next_node, bp.v->min_key)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, next_node); - bch2_bpos_to_text(&buf2, bp.v->min_key); - panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); - } + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - bch2_btree_node_iter_advance(&iter, b); + struct bpos expected_min = bkey_deleted(&prev.k->k) + ? node_min + : bpos_successor(prev.k->k.p); - if (bch2_btree_node_iter_end(&iter)) { - if (!bpos_eq(k.k->p, b->key.k.p)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, b->key.k.p); - bch2_bpos_to_text(&buf2, k.k->p); - panic("expected end %s got %s\n", buf1.buf, buf2.buf); - } - break; + if (!bpos_eq(expected_min, bp.v->min_key)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "end of prev node doesn't match start of next node\n"), + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n prev "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_str(&buf, "\n next "); + bch2_bkey_val_to_text(&buf, c, k); + + need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf); + goto topology_repair; } - next_node = bpos_successor(k.k->p); + bch2_bkey_buf_reassemble(&prev, c, k); + bch2_btree_and_journal_iter_advance(&iter); + } + + if (bkey_deleted(&prev.k->k)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "empty interior node\n"); + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf); + goto topology_repair; + } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "last child node doesn't end at end of parent node\n"); + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n last key "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + + need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf); + goto topology_repair; + } +out: +fsck_err: + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&prev, c); + printbuf_exit(&buf); + return ret; +topology_repair: + if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) { + bch2_inconsistent_error(c); + ret = -BCH_ERR_btree_need_topology_repair; + } else { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); } -#endif + goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -1448,8 +1497,7 @@ static void __btree_split_node(struct btree_update *as, bch2_verify_btree_nr_keys(n[i]); - if (b->c.level) - btree_node_interior_verify(as->c, n[i]); + BUG_ON(bch2_btree_node_check_topology(trans, n[i])); } } @@ -1480,7 +1528,7 @@ static void btree_split_insert_keys(struct btree_update *as, __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); - btree_node_interior_verify(as->c, b); + BUG_ON(bch2_btree_node_check_topology(trans, b)); } } @@ -1498,6 +1546,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, BUG_ON(!parent && (b != btree_node_root(c, b))); BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + bch2_btree_interior_update_will_free_node(as, b); if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { @@ -1717,7 +1769,11 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t goto split; } - btree_node_interior_verify(c, b); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) { + bch2_btree_node_unlock_write(trans, path, b); + return ret; + } bch2_btree_insert_keys_interior(as, trans, path, b, keys); @@ -1735,7 +1791,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_btree_node_unlock_write(trans, path, b); - btree_node_interior_verify(c, b); + BUG_ON(bch2_btree_node_check_topology(trans, b)); return 0; split: /* diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index f651dd48aaa04..7ac3e17b84fde 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -10,6 +10,8 @@ #define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) +int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); + /* * Tracks an in progress split/rewrite of a btree node and the update to the * parent node: diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index ae1d6674c512d..36caedf72d89a 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -32,6 +32,12 @@ bool bch2_inconsistent_error(struct bch_fs *); int bch2_topology_error(struct bch_fs *); +#define bch2_fs_topology_error(c, ...) \ +({ \ + bch_err(c, "btree topology error: " __VA_ARGS__); \ + bch2_topology_error(c); \ +}) + #define bch2_fs_inconsistent(c, ...) \ ({ \ bch_err(c, __VA_ARGS__); \ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 5178bf579f7c5..8edbd9ef994c4 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -265,7 +265,8 @@ x(subvol_children_bad, 257) \ x(subvol_loop, 258) \ x(subvol_unreachable, 259) \ - x(btree_node_bkey_bad_u64s, 260) + x(btree_node_bkey_bad_u64s, 260) \ + x(btree_node_topology_empty_interior_node, 261) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, -- GitLab From bb66009958b277a9baffaa53d835661852550151 Mon Sep 17 00:00:00 2001 From: zhuxiaohui Date: Tue, 26 Mar 2024 20:03:45 +0800 Subject: [PATCH 191/989] bcachefs: add REQ_SYNC and REQ_IDLE in write dio when writing file with direct_IO on bcachefs, then performance is much lower than other fs due to write back throttle in block layer: wbt_wait+1 __rq_qos_throttle+32 blk_mq_submit_bio+394 submit_bio_noacct_nocheck+649 bch2_submit_wbio_replicas+538 __bch2_write+2539 bch2_direct_write+1663 bch2_write_iter+318 aio_write+355 io_submit_one+1224 __x64_sys_io_submit+169 do_syscall_64+134 entry_SYSCALL_64_after_hwframe+110 add set REQ_SYNC and REQ_IDLE in bio->bi_opf as standard dirct-io Signed-off-by: zhuxiaohui Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-direct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 33cb6da3a5ad2..f49e6c0f0f683 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -536,7 +536,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) if (likely(!dio->iter.count) || dio->op.error) break; - bio_reset(bio, NULL, REQ_OP_WRITE); + bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); } out: return bch2_dio_write_done(dio); @@ -618,7 +618,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) bio = bio_alloc_bioset(NULL, bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_WRITE, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE, GFP_KERNEL, &c->dio_write_bioset); dio = container_of(bio, struct dio_write, op.wbio.bio); -- GitLab From 805b535a8afbcd8073a03eb25aafd82cb816bff6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Mar 2024 21:58:07 -0400 Subject: [PATCH 192/989] bcachefs: Check btree ptr min_key in .invalid Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 9 +++++++-- fs/bcachefs/sb-errors_types.h | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 61395b113df9b..b2432d88cda64 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -189,13 +189,18 @@ int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); int ret = 0; - bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err, - btree_ptr_v2_val_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, + c, err, btree_ptr_v2_val_too_big, "value too big (%zu > %zu)", bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); + bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p), + c, err, btree_ptr_v2_min_key_bad, + "min_key > key"); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); fsck_err: return ret; diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 8edbd9ef994c4..73e9634df8ffb 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -266,7 +266,8 @@ x(subvol_loop, 258) \ x(subvol_unreachable, 259) \ x(btree_node_bkey_bad_u64s, 260) \ - x(btree_node_topology_empty_interior_node, 261) + x(btree_node_topology_empty_interior_node, 261) \ + x(btree_ptr_v2_min_key_bad, 262) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, -- GitLab From 812a9297936a959c98a2e9e44a9a622bbe30b162 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Mar 2024 16:39:54 -0400 Subject: [PATCH 193/989] bcachefs: Fix btree node keys accounting in topology repair path When dropping keys now outside a now because we're changing the node min/max, we need to redo the node's accounting as well. Signed-off-by: Kent Overstreet --- fs/bcachefs/bset.c | 14 ++++++++++---- fs/bcachefs/bset.h | 2 ++ fs/bcachefs/btree_io.c | 1 + fs/bcachefs/btree_update_interior.c | 1 + 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 3fd1085b6c61e..3bb477840eab6 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -134,18 +134,24 @@ void bch2_dump_btree_node_iter(struct btree *b, printbuf_exit(&buf); } -#ifdef CONFIG_BCACHEFS_DEBUG - -void __bch2_verify_btree_nr_keys(struct btree *b) +struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) { struct bset_tree *t; struct bkey_packed *k; - struct btree_nr_keys nr = { 0 }; + struct btree_nr_keys nr = {}; for_each_bset(b, t) bset_tree_for_each_key(b, t, k) if (!bkey_deleted(k)) btree_keys_account_key_add(&nr, t - b->set, k); + return nr; +} + +#ifdef CONFIG_BCACHEFS_DEBUG + +void __bch2_verify_btree_nr_keys(struct btree *b) +{ + struct btree_nr_keys nr = bch2_btree_node_count_keys(b); BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); } diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 79c77baaa3838..120a79fd456bd 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -458,6 +458,8 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, /* Accounting: */ +struct btree_nr_keys bch2_btree_node_count_keys(struct btree *); + static inline void btree_keys_account_key(struct btree_nr_keys *n, unsigned bset, struct bkey_packed *k, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index b85e2cd61c381..9c71e6fb9c41b 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -654,6 +654,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) */ bch2_bset_set_no_aux_tree(b, b->set); bch2_btree_build_aux_trees(b); + b->nr = bch2_btree_node_count_keys(b); struct bkey_s_c k; struct bkey unpacked; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index aae7a2687eee7..826cebe4bce07 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1543,6 +1543,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, u64 start_time = local_clock(); int ret = 0; + bch2_verify_btree_nr_keys(b); BUG_ON(!parent && (b != btree_node_root(c, b))); BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); -- GitLab From 6f5869ffd9f111b81b95b73c6e54f07406591911 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Mar 2024 17:38:22 -0400 Subject: [PATCH 194/989] bcachefs: Fix use after free in bch2_check_fix_ptrs() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 6d32ac9be2438..26e51af9acdcf 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -691,12 +691,6 @@ found: } } - ret = bch2_journal_key_insert_take(c, btree_id, level, new); - if (ret) { - kfree(new); - goto err; - } - if (level) bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); @@ -710,6 +704,12 @@ found: bch_info(c, "new key %s", buf.buf); } + ret = bch2_journal_key_insert_take(c, btree_id, level, new); + if (ret) { + kfree(new); + goto err; + } + *k = bkey_i_to_s_c(new); } err: -- GitLab From 83bb58539045b15653b61c6e8eb65f3f9c671cdf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Mar 2024 18:46:20 -0400 Subject: [PATCH 195/989] bcachefs: Fix repair path for missing indirect extents Signed-off-by: Kent Overstreet --- fs/bcachefs/reflink.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index c47c66c2b394d..ff7864731a073 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -185,8 +185,7 @@ not_found: } else { bkey_error_init(update); update->k.p = p.k->p; - update->k.p.offset = next_idx; - update->k.size = next_idx - *idx; + update->k.size = p.k->size; set_bkey_val_u64s(&update->k, 0); } -- GitLab From dcc1c04587aa9bc3515153f4c89cff73f2cb45b2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Mar 2024 18:46:38 -0400 Subject: [PATCH 196/989] bcachefs: Fix use after free in check_root_trans() Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 47d4eefaba7ba..6d8367ab5ddda 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2098,17 +2098,21 @@ static int check_root_trans(struct btree_trans *trans) if (mustfix_fsck_err_on(ret, c, root_subvol_missing, "root subvol missing")) { - struct bkey_i_subvolume root_subvol; + struct bkey_i_subvolume *root_subvol = + bch2_trans_kmalloc(trans, sizeof(*root_subvol)); + ret = PTR_ERR_OR_ZERO(root_subvol); + if (ret) + goto err; snapshot = U32_MAX; inum = BCACHEFS_ROOT_INO; - bkey_subvolume_init(&root_subvol.k_i); - root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_subvol.v.flags = 0; - root_subvol.v.snapshot = cpu_to_le32(snapshot); - root_subvol.v.inode = cpu_to_le64(inum); - ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0); + bkey_subvolume_init(&root_subvol->k_i); + root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_subvol->v.flags = 0; + root_subvol->v.snapshot = cpu_to_le32(snapshot); + root_subvol->v.inode = cpu_to_le64(inum); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); bch_err_msg(c, ret, "writing root subvol"); if (ret) goto err; -- GitLab From 47d2080e30b0b9fc636eba4e74f9e4bdc01543d7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Mar 2024 19:26:05 -0400 Subject: [PATCH 197/989] bcachefs: Kill bch2_bkey_ptr_data_type() Remove some duplication, and inconsistency between check_fix_ptrs and the main ptr marking paths Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 6 ++-- fs/bcachefs/backpointers.h | 32 +++++++++++++++++---- fs/bcachefs/btree_gc.c | 59 +++++++++++++++++++++----------------- fs/bcachefs/buckets.c | 12 ++++---- fs/bcachefs/extents.h | 24 ---------------- 5 files changed, 67 insertions(+), 66 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 21b3ae7df8240..7537d78aa9db6 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -29,8 +29,7 @@ static bool extent_matches_bp(struct bch_fs *c, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, - &bucket2, &bp2); + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bucket2, &bp2); if (bpos_eq(bucket, bucket2) && !memcmp(&bp, &bp2, sizeof(bp))) return true; @@ -507,8 +506,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree, level, - k, p, &bucket_pos, &bp); + bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bucket_pos, &bp); ret = check_bp_exists(trans, s, bucket_pos, bp, k); if (ret) diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 327365a9feac4..da012ca7daee5 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -90,20 +90,40 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); } -static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p) +static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, + struct extent_ptr_decoded p, + const union bch_extent_entry *entry) { - return level ? BCH_DATA_btree : - p.has_ec ? BCH_DATA_stripe : - BCH_DATA_user; + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + return BCH_DATA_btree; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user; + case KEY_TYPE_stripe: { + const struct bch_extent_ptr *ptr = &entry->ptr; + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + BUG_ON(ptr < s.v->ptrs || + ptr >= s.v->ptrs + s.v->nr_blocks); + + return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity + : BCH_DATA_user; + } + default: + BUG(); + } } static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, + const union bch_extent_entry *entry, struct bpos *bucket_pos, struct bch_backpointer *bp) { - enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); s64 sectors = level ? btree_sectors(c) : k.k->size; u32 bucket_offset; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 26e51af9acdcf..0afefccf4e520 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -7,6 +7,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "bkey_methods.h" #include "bkey_buf.h" #include "btree_journal_iter.h" @@ -508,7 +509,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, p, entry_c); if (fsck_err_on(!g->gen_valid, c, ptr_to_missing_alloc_key, @@ -574,7 +575,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id continue; if (fsck_err_on(bucket_data_type(g->data_type) && - bucket_data_type(g->data_type) != data_type, c, + bucket_data_type(g->data_type) != + bucket_data_type(data_type), c, ptr_bucket_data_type_mismatch, "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", @@ -615,18 +617,13 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } if (do_update) { - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; - struct bch_extent_ptr *ptr; - struct bkey_i *new; - if (is_root) { bch_err(c, "cannot update btree roots yet"); ret = -EINVAL; goto err; } - new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); + struct bkey_i *new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); if (!new) { ret = -BCH_ERR_ENOMEM_gc_repair_key; bch_err_msg(c, ret, "allocating new key"); @@ -641,7 +638,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id * btree node isn't there anymore, the read path will * sort it out: */ - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bucket *g = PTR_GC_BUCKET(ca, ptr); @@ -649,19 +646,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id ptr->gen = g->gen; } } else { - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); - - (ptr->cached && - (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || - (!ptr->cached && - gen_cmp(ptr->gen, g->gen) < 0) || - gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || - (g->data_type && - g->data_type != data_type); - })); + struct bkey_ptrs ptrs; + union bch_extent_entry *entry; +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); + + if ((p.ptr.cached && + (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || + (!p.ptr.cached && + gen_cmp(p.ptr.gen, g->gen) < 0) || + gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || + (g->data_type && + g->data_type != data_type)) { + bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); + goto restart_drop_ptrs; + } + } again: ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_extent_entry_for_each(ptrs, entry) { @@ -736,10 +740,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, BUG_ON(bch2_journal_seq_verify && k->k->version.lo > atomic64_read(&c->journal.seq)); - ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); - if (ret) - goto err; - if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, bkey_version_in_future, "key version number higher than recorded: %llu > %llu", @@ -748,8 +748,13 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, atomic64_set(&c->key_version, k->k->version.lo); } + ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); + if (ret) + goto err; + ret = commit_do(trans, NULL, NULL, 0, - bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); + bch2_key_trigger(trans, btree_id, level, old, + unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); fsck_err: err: bch_err_fn(c, ret); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 96edf2c34d433..941401a210f56 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -525,6 +525,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, "different types of data in same bucket: %s, %s", bch2_data_type_str(g->data_type), bch2_data_type_str(data_type))) { + BUG(); ret = -EIO; goto err; } @@ -628,6 +629,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, bch2_data_type_str(ptr_data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + BUG(); ret = -EIO; goto err; } @@ -815,14 +817,14 @@ static int __mark_pointer(struct btree_trans *trans, static int bch2_trigger_pointer(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, - s64 *sectors, - unsigned flags) + const union bch_extent_entry *entry, + s64 *sectors, unsigned flags) { bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); struct bpos bucket; struct bch_backpointer bp; - bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); + bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, entry, &bucket, &bp); *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); if (flags & BTREE_TRIGGER_TRANSACTIONAL) { @@ -851,7 +853,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (flags & BTREE_TRIGGER_GC) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); percpu_down_read(&c->mark_lock); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); @@ -979,7 +981,7 @@ static int __trigger_extent(struct btree_trans *trans, bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { s64 disk_sectors; - ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags); + ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags); if (ret < 0) return ret; diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index fd2669cdd76f3..3fd0169b98c18 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -596,30 +596,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) return ret; } -static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - return BCH_DATA_btree; - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - return BCH_DATA_user; - case KEY_TYPE_stripe: { - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - BUG_ON(ptr < s.v->ptrs || - ptr >= s.v->ptrs + s.v->nr_blocks); - - return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity - : BCH_DATA_user; - } - default: - BUG(); - } -} - unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); -- GitLab From 7f9e5080366726084eb765a5d689bdf502e7e2ed Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 14 Mar 2024 19:39:26 -0400 Subject: [PATCH 198/989] bcachefs: Fix bch2_btree_increase_depth() When we haven't yet allocated any btree nodes for a given btree, we first need to call the regular split path to allocate one. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 826cebe4bce07..9c4325786ba8a 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1882,9 +1882,12 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, { struct bch_fs *c = trans->c; struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; + + if (btree_node_fake(b)) + return bch2_btree_split_leaf(trans, path, flags); + struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, - b->c.level, true, flags); + bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); if (IS_ERR(as)) return PTR_ERR(as); -- GitLab From 11d5568d3e04a2e6734d1eccc394cfcf5ca8523c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Mar 2024 01:41:03 -0400 Subject: [PATCH 199/989] bcachefs: fix backpointer for missing alloc key msg Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 7537d78aa9db6..762c8ddfc5e73 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -382,7 +382,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ backpointer_to_missing_alloc, "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", alloc_iter.pos.inode, alloc_iter.pos.offset, - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, bp_iter, 0); goto out; } -- GitLab From d2554263adcb4041f3608cb7476f102fda036ccc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Mar 2024 20:07:46 -0400 Subject: [PATCH 200/989] bcachefs: Split out recovery_passes.c We've grown a fair amount of code for managing recovery passes; tracking which ones we're running, which ones need to be run, and flagging in the superblock which ones need to be run on the next recovery. So it's worth splitting out into its own file, this code is pretty different from the code in recovery.c. Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/bcachefs.h | 2 +- fs/bcachefs/btree_gc.c | 2 +- fs/bcachefs/btree_update_interior.c | 2 +- fs/bcachefs/chardev.c | 2 +- fs/bcachefs/error.c | 2 +- fs/bcachefs/fsck.c | 2 +- fs/bcachefs/recovery.c | 249 +----------------- fs/bcachefs/recovery.h | 32 +-- fs/bcachefs/recovery_passes.c | 203 ++++++++++++++ fs/bcachefs/recovery_passes.h | 16 ++ ...covery_types.h => recovery_passes_types.h} | 7 +- fs/bcachefs/sb-downgrade.c | 2 +- fs/bcachefs/subvolume.c | 72 +++++ fs/bcachefs/subvolume.h | 3 + fs/bcachefs/super-io.c | 2 +- 16 files changed, 313 insertions(+), 286 deletions(-) create mode 100644 fs/bcachefs/recovery_passes.c create mode 100644 fs/bcachefs/recovery_passes.h rename fs/bcachefs/{recovery_types.h => recovery_passes_types.h} (94%) diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index b02796c8a5953..f6d86eb4b9765 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -67,6 +67,7 @@ bcachefs-y := \ quota.o \ rebalance.o \ recovery.o \ + recovery_passes.o \ reflink.o \ replicas.o \ sb-clean.o \ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index a9ade0b1f78cf..963162a627cd6 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -209,7 +209,7 @@ #include "fifo.h" #include "nocow_locking_types.h" #include "opts.h" -#include "recovery_types.h" +#include "recovery_passes_types.h" #include "sb-errors_types.h" #include "seqmutex.h" #include "time_stats.h" diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 0afefccf4e520..e5d2c6daa663d 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -25,7 +25,7 @@ #include "journal.h" #include "keylist.h" #include "move.h" -#include "recovery.h" +#include "recovery_passes.h" #include "reflink.h" #include "replicas.h" #include "super-io.h" diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 9c4325786ba8a..983bb27298cc2 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -19,7 +19,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" -#include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "super-io.h" #include "trace.h" diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 38defa19d52d7..cbfa6459bdbce 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -7,7 +7,7 @@ #include "chardev.h" #include "journal.h" #include "move.h" -#include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "super.h" #include "super-io.h" diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 043431206799d..f942a39474962 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" -#include "recovery.h" +#include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 6d8367ab5ddda..aca6fae409cdf 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -12,7 +12,7 @@ #include "fsck.h" #include "inode.h" #include "keylist.h" -#include "recovery.h" +#include "recovery_passes.h" #include "snapshot.h" #include "super.h" #include "xattr.h" diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 03f9d6afe4678..f9e2b011f6d4f 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1,35 +1,30 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "backpointers.h" -#include "bkey_buf.h" #include "alloc_background.h" -#include "btree_gc.h" +#include "bkey_buf.h" #include "btree_journal_iter.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" #include "buckets.h" #include "dirent.h" -#include "ec.h" #include "errcode.h" #include "error.h" #include "fs-common.h" -#include "fsck.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" -#include "lru.h" #include "logged_ops.h" #include "move.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-clean.h" #include "sb-downgrade.h" #include "snapshot.h" -#include "subvolume.h" #include "super-io.h" #include @@ -186,7 +181,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) return cmp_int(l->journal_seq, r->journal_seq); } -static int bch2_journal_replay(struct bch_fs *c) +int bch2_journal_replay(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; DARRAY(struct journal_key *) keys_sorted = { 0 }; @@ -471,150 +466,6 @@ fsck_err: return ret; } -static int bch2_initialize_subvolumes(struct bch_fs *c) -{ - struct bkey_i_snapshot_tree root_tree; - struct bkey_i_snapshot root_snapshot; - struct bkey_i_subvolume root_volume; - int ret; - - bkey_snapshot_tree_init(&root_tree.k_i); - root_tree.k.p.offset = 1; - root_tree.v.master_subvol = cpu_to_le32(1); - root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); - - bkey_snapshot_init(&root_snapshot.k_i); - root_snapshot.k.p.offset = U32_MAX; - root_snapshot.v.flags = 0; - root_snapshot.v.parent = 0; - root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); - root_snapshot.v.tree = cpu_to_le32(1); - SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); - - bkey_subvolume_init(&root_volume.k_i); - root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_volume.v.flags = 0; - root_volume.v.snapshot = cpu_to_le32(U32_MAX); - root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); - - ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); - bch_err_fn(c, ret); - return ret; -} - -static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bch_inode_unpacked inode; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); - ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_is_inode(k.k)) { - bch_err(trans->c, "root inode not found"); - ret = -BCH_ERR_ENOENT_inode; - goto err; - } - - ret = bch2_inode_unpack(k, &inode); - BUG_ON(ret); - - inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; - - ret = bch2_inode_write(trans, &iter, &inode); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* set bi_subvol on root inode */ -noinline_for_stack -static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) -{ - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, - __bch2_fs_upgrade_for_subvolumes(trans)); - bch_err_fn(c, ret); - return ret; -} - -const char * const bch2_recovery_passes[] = { -#define x(_fn, ...) #_fn, - BCH_RECOVERY_PASSES() -#undef x - NULL -}; - -static int bch2_check_allocations(struct bch_fs *c) -{ - return bch2_gc(c, true, c->opts.norecovery); -} - -static int bch2_set_may_go_rw(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - /* - * After we go RW, the journal keys buffer can't be modified (except for - * setting journal_key->overwritten: it will be accessed by multiple - * threads - */ - move_gap(keys, keys->nr); - - set_bit(BCH_FS_may_go_rw, &c->flags); - - if (keys->nr || c->opts.fsck || !c->sb.clean) - return bch2_fs_read_write_early(c); - return 0; -} - -struct recovery_pass_fn { - int (*fn)(struct bch_fs *); - unsigned when; -}; - -static struct recovery_pass_fn recovery_pass_fns[] = { -#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, - BCH_RECOVERY_PASSES() -#undef x -}; - -u64 bch2_recovery_passes_to_stable(u64 v) -{ - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, - BCH_RECOVERY_PASSES() -#undef x - }; - - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); - return ret; -} - -u64 bch2_recovery_passes_from_stable(u64 v) -{ - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - }; - - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); - return ret; -} - static bool check_version_upgrade(struct bch_fs *c) { unsigned latest_version = bcachefs_metadata_version_current; @@ -687,96 +538,6 @@ static bool check_version_upgrade(struct bch_fs *c) return false; } -u64 bch2_fsck_recovery_passes(void) -{ - u64 ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) - if (recovery_pass_fns[i].when & PASS_FSCK) - ret |= BIT_ULL(i); - return ret; -} - -static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - - if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) - return false; - if (c->recovery_passes_explicit & BIT_ULL(pass)) - return true; - if ((p->when & PASS_FSCK) && c->opts.fsck) - return true; - if ((p->when & PASS_UNCLEAN) && !c->sb.clean) - return true; - if (p->when & PASS_ALWAYS) - return true; - return false; -} - -static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - int ret; - - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); - ret = p->fn(c); - if (ret) - return ret; - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_CONT " done\n"); - - return 0; -} - -static int bch2_run_recovery_passes(struct bch_fs *c) -{ - int ret = 0; - - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { - if (should_run_recovery_pass(c, c->curr_recovery_pass)) { - unsigned pass = c->curr_recovery_pass; - - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || - (ret && c->curr_recovery_pass < pass)) - continue; - if (ret) - break; - - c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); - } - c->curr_recovery_pass++; - c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); - } - - return ret; -} - -int bch2_run_online_recovery_passes(struct bch_fs *c) -{ - int ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { - struct recovery_pass_fn *p = recovery_pass_fns + i; - - if (!(p->when & PASS_ONLINE)) - continue; - - ret = bch2_run_recovery_pass(c, i); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { - i = c->curr_recovery_pass; - continue; - } - if (ret) - break; - } - - return ret; -} - int bch2_fs_recovery(struct bch_fs *c) { struct bch_sb_field_clean *clean = NULL; @@ -1155,7 +916,7 @@ int bch2_fs_initialize(struct bch_fs *c) } mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); + c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; set_bit(BCH_FS_may_go_rw, &c->flags); for (unsigned i = 0; i < BTREE_ID_NR; i++) @@ -1230,7 +991,7 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1; + c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 4e9d24719b2e8..3962fd87b50d8 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,37 +2,7 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -extern const char * const bch2_recovery_passes[]; - -u64 bch2_recovery_passes_to_stable(u64 v); -u64 bch2_recovery_passes_from_stable(u64 v); - -/* - * For when we need to rewind recovery passes and run a pass we skipped: - */ -static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - if (c->recovery_passes_explicit & BIT_ULL(pass)) - return 0; - - bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); - - c->recovery_passes_explicit |= BIT_ULL(pass); - - if (c->curr_recovery_pass >= pass) { - c->curr_recovery_pass = pass; - c->recovery_passes_complete &= (1ULL << pass) >> 1; - return -BCH_ERR_restart_recovery; - } else { - return 0; - } -} - -int bch2_run_online_recovery_passes(struct bch_fs *); -u64 bch2_fsck_recovery_passes(void); +int bch2_journal_replay(struct bch_fs *); int bch2_fs_recovery(struct bch_fs *); int bch2_fs_initialize(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c new file mode 100644 index 0000000000000..bab586dc395a6 --- /dev/null +++ b/fs/bcachefs/recovery_passes.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "backpointers.h" +#include "btree_gc.h" +#include "ec.h" +#include "fsck.h" +#include "inode.h" +#include "journal.h" +#include "lru.h" +#include "logged_ops.h" +#include "rebalance.h" +#include "recovery.h" +#include "recovery_passes.h" +#include "snapshot.h" +#include "subvolume.h" +#include "super.h" + +const char * const bch2_recovery_passes[] = { +#define x(_fn, ...) #_fn, + BCH_RECOVERY_PASSES() +#undef x + NULL +}; + +static int bch2_check_allocations(struct bch_fs *c) +{ + return bch2_gc(c, true, c->opts.norecovery); +} + +static int bch2_set_may_go_rw(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + + /* + * After we go RW, the journal keys buffer can't be modified (except for + * setting journal_key->overwritten: it will be accessed by multiple + * threads + */ + move_gap(keys, keys->nr); + + set_bit(BCH_FS_may_go_rw, &c->flags); + + if (keys->nr || c->opts.fsck || !c->sb.clean) + return bch2_fs_read_write_early(c); + return 0; +} + +struct recovery_pass_fn { + int (*fn)(struct bch_fs *); + unsigned when; +}; + +static struct recovery_pass_fn recovery_pass_fns[] = { +#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, + BCH_RECOVERY_PASSES() +#undef x +}; + +u64 bch2_recovery_passes_to_stable(u64 v) +{ + static const u8 map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, + BCH_RECOVERY_PASSES() +#undef x + }; + + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(map[i]); + return ret; +} + +u64 bch2_recovery_passes_from_stable(u64 v) +{ + static const u8 map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x + }; + + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(map[i]); + return ret; +} + +/* + * For when we need to rewind recovery passes and run a pass we skipped: + */ +int bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + if (c->recovery_passes_explicit & BIT_ULL(pass)) + return 0; + + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + + c->recovery_passes_explicit |= BIT_ULL(pass); + + if (c->curr_recovery_pass >= pass) { + c->curr_recovery_pass = pass; + c->recovery_passes_complete &= (1ULL << pass) >> 1; + return -BCH_ERR_restart_recovery; + } else { + return 0; + } +} + +u64 bch2_fsck_recovery_passes(void) +{ + u64 ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) + if (recovery_pass_fns[i].when & PASS_FSCK) + ret |= BIT_ULL(i); + return ret; +} + +static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + struct recovery_pass_fn *p = recovery_pass_fns + pass; + + if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) + return false; + if (c->recovery_passes_explicit & BIT_ULL(pass)) + return true; + if ((p->when & PASS_FSCK) && c->opts.fsck) + return true; + if ((p->when & PASS_UNCLEAN) && !c->sb.clean) + return true; + if (p->when & PASS_ALWAYS) + return true; + return false; +} + +static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + struct recovery_pass_fn *p = recovery_pass_fns + pass; + int ret; + + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_CONT " done\n"); + + return 0; +} + +int bch2_run_online_recovery_passes(struct bch_fs *c) +{ + int ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { + struct recovery_pass_fn *p = recovery_pass_fns + i; + + if (!(p->when & PASS_ONLINE)) + continue; + + ret = bch2_run_recovery_pass(c, i); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { + i = c->curr_recovery_pass; + continue; + } + if (ret) + break; + } + + return ret; +} + +int bch2_run_recovery_passes(struct bch_fs *c) +{ + int ret = 0; + + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + if (should_run_recovery_pass(c, c->curr_recovery_pass)) { + unsigned pass = c->curr_recovery_pass; + + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || + (ret && c->curr_recovery_pass < pass)) + continue; + if (ret) + break; + + c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); + } + c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); + } + + return ret; +} diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h new file mode 100644 index 0000000000000..abefa67749ebb --- /dev/null +++ b/fs/bcachefs/recovery_passes.h @@ -0,0 +1,16 @@ +#ifndef _BCACHEFS_RECOVERY_PASSES_H +#define _BCACHEFS_RECOVERY_PASSES_H + +extern const char * const bch2_recovery_passes[]; + +u64 bch2_recovery_passes_to_stable(u64 v); +u64 bch2_recovery_passes_from_stable(u64 v); + +u64 bch2_fsck_recovery_passes(void); + +int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); + +int bch2_run_online_recovery_passes(struct bch_fs *); +int bch2_run_recovery_passes(struct bch_fs *); + +#endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_passes_types.h similarity index 94% rename from fs/bcachefs/recovery_types.h rename to fs/bcachefs/recovery_passes_types.h index 4959e95e7c746..09c1ab032a675 100644 --- a/fs/bcachefs/recovery_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_TYPES_H -#define _BCACHEFS_RECOVERY_TYPES_H +#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H +#define _BCACHEFS_RECOVERY_PASSES_TYPES_H #define PASS_SILENT BIT(0) #define PASS_FSCK BIT(1) @@ -56,6 +56,7 @@ enum bch_recovery_pass { #define x(n, id, when) BCH_RECOVERY_PASS_##n, BCH_RECOVERY_PASSES() #undef x + BCH_RECOVERY_PASS_NR }; /* But we also need stable identifiers that can be used in the superblock */ @@ -65,4 +66,4 @@ enum bch_recovery_pass_stable { #undef x }; -#endif /* _BCACHEFS_RECOVERY_TYPES_H */ +#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index e4396cb0bacb0..d6f81179c3a29 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -7,7 +7,7 @@ #include "bcachefs.h" #include "darray.h" -#include "recovery.h" +#include "recovery_passes.h" #include "sb-downgrade.h" #include "sb-errors.h" #include "super-io.h" diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index ce7aed1219423..88a79c8232768 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -595,6 +595,78 @@ err: return ret; } +int bch2_initialize_subvolumes(struct bch_fs *c) +{ + struct bkey_i_snapshot_tree root_tree; + struct bkey_i_snapshot root_snapshot; + struct bkey_i_subvolume root_volume; + int ret; + + bkey_snapshot_tree_init(&root_tree.k_i); + root_tree.k.p.offset = 1; + root_tree.v.master_subvol = cpu_to_le32(1); + root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); + + bkey_snapshot_init(&root_snapshot.k_i); + root_snapshot.k.p.offset = U32_MAX; + root_snapshot.v.flags = 0; + root_snapshot.v.parent = 0; + root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); + root_snapshot.v.tree = cpu_to_le32(1); + SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); + + bkey_subvolume_init(&root_volume.k_i); + root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_volume.v.flags = 0; + root_volume.v.snapshot = cpu_to_le32(U32_MAX); + root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); + + ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); + bch_err_fn(c, ret); + return ret; +} + +static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); + ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_is_inode(k.k)) { + bch_err(trans->c, "root inode not found"); + ret = -BCH_ERR_ENOENT_inode; + goto err; + } + + ret = bch2_inode_unpack(k, &inode); + BUG_ON(ret); + + inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + + ret = bch2_inode_write(trans, &iter, &inode); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* set bi_subvol on root inode */ +int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) +{ + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + __bch2_fs_upgrade_for_subvolumes(trans)); + bch_err_fn(c, ret); + return ret; +} + int bch2_fs_subvolumes_init(struct bch_fs *c) { INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index 903c05162c068..d2015d549bd2a 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -37,6 +37,9 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *); int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); +int bch2_initialize_subvolumes(struct bch_fs *); +int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); + int bch2_fs_subvolumes_init(struct bch_fs *); #endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index ad28e370b6404..e15f8b1f30c2a 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -8,7 +8,7 @@ #include "journal.h" #include "journal_sb.h" #include "journal_seq_blacklist.h" -#include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "quota.h" #include "sb-clean.h" -- GitLab From e5aa80464155287cc309d18c1c93962357e3e393 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Mar 2024 19:30:58 -0400 Subject: [PATCH 201/989] bcachefs: Add error messages to logged ops fns Signed-off-by: Kent Overstreet --- fs/bcachefs/io_misc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 1baf78594ccaf..82f9170dab3fd 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -264,6 +264,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, ret = 0; err: bch2_logged_op_finish(trans, op_k); + bch_err_fn(c, ret); return ret; } @@ -476,6 +477,7 @@ case LOGGED_OP_FINSERT_finish: break; } err: + bch_err_fn(c, ret); bch2_logged_op_finish(trans, op_k); bch2_trans_iter_exit(trans, &iter); return ret; -- GitLab From af855a5f5e74cf0ef1166759fca937ce692b4aac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Mar 2024 19:31:15 -0400 Subject: [PATCH 202/989] bcachefs: Resume logged ops after fsck Finishing logged ops requires the filesystem to be in a reasonably consistent state - and other fsck passes don't require it to have completed, so just run it last. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 09c1ab032a675..f305212857068 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -37,7 +37,6 @@ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ x(fs_upgrade_for_subvolumes, 22, 0) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ x(check_inodes, 24, PASS_FSCK) \ x(check_extents, 25, PASS_FSCK) \ x(check_indirect_extents, 26, PASS_FSCK) \ @@ -47,6 +46,7 @@ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ + x(resume_logged_ops, 23, PASS_ALWAYS) \ x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ x(fix_reflink_p, 33, 0) \ x(set_fs_needs_rebalance, 34, 0) \ -- GitLab From 4fe0eeeae477328cbd26af1e6f81a94e2080ffa8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Mar 2024 02:36:10 -0400 Subject: [PATCH 203/989] bcachefs: Flush journal immediately after replay if we did early repair Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index f9e2b011f6d4f..dbedb5fd1dde5 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -189,6 +189,7 @@ int bch2_journal_replay(struct bch_fs *c) u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; struct btree_trans *trans = bch2_trans_get(c); + bool immediate_flush = false; int ret = 0; if (keys->nr) { @@ -210,6 +211,13 @@ int bch2_journal_replay(struct bch_fs *c) darray_for_each(*keys, k) { cond_resched(); + /* + * k->allocated means the key wasn't read in from the journal, + * rather it was from early repair code + */ + if (k->allocated) + immediate_flush = true; + /* Skip fastpath if we're low on space in the journal */ ret = c->journal.watermark ? -1 : commit_do(trans, NULL, NULL, @@ -269,6 +277,12 @@ int bch2_journal_replay(struct bch_fs *c) bch2_journal_set_replay_done(j); + /* if we did any repair, flush it immediately */ + if (immediate_flush) { + bch2_journal_flush_all_pins(&c->journal); + ret = bch2_journal_meta(&c->journal); + } + if (keys->nr) bch2_journal_log_msg(c, "journal replay finished"); err: @@ -778,6 +792,12 @@ use_clean: clear_bit(BCH_FS_fsck_running, &c->flags); + /* fsync if we fixed errors */ + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_meta(&c->journal); + } + /* If we fixed errors, verify that fs is actually clean now: */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && test_bit(BCH_FS_errors_fixed, &c->flags) && -- GitLab From 0a34c058fca84b10002228a1724e2e613e4dc3cc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 30 Mar 2024 18:57:53 -0400 Subject: [PATCH 204/989] bcachefs: Ensure bch_sb_field_ext always exists This makes bch_sb_field_ext more consistent with the rest of -o nochanges - we don't want to be varying other codepaths based on -o nochanges, since it's used for testing in dry run mode; also fixes some potential null ptr derefs. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 25 ++++++++----------------- fs/bcachefs/super.c | 8 ++++++++ 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index dbedb5fd1dde5..e3b06430d6067 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -592,16 +592,9 @@ int bch2_fs_recovery(struct bch_fs *c) if (!c->opts.nochanges) { mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; - struct bch_sb_field_ext *ext = - bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); - if (!ext) { - ret = -BCH_ERR_ENOSPC_sb; - mutex_unlock(&c->sb_lock); - goto err; - } - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { ext->recovery_passes_required[0] |= cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); @@ -832,6 +825,7 @@ use_clean: } mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { @@ -845,15 +839,12 @@ use_clean: write_sb = true; } - if (!test_bit(BCH_FS_error, &c->flags)) { - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - if (ext && - (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || - !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { - memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); - memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); - write_sb = true; - } + if (!test_bit(BCH_FS_error, &c->flags) && + (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || + !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { + memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); + memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); + write_sb = true; } if (c->opts.fsck && diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 1ad6e5cd9476c..89ee5d3ec1196 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1015,8 +1015,16 @@ int bch2_fs_start(struct bch_fs *c) for_each_online_member(c, ca) bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); + struct bch_sb_field_ext *ext = + bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); mutex_unlock(&c->sb_lock); + if (!ext) { + bch_err(c, "insufficient space in superblock for sb_field_ext"); + ret = -BCH_ERR_ENOSPC_sb; + goto err; + } + for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); -- GitLab From 060ff30a8596b649a80c19935758000dde7855fe Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 29 Mar 2024 20:43:39 -0400 Subject: [PATCH 205/989] bcachefs: bch2_run_explicit_recovery_pass_persistent() Flag that we need to run a recovery pass and run it - persistenly, so if we crash it'll still get run. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 18 ++++++++++++++++++ fs/bcachefs/recovery_passes.h | 1 + 2 files changed, 19 insertions(+) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index bab586dc395a6..fb22cce10f668 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -16,6 +16,7 @@ #include "snapshot.h" #include "subvolume.h" #include "super.h" +#include "super-io.h" const char * const bch2_recovery_passes[] = { #define x(_fn, ...) #_fn, @@ -112,6 +113,23 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, } } +int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + __le64 s = cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(pass))); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (!(ext->recovery_passes_required[0] & s)) { + ext->recovery_passes_required[0] |= s; + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + + return bch2_run_explicit_recovery_pass(c, pass); +} + u64 bch2_fsck_recovery_passes(void) { u64 ret = 0; diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index abefa67749ebb..99b464e127b80 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -9,6 +9,7 @@ u64 bch2_recovery_passes_from_stable(u64 v); u64 bch2_fsck_recovery_passes(void); int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); +int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); int bch2_run_online_recovery_passes(struct bch_fs *); int bch2_run_recovery_passes(struct bch_fs *); -- GitLab From 13c1e583f9179ad7953dc71ebb2f12e613b9d052 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Mar 2024 21:34:14 -0400 Subject: [PATCH 206/989] bcachefs: Improve -o norecovery; opts.recovery_pass_limit This adds opts.recovery_pass_limit, and redoes -o norecovery to make use of it; this fixes some issues with -o norecovery so it can be safely used for data recovery. Norecovery means "don't do journal replay"; it's an important data recovery tool when we're getting stuck in journal replay. When using it this way we need to make sure we don't free journal keys after startup, so we continue to overlay them: thus it needs to imply retain_recovery_info, as well as nochanges. recovery_pass_limit is an explicit option for telling recovery to exit after a specific recovery pass; this is a much cleaner way of implementing -o norecovery, as well as being a useful debug feature in its own right. Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.c | 4 ++++ fs/bcachefs/opts.h | 7 ++++++- fs/bcachefs/recovery.c | 10 ++++------ fs/bcachefs/recovery_passes.c | 12 ++++++++---- fs/bcachefs/snapshot.c | 2 +- fs/bcachefs/super.c | 5 +++-- 6 files changed, 26 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 08ea0cfc4aef0..e1800c4119b5f 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -7,6 +7,7 @@ #include "disk_groups.h" #include "error.h" #include "opts.h" +#include "recovery_passes.h" #include "super-io.h" #include "util.h" @@ -205,6 +206,9 @@ const struct bch_option bch2_opt_table[] = { #define OPT_STR(_choices) .type = BCH_OPT_STR, \ .min = 0, .max = ARRAY_SIZE(_choices), \ .choices = _choices +#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \ + .min = 0, .max = U64_MAX, \ + .choices = _choices #define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 136083c11f3a3..084247c13bd8f 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -362,7 +362,12 @@ enum fsck_err_opts { OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ - NULL, "Don't replay the journal") \ + NULL, "Exit recovery immediately prior to journal replay")\ + x(recovery_pass_last, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_STR_NOLIMIT(bch2_recovery_passes), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Exit recovery after specified pass") \ x(keep_journal, u8, \ 0, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index e3b06430d6067..e8b4340092936 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -269,7 +269,8 @@ int bch2_journal_replay(struct bch_fs *c) bch2_trans_put(trans); trans = NULL; - if (!c->opts.keep_journal) + if (!c->opts.keep_journal && + c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) bch2_journal_keys_put_initial(c); replay_now_at(j, j->replay_journal_seq_end); @@ -584,11 +585,8 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (c->opts.fsck && c->opts.norecovery) { - bch_err(c, "cannot select both norecovery and fsck"); - ret = -EINVAL; - goto err; - } + if (c->opts.norecovery) + c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1; if (!c->opts.nochanges) { mutex_lock(&c->sb_lock); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index fb22cce10f668..0089d9456b100 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -27,7 +27,7 @@ const char * const bch2_recovery_passes[] = { static int bch2_check_allocations(struct bch_fs *c) { - return bch2_gc(c, true, c->opts.norecovery); + return bch2_gc(c, true, false); } static int bch2_set_may_go_rw(struct bch_fs *c) @@ -144,8 +144,6 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa { struct recovery_pass_fn *p = recovery_pass_fns + pass; - if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) - return false; if (c->recovery_passes_explicit & BIT_ULL(pass)) return true; if ((p->when & PASS_FSCK) && c->opts.fsck) @@ -201,6 +199,10 @@ int bch2_run_recovery_passes(struct bch_fs *c) int ret = 0; while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + if (c->opts.recovery_pass_last && + c->curr_recovery_pass > c->opts.recovery_pass_last) + break; + if (should_run_recovery_pass(c, c->curr_recovery_pass)) { unsigned pass = c->curr_recovery_pass; @@ -213,8 +215,10 @@ int bch2_run_recovery_passes(struct bch_fs *c) c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); } - c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); + + c->curr_recovery_pass++; } return ret; diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 4e074136c4907..4577ee7939a2a 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -131,7 +131,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) rcu_read_lock(); struct snapshot_table *t = rcu_dereference(c->snapshots); - if (unlikely(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots)) { + if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); goto out; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 89ee5d3ec1196..bc026a77eb99d 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -365,7 +365,7 @@ void bch2_fs_read_only(struct bch_fs *c) !test_bit(BCH_FS_emergency_ro, &c->flags) && test_bit(BCH_FS_started, &c->flags) && test_bit(BCH_FS_clean_shutdown, &c->flags) && - !c->opts.norecovery) { + c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_read(&c->btree_cache.dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); @@ -510,7 +510,8 @@ err: int bch2_fs_read_write(struct bch_fs *c) { - if (c->opts.norecovery) + if (c->opts.recovery_pass_last && + c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) return -BCH_ERR_erofs_norecovery; if (c->opts.nochanges) -- GitLab From cecfed9b446da5fba9d73e6448c9f0d1ff5d95ff Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 31 Mar 2024 22:34:45 -0400 Subject: [PATCH 207/989] bcachefs: Logged op errors should be ignored If something is wrong with a logged op, we just want to delete it - there's nothing to repair. Signed-off-by: Kent Overstreet --- fs/bcachefs/logged_ops.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index 9fac838d123e8..b82f8209041ff 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -37,7 +37,6 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type); struct bkey_buf sk; u32 restart_count = trans->restart_count; - int ret; if (!fn) return 0; @@ -45,11 +44,11 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, bch2_bkey_buf_init(&sk); bch2_bkey_buf_reassemble(&sk, c, k); - ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?: - fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count); + fn->resume(trans, sk.k); bch2_bkey_buf_exit(&sk, c); - return ret; + + return trans_was_restarted(trans, restart_count); } int bch2_resume_logged_ops(struct bch_fs *c) -- GitLab From 8ce1db8091b23f5d2a0dd1dabe8007954114cb68 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 1 Apr 2024 00:00:32 -0400 Subject: [PATCH 208/989] bcachefs: Fix remove_dirent() We were missing an iter_traverse(). Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index aca6fae409cdf..0a47b0a473d8d 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -158,9 +158,10 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); - ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); bch2_trans_iter_exit(trans, &iter); err: bch_err_fn(c, ret); -- GitLab From eab3a3ce2dea1a4013a3a553722b85f55a76ac2d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 30 Mar 2024 01:00:50 -0400 Subject: [PATCH 209/989] bcachefs: Fix overlapping extent repair overlapping extent repair was colliding with extent past end of inode checks - don't update "extent ends at" until we know we have an extent. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0a47b0a473d8d..cbb8b43e419fe 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1372,10 +1372,6 @@ static int check_overlapping_extents(struct btree_trans *trans, goto err; } - ret = extent_ends_at(c, extent_ends, seen, k); - if (ret) - goto err; - extent_ends->last_pos = k.k->p; err: return ret; @@ -1505,6 +1501,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, i->seen_this_pos = true; } + + if (k.k->type != KEY_TYPE_whiteout) { + ret = extent_ends_at(c, extent_ends, s, k); + if (ret) + goto err; + } out: err: fsck_err: -- GitLab From b3c7fd35c03c17a950737fb56a06b730a7962d28 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 30 Mar 2024 15:59:57 -0400 Subject: [PATCH 210/989] bcachefs: On emergency shutdown, print out current journal sequence number Signed-off-by: Kent Overstreet --- fs/bcachefs/error.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index f942a39474962..82a6656c941c5 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" +#include "journal.h" #include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" @@ -16,7 +17,8 @@ bool bch2_inconsistent_error(struct bch_fs *c) return false; case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) - bch_err(c, "inconsistency detected - emergency read only"); + bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", + journal_cur_seq(&c->journal)); return true; case BCH_ON_ERROR_panic: panic(bch2_fmt(c, "panic after error")); -- GitLab From e23d7e82b707d1d0a627e334fb46370e4f772c11 Mon Sep 17 00:00:00 2001 From: Andrey Albershteyn Date: Thu, 14 Mar 2024 18:07:02 +0100 Subject: [PATCH 211/989] xfs: allow cross-linking special files without project quota There's an issue that if special files is created before quota project is enabled, then it's not possible to link this file. This works fine for normal files. This happens because xfs_quota skips special files (no ioctls to set necessary flags). The check for having the same project ID for source and destination then fails as source file doesn't have any ID. mkfs.xfs -f /dev/sda mount -o prjquota /dev/sda /mnt/test mkdir /mnt/test/foo mkfifo /mnt/test/foo/fifo1 xfs_quota -xc "project -sp /mnt/test/foo 9" /mnt/test > Setting up project 9 (path /mnt/test/foo)... > xfs_quota: skipping special file /mnt/test/foo/fifo1 > Processed 1 (/etc/projects and cmdline) paths for project 9 with recursion depth infinite (-1). ln /mnt/test/foo/fifo1 /mnt/test/foo/fifo1_link > ln: failed to create hard link '/mnt/test/testdir/fifo1_link' => '/mnt/test/testdir/fifo1': Invalid cross-device link mkfifo /mnt/test/foo/fifo2 ln /mnt/test/foo/fifo2 /mnt/test/foo/fifo2_link Fix this by allowing linking of special files to the project quota if special files doesn't have any ID set (ID = 0). Signed-off-by: Andrey Albershteyn Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_inode.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ea48774f6b76d..d55b42b2480d6 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1301,8 +1301,19 @@ xfs_link( */ if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && tdp->i_projid != sip->i_projid)) { - error = -EXDEV; - goto error_return; + /* + * Project quota setup skips special files which can + * leave inodes in a PROJINHERIT directory without a + * project ID set. We need to allow links to be made + * to these "project-less" inodes because userspace + * expects them to succeed after project ID setup, + * but everything else should be rejected. + */ + if (!special_file(VFS_I(sip)->i_mode) || + sip->i_projid != 0) { + error = -EXDEV; + goto error_return; + } } if (!resblks) { -- GitLab From f62d4c3eb687d87b616b4279acec7862553bda77 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 27 Mar 2024 12:48:50 +0000 Subject: [PATCH 212/989] KVM: arm64: Don't defer TLB invalidation when zapping table entries Commit 7657ea920c54 ("KVM: arm64: Use TLBI range-based instructions for unmap") introduced deferred TLB invalidation for the stage-2 page-table so that range-based invalidation can be used for the accumulated addresses. This works fine if the structure of the page-tables remains unchanged, but if entire tables are zapped and subsequently freed then we transiently leave the hardware page-table walker with a reference to freed memory thanks to the translation walk caches. For example, stage2_unmap_walker() will free page-table pages: if (childp) mm_ops->put_page(childp); and issue the TLB invalidation later in kvm_pgtable_stage2_unmap(): if (stage2_unmap_defer_tlb_flush(pgt)) /* Perform the deferred TLB invalidations */ kvm_tlb_flush_vmid_range(pgt->mmu, addr, size); For now, take the conservative approach and invalidate the TLB eagerly when we clear a table entry. Note, however, that the existing level hint passed to __kvm_tlb_flush_vmid_ipa() is incorrect and will be fixed in a subsequent patch. Cc: Raghavendra Rao Ananta Cc: Shaoqin Huang Cc: Marc Zyngier Cc: Oliver Upton Signed-off-by: Will Deacon Reviewed-by: Shaoqin Huang Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240327124853.11206-2-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 3fae5830f8d2c..de0b667ba2962 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -896,9 +896,11 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, if (kvm_pte_valid(ctx->old)) { kvm_clear_pte(ctx->ptep); - if (!stage2_unmap_defer_tlb_flush(pgt)) + if (!stage2_unmap_defer_tlb_flush(pgt) || + kvm_pte_table(ctx->old, ctx->level)) { kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + } } mm_ops->put_page(ctx->ptep); -- GitLab From 36e008323926036650299cfbb2dca704c7aba849 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 27 Mar 2024 12:48:51 +0000 Subject: [PATCH 213/989] KVM: arm64: Don't pass a TLBI level hint when zapping table entries The TLBI level hints are for leaf entries only, so take care not to pass them incorrectly after clearing a table entry. Cc: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Fixes: 82bb02445de5 ("KVM: arm64: Implement kvm_pgtable_hyp_unmap() at EL2") Fixes: 6d9d2115c480 ("KVM: arm64: Add support for stage-2 map()/unmap() in generic page-table") Signed-off-by: Will Deacon Reviewed-by: Shaoqin Huang Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240327124853.11206-3-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index de0b667ba2962..a40dafc43bb64 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -528,7 +528,7 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); + __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN); } else { if (ctx->end - ctx->addr < granule) return -EINVAL; @@ -896,10 +896,12 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, if (kvm_pte_valid(ctx->old)) { kvm_clear_pte(ctx->ptep); - if (!stage2_unmap_defer_tlb_flush(pgt) || - kvm_pte_table(ctx->old, ctx->level)) { - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, - ctx->addr, ctx->level); + if (kvm_pte_table(ctx->old, ctx->level)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, + TLBI_TTL_UNKNOWN); + } else if (!stage2_unmap_defer_tlb_flush(pgt)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, + ctx->level); } } -- GitLab From 0f0ff097bf77663b8d2692e33d56119947611bb0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 27 Mar 2024 12:48:52 +0000 Subject: [PATCH 214/989] KVM: arm64: Use TLBI_TTL_UNKNOWN in __kvm_tlb_flush_vmid_range() Commit c910f2b65518 ("arm64/mm: Update tlb invalidation routines for FEAT_LPA2") updated the __tlbi_level() macro to take the target level as an argument, with TLBI_TTL_UNKNOWN (rather than 0) indicating that the caller cannot provide level information. Unfortunately, the two implementations of __kvm_tlb_flush_vmid_range() were not updated and so now ask for an level 0 invalidation if FEAT_LPA2 is implemented. Fix the problem by passing TLBI_TTL_UNKNOWN instead of 0 as the level argument to __flush_s2_tlb_range_op() in __kvm_tlb_flush_vmid_range(). Cc: Catalin Marinas Cc: Oliver Upton Cc: Marc Zyngier Reviewed-by: Ryan Roberts Fixes: c910f2b65518 ("arm64/mm: Update tlb invalidation routines for FEAT_LPA2") Signed-off-by: Will Deacon Reviewed-by: Shaoqin Huang Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240327124853.11206-4-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/tlb.c | 3 ++- arch/arm64/kvm/hyp/vhe/tlb.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index a60fb13e21924..2fc68da4036d9 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -154,7 +154,8 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, /* Switch to requested VMID */ __tlb_switch_to_guest(mmu, &cxt, false); - __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0); + __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, + TLBI_TTL_UNKNOWN); dsb(ish); __tlbi(vmalle1is); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index b32e2940df7dc..1a60b95381e8e 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -171,7 +171,8 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, /* Switch to requested VMID */ __tlb_switch_to_guest(mmu, &cxt); - __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0); + __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, + TLBI_TTL_UNKNOWN); dsb(ish); __tlbi(vmalle1is); -- GitLab From 4c36a156738887c1edd78589fe192d757989bcde Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 27 Mar 2024 12:48:53 +0000 Subject: [PATCH 215/989] KVM: arm64: Ensure target address is granule-aligned for range TLBI When zapping a table entry in stage2_try_break_pte(), we issue range TLB invalidation for the region that was mapped by the table. However, we neglect to align the base address down to the granule size and so if we ended up reaching the table entry via a misaligned address then we will accidentally skip invalidation for some prefix of the affected address range. Align 'ctx->addr' down to the granule size when performing TLB invalidation for an unmapped table in stage2_try_break_pte(). Cc: Raghavendra Rao Ananta Cc: Gavin Shan Cc: Shaoqin Huang Cc: Quentin Perret Fixes: defc8cc7abf0 ("KVM: arm64: Invalidate the table entries upon a range") Signed-off-by: Will Deacon Reviewed-by: Shaoqin Huang Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240327124853.11206-5-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index a40dafc43bb64..5a59ef88b646f 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -843,12 +843,15 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, * Perform the appropriate TLB invalidation based on the * evicted pte value (if any). */ - if (kvm_pte_table(ctx->old, ctx->level)) - kvm_tlb_flush_vmid_range(mmu, ctx->addr, - kvm_granule_size(ctx->level)); - else if (kvm_pte_valid(ctx->old)) + if (kvm_pte_table(ctx->old, ctx->level)) { + u64 size = kvm_granule_size(ctx->level); + u64 addr = ALIGN_DOWN(ctx->addr, size); + + kvm_tlb_flush_vmid_range(mmu, addr, size); + } else if (kvm_pte_valid(ctx->old)) { kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + } } if (stage2_pte_is_counted(ctx->old)) -- GitLab From b3320142f3db9b3f2a23460abd3e22292e1530a5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 21 Mar 2024 11:54:14 +0000 Subject: [PATCH 216/989] arm64: Fix early handling of FEAT_E2H0 not being implemented Commit 3944382fa6f2 introduced checks for the FEAT_E2H0 not being implemented. However, the check is absolutely wrong and makes a point it testing a bit that is guaranteed to be zero. On top of that, the detection happens way too late, after the init_el2_state has done its job. This went undetected because the HW this was tested on has E2H being RAO/WI, and not RES1. However, the bug shows up when run as a nested guest, where HCR_EL2.E2H is not necessarily set to 1. As a result, booting the kernel in hVHE mode fails with timer accesses being cought in a trap loop (which was fun to debug). Fix the check for ID_AA64MMFR4_EL1.E2H0, and set the HCR_EL2.E2H bit early so that it can be checked by the rest of the init sequence. With this, hVHE works again in a NV environment that doesn't have FEAT_E2H0. Fixes: 3944382fa6f2 ("arm64: Treat HCR_EL2.E2H as RES1 when ID_AA64MMFR4_EL1.E2H0 is negative") Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20240321115414.3169115-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kernel/head.S | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index ce08b744aaab2..06234c3a15f3d 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -291,6 +291,21 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) blr x2 0: mov_q x0, HCR_HOST_NVHE_FLAGS + + /* + * Compliant CPUs advertise their VHE-onlyness with + * ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be + * RES1 in that case. Publish the E2H bit early so that + * it can be picked up by the init_el2_state macro. + * + * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but + * don't advertise it (they predate this relaxation). + */ + mrs_s x1, SYS_ID_AA64MMFR4_EL1 + tbz x1, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f + + orr x0, x0, #HCR_E2H +1: msr hcr_el2, x0 isb @@ -303,22 +318,10 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) mov_q x1, INIT_SCTLR_EL1_MMU_OFF - /* - * Compliant CPUs advertise their VHE-onlyness with - * ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be - * RES1 in that case. - * - * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, but - * don't advertise it (they predate this relaxation). - */ - mrs_s x0, SYS_ID_AA64MMFR4_EL1 - ubfx x0, x0, #ID_AA64MMFR4_EL1_E2H0_SHIFT, #ID_AA64MMFR4_EL1_E2H0_WIDTH - tbnz x0, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f - mrs x0, hcr_el2 and x0, x0, #HCR_E2H cbz x0, 2f -1: + /* Set a sane SCTLR_EL1, the VHE way */ pre_disable_mmu_workaround msr_s SYS_SCTLR_EL12, x1 -- GitLab From d96c66ab9fb3ad8b243669cf6b41e68d0f7f9ecd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 21 Mar 2024 17:37:06 +0000 Subject: [PATCH 217/989] KVM: arm64: Rationalise KVM banner output We are not very consistent when it comes to displaying which mode we're in (VHE, {n,h}VHE, protected or not). For example, booting in protected mode with hVHE results in: [ 0.969545] kvm [1]: Protected nVHE mode initialized successfully which is mildly amusing considering that the machine is VHE only. We already cleaned this up a bit with commit 1f3ca7023fe6 ("KVM: arm64: print Hyp mode"), but that's still unsatisfactory. Unify the three strings into one and use a mess of conditional statements to sort it out (yes, it's a slow day). Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240321173706.3280796-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 3dee5490eea94..c4a0a35e02c72 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2597,14 +2597,11 @@ static __init int kvm_arm_init(void) if (err) goto out_hyp; - if (is_protected_kvm_enabled()) { - kvm_info("Protected nVHE mode initialized successfully\n"); - } else if (in_hyp_mode) { - kvm_info("VHE mode initialized successfully\n"); - } else { - char mode = cpus_have_final_cap(ARM64_KVM_HVHE) ? 'h' : 'n'; - kvm_info("Hyp mode (%cVHE) initialized successfully\n", mode); - } + kvm_info("%s%sVHE mode initialized successfully\n", + in_hyp_mode ? "" : (is_protected_kvm_enabled() ? + "Protected " : "Hyp "), + in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ? + "h" : "n")); /* * FIXME: Do something reasonable if kvm_init() fails after pKVM -- GitLab From aa7cbefe65e455178c33eca308349e687d262ea7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:47 -0700 Subject: [PATCH 218/989] time/timecounter: Fix inline documentation Fix kernel-doc warnings, text punctuation, and a kernel-doc marker (change '%' to '&' to indicate a struct): timecounter.h:72: warning: No description found for return value of 'cyclecounter_cyc2ns' timecounter.h:85: warning: Function parameter or member 'tc' not described in 'timecounter_adjtime' timecounter.h:111: warning: No description found for return value of 'timecounter_read' timecounter.h:128: warning: No description found for return value of 'timecounter_cyc2time' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-2-rdunlap@infradead.org --- include/linux/timecounter.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h index c6540ceea1430..0982d1d52b24d 100644 --- a/include/linux/timecounter.h +++ b/include/linux/timecounter.h @@ -22,7 +22,7 @@ * * @read: returns the current cycle value * @mask: bitmask for two's complement - * subtraction of non 64 bit counters, + * subtraction of non-64-bit counters, * see CYCLECOUNTER_MASK() helper macro * @mult: cycle to nanosecond multiplier * @shift: cycle to nanosecond divisor (power of two) @@ -35,7 +35,7 @@ struct cyclecounter { }; /** - * struct timecounter - layer above a %struct cyclecounter which counts nanoseconds + * struct timecounter - layer above a &struct cyclecounter which counts nanoseconds * Contains the state needed by timecounter_read() to detect * cycle counter wrap around. Initialize with * timecounter_init(). Also used to convert cycle counts into the @@ -66,6 +66,8 @@ struct timecounter { * @cycles: Cycles * @mask: bit mask for maintaining the 'frac' field * @frac: pointer to storage for the fractional nanoseconds. + * + * Returns: cycle counter cycles converted to nanoseconds */ static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc, u64 cycles, u64 mask, u64 *frac) @@ -79,6 +81,7 @@ static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc, /** * timecounter_adjtime - Shifts the time of the clock. + * @tc: The &struct timecounter to adjust * @delta: Desired change in nanoseconds. */ static inline void timecounter_adjtime(struct timecounter *tc, s64 delta) @@ -107,6 +110,8 @@ extern void timecounter_init(struct timecounter *tc, * * In other words, keeps track of time since the same epoch as * the function which generated the initial time stamp. + * + * Returns: nanoseconds since the initial time stamp */ extern u64 timecounter_read(struct timecounter *tc); @@ -123,6 +128,8 @@ extern u64 timecounter_read(struct timecounter *tc); * * This allows conversion of cycle counter values which were generated * in the past. + * + * Returns: cycle counter converted to nanoseconds since the initial time stamp */ extern u64 timecounter_cyc2time(const struct timecounter *tc, u64 cycle_tstamp); -- GitLab From 76f788ee4a7d9f826738a034f9d2ee0bc4cd291b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:48 -0700 Subject: [PATCH 219/989] time/timekeeping: Fix kernel-doc warnings and typos Fix punctuation, spellos, and kernel-doc warnings: timekeeping.h:79: warning: No description found for return value of 'ktime_get_real' timekeeping.h:95: warning: No description found for return value of 'ktime_get_boottime' timekeeping.h:108: warning: No description found for return value of 'ktime_get_clocktai' timekeeping.h:149: warning: Function parameter or struct member 'mono' not described in 'ktime_mono_to_real' timekeeping.h:149: warning: No description found for return value of 'ktime_mono_to_real' timekeeping.h:255: warning: Function parameter or struct member 'cs_id' not described in 'system_time_snapshot' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-3-rdunlap@infradead.org --- include/linux/timekeeping.h | 49 +++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7e50cbd97f86e..0ea7823b7f31f 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -22,14 +22,14 @@ extern int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz); /* - * ktime_get() family: read the current time in a multitude of ways, + * ktime_get() family - read the current time in a multitude of ways. * * The default time reference is CLOCK_MONOTONIC, starting at * boot time but not counting the time spent in suspend. * For other references, use the functions with "real", "clocktai", * "boottime" and "raw" suffixes. * - * To get the time in a different format, use the ones wit + * To get the time in a different format, use the ones with * "ns", "ts64" and "seconds" suffix. * * See Documentation/core-api/timekeeping.rst for more details. @@ -74,6 +74,8 @@ extern u32 ktime_get_resolution_ns(void); /** * ktime_get_real - get the real (wall-) time in ktime_t format + * + * Returns: real (wall) time in ktime_t format */ static inline ktime_t ktime_get_real(void) { @@ -86,10 +88,12 @@ static inline ktime_t ktime_get_coarse_real(void) } /** - * ktime_get_boottime - Returns monotonic time since boot in ktime_t format + * ktime_get_boottime - Get monotonic time since boot in ktime_t format * * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the * time spent in suspend. + * + * Returns: monotonic time since boot in ktime_t format */ static inline ktime_t ktime_get_boottime(void) { @@ -102,7 +106,9 @@ static inline ktime_t ktime_get_coarse_boottime(void) } /** - * ktime_get_clocktai - Returns the TAI time of day in ktime_t format + * ktime_get_clocktai - Get the TAI time of day in ktime_t format + * + * Returns: the TAI time of day in ktime_t format */ static inline ktime_t ktime_get_clocktai(void) { @@ -144,32 +150,60 @@ static inline u64 ktime_get_coarse_clocktai_ns(void) /** * ktime_mono_to_real - Convert monotonic time to clock realtime + * @mono: monotonic time to convert + * + * Returns: time converted to realtime clock */ static inline ktime_t ktime_mono_to_real(ktime_t mono) { return ktime_mono_to_any(mono, TK_OFFS_REAL); } +/** + * ktime_get_ns - Get the current time in nanoseconds + * + * Returns: current time converted to nanoseconds + */ static inline u64 ktime_get_ns(void) { return ktime_to_ns(ktime_get()); } +/** + * ktime_get_real_ns - Get the current real/wall time in nanoseconds + * + * Returns: current real time converted to nanoseconds + */ static inline u64 ktime_get_real_ns(void) { return ktime_to_ns(ktime_get_real()); } +/** + * ktime_get_boottime_ns - Get the monotonic time since boot in nanoseconds + * + * Returns: current boottime converted to nanoseconds + */ static inline u64 ktime_get_boottime_ns(void) { return ktime_to_ns(ktime_get_boottime()); } +/** + * ktime_get_clocktai_ns - Get the current TAI time of day in nanoseconds + * + * Returns: current TAI time converted to nanoseconds + */ static inline u64 ktime_get_clocktai_ns(void) { return ktime_to_ns(ktime_get_clocktai()); } +/** + * ktime_get_raw_ns - Get the raw monotonic time in nanoseconds + * + * Returns: current raw monotonic time converted to nanoseconds + */ static inline u64 ktime_get_raw_ns(void) { return ktime_to_ns(ktime_get_raw()); @@ -224,8 +258,8 @@ extern bool timekeeping_rtc_skipresume(void); extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta); -/* - * struct ktime_timestanps - Simultaneous mono/boot/real timestamps +/** + * struct ktime_timestamps - Simultaneous mono/boot/real timestamps * @mono: Monotonic timestamp * @boot: Boottime timestamp * @real: Realtime timestamp @@ -242,7 +276,8 @@ struct ktime_timestamps { * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time * @raw: Monotonic raw system time - * @clock_was_set_seq: The sequence number of clock was set events + * @cs_id: Clocksource ID + * @clock_was_set_seq: The sequence number of clock-was-set events * @cs_was_changed_seq: The sequence number of clocksource change events */ struct system_time_snapshot { -- GitLab From b87752ef5cc15b0bae04583d599e873d92dc0618 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:49 -0700 Subject: [PATCH 220/989] timers: Fix kernel-doc format and add Return values Fix kernel-doc format and warnings: timer.h:26: warning: Cannot understand * @TIMER_DEFERRABLE: A deferrable timer will work normally when the on line 26 - I thought it was a doc line timer.h:146: warning: No description found for return value of 'timer_pending' timer.h:180: warning: No description found for return value of 'del_timer_sync' timer.h:193: warning: No description found for return value of 'del_timer' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-4-rdunlap@infradead.org --- include/linux/timer.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index 14a633ba61d64..e67ecd1cbc97d 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -22,7 +22,7 @@ #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) #endif -/** +/* * @TIMER_DEFERRABLE: A deferrable timer will work normally when the * system is busy, but will not cause a CPU to come out of idle just * to service it; instead, the timer will be serviced when the CPU @@ -140,7 +140,7 @@ static inline void destroy_timer_on_stack(struct timer_list *timer) { } * or not. Callers must ensure serialization wrt. other operations done * to this timer, eg. interrupt contexts, or other CPUs on SMP. * - * return value: 1 if the timer is pending, 0 if not. + * Returns: 1 if the timer is pending, 0 if not. */ static inline int timer_pending(const struct timer_list * timer) { @@ -175,6 +175,10 @@ extern int timer_shutdown(struct timer_list *timer); * See timer_delete_sync() for detailed explanation. * * Do not use in new code. Use timer_delete_sync() instead. + * + * Returns: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated */ static inline int del_timer_sync(struct timer_list *timer) { @@ -188,6 +192,10 @@ static inline int del_timer_sync(struct timer_list *timer) * See timer_delete() for detailed explanation. * * Do not use in new code. Use timer_delete() instead. + * + * Returns: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated */ static inline int del_timer(struct timer_list *timer) { -- GitLab From f29536bf1721802d2ebdc7893ed2991d4da0a4b6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:50 -0700 Subject: [PATCH 221/989] tick/sched: Fix various kernel-doc warnings Fix a slew of kernel-doc warnings in tick-sched.c: tick-sched.c:650: warning: Function parameter or struct member 'now' not described in 'tick_nohz_update_jiffies' tick-sched.c:741: warning: No description found for return value of 'get_cpu_idle_time_us' tick-sched.c:767: warning: No description found for return value of 'get_cpu_iowait_time_us' tick-sched.c:1210: warning: No description found for return value of 'tick_nohz_idle_got_tick' tick-sched.c:1228: warning: No description found for return value of 'tick_nohz_get_next_hrtimer' tick-sched.c:1243: warning: No description found for return value of 'tick_nohz_get_sleep_length' tick-sched.c:1282: warning: Function parameter or struct member 'cpu' not described in 'tick_nohz_get_idle_calls_cpu' tick-sched.c:1282: warning: No description found for return value of 'tick_nohz_get_idle_calls_cpu' tick-sched.c:1294: warning: No description found for return value of 'tick_nohz_get_idle_calls' tick-sched.c:1577: warning: Function parameter or struct member 'hrtimer' not described in 'tick_setup_sched_timer' tick-sched.c:1577: warning: Excess function parameter 'mode' description in 'tick_setup_sched_timer' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-5-rdunlap@infradead.org --- kernel/time/tick-sched.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 269e21590df53..1331216a9cae7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -697,6 +697,7 @@ bool tick_nohz_tick_stopped_cpu(int cpu) /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted + * @now: current ktime_t * * Called from interrupt entry when the CPU was idle * @@ -794,7 +795,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * - * This function returns -1 if NOHZ is not enabled. + * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { @@ -820,7 +821,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * - * This function returns -1 if NOHZ is not enabled. + * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { @@ -1287,6 +1288,8 @@ void tick_nohz_irq_exit(void) /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run + * + * Return: %true if the tick handler has run, otherwise %false */ bool tick_nohz_idle_got_tick(void) { @@ -1305,6 +1308,8 @@ bool tick_nohz_idle_got_tick(void) * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled + * + * Return: the next expiration time */ ktime_t tick_nohz_get_next_hrtimer(void) { @@ -1320,6 +1325,8 @@ ktime_t tick_nohz_get_next_hrtimer(void) * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. + * + * Return: the expected length of the current sleep */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { @@ -1357,8 +1364,11 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. + * @cpu: target CPU number * * Called from the schedutil frequency scaling governor in scheduler context. + * + * Return: the current idle calls counter value for @cpu */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { @@ -1371,6 +1381,8 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) * tick_nohz_get_idle_calls - return the current idle calls counter value * * Called from the schedutil frequency scaling governor in scheduler context. + * + * Return: the current idle calls counter value for the current CPU */ unsigned long tick_nohz_get_idle_calls(void) { @@ -1559,7 +1571,7 @@ early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer - * @mode: tick_nohz_mode to setup for + * @hrtimer: whether to use the hrtimer or not */ void tick_setup_sched_timer(bool hrtimer) { -- GitLab From ba6ad57b803e33ed509213a5e840427dbef501d6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:51 -0700 Subject: [PATCH 222/989] tick/sched: Fix struct tick_sched doc warnings Fix kernel-doc warnings in struct tick_sched: tick-sched.h:103: warning: Function parameter or struct member 'idle_sleeptime_seq' not described in 'tick_sched' tick-sched.h:104: warning: Excess struct member 'nohz_mode' description in 'tick_sched' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-6-rdunlap@infradead.org --- kernel/time/tick-sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index e11c4dc65bcb2..b4a7822f495d3 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -46,8 +46,8 @@ struct tick_device { * @next_tick: Next tick to be fired when in dynticks mode. * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_waketime: Time when the idle was interrupted + * @idle_sleeptime_seq: sequence counter for data consistency * @idle_entrytime: Time when the idle call was entered - * @nohz_mode: Mode - one state of tick_nohz_mode * @last_jiffies: Base jiffies snapshot when next event was last computed * @timer_expires_base: Base time clock monotonic for @timer_expires * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) -- GitLab From 9e643ab59d7ee4332994671720a9528bac62e9b7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:52 -0700 Subject: [PATCH 223/989] timers: Fix text inconsistencies and spelling Fix some text for consistency: s/lvl/level/ in a comment and use correct/full function names in comments. Correct spelling errors as reported by codespell. Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-7-rdunlap@infradead.org --- kernel/time/timer.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index dee29f1f5b75f..3baf2fbe6848f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -64,15 +64,15 @@ EXPORT_SYMBOL(jiffies_64); /* * The timer wheel has LVL_DEPTH array levels. Each level provides an array of - * LVL_SIZE buckets. Each level is driven by its own clock and therefor each + * LVL_SIZE buckets. Each level is driven by its own clock and therefore each * level has a different granularity. * - * The level granularity is: LVL_CLK_DIV ^ lvl + * The level granularity is: LVL_CLK_DIV ^ level * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) * * The array level of a newly armed timer depends on the relative expiry * time. The farther the expiry time is away the higher the array level and - * therefor the granularity becomes. + * therefore the granularity becomes. * * Contrary to the original timer wheel implementation, which aims for 'exact' * expiry of the timers, this implementation removes the need for recascading @@ -207,7 +207,7 @@ EXPORT_SYMBOL(jiffies_64); * struct timer_base - Per CPU timer base (number of base depends on config) * @lock: Lock protecting the timer_base * @running_timer: When expiring timers, the lock is dropped. To make - * sure not to race agains deleting/modifying a + * sure not to race against deleting/modifying a * currently running timer, the pointer is set to the * timer, which expires at the moment. If no timer is * running, the pointer is NULL. @@ -737,7 +737,7 @@ static bool timer_is_static_object(void *addr) } /* - * fixup_init is called when: + * timer_fixup_init is called when: * - an active object is initialized */ static bool timer_fixup_init(void *addr, enum debug_obj_state state) @@ -761,7 +761,7 @@ static void stub_timer(struct timer_list *unused) } /* - * fixup_activate is called when: + * timer_fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ @@ -783,7 +783,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) } /* - * fixup_free is called when: + * timer_fixup_free is called when: * - an active object is freed */ static bool timer_fixup_free(void *addr, enum debug_obj_state state) @@ -801,7 +801,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state) } /* - * fixup_assert_init is called when: + * timer_fixup_assert_init is called when: * - an untracked/uninit-ed object is found */ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) @@ -914,7 +914,7 @@ static void do_init_timer(struct timer_list *timer, * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * - * init_timer_key() must be done to a timer prior calling *any* of the + * init_timer_key() must be done to a timer prior to calling *any* of the * other timer functions. */ void init_timer_key(struct timer_list *timer, @@ -1417,7 +1417,7 @@ static int __timer_delete(struct timer_list *timer, bool shutdown) * If @shutdown is set then the lock has to be taken whether the * timer is pending or not to protect against a concurrent rearm * which might hit between the lockless pending check and the lock - * aquisition. By taking the lock it is ensured that such a newly + * acquisition. By taking the lock it is ensured that such a newly * enqueued timer is dequeued and cannot end up with * timer->function == NULL in the expiry code. * @@ -2306,7 +2306,7 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, /* * When timer base is not set idle, undo the effect of - * tmigr_cpu_deactivate() to prevent inconsitent states - active + * tmigr_cpu_deactivate() to prevent inconsistent states - active * timer base but inactive timer migration hierarchy. * * When timer base was already marked idle, nothing will be -- GitLab From 481047d7e8391d3842ae59025806531cdad710d9 Mon Sep 17 00:00:00 2001 From: "Yanjun.Zhu" Date: Thu, 14 Mar 2024 07:51:40 +0100 Subject: [PATCH 224/989] RDMA/rxe: Fix the problem "mutex_destroy missing" When a mutex lock is not used any more, the function mutex_destroy should be called to mark the mutex lock uninitialized. Fixes: 8700e3e7c485 ("Soft RoCE driver") Signed-off-by: Yanjun.Zhu Link: https://lore.kernel.org/r/20240314065140.27468-1-yanjun.zhu@linux.dev Reviewed-by: Daisuke Matsuda Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index ae466e72fc43b..255677bc12b2a 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -33,6 +33,8 @@ void rxe_dealloc(struct ib_device *ib_dev) if (rxe->tfm) crypto_free_shash(rxe->tfm); + + mutex_destroy(&rxe->usdev_lock); } /* initialize rxe device parameters */ -- GitLab From 755795cd3da053b0565085d9950c44d7b6cba5c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 29 Mar 2024 22:54:42 +0100 Subject: [PATCH 225/989] OSS: dmasound/paula: Mark driver struct with __refdata to prevent section mismatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As described in the added code comment, a reference to .exit.text is ok for drivers registered via module_platform_driver_probe(). Make this explicit to prevent the following section mismatch warning WARNING: modpost: sound/oss/dmasound/dmasound_paula: section mismatch in reference: amiga_audio_driver+0x8 (section: .data) -> amiga_audio_remove (section: .exit.text) that triggers on an allmodconfig W=1 build. Signed-off-by: Uwe Kleine-König Message-ID: Signed-off-by: Takashi Iwai --- sound/oss/dmasound/dmasound_paula.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sound/oss/dmasound/dmasound_paula.c b/sound/oss/dmasound/dmasound_paula.c index 0ba8f0c4cd99a..3a593da09280d 100644 --- a/sound/oss/dmasound/dmasound_paula.c +++ b/sound/oss/dmasound/dmasound_paula.c @@ -725,7 +725,13 @@ static void __exit amiga_audio_remove(struct platform_device *pdev) dmasound_deinit(); } -static struct platform_driver amiga_audio_driver = { +/* + * amiga_audio_remove() lives in .exit.text. For drivers registered via + * module_platform_driver_probe() this is ok because they cannot get unbound at + * runtime. So mark the driver struct with __refdata to prevent modpost + * triggering a section mismatch warning. + */ +static struct platform_driver amiga_audio_driver __refdata = { .remove_new = __exit_p(amiga_audio_remove), .driver = { .name = "amiga-audio", -- GitLab From b68e1acb5834ed1a2ad42d9d002815a8bae7c0b6 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Fri, 22 Mar 2024 13:20:49 +0200 Subject: [PATCH 226/989] RDMA/cm: Print the old state when cm_destroy_id gets timeout The old state is helpful for debugging, as the current state is always IB_CM_IDLE when timeout happens. Fixes: 96d9cbe2f2ff ("RDMA/cm: add timeout to cm_destroy_id wait") Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/20240322112049.2022994-1-markzhang@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cm.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index bf0df6ee4f785..07fb8d3c037f0 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1026,23 +1026,26 @@ static void cm_reset_to_idle(struct cm_id_private *cm_id_priv) } } -static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id) +static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id, + enum ib_cm_state old_state) { struct cm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct cm_id_private, id); - pr_err("%s: cm_id=%p timed out. state=%d refcnt=%d\n", __func__, - cm_id, cm_id->state, refcount_read(&cm_id_priv->refcount)); + pr_err("%s: cm_id=%p timed out. state %d -> %d, refcnt=%d\n", __func__, + cm_id, old_state, cm_id->state, refcount_read(&cm_id_priv->refcount)); } static void cm_destroy_id(struct ib_cm_id *cm_id, int err) { struct cm_id_private *cm_id_priv; + enum ib_cm_state old_state; struct cm_work *work; int ret; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irq(&cm_id_priv->lock); + old_state = cm_id->state; retest: switch (cm_id->state) { case IB_CM_LISTEN: @@ -1151,7 +1154,7 @@ retest: msecs_to_jiffies( CM_DESTROY_ID_WAIT_TIMEOUT)); if (!ret) /* timeout happened */ - cm_destroy_id_wait_timeout(cm_id); + cm_destroy_id_wait_timeout(cm_id, old_state); } while (!ret); while ((work = cm_dequeue_work(cm_id_priv)) != NULL) -- GitLab From 2a975d426c82ff05ec1f0b773798d909fe4a3105 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Apr 2024 11:27:33 -0600 Subject: [PATCH 227/989] io_uring/rw: don't allow multishot reads without NOWAIT support Supporting multishot reads requires support for NOWAIT, as the alternative would be always having io-wq execute the work item whenever the poll readiness triggered. Any fast file type will have NOWAIT support (eg it understands both O_NONBLOCK and IOCB_NOWAIT). If the given file type does not, then simply resort to single shot execution. Cc: stable@vger.kernel.org Fixes: fc68fcda04910 ("io_uring/rw: add support for IORING_OP_READ_MULTISHOT") Signed-off-by: Jens Axboe --- io_uring/rw.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 0585ebcc9773d..c8d48287439e5 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -936,6 +936,13 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) ret = __io_read(req, issue_flags); + /* + * If the file doesn't support proper NOWAIT, then disable multishot + * and stay in single shot mode. + */ + if (!io_file_supports_nowait(req)) + req->flags &= ~REQ_F_APOLL_MULTISHOT; + /* * If we get -EAGAIN, recycle our buffer and just let normal poll * handling arm it. @@ -955,7 +962,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) /* * Any successful return value will keep the multishot read armed. */ - if (ret > 0) { + if (ret > 0 && req->flags & REQ_F_APOLL_MULTISHOT) { /* * Put our buffer and post a CQE. If we fail to post a CQE, then * jump to the termination path. This request is then done. -- GitLab From bee1d5becdf5bf23d4ca0cd9c6b60bdf3c61d72b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Apr 2024 11:30:06 -0600 Subject: [PATCH 228/989] io_uring: disable io-wq execution of multishot NOWAIT requests Do the same check for direct io-wq execution for multishot requests that commit 2a975d426c82 did for the inline execution, and disable multishot mode (and revert to single shot) if the file type doesn't support NOWAIT, and isn't opened in O_NONBLOCK mode. For multishot to work properly, it's a requirement that nonblocking read attempts can be done. Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5d4b448fdc503..8baf8afb79c2a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1982,10 +1982,15 @@ fail: err = -EBADFD; if (!io_file_can_poll(req)) goto fail; - err = -ECANCELED; - if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK) - goto fail; - return; + if (req->file->f_flags & O_NONBLOCK || + req->file->f_mode & FMODE_NOWAIT) { + err = -ECANCELED; + if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK) + goto fail; + return; + } else { + req->flags &= ~REQ_F_APOLL_MULTISHOT; + } } if (req->flags & REQ_F_FORCE_ASYNC) { -- GitLab From 24a9799aa8efecd0eb55a75e35f9d8e6400063aa Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 1 Apr 2024 14:13:10 -0300 Subject: [PATCH 229/989] smb: client: fix UAF in smb2_reconnect_server() The UAF bug is due to smb2_reconnect_server() accessing a session that is already being teared down by another thread that is executing __cifs_put_smb_ses(). This can happen when (a) the client has connection to the server but no session or (b) another thread ends up setting @ses->ses_status again to something different than SES_EXITING. To fix this, we need to make sure to unconditionally set @ses->ses_status to SES_EXITING and prevent any other threads from setting a new status while we're still tearing it down. The following can be reproduced by adding some delay to right after the ipc is freed in __cifs_put_smb_ses() - which will give smb2_reconnect_server() worker a chance to run and then accessing @ses->ipc: kinit ... mount.cifs //srv/share /mnt/1 -o sec=krb5,nohandlecache,echo_interval=10 [disconnect srv] ls /mnt/1 &>/dev/null sleep 30 kdestroy [reconnect srv] sleep 10 umount /mnt/1 ... CIFS: VFS: Verify user has a krb5 ticket and keyutils is installed CIFS: VFS: \\srv Send error in SessSetup = -126 CIFS: VFS: Verify user has a krb5 ticket and keyutils is installed CIFS: VFS: \\srv Send error in SessSetup = -126 general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6b6b: 0000 [#1] PREEMPT SMP NOPTI CPU: 3 PID: 50 Comm: kworker/3:1 Not tainted 6.9.0-rc2 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-1.fc39 04/01/2014 Workqueue: cifsiod smb2_reconnect_server [cifs] RIP: 0010:__list_del_entry_valid_or_report+0x33/0xf0 Code: 4f 08 48 85 d2 74 42 48 85 c9 74 59 48 b8 00 01 00 00 00 00 ad de 48 39 c2 74 61 48 b8 22 01 00 00 00 00 74 69 <48> 8b 01 48 39 f8 75 7b 48 8b 72 08 48 39 c6 0f 85 88 00 00 00 b8 RSP: 0018:ffffc900001bfd70 EFLAGS: 00010a83 RAX: dead000000000122 RBX: ffff88810da53838 RCX: 6b6b6b6b6b6b6b6b RDX: 6b6b6b6b6b6b6b6b RSI: ffffffffc02f6878 RDI: ffff88810da53800 RBP: ffff88810da53800 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff88810c064000 R13: 0000000000000001 R14: ffff88810c064000 R15: ffff8881039cc000 FS: 0000000000000000(0000) GS:ffff888157c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe3728b1000 CR3: 000000010caa4000 CR4: 0000000000750ef0 PKRU: 55555554 Call Trace: ? die_addr+0x36/0x90 ? exc_general_protection+0x1c1/0x3f0 ? asm_exc_general_protection+0x26/0x30 ? __list_del_entry_valid_or_report+0x33/0xf0 __cifs_put_smb_ses+0x1ae/0x500 [cifs] smb2_reconnect_server+0x4ed/0x710 [cifs] process_one_work+0x205/0x6b0 worker_thread+0x191/0x360 ? __pfx_worker_thread+0x10/0x10 kthread+0xe2/0x110 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x34/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 83 +++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 49 deletions(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 9b85b5341822e..ee29bc57300cd 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -232,7 +232,13 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, spin_lock(&cifs_tcp_ses_lock); list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) { - /* check if iface is still active */ + spin_lock(&ses->ses_lock); + if (ses->ses_status == SES_EXITING) { + spin_unlock(&ses->ses_lock); + continue; + } + spin_unlock(&ses->ses_lock); + spin_lock(&ses->chan_lock); if (cifs_ses_get_chan_index(ses, server) == CIFS_INVAL_CHAN_INDEX) { @@ -1963,31 +1969,6 @@ out: return rc; } -/** - * cifs_free_ipc - helper to release the session IPC tcon - * @ses: smb session to unmount the IPC from - * - * Needs to be called everytime a session is destroyed. - * - * On session close, the IPC is closed and the server must release all tcons of the session. - * No need to send a tree disconnect here. - * - * Besides, it will make the server to not close durable and resilient files on session close, as - * specified in MS-SMB2 3.3.5.6 Receiving an SMB2 LOGOFF Request. - */ -static int -cifs_free_ipc(struct cifs_ses *ses) -{ - struct cifs_tcon *tcon = ses->tcon_ipc; - - if (tcon == NULL) - return 0; - - tconInfoFree(tcon); - ses->tcon_ipc = NULL; - return 0; -} - static struct cifs_ses * cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) { @@ -2019,48 +2000,52 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) void __cifs_put_smb_ses(struct cifs_ses *ses) { struct TCP_Server_Info *server = ses->server; + struct cifs_tcon *tcon; unsigned int xid; size_t i; + bool do_logoff; int rc; + spin_lock(&cifs_tcp_ses_lock); spin_lock(&ses->ses_lock); - if (ses->ses_status == SES_EXITING) { + cifs_dbg(FYI, "%s: id=0x%llx ses_count=%d ses_status=%u ipc=%s\n", + __func__, ses->Suid, ses->ses_count, ses->ses_status, + ses->tcon_ipc ? ses->tcon_ipc->tree_name : "none"); + if (ses->ses_status == SES_EXITING || --ses->ses_count > 0) { spin_unlock(&ses->ses_lock); + spin_unlock(&cifs_tcp_ses_lock); return; } - spin_unlock(&ses->ses_lock); + /* ses_count can never go negative */ + WARN_ON(ses->ses_count < 0); - cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); - cifs_dbg(FYI, - "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->tree_name : "NONE"); + spin_lock(&ses->chan_lock); + cifs_chan_clear_need_reconnect(ses, server); + spin_unlock(&ses->chan_lock); - spin_lock(&cifs_tcp_ses_lock); - if (--ses->ses_count > 0) { - spin_unlock(&cifs_tcp_ses_lock); - return; - } - spin_lock(&ses->ses_lock); - if (ses->ses_status == SES_GOOD) - ses->ses_status = SES_EXITING; + do_logoff = ses->ses_status == SES_GOOD && server->ops->logoff; + ses->ses_status = SES_EXITING; + tcon = ses->tcon_ipc; + ses->tcon_ipc = NULL; spin_unlock(&ses->ses_lock); spin_unlock(&cifs_tcp_ses_lock); - /* ses_count can never go negative */ - WARN_ON(ses->ses_count < 0); - - spin_lock(&ses->ses_lock); - if (ses->ses_status == SES_EXITING && server->ops->logoff) { - spin_unlock(&ses->ses_lock); - cifs_free_ipc(ses); + /* + * On session close, the IPC is closed and the server must release all + * tcons of the session. No need to send a tree disconnect here. + * + * Besides, it will make the server to not close durable and resilient + * files on session close, as specified in MS-SMB2 3.3.5.6 Receiving an + * SMB2 LOGOFF Request. + */ + tconInfoFree(tcon); + if (do_logoff) { xid = get_xid(); rc = server->ops->logoff(xid, ses); if (rc) cifs_server_dbg(VFS, "%s: Session Logoff failure rc=%d\n", __func__, rc); _free_xid(xid); - } else { - spin_unlock(&ses->ses_lock); - cifs_free_ipc(ses); } spin_lock(&cifs_tcp_ses_lock); -- GitLab From fddf09273807bf6e51537823aaae896e05f147f9 Mon Sep 17 00:00:00 2001 From: Oleksandr Natalenko Date: Fri, 19 Jan 2024 11:22:15 +0100 Subject: [PATCH 230/989] drm/display: fix typo While studying the code I've bumped into a small typo within the kernel-doc for two functions, apparently, due to copy-paste. This commit fixes "sizo" word to be "size". Signed-off-by: Oleksandr Natalenko Acked-by: Randy Dunlap Fixes: b3daa5ef52c2 ("drm: Add helper for DP++ adaptors") Reviewed-by: Dmitry Baryshkov Signed-off-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20240119102215.201474-1-oleksandr@natalenko.name --- drivers/gpu/drm/display/drm_dp_dual_mode_helper.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/display/drm_dp_dual_mode_helper.c b/drivers/gpu/drm/display/drm_dp_dual_mode_helper.c index bd61e20770a5b..14a2a8473682b 100644 --- a/drivers/gpu/drm/display/drm_dp_dual_mode_helper.c +++ b/drivers/gpu/drm/display/drm_dp_dual_mode_helper.c @@ -52,7 +52,7 @@ * @adapter: I2C adapter for the DDC bus * @offset: register offset * @buffer: buffer for return data - * @size: sizo of the buffer + * @size: size of the buffer * * Reads @size bytes from the DP dual mode adaptor registers * starting at @offset. @@ -116,7 +116,7 @@ EXPORT_SYMBOL(drm_dp_dual_mode_read); * @adapter: I2C adapter for the DDC bus * @offset: register offset * @buffer: buffer for write data - * @size: sizo of the buffer + * @size: size of the buffer * * Writes @size bytes to the DP dual mode adaptor registers * starting at @offset. -- GitLab From 8844f467d6a58dc915f241e81c46e0c126f8c070 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 30 Mar 2024 05:53:22 +0200 Subject: [PATCH 231/989] drm/msm/dpu: make error messages at dpu_core_irq_register_callback() more sensible There is little point in using %ps to print a value known to be NULL. On the other hand it makes sense to print the callback symbol in the 'invalid IRQ' message. Correct those two error messages to make more sense. Fixes: 6893199183f8 ("drm/msm/dpu: stop using raw IRQ indices in the kernel output") Signed-off-by: Dmitry Baryshkov Reviewed-by: Marijn Suijten Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/585565/ Link: https://lore.kernel.org/r/20240330-dpu-irq-messages-v1-1-9ce782ae35f9@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c index 946dd0135dffc..6a0a74832fb64 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c @@ -525,14 +525,14 @@ int dpu_core_irq_register_callback(struct dpu_kms *dpu_kms, int ret; if (!irq_cb) { - DPU_ERROR("invalid IRQ=[%d, %d] irq_cb:%ps\n", - DPU_IRQ_REG(irq_idx), DPU_IRQ_BIT(irq_idx), irq_cb); + DPU_ERROR("IRQ=[%d, %d] NULL callback\n", + DPU_IRQ_REG(irq_idx), DPU_IRQ_BIT(irq_idx)); return -EINVAL; } if (!dpu_core_irq_is_valid(irq_idx)) { - DPU_ERROR("invalid IRQ=[%d, %d]\n", - DPU_IRQ_REG(irq_idx), DPU_IRQ_BIT(irq_idx)); + DPU_ERROR("invalid IRQ=[%d, %d] irq_cb:%ps\n", + DPU_IRQ_REG(irq_idx), DPU_IRQ_BIT(irq_idx), irq_cb); return -EINVAL; } -- GitLab From 1197c5b2099f716b3de327437fb50900a0b936c9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 26 Mar 2024 23:38:06 +0100 Subject: [PATCH 232/989] scsi: mylex: Fix sysfs buffer lengths The myrb and myrs drivers use an odd way of implementing their sysfs files, calling snprintf() with a fixed length of 32 bytes to print into a page sized buffer. One of the strings is actually longer than 32 bytes, which clang can warn about: drivers/scsi/myrb.c:1906:10: error: 'snprintf' will always be truncated; specified size is 32, but format string expands to at least 34 [-Werror,-Wformat-truncation] drivers/scsi/myrs.c:1089:10: error: 'snprintf' will always be truncated; specified size is 32, but format string expands to at least 34 [-Werror,-Wformat-truncation] These could all be plain sprintf() without a length as the buffer is always long enough. On the other hand, sysfs files should not be overly long either, so just double the length to make sure the longest strings don't get truncated here. Fixes: 77266186397c ("scsi: myrs: Add Mylex RAID controller (SCSI interface)") Fixes: 081ff398c56c ("scsi: myrb: Add Mylex RAID controller (block interface)") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240326223825.4084412-8-arnd@kernel.org Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/myrb.c | 20 ++++++++++---------- drivers/scsi/myrs.c | 24 ++++++++++++------------ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/scsi/myrb.c b/drivers/scsi/myrb.c index ca2e932dd9b70..f684eb5e04898 100644 --- a/drivers/scsi/myrb.c +++ b/drivers/scsi/myrb.c @@ -1775,9 +1775,9 @@ static ssize_t raid_state_show(struct device *dev, name = myrb_devstate_name(ldev_info->state); if (name) - ret = snprintf(buf, 32, "%s\n", name); + ret = snprintf(buf, 64, "%s\n", name); else - ret = snprintf(buf, 32, "Invalid (%02X)\n", + ret = snprintf(buf, 64, "Invalid (%02X)\n", ldev_info->state); } else { struct myrb_pdev_state *pdev_info = sdev->hostdata; @@ -1796,9 +1796,9 @@ static ssize_t raid_state_show(struct device *dev, else name = myrb_devstate_name(pdev_info->state); if (name) - ret = snprintf(buf, 32, "%s\n", name); + ret = snprintf(buf, 64, "%s\n", name); else - ret = snprintf(buf, 32, "Invalid (%02X)\n", + ret = snprintf(buf, 64, "Invalid (%02X)\n", pdev_info->state); } return ret; @@ -1886,11 +1886,11 @@ static ssize_t raid_level_show(struct device *dev, name = myrb_raidlevel_name(ldev_info->raid_level); if (!name) - return snprintf(buf, 32, "Invalid (%02X)\n", + return snprintf(buf, 64, "Invalid (%02X)\n", ldev_info->state); - return snprintf(buf, 32, "%s\n", name); + return snprintf(buf, 64, "%s\n", name); } - return snprintf(buf, 32, "Physical Drive\n"); + return snprintf(buf, 64, "Physical Drive\n"); } static DEVICE_ATTR_RO(raid_level); @@ -1903,15 +1903,15 @@ static ssize_t rebuild_show(struct device *dev, unsigned char status; if (sdev->channel < myrb_logical_channel(sdev->host)) - return snprintf(buf, 32, "physical device - not rebuilding\n"); + return snprintf(buf, 64, "physical device - not rebuilding\n"); status = myrb_get_rbld_progress(cb, &rbld_buf); if (rbld_buf.ldev_num != sdev->id || status != MYRB_STATUS_SUCCESS) - return snprintf(buf, 32, "not rebuilding\n"); + return snprintf(buf, 64, "not rebuilding\n"); - return snprintf(buf, 32, "rebuilding block %u of %u\n", + return snprintf(buf, 64, "rebuilding block %u of %u\n", rbld_buf.ldev_size - rbld_buf.blocks_left, rbld_buf.ldev_size); } diff --git a/drivers/scsi/myrs.c b/drivers/scsi/myrs.c index a1eec65a9713f..e824be9d9bbb9 100644 --- a/drivers/scsi/myrs.c +++ b/drivers/scsi/myrs.c @@ -947,9 +947,9 @@ static ssize_t raid_state_show(struct device *dev, name = myrs_devstate_name(ldev_info->dev_state); if (name) - ret = snprintf(buf, 32, "%s\n", name); + ret = snprintf(buf, 64, "%s\n", name); else - ret = snprintf(buf, 32, "Invalid (%02X)\n", + ret = snprintf(buf, 64, "Invalid (%02X)\n", ldev_info->dev_state); } else { struct myrs_pdev_info *pdev_info; @@ -958,9 +958,9 @@ static ssize_t raid_state_show(struct device *dev, pdev_info = sdev->hostdata; name = myrs_devstate_name(pdev_info->dev_state); if (name) - ret = snprintf(buf, 32, "%s\n", name); + ret = snprintf(buf, 64, "%s\n", name); else - ret = snprintf(buf, 32, "Invalid (%02X)\n", + ret = snprintf(buf, 64, "Invalid (%02X)\n", pdev_info->dev_state); } return ret; @@ -1066,13 +1066,13 @@ static ssize_t raid_level_show(struct device *dev, ldev_info = sdev->hostdata; name = myrs_raid_level_name(ldev_info->raid_level); if (!name) - return snprintf(buf, 32, "Invalid (%02X)\n", + return snprintf(buf, 64, "Invalid (%02X)\n", ldev_info->dev_state); } else name = myrs_raid_level_name(MYRS_RAID_PHYSICAL); - return snprintf(buf, 32, "%s\n", name); + return snprintf(buf, 64, "%s\n", name); } static DEVICE_ATTR_RO(raid_level); @@ -1086,7 +1086,7 @@ static ssize_t rebuild_show(struct device *dev, unsigned char status; if (sdev->channel < cs->ctlr_info->physchan_present) - return snprintf(buf, 32, "physical device - not rebuilding\n"); + return snprintf(buf, 64, "physical device - not rebuilding\n"); ldev_info = sdev->hostdata; ldev_num = ldev_info->ldev_num; @@ -1098,11 +1098,11 @@ static ssize_t rebuild_show(struct device *dev, return -EIO; } if (ldev_info->rbld_active) { - return snprintf(buf, 32, "rebuilding block %zu of %zu\n", + return snprintf(buf, 64, "rebuilding block %zu of %zu\n", (size_t)ldev_info->rbld_lba, (size_t)ldev_info->cfg_devsize); } else - return snprintf(buf, 32, "not rebuilding\n"); + return snprintf(buf, 64, "not rebuilding\n"); } static ssize_t rebuild_store(struct device *dev, @@ -1190,7 +1190,7 @@ static ssize_t consistency_check_show(struct device *dev, unsigned short ldev_num; if (sdev->channel < cs->ctlr_info->physchan_present) - return snprintf(buf, 32, "physical device - not checking\n"); + return snprintf(buf, 64, "physical device - not checking\n"); ldev_info = sdev->hostdata; if (!ldev_info) @@ -1198,11 +1198,11 @@ static ssize_t consistency_check_show(struct device *dev, ldev_num = ldev_info->ldev_num; myrs_get_ldev_info(cs, ldev_num, ldev_info); if (ldev_info->cc_active) - return snprintf(buf, 32, "checking block %zu of %zu\n", + return snprintf(buf, 64, "checking block %zu of %zu\n", (size_t)ldev_info->cc_lba, (size_t)ldev_info->cfg_devsize); else - return snprintf(buf, 32, "not checking\n"); + return snprintf(buf, 64, "not checking\n"); } static ssize_t consistency_check_store(struct device *dev, -- GitLab From ba947ecd39ea0e6a6f6f1101f99611fc30943bcb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 1 Apr 2024 19:16:19 -0400 Subject: [PATCH 233/989] bcachefs: Fix btree node reserve Sign error when checking the watermark - oops. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 983bb27298cc2..29aee215384a6 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -303,7 +303,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct open_buckets obs = { .nr = 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim + unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim ? BTREE_NODE_RESERVE : 0; int ret; -- GitLab From e2a316b3cc45a1198f3feb18707403bb7f0cbc15 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 1 Apr 2024 19:20:36 -0400 Subject: [PATCH 234/989] bcachefs: BCH_WATERMARK_interior_updates This adds a new watermark, higher priority than BCH_WATERMARK_reclaim, for interior btree updates. We've seen a deadlock where journal replay triggers a ton of btree node merges, and these use up all available open buckets and then interior updates get stuck. One cause of this is that we're currently lacking btree node merging on write buffer btrees - that needs to be fixed as well. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 4 +++- fs/bcachefs/alloc_types.h | 3 ++- fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/btree_trans_commit.c | 3 ++- fs/bcachefs/btree_update_interior.c | 6 +++--- fs/bcachefs/buckets.h | 1 + 6 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 214b15c84d1f3..a1fc30adf9129 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) static inline unsigned open_buckets_reserved(enum bch_watermark watermark) { switch (watermark) { - case BCH_WATERMARK_reclaim: + case BCH_WATERMARK_interior_updates: return 0; + case BCH_WATERMARK_reclaim: + return OPEN_BUCKETS_COUNT / 6; case BCH_WATERMARK_btree: case BCH_WATERMARK_btree_copygc: return OPEN_BUCKETS_COUNT / 4; diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index b91b7a4610560..c2226e947c41f 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -22,7 +22,8 @@ struct bucket_alloc_state { x(copygc) \ x(btree) \ x(btree_copygc) \ - x(reclaim) + x(reclaim) \ + x(interior_updates) enum bch_watermark { #define x(name) BCH_WATERMARK_##name, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 9c71e6fb9c41b..f3f27bb85a5ba 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1861,7 +1861,7 @@ static void btree_node_write_work(struct work_struct *work) } else { ret = bch2_trans_do(c, NULL, NULL, 0, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, - BCH_WATERMARK_reclaim| + BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw, diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 96669fede7d34..aa9da49707404 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -887,6 +887,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, int ret, unsigned long trace_ip) { struct bch_fs *c = trans->c; + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; switch (ret) { case -BCH_ERR_btree_insert_btree_node_full: @@ -905,7 +906,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, * flag */ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { + watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; break; } diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 29aee215384a6..9fd2dd0f46825 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -687,7 +687,7 @@ static void btree_update_nodes_written(struct btree_update *as) * which may require allocations as well. */ ret = commit_do(trans, &as->disk_res, &journal_seq, - BCH_WATERMARK_reclaim| + BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_journal_reclaim, @@ -1121,7 +1121,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK; if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark != BCH_WATERMARK_reclaim) + watermark < BCH_WATERMARK_reclaim) journal_flags |= JOURNAL_RES_GET_NONBLOCK; ret = drop_locks_do(trans, @@ -1217,7 +1217,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, */ if (bch2_err_matches(ret, ENOSPC) && (flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark != BCH_WATERMARK_reclaim) { + watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; goto err; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 6387e039f7897..00aaf4bb51397 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -226,6 +226,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma fallthrough; case BCH_WATERMARK_btree_copygc: case BCH_WATERMARK_reclaim: + case BCH_WATERMARK_interior_updates: break; } -- GitLab From 6bc5e70b1c792b31b497e48b4668a9a2909aca0d Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Fri, 29 Mar 2024 09:50:36 +0800 Subject: [PATCH 235/989] scsi: ufs: core: WLUN suspend dev/link state error recovery When wl suspend error occurs, for example BKOP or SSU timeout, the host triggers an error handler and returns -EBUSY to break the wl suspend process. However, it is possible for the runtime PM to enter wl suspend again before the error handler has finished, and return -EINVAL because the device is in an error state. To address this, ensure that the rumtime PM waits for the error handler to finish, or trigger the error handler in such cases, because returning -EINVAL can cause the I/O to hang. Signed-off-by: Peter Wang Link: https://lore.kernel.org/r/20240329015036.15707-1-peter.wang@mediatek.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index e30fd125988d7..292b06f361a26 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -9791,7 +9791,10 @@ static int __ufshcd_wl_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op) /* UFS device & link must be active before we enter in this function */ if (!ufshcd_is_ufs_dev_active(hba) || !ufshcd_is_link_active(hba)) { - ret = -EINVAL; + /* Wait err handler finish or trigger err recovery */ + if (!ufshcd_eh_in_progress(hba)) + ufshcd_force_error_recovery(hba); + ret = -EBUSY; goto enable_scaling; } -- GitLab From 0296bea01cfa6526be6bd2d16dc83b4e7f1af91f Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 8 Dec 2023 16:23:35 +0800 Subject: [PATCH 236/989] scsi: sd: Unregister device if device_add_disk() failed in sd_probe() "if device_add() succeeds, you should call device_del() when you want to get rid of it." In sd_probe(), device_add_disk() fails when device_add() has already succeeded, so change put_device() to device_unregister() to ensure device resources are released. Fixes: 2a7a891f4c40 ("scsi: sd: Add error handling support for add_disk()") Signed-off-by: Li Nan Link: https://lore.kernel.org/r/20231208082335.1754205-1-linan666@huaweicloud.com Reviewed-by: Bart Van Assche Reviewed-by: Yu Kuai Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 3cf8986702904..58fdf679341dc 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3920,7 +3920,7 @@ static int sd_probe(struct device *dev) error = device_add_disk(dev, gd, NULL); if (error) { - put_device(&sdkp->disk_dev); + device_unregister(&sdkp->disk_dev); put_disk(gd); goto out; } -- GitLab From e675a4fd6d1f8990d3bed5dada3d20edfa000423 Mon Sep 17 00:00:00 2001 From: Yihang Li Date: Thu, 28 Mar 2024 17:06:26 +0800 Subject: [PATCH 237/989] scsi: libsas: Align SMP request allocation to ARCH_DMA_MINALIGN This series [1] reduced the kmalloc() minimum alignment on arm64 to 8 bytes (from 128). In libsas, this will cause SMP requests to be 8-byte aligned through kmalloc() allocation. However, for hisi_sas hardware, all command addresses must be 16-byte-aligned. Otherwise, the commands fail to be executed. ARCH_DMA_MINALIGN represents the minimum (static) alignment for safe DMA operations, so use ARCH_DMA_MINALIGN as the alignment for SMP request. Link: https://lkml.kernel.org/r/20230612153201.554742-1-catalin.marinas@arm.com [1] Signed-off-by: Yihang Li Link: https://lore.kernel.org/r/20240328090626.621147-1-liyihang9@huawei.com Reviewed-by: Damien Le Moal Reviewed-by: John Garry Reviewed-by: Jason Yan Signed-off-by: Martin K. Petersen --- drivers/scsi/libsas/sas_expander.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 5c261005b74e4..f6e6db8b8aba9 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -135,7 +135,7 @@ static int smp_execute_task(struct domain_device *dev, void *req, int req_size, static inline void *alloc_smp_req(int size) { - u8 *p = kzalloc(size, GFP_KERNEL); + u8 *p = kzalloc(ALIGN(size, ARCH_DMA_MINALIGN), GFP_KERNEL); if (p) p[0] = SMP_REQUEST; return p; -- GitLab From 2a26a11e9c258b14be6fd98f8a85f20ac1fff66e Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Thu, 28 Mar 2024 19:12:44 +0800 Subject: [PATCH 238/989] scsi: ufs: core: Fix MCQ mode dev command timeout When a dev command times out in MCQ mode, a successfully cleared command should cause a retry. However, because we currently return 0, the caller considers the command a success which causes the following error to be logged: "Invalid offset 0x0 in descriptor IDN 0x9, length 0x0". Retry if clearing the command was successful. Signed-off-by: Peter Wang Link: https://lore.kernel.org/r/20240328111244.3599-1-peter.wang@mediatek.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 292b06f361a26..a0f8e930167d7 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -3217,7 +3217,9 @@ retry: /* MCQ mode */ if (is_mcq_enabled(hba)) { - err = ufshcd_clear_cmd(hba, lrbp->task_tag); + /* successfully cleared the command, retry if needed */ + if (ufshcd_clear_cmd(hba, lrbp->task_tag) == 0) + err = -EAGAIN; hba->dev_cmd.complete = NULL; return err; } -- GitLab From cd49cca222bc5532105a1f20bff6ae60d7bf7713 Mon Sep 17 00:00:00 2001 From: Abhinav Kumar Date: Wed, 6 Mar 2024 11:35:15 -0800 Subject: [PATCH 239/989] drm/msm/dp: fix typo in dp_display_handle_port_status_changed() Fix the typo in the name of dp_display_handle_port_status_changed(). Fixes: c58eb1b54fee ("drm/msm/dp: fix connect/disconnect handled at irq_hpd") Signed-off-by: Abhinav Kumar Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/581746/ Link: https://lore.kernel.org/r/20240306193515.455388-1-quic_abhinavk@quicinc.com --- drivers/gpu/drm/msm/dp/dp_display.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index 2ed40bd0acb60..9575d4b80c295 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -468,7 +468,7 @@ static void dp_display_handle_video_request(struct dp_display_private *dp) } } -static int dp_display_handle_port_ststus_changed(struct dp_display_private *dp) +static int dp_display_handle_port_status_changed(struct dp_display_private *dp) { int rc = 0; @@ -525,7 +525,7 @@ static int dp_display_usbpd_attention_cb(struct device *dev) drm_dbg_dp(dp->drm_dev, "hpd_state=%d sink_request=%d\n", dp->hpd_state, sink_request); if (sink_request & DS_PORT_STATUS_CHANGED) - rc = dp_display_handle_port_ststus_changed(dp); + rc = dp_display_handle_port_status_changed(dp); else rc = dp_display_handle_irq_hpd(dp); } -- GitLab From be1b7acb929137e3943fe380671242beb485190c Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 2 Apr 2024 05:57:15 +0300 Subject: [PATCH 240/989] dt-bindings: display/msm: sm8150-mdss: add DP node As Qualcomm SM8150 got support for the DisplayPort, add displayport@ node as a valid child to the MDSS node. Fixes: 88806318e2c2 ("dt-bindings: display: msm: dp: declare compatible string for sm8150") Reviewed-by: Krzysztof Kozlowski Signed-off-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/586156/ Link: https://lore.kernel.org/r/20240402-fd-fix-schema-v3-1-817ea6ddf775@linaro.org Signed-off-by: Abhinav Kumar --- .../bindings/display/msm/qcom,sm8150-mdss.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml index c0d6a4fdff97e..e6dc5494baee2 100644 --- a/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml +++ b/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml @@ -53,6 +53,15 @@ patternProperties: compatible: const: qcom,sm8150-dpu + "^displayport-controller@[0-9a-f]+$": + type: object + additionalProperties: true + + properties: + compatible: + contains: + const: qcom,sm8150-dp + "^dsi@[0-9a-f]+$": type: object additionalProperties: true -- GitLab From c88b50a12f962f520dfab0a53ab393f43df9bbd4 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 1 Apr 2024 14:44:29 +0200 Subject: [PATCH 241/989] ata: ahci_st: Remove an unused field in struct st_ahci_drv_data In "struct st_ahci_drv_data", the 'ahci' field is unused. Remove it. Found with cppcheck, unusedStructMember. Signed-off-by: Christophe JAILLET Signed-off-by: Damien Le Moal --- drivers/ata/ahci_st.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/ata/ahci_st.c b/drivers/ata/ahci_st.c index d4a626f87963b..79a8b0aa37bf3 100644 --- a/drivers/ata/ahci_st.c +++ b/drivers/ata/ahci_st.c @@ -30,7 +30,6 @@ #define ST_AHCI_OOBR_CIMAX_SHIFT 0 struct st_ahci_drv_data { - struct platform_device *ahci; struct reset_control *pwr; struct reset_control *sw_rst; struct reset_control *pwr_rst; -- GitLab From 37801a36b4d68892ce807264f784d818f8d0d39b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Thu, 28 Mar 2024 20:16:58 +0100 Subject: [PATCH 242/989] selinux: avoid dereference of garbage after mount failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case kern_mount() fails and returns an error pointer return in the error branch instead of continuing and dereferencing the error pointer. While on it drop the never read static variable selinuxfs_mount. Cc: stable@vger.kernel.org Fixes: 0619f0f5e36f ("selinux: wrap selinuxfs state") Signed-off-by: Christian Göttsche Signed-off-by: Paul Moore --- security/selinux/selinuxfs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 0619a1cbbfbe4..074d6c2714eb5 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -2123,7 +2123,6 @@ static struct file_system_type sel_fs_type = { .kill_sb = sel_kill_sb, }; -static struct vfsmount *selinuxfs_mount __ro_after_init; struct path selinux_null __ro_after_init; static int __init init_sel_fs(void) @@ -2145,18 +2144,21 @@ static int __init init_sel_fs(void) return err; } - selinux_null.mnt = selinuxfs_mount = kern_mount(&sel_fs_type); - if (IS_ERR(selinuxfs_mount)) { + selinux_null.mnt = kern_mount(&sel_fs_type); + if (IS_ERR(selinux_null.mnt)) { pr_err("selinuxfs: could not mount!\n"); - err = PTR_ERR(selinuxfs_mount); - selinuxfs_mount = NULL; + err = PTR_ERR(selinux_null.mnt); + selinux_null.mnt = NULL; + return err; } + selinux_null.dentry = d_hash_and_lookup(selinux_null.mnt->mnt_root, &null_name); if (IS_ERR(selinux_null.dentry)) { pr_err("selinuxfs: could not lookup null!\n"); err = PTR_ERR(selinux_null.dentry); selinux_null.dentry = NULL; + return err; } return err; -- GitLab From 9d98aa088386aee3db1b7b60b800c0fde0654a4a Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 1 Apr 2024 20:55:29 +0200 Subject: [PATCH 243/989] x86/bpf: Fix IP after emitting call depth accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adjust the IP passed to `emit_patch` so it calculates the correct offset for the CALL instruction if `x86_call_depth_emit_accounting` emits code. Otherwise we will skip some instructions and most likely crash. Fixes: b2e9dfe54be4 ("x86/bpf: Emit call depth accounting if required") Link: https://lore.kernel.org/lkml/20230105214922.250473-1-joanbrugueram@gmail.com/ Co-developed-by: Joan Bruguera Micó Signed-off-by: Joan Bruguera Micó Signed-off-by: Uros Bizjak Cc: Alexei Starovoitov Cc: Daniel Borkmann Link: https://lore.kernel.org/r/20240401185821.224068-2-ubizjak@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a7ba8e1786452..e55745f512e19 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -480,7 +480,7 @@ static int emit_call(u8 **pprog, void *func, void *ip) static int emit_rsb_call(u8 **pprog, void *func, void *ip) { OPTIMIZER_HIDE_VAR(func); - x86_call_depth_emit_accounting(pprog, func); + ip += x86_call_depth_emit_accounting(pprog, func); return emit_patch(pprog, func, ip, 0xE8); } -- GitLab From 6a537453000a916392fcac1acb96c1d9d1e05b74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joan=20Bruguera=20Mic=C3=B3?= Date: Mon, 1 Apr 2024 20:55:30 +0200 Subject: [PATCH 244/989] x86/bpf: Fix IP for relocating call depth accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit: 59bec00ace28 ("x86/percpu: Introduce %rip-relative addressing to PER_CPU_VAR()") made PER_CPU_VAR() to use rip-relative addressing, hence INCREMENT_CALL_DEPTH macro and skl_call_thunk_template got rip-relative asm code inside of it. A follow up commit: 17bce3b2ae2d ("x86/callthunks: Handle %rip-relative relocations in call thunk template") changed x86_call_depth_emit_accounting() to use apply_relocation(), but mistakenly assumed that the code is being patched in-place (where the destination of the relocation matches the address of the code), using *pprog as the destination ip. This is not true for the call depth accounting, emitted by the BPF JIT, so the calculated address was wrong, JIT-ed BPF progs on kernels with call depth tracking got broken and usually caused a page fault. Pass the destination IP when the BPF JIT emits call depth accounting. Fixes: 17bce3b2ae2d ("x86/callthunks: Handle %rip-relative relocations in call thunk template") Signed-off-by: Joan Bruguera Micó Reviewed-by: Uros Bizjak Acked-by: Ingo Molnar Cc: Alexei Starovoitov Cc: Daniel Borkmann Link: https://lore.kernel.org/r/20240401185821.224068-3-ubizjak@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/include/asm/alternative.h | 4 ++-- arch/x86/kernel/callthunks.c | 4 ++-- arch/x86/net/bpf_jit_comp.c | 19 ++++++++----------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index fcd20c6dc7f90..67b68d0d17d1e 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -117,7 +117,7 @@ extern void callthunks_patch_builtin_calls(void); extern void callthunks_patch_module_calls(struct callthunk_sites *sites, struct module *mod); extern void *callthunks_translate_call_dest(void *dest); -extern int x86_call_depth_emit_accounting(u8 **pprog, void *func); +extern int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip); #else static __always_inline void callthunks_patch_builtin_calls(void) {} static __always_inline void @@ -128,7 +128,7 @@ static __always_inline void *callthunks_translate_call_dest(void *dest) return dest; } static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, - void *func) + void *func, void *ip) { return 0; } diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 30335182b6b0a..e92ff0c11db81 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -314,7 +314,7 @@ static bool is_callthunk(void *addr) return !bcmp(pad, insn_buff, tmpl_size); } -int x86_call_depth_emit_accounting(u8 **pprog, void *func) +int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip) { unsigned int tmpl_size = SKL_TMPL_SIZE; u8 insn_buff[MAX_PATCH_LEN]; @@ -327,7 +327,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func) return 0; memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, tmpl_size, *pprog, + apply_relocation(insn_buff, tmpl_size, ip, skl_call_thunk_template, tmpl_size); memcpy(*pprog, insn_buff, tmpl_size); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e55745f512e19..df5fac428408f 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -480,7 +480,7 @@ static int emit_call(u8 **pprog, void *func, void *ip) static int emit_rsb_call(u8 **pprog, void *func, void *ip) { OPTIMIZER_HIDE_VAR(func); - ip += x86_call_depth_emit_accounting(pprog, func); + ip += x86_call_depth_emit_accounting(pprog, func, ip); return emit_patch(pprog, func, ip, 0xE8); } @@ -1972,20 +1972,17 @@ populate_extable: /* call */ case BPF_JMP | BPF_CALL: { - int offs; + u8 *ip = image + addrs[i - 1]; func = (u8 *) __bpf_call_base + imm32; if (tail_call_reachable) { RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth); - if (!imm32) - return -EINVAL; - offs = 7 + x86_call_depth_emit_accounting(&prog, func); - } else { - if (!imm32) - return -EINVAL; - offs = x86_call_depth_emit_accounting(&prog, func); + ip += 7; } - if (emit_call(&prog, func, image + addrs[i - 1] + offs)) + if (!imm32) + return -EINVAL; + ip += x86_call_depth_emit_accounting(&prog, func, ip); + if (emit_call(&prog, func, ip)) return -EINVAL; break; } @@ -2835,7 +2832,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im * Direct-call fentry stub, as such it needs accounting for the * __fentry__ call. */ - x86_call_depth_emit_accounting(&prog, NULL); + x86_call_depth_emit_accounting(&prog, NULL, image); } EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ -- GitLab From 96c155943a703f0655c0c4cab540f67055960e91 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Fri, 29 Mar 2024 09:16:31 +0300 Subject: [PATCH 245/989] net: phy: micrel: Fix potential null pointer dereference In lan8814_get_sig_rx() and lan8814_get_sig_tx() ptp_parse_header() may return NULL as ptp_header due to abnormal packet type or corrupted packet. Fix this bug by adding ptp_header check. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy") Signed-off-by: Aleksandr Mishin Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240329061631.33199-1-amishin@t-argos.ru Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 8b8634600c519..0f8a8ad7ea0bd 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -2537,7 +2537,7 @@ static void lan8814_txtstamp(struct mii_timestamper *mii_ts, } } -static void lan8814_get_sig_rx(struct sk_buff *skb, u16 *sig) +static bool lan8814_get_sig_rx(struct sk_buff *skb, u16 *sig) { struct ptp_header *ptp_header; u32 type; @@ -2547,7 +2547,11 @@ static void lan8814_get_sig_rx(struct sk_buff *skb, u16 *sig) ptp_header = ptp_parse_header(skb, type); skb_pull_inline(skb, ETH_HLEN); + if (!ptp_header) + return false; + *sig = (__force u16)(ntohs(ptp_header->sequence_id)); + return true; } static bool lan8814_match_rx_skb(struct kszphy_ptp_priv *ptp_priv, @@ -2559,7 +2563,8 @@ static bool lan8814_match_rx_skb(struct kszphy_ptp_priv *ptp_priv, bool ret = false; u16 skb_sig; - lan8814_get_sig_rx(skb, &skb_sig); + if (!lan8814_get_sig_rx(skb, &skb_sig)) + return ret; /* Iterate over all RX timestamps and match it with the received skbs */ spin_lock_irqsave(&ptp_priv->rx_ts_lock, flags); @@ -2834,7 +2839,7 @@ static int lan8814_ptpci_adjfine(struct ptp_clock_info *ptpci, long scaled_ppm) return 0; } -static void lan8814_get_sig_tx(struct sk_buff *skb, u16 *sig) +static bool lan8814_get_sig_tx(struct sk_buff *skb, u16 *sig) { struct ptp_header *ptp_header; u32 type; @@ -2842,7 +2847,11 @@ static void lan8814_get_sig_tx(struct sk_buff *skb, u16 *sig) type = ptp_classify_raw(skb); ptp_header = ptp_parse_header(skb, type); + if (!ptp_header) + return false; + *sig = (__force u16)(ntohs(ptp_header->sequence_id)); + return true; } static void lan8814_match_tx_skb(struct kszphy_ptp_priv *ptp_priv, @@ -2856,7 +2865,8 @@ static void lan8814_match_tx_skb(struct kszphy_ptp_priv *ptp_priv, spin_lock_irqsave(&ptp_priv->tx_queue.lock, flags); skb_queue_walk_safe(&ptp_priv->tx_queue, skb, skb_tmp) { - lan8814_get_sig_tx(skb, &skb_sig); + if (!lan8814_get_sig_tx(skb, &skb_sig)) + continue; if (memcmp(&skb_sig, &seq_id, sizeof(seq_id))) continue; @@ -2910,7 +2920,8 @@ static bool lan8814_match_skb(struct kszphy_ptp_priv *ptp_priv, spin_lock_irqsave(&ptp_priv->rx_queue.lock, flags); skb_queue_walk_safe(&ptp_priv->rx_queue, skb, skb_tmp) { - lan8814_get_sig_rx(skb, &skb_sig); + if (!lan8814_get_sig_rx(skb, &skb_sig)) + continue; if (memcmp(&skb_sig, &rx_ts->seq_id, sizeof(rx_ts->seq_id))) continue; -- GitLab From 31974122cfdeaf56abc18d8ab740d580d9833e90 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Mar 2024 09:05:59 -0700 Subject: [PATCH 246/989] selftests: reuseaddr_conflict: add missing new line at the end of the output The netdev CI runs in a VM and captures serial, so stdout and stderr get combined. Because there's a missing new line in stderr the test ends up corrupting KTAP: # Successok 1 selftests: net: reuseaddr_conflict which should have been: # Success ok 1 selftests: net: reuseaddr_conflict Fixes: 422d8dc6fd3a ("selftest: add a reuseaddr test") Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240329160559.249476-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/reuseaddr_conflict.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/reuseaddr_conflict.c b/tools/testing/selftests/net/reuseaddr_conflict.c index 7c5b12664b03b..bfb07dc495186 100644 --- a/tools/testing/selftests/net/reuseaddr_conflict.c +++ b/tools/testing/selftests/net/reuseaddr_conflict.c @@ -109,6 +109,6 @@ int main(void) fd1 = open_port(0, 1); if (fd1 >= 0) error(1, 0, "Was allowed to create an ipv4 reuseport on an already bound non-reuseport socket with no ipv6"); - fprintf(stderr, "Success"); + fprintf(stderr, "Success\n"); return 0; } -- GitLab From fcf4692fa39e86a590c14a4af2de704e1d20a3b5 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 29 Mar 2024 19:50:36 +0100 Subject: [PATCH 247/989] mptcp: prevent BPF accessing lowat from a subflow socket. Alexei reported the following splat: WARNING: CPU: 32 PID: 3276 at net/mptcp/subflow.c:1430 subflow_data_ready+0x147/0x1c0 Modules linked in: dummy bpf_testmod(O) [last unloaded: bpf_test_no_cfi(O)] CPU: 32 PID: 3276 Comm: test_progs Tainted: GO 6.8.0-12873-g2c43c33bfd23 Call Trace: mptcp_set_rcvlowat+0x79/0x1d0 sk_setsockopt+0x6c0/0x1540 __bpf_setsockopt+0x6f/0x90 bpf_sock_ops_setsockopt+0x3c/0x90 bpf_prog_509ce5db2c7f9981_bpf_test_sockopt_int+0xb4/0x11b bpf_prog_dce07e362d941d2b_bpf_test_socket_sockopt+0x12b/0x132 bpf_prog_348c9b5faaf10092_skops_sockopt+0x954/0xe86 __cgroup_bpf_run_filter_sock_ops+0xbc/0x250 tcp_connect+0x879/0x1160 tcp_v6_connect+0x50c/0x870 mptcp_connect+0x129/0x280 __inet_stream_connect+0xce/0x370 inet_stream_connect+0x36/0x50 bpf_trampoline_6442491565+0x49/0xef inet_stream_connect+0x5/0x50 __sys_connect+0x63/0x90 __x64_sys_connect+0x14/0x20 The root cause of the issue is that bpf allows accessing mptcp-level proto_ops from a tcp subflow scope. Fix the issue detecting the problematic call and preventing any action. Reported-by: Alexei Starovoitov Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/482 Fixes: 5684ab1a0eff ("mptcp: give rcvlowat some love") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Reviewed-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/d8cb7d8476d66cb0812a6e29cd1e626869d9d53e.1711738080.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index dcd1c76d2a3ba..73fdf423de44e 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1493,6 +1493,10 @@ int mptcp_set_rcvlowat(struct sock *sk, int val) struct mptcp_subflow_context *subflow; int space, cap; + /* bpf can land here with a wrong sk type */ + if (sk->sk_protocol == IPPROTO_TCP) + return -EINVAL; + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else -- GitLab From 7a1b3490f47e88ec4cbde65f1a77a0f4bc972282 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 29 Mar 2024 13:08:52 +0100 Subject: [PATCH 248/989] mptcp: don't account accept() of non-MPC client as fallback to TCP Current MPTCP servers increment MPTcpExtMPCapableFallbackACK when they accept non-MPC connections. As reported by Christoph, this is "surprising" because the counter might become greater than MPTcpExtMPCapableSYNRX. MPTcpExtMPCapableFallbackACK counter's name suggests it should only be incremented when a connection was seen using MPTCP options, then a fallback to TCP has been done. Let's do that by incrementing it when the subflow context of an inbound MPC connection attempt is dropped. Also, update mptcp_connect.sh kselftest, to ensure that the above MIB does not increment in case a pure TCP client connects to a MPTCP server. Fixes: fc518953bc9c ("mptcp: add and use MIB counter infrastructure") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/449 Signed-off-by: Davide Caratti Reviewed-by: Mat Martineau Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240329-upstream-net-20240329-fallback-mib-v1-1-324a8981da48@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 2 -- net/mptcp/subflow.c | 2 ++ tools/testing/selftests/net/mptcp/mptcp_connect.sh | 9 +++++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3a1967bc7bad6..7e74b812e366a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3937,8 +3937,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, mptcp_set_state(newsk, TCP_CLOSE); } } else { - MPTCP_INC_STATS(sock_net(ssk), - MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); tcpfallback: newsk->sk_kern_sock = kern; lock_sock(newsk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 1626dd20c68f1..6042a47da61be 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -905,6 +905,8 @@ dispose_child: return child; fallback: + if (fallback) + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); mptcp_subflow_drop_ctx(child); return child; } diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 4c42485548262..4131f3263a482 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -383,12 +383,14 @@ do_transfer() local stat_cookierx_last local stat_csum_err_s local stat_csum_err_c + local stat_tcpfb_last_l stat_synrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") stat_ackrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") stat_cookietx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent") stat_cookierx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv") stat_csum_err_s=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtDataCsumErr") stat_csum_err_c=$(mptcp_lib_get_counter "${connector_ns}" "MPTcpExtDataCsumErr") + stat_tcpfb_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK") timeout ${timeout_test} \ ip netns exec ${listener_ns} \ @@ -457,11 +459,13 @@ do_transfer() local stat_cookietx_now local stat_cookierx_now local stat_ooo_now + local stat_tcpfb_now_l stat_synrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") stat_ackrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") stat_cookietx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent") stat_cookierx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv") stat_ooo_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtTCPOFOQueue") + stat_tcpfb_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK") expect_synrx=$((stat_synrx_last_l)) expect_ackrx=$((stat_ackrx_last_l)) @@ -508,6 +512,11 @@ do_transfer() fi fi + if [ ${stat_ooo_now} -eq 0 ] && [ ${stat_tcpfb_last_l} -ne ${stat_tcpfb_now_l} ]; then + mptcp_lib_pr_fail "unexpected fallback to TCP" + rets=1 + fi + if [ $cookies -eq 2 ];then if [ $stat_cookietx_last -ge $stat_cookietx_now ] ;then extra+=" WARN: CookieSent: did not advance" -- GitLab From 40061817d95bce6dd5634a61a65cd5922e6ccc92 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 29 Mar 2024 13:08:53 +0100 Subject: [PATCH 249/989] selftests: mptcp: join: fix dev in check_endpoint There's a bug in pm_nl_check_endpoint(), 'dev' didn't be parsed correctly. If calling it in the 2nd test of endpoint_tests() too, it fails with an error like this: creation [FAIL] expected '10.0.2.2 id 2 subflow dev dev' \ found '10.0.2.2 id 2 subflow dev ns2eth2' The reason is '$2' should be set to 'dev', not '$1'. This patch fixes it. Fixes: 69c6ce7b6eca ("selftests: mptcp: add implicit endpoint test case") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240329-upstream-net-20240329-fallback-mib-v1-2-324a8981da48@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 5e9211e898256..e4403236f6554 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -729,7 +729,7 @@ pm_nl_check_endpoint() [ -n "$_flags" ]; flags="flags $_flags" shift elif [ $1 = "dev" ]; then - [ -n "$2" ]; dev="dev $1" + [ -n "$2" ]; dev="dev $2" shift elif [ $1 = "id" ]; then _id=$2 @@ -3610,6 +3610,8 @@ endpoint_tests() local tests_pid=$! wait_mpj $ns2 + pm_nl_check_endpoint "creation" \ + $ns2 10.0.2.2 id 2 flags subflow dev ns2eth2 chk_subflow_nr "before delete" 2 chk_mptcp_info subflows 1 subflows 1 -- GitLab From ea2a1cfc3b2019bdea6324acd3c03606b60d71ad Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 29 Mar 2024 11:06:37 -0700 Subject: [PATCH 250/989] i40e: Fix VF MAC filter removal Commit 73d9629e1c8c ("i40e: Do not allow untrusted VF to remove administratively set MAC") fixed an issue where untrusted VF was allowed to remove its own MAC address although this was assigned administratively from PF. Unfortunately the introduced check is wrong because it causes that MAC filters for other MAC addresses including multi-cast ones are not removed. if (ether_addr_equal(addr, vf->default_lan_addr.addr) && i40e_can_vf_change_mac(vf)) was_unimac_deleted = true; else continue; if (i40e_del_mac_filter(vsi, al->list[i].addr)) { ... The else path with `continue` effectively skips any MAC filter removal except one for primary MAC addr when VF is allowed to do so. Fix the check condition so the `continue` is only done for primary MAC address. Fixes: 73d9629e1c8c ("i40e: Do not allow untrusted VF to remove administratively set MAC") Signed-off-by: Ivan Vecera Reviewed-by: Michal Schmidt Reviewed-by: Brett Creeley Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20240329180638.211412-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 4c13f78af6756..232b65b9c8eac 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3137,11 +3137,12 @@ static int i40e_vc_del_mac_addr_msg(struct i40e_vf *vf, u8 *msg) /* Allow to delete VF primary MAC only if it was not set * administratively by PF or if VF is trusted. */ - if (ether_addr_equal(addr, vf->default_lan_addr.addr) && - i40e_can_vf_change_mac(vf)) - was_unimac_deleted = true; - else - continue; + if (ether_addr_equal(addr, vf->default_lan_addr.addr)) { + if (i40e_can_vf_change_mac(vf)) + was_unimac_deleted = true; + else + continue; + } if (i40e_del_mac_filter(vsi, al->list[i].addr)) { ret = -EINVAL; -- GitLab From c42cd606e4f004e9ba36a05b9adb9e4eead5834a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 2 Apr 2024 01:03:58 -0400 Subject: [PATCH 251/989] bcachefs: fix nocow lock deadlock Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index b564404dad711..34731ee0217f6 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -580,8 +580,7 @@ int bch2_data_update_init(struct btree_trans *trans, move_ctxt_wait_event(ctxt, (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, PTR_BUCKET_POS(c, &p.ptr), 0)) || - (!atomic_read(&ctxt->read_sectors) && - !atomic_read(&ctxt->write_sectors))); + list_empty(&ctxt->ios)); if (!locked) bch2_bucket_nocow_lock(&c->nocow_locks, -- GitLab From c032cdd48b29549e8283c2fea99e7d91ddefebf7 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 26 Mar 2024 10:58:15 +0200 Subject: [PATCH 252/989] thunderbolt: Do not create DisplayPort tunnels on adapters of the same router Probably due to a firmware bug Dell TB16 dock announces that one of its DisplayPort adapters is actually DP IN. Now this is possible and used with some external GPUs but not likely in this case as we are dealing with a dock. Anyways the problem is that the driver tries to create a DisplayPort tunnel between adapters of the same router which then shows to user that there is no picture on the display (because there are no available DP OUT adapters on the dock anymore). Fix this by not creating DisplayPort tunnels between adapters that are on the same router. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10265 Fixes: 274baf695b08 ("thunderbolt: Add DP IN added last in the head of the list of DP resources") Cc: Gil Fine Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/tb.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c index c5ce7a694b27d..360cb95f39aab 100644 --- a/drivers/thunderbolt/tb.c +++ b/drivers/thunderbolt/tb.c @@ -1801,6 +1801,12 @@ static struct tb_port *tb_find_dp_out(struct tb *tb, struct tb_port *in) continue; } + /* Needs to be on different routers */ + if (in->sw == port->sw) { + tb_port_dbg(port, "skipping DP OUT on same router\n"); + continue; + } + tb_port_dbg(port, "DP OUT available\n"); /* -- GitLab From 03f56ed4ead162551ac596c9e3076ff01f1c5836 Mon Sep 17 00:00:00 2001 From: Oswald Buddenhagen Date: Mon, 1 Apr 2024 16:58:05 +0200 Subject: [PATCH 253/989] Revert "ALSA: emu10k1: fix synthesizer sample playback position and caching" As already anticipated in the original commit, playback was broken for very short samples. I just didn't expect it to be an actual problem, because we're talking about less than 1.5 milliseconds here. But clearly such wavetable samples do actually exist. The problem was that for such short samples we'd set the current position beyond the end of the loop, so we'd run off the end of the sample and play garbage. This is a bigger (more audible) problem than the original one, which was that we'd start playback with garbage (whatever was still in the cache), which would be mostly masked by the note's attack phase. So revert to the old behavior for now. We'll subsequently fix it properly with a bigger patch series. Note that this isn't a full revert - the dead code is not re-introduced, because that would be silly. Fixes: df335e9a8bcb ("ALSA: emu10k1: fix synthesizer sample playback position and caching") Link: https://bugzilla.kernel.org/show_bug.cgi?id=218625 Signed-off-by: Oswald Buddenhagen Message-ID: <20240401145805.528794-1-oswald.buddenhagen@gmx.de> Signed-off-by: Takashi Iwai --- sound/pci/emu10k1/emu10k1_callback.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sound/pci/emu10k1/emu10k1_callback.c b/sound/pci/emu10k1/emu10k1_callback.c index d36234b88fb42..941bfbf812ed3 100644 --- a/sound/pci/emu10k1/emu10k1_callback.c +++ b/sound/pci/emu10k1/emu10k1_callback.c @@ -255,7 +255,7 @@ lookup_voices(struct snd_emux *emu, struct snd_emu10k1 *hw, /* check if sample is finished playing (non-looping only) */ if (bp != best + V_OFF && bp != best + V_FREE && (vp->reg.sample_mode & SNDRV_SFNT_SAMPLE_SINGLESHOT)) { - val = snd_emu10k1_ptr_read(hw, CCCA_CURRADDR, vp->ch) - 64; + val = snd_emu10k1_ptr_read(hw, CCCA_CURRADDR, vp->ch); if (val >= vp->reg.loopstart) bp = best + V_OFF; } @@ -362,7 +362,7 @@ start_voice(struct snd_emux_voice *vp) map = (hw->silent_page.addr << hw->address_mode) | (hw->address_mode ? MAP_PTI_MASK1 : MAP_PTI_MASK0); - addr = vp->reg.start + 64; + addr = vp->reg.start; temp = vp->reg.parm.filterQ; ccca = (temp << 28) | addr; if (vp->apitch < 0xe400) @@ -430,9 +430,6 @@ start_voice(struct snd_emux_voice *vp) /* Q & current address (Q 4bit value, MSB) */ CCCA, ccca, - /* cache */ - CCR, REG_VAL_PUT(CCR_CACHEINVALIDSIZE, 64), - /* reset volume */ VTFT, vtarget | vp->ftarget, CVCF, vtarget | CVCF_CURRENTFILTER_MASK, -- GitLab From b67a7dc418aabbddec41c752ac29b6fa0250d0a8 Mon Sep 17 00:00:00 2001 From: Christian Bendiksen Date: Mon, 1 Apr 2024 12:26:10 +0000 Subject: [PATCH 254/989] ALSA: hda/realtek: Add sound quirks for Lenovo Legion slim 7 16ARHA7 models This fixes the sound not working from internal speakers on Lenovo Legion Slim 7 16ARHA7 models. The correct subsystem ID have been added to cs35l41_hda_property.c and patch_realtek.c. Signed-off-by: Christian Bendiksen Cc: Message-ID: <20240401122603.6634-1-christian@bendiksen.me> Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda_property.c | 4 ++++ sound/pci/hda/patch_realtek.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c index 72ec872afb8d2..d6ea3ab72f752 100644 --- a/sound/pci/hda/cs35l41_hda_property.c +++ b/sound/pci/hda/cs35l41_hda_property.c @@ -109,6 +109,8 @@ static const struct cs35l41_config cs35l41_config_table[] = { { "10431F1F", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 0, 0, 0 }, { "10431F62", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 }, { "17AA386F", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, -1, -1, 0, 0, 0 }, + { "17AA3877", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, + { "17AA3878", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA38A9", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, { "17AA38AB", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, { "17AA38B4", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, @@ -497,6 +499,8 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = { { "CSC3551", "10431F1F", generic_dsd_config }, { "CSC3551", "10431F62", generic_dsd_config }, { "CSC3551", "17AA386F", generic_dsd_config }, + { "CSC3551", "17AA3877", generic_dsd_config }, + { "CSC3551", "17AA3878", generic_dsd_config }, { "CSC3551", "17AA38A9", generic_dsd_config }, { "CSC3551", "17AA38AB", generic_dsd_config }, { "CSC3551", "17AA38B4", generic_dsd_config }, diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e866b8a75cda3..405620bd543c1 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10384,6 +10384,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3869, "Lenovo Yoga7 14IAL7", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), SND_PCI_QUIRK(0x17aa, 0x386f, "Legion 7i 16IAX7", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x3870, "Lenovo Yoga 7 14ARB7", ALC287_FIXUP_YOGA7_14ARB7_I2C), + SND_PCI_QUIRK(0x17aa, 0x3877, "Lenovo Legion 7 Slim 16ARHA7", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x3878, "Lenovo Legion 7 Slim 16ARHA7", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x387d, "Yoga S780-16 pro Quad AAC", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x387e, "Yoga S780-16 pro Quad YC", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x3881, "YB9 dual power mode2 YC", ALC287_FIXUP_TAS2781_I2C), -- GitLab From 1576f263ee2147dc395531476881058609ad3d38 Mon Sep 17 00:00:00 2001 From: I Gede Agastya Darma Laksana Date: Tue, 2 Apr 2024 00:46:02 +0700 Subject: [PATCH 255/989] ALSA: hda/realtek: Update Panasonic CF-SZ6 quirk to support headset with microphone This patch addresses an issue with the Panasonic CF-SZ6's existing quirk, specifically its headset microphone functionality. Previously, the quirk used ALC269_FIXUP_HEADSET_MODE, which does not support the CF-SZ6's design of a single 3.5mm jack for both mic and audio output effectively. The device uses pin 0x19 for the headset mic without jack detection. Following verification on the CF-SZ6 and discussions with the original patch author, i determined that the update to ALC269_FIXUP_ASPIRE_HEADSET_MIC is the appropriate solution. This change is custom-designed for the CF-SZ6's unique hardware setup, which includes a single 3.5mm jack for both mic and audio output, connecting the headset microphone to pin 0x19 without the use of jack detection. Fixes: 0fca97a29b83 ("ALSA: hda/realtek - Add Panasonic CF-SZ6 headset jack quirk") Signed-off-by: I Gede Agastya Darma Laksana Cc: Message-ID: <20240401174602.14133-1-gedeagas22@gmail.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 405620bd543c1..e1729d209922d 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10210,7 +10210,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x10ec, 0x1254, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0x10ec, 0x12cc, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0x10ec, 0x12f6, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), - SND_PCI_QUIRK(0x10f7, 0x8338, "Panasonic CF-SZ6", ALC269_FIXUP_HEADSET_MODE), + SND_PCI_QUIRK(0x10f7, 0x8338, "Panasonic CF-SZ6", ALC269_FIXUP_ASPIRE_HEADSET_MIC), SND_PCI_QUIRK(0x144d, 0xc109, "Samsung Ativ book 9 (NP900X3G)", ALC269_FIXUP_INV_DMIC), SND_PCI_QUIRK(0x144d, 0xc169, "Samsung Notebook 9 Pen (NP930SBE-K01US)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc176, "Samsung Notebook 9 Pro (NP930MBE-K04US)", ALC298_FIXUP_SAMSUNG_AMP), -- GitLab From 0bfe105018bd2d7b1e4373193d9b55b37cf4458b Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Tue, 2 Apr 2024 14:51:26 +1300 Subject: [PATCH 256/989] ALSA: hda/realtek: cs35l41: Support ASUS ROG G634JYR Fixes the realtek quirk to initialise the Cirrus amp correctly and adds related quirk for missing DSD properties. This model laptop has slightly updated internals compared to the previous version with Realtek Codec ID of 0x1caf. Signed-off-by: Luke D. Jones Cc: Message-ID: <20240402015126.21115-1-luke@ljones.dev> Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda_property.c | 2 ++ sound/pci/hda/patch_realtek.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c index d6ea3ab72f752..8fb688e414148 100644 --- a/sound/pci/hda/cs35l41_hda_property.c +++ b/sound/pci/hda/cs35l41_hda_property.c @@ -108,6 +108,7 @@ static const struct cs35l41_config cs35l41_config_table[] = { { "10431F12", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 }, { "10431F1F", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 0, 0, 0 }, { "10431F62", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 }, + { "10433A60", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 }, { "17AA386F", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, -1, -1, 0, 0, 0 }, { "17AA3877", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA3878", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, @@ -498,6 +499,7 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = { { "CSC3551", "10431F12", generic_dsd_config }, { "CSC3551", "10431F1F", generic_dsd_config }, { "CSC3551", "10431F62", generic_dsd_config }, + { "CSC3551", "10433A60", generic_dsd_config }, { "CSC3551", "17AA386F", generic_dsd_config }, { "CSC3551", "17AA3877", generic_dsd_config }, { "CSC3551", "17AA3878", generic_dsd_config }, diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e1729d209922d..cdcb28aa9d7bf 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10184,7 +10184,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x3a30, "ASUS G814JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x3a40, "ASUS G814JZR", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x3a50, "ASUS G834JYR/JZR", ALC245_FIXUP_CS35L41_SPI_2), - SND_PCI_QUIRK(0x1043, 0x3a60, "ASUS G634JYR/JZR", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x3a60, "ASUS G634JYR/JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), SND_PCI_QUIRK(0x1043, 0x831a, "ASUS P901", ALC269_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x1043, 0x834a, "ASUS S101", ALC269_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x1043, 0x8398, "ASUS P1005", ALC269_FIXUP_STEREO_DMIC), -- GitLab From c6ddd6e7b166532a0816825442ff60f70aed9647 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 22 Mar 2024 12:47:05 -0400 Subject: [PATCH 257/989] arm64: dts: imx8-ss-conn: fix usdhc wrong lpcg clock order The actual clock show wrong frequency: echo on >/sys/devices/platform/bus\@5b000000/5b010000.mmc/power/control cat /sys/kernel/debug/mmc0/ios clock: 200000000 Hz actual clock: 166000000 Hz ^^^^^^^^^ ..... According to sdhc0_lpcg: clock-controller@5b200000 { compatible = "fsl,imx8qxp-lpcg"; reg = <0x5b200000 0x10000>; #clock-cells = <1>; clocks = <&clk IMX_SC_R_SDHC_0 IMX_SC_PM_CLK_PER>, <&conn_ipg_clk>, <&conn_axi_clk>; clock-indices = , , ; clock-output-names = "sdhc0_lpcg_per_clk", "sdhc0_lpcg_ipg_clk", "sdhc0_lpcg_ahb_clk"; power-domains = <&pd IMX_SC_R_SDHC_0>; } "per_clk" should be IMX_LPCG_CLK_0 instead of IMX_LPCG_CLK_5. After correct clocks order: echo on >/sys/devices/platform/bus\@5b000000/5b010000.mmc/power/control cat /sys/kernel/debug/mmc0/ios clock: 200000000 Hz actual clock: 198000000 Hz ^^^^^^^^ ... Fixes: 16c4ea7501b1 ("arm64: dts: imx8: switch to new lpcg clock binding") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi index 3c42240e78e24..af2259e997967 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi @@ -67,8 +67,8 @@ conn_subsys: bus@5b000000 { interrupts = ; reg = <0x5b010000 0x10000>; clocks = <&sdhc0_lpcg IMX_LPCG_CLK_4>, - <&sdhc0_lpcg IMX_LPCG_CLK_0>, - <&sdhc0_lpcg IMX_LPCG_CLK_5>; + <&sdhc0_lpcg IMX_LPCG_CLK_5>, + <&sdhc0_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "ahb", "per"; power-domains = <&pd IMX_SC_R_SDHC_0>; status = "disabled"; @@ -78,8 +78,8 @@ conn_subsys: bus@5b000000 { interrupts = ; reg = <0x5b020000 0x10000>; clocks = <&sdhc1_lpcg IMX_LPCG_CLK_4>, - <&sdhc1_lpcg IMX_LPCG_CLK_0>, - <&sdhc1_lpcg IMX_LPCG_CLK_5>; + <&sdhc1_lpcg IMX_LPCG_CLK_5>, + <&sdhc1_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "ahb", "per"; power-domains = <&pd IMX_SC_R_SDHC_1>; fsl,tuning-start-tap = <20>; @@ -91,8 +91,8 @@ conn_subsys: bus@5b000000 { interrupts = ; reg = <0x5b030000 0x10000>; clocks = <&sdhc2_lpcg IMX_LPCG_CLK_4>, - <&sdhc2_lpcg IMX_LPCG_CLK_0>, - <&sdhc2_lpcg IMX_LPCG_CLK_5>; + <&sdhc2_lpcg IMX_LPCG_CLK_5>, + <&sdhc2_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "ahb", "per"; power-domains = <&pd IMX_SC_R_SDHC_2>; status = "disabled"; -- GitLab From c4e51e424e2c772ce1836912a8b0b87cd61bc9d5 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 2 Apr 2024 08:36:25 +0200 Subject: [PATCH 258/989] ALSA: line6: Zero-initialize message buffers For shutting up spurious KMSAN uninit-value warnings, just replace kmalloc() calls with kzalloc() for the buffers used for communications. There should be no real issue with the original code, but it's still better to cover. Reported-by: syzbot+7fb05ccf7b3d2f9617b3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/00000000000084b18706150bcca5@google.com Message-ID: <20240402063628.26609-1-tiwai@suse.de> Signed-off-by: Takashi Iwai --- sound/usb/line6/driver.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/usb/line6/driver.c b/sound/usb/line6/driver.c index b67617b68e509..f4437015d43a7 100644 --- a/sound/usb/line6/driver.c +++ b/sound/usb/line6/driver.c @@ -202,7 +202,7 @@ int line6_send_raw_message_async(struct usb_line6 *line6, const char *buffer, struct urb *urb; /* create message: */ - msg = kmalloc(sizeof(struct message), GFP_ATOMIC); + msg = kzalloc(sizeof(struct message), GFP_ATOMIC); if (msg == NULL) return -ENOMEM; @@ -688,7 +688,7 @@ static int line6_init_cap_control(struct usb_line6 *line6) int ret; /* initialize USB buffers: */ - line6->buffer_listen = kmalloc(LINE6_BUFSIZE_LISTEN, GFP_KERNEL); + line6->buffer_listen = kzalloc(LINE6_BUFSIZE_LISTEN, GFP_KERNEL); if (!line6->buffer_listen) return -ENOMEM; @@ -697,7 +697,7 @@ static int line6_init_cap_control(struct usb_line6 *line6) return -ENOMEM; if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { - line6->buffer_message = kmalloc(LINE6_MIDI_MESSAGE_MAXLEN, GFP_KERNEL); + line6->buffer_message = kzalloc(LINE6_MIDI_MESSAGE_MAXLEN, GFP_KERNEL); if (!line6->buffer_message) return -ENOMEM; -- GitLab From 73eaa2b583493b680c6f426531d6736c39643bfb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Apr 2024 15:16:19 -0600 Subject: [PATCH 259/989] io_uring: use private workqueue for exit work Rather than use the system unbound event workqueue, use an io_uring specific one. This avoids dependencies with the tty, which also uses the system_unbound_wq, and issues flushes of said workqueue from inside its poll handling. Cc: stable@vger.kernel.org Reported-by: Rasmus Karlsson Tested-by: Rasmus Karlsson Tested-by: Iskren Chernev Link: https://github.com/axboe/liburing/issues/1113 Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8baf8afb79c2a..d1defb99b89ee 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -147,6 +147,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, static void io_queue_sqe(struct io_kiocb *req); struct kmem_cache *req_cachep; +static struct workqueue_struct *iou_wq __ro_after_init; static int __read_mostly sysctl_io_uring_disabled; static int __read_mostly sysctl_io_uring_group = -1; @@ -3166,7 +3167,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) * noise and overhead, there's no discernable change in runtime * over using system_wq. */ - queue_work(system_unbound_wq, &ctx->exit_work); + queue_work(iou_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) @@ -4190,6 +4191,8 @@ static int __init io_uring_init(void) io_buf_cachep = KMEM_CACHE(io_buffer, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); + iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); + #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); #endif -- GitLab From fbbd5d3ad9435748b8ae6451bc004ee9ac49b6b7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 30 Mar 2024 09:53:00 +0900 Subject: [PATCH 260/989] nullblk: Fix cleanup order in null_add_dev() error path In null_add_dev(), if an error happen after initializing the resources for a zoned null block device, we must free these resources before exiting the function. To ensure this, move the out_cleanup_zone label after out_cleanup_disk as we jump to this latter label if an error happens after calling null_init_zoned_dev(). Fixes: e440626b1caf ("null_blk: pass queue_limits to blk_mq_alloc_disk") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240330005300.1503252-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 71c39bcd872c7..ed33cf7192d21 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1965,10 +1965,10 @@ static int null_add_dev(struct nullb_device *dev) out_ida_free: ida_free(&nullb_indexes, nullb->index); -out_cleanup_zone: - null_free_zoned_dev(dev); out_cleanup_disk: put_disk(nullb->disk); +out_cleanup_zone: + null_free_zoned_dev(dev); out_cleanup_tags: if (nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); -- GitLab From 22d24a544b0d49bbcbd61c8c0eaf77d3c9297155 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 29 Mar 2024 09:23:19 +0800 Subject: [PATCH 261/989] block: fix overflow in blk_ioctl_discard() There is no check for overflow of 'start + len' in blk_ioctl_discard(). Hung task occurs if submit an discard ioctl with the following param: start = 0x80000000000ff000, len = 0x8000000000fff000; Add the overflow validation now. Signed-off-by: Li Nan Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240329012319.2034550-1-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- block/ioctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 0c76137adcaaa..a9028a2c2db57 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -96,7 +96,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { uint64_t range[2]; - uint64_t start, len; + uint64_t start, len, end; struct inode *inode = bdev->bd_inode; int err; @@ -117,7 +117,8 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, if (len & 511) return -EINVAL; - if (start + len > bdev_nr_bytes(bdev)) + if (check_add_overflow(start, len, &end) || + end > bdev_nr_bytes(bdev)) return -EINVAL; filemap_invalidate_lock(inode->i_mapping); -- GitLab From c1832f67035dc04fb89e6b591b64e4d515843cda Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 31 Mar 2024 21:58:26 +0900 Subject: [PATCH 262/989] ksmbd: don't send oplock break if rename fails Don't send oplock break if rename fails. This patch fix smb2.oplock.batch20 test. Cc: stable@vger.kernel.org Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index d478fa0c57abd..5723bbf372d7c 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -5857,8 +5857,9 @@ static int smb2_rename(struct ksmbd_work *work, if (!file_info->ReplaceIfExists) flags = RENAME_NOREPLACE; - smb_break_all_levII_oplock(work, fp, 0); rc = ksmbd_vfs_rename(work, &fp->filp->f_path, new_name, flags); + if (!rc) + smb_break_all_levII_oplock(work, fp, 0); out: kfree(new_name); return rc; -- GitLab From a677ebd8ca2f2632ccdecbad7b87641274e15aac Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 31 Mar 2024 21:59:10 +0900 Subject: [PATCH 263/989] ksmbd: validate payload size in ipc response If installing malicious ksmbd-tools, ksmbd.mountd can return invalid ipc response to ksmbd kernel server. ksmbd should validate payload size of ipc response from ksmbd.mountd to avoid memory overrun or slab-out-of-bounds. This patch validate 3 ipc response that has payload. Cc: stable@vger.kernel.org Reported-by: Chao Ma Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/ksmbd_netlink.h | 3 ++- fs/smb/server/mgmt/share_config.c | 7 +++++- fs/smb/server/transport_ipc.c | 37 +++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index 8ca8a45c4c621..686b321c5a8bb 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -167,7 +167,8 @@ struct ksmbd_share_config_response { __u16 force_uid; __u16 force_gid; __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME]; - __u32 reserved[112]; /* Reserved room */ + __u32 reserved[111]; /* Reserved room */ + __u32 payload_sz; __u32 veto_list_sz; __s8 ____payload[]; }; diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c index 328a412259dc1..a2f0a2edceb8a 100644 --- a/fs/smb/server/mgmt/share_config.c +++ b/fs/smb/server/mgmt/share_config.c @@ -158,7 +158,12 @@ static struct ksmbd_share_config *share_config_request(struct unicode_map *um, share->name = kstrdup(name, GFP_KERNEL); if (!test_share_config_flag(share, KSMBD_SHARE_FLAG_PIPE)) { - share->path = kstrdup(ksmbd_share_config_path(resp), + int path_len = PATH_MAX; + + if (resp->payload_sz) + path_len = resp->payload_sz - resp->veto_list_sz; + + share->path = kstrndup(ksmbd_share_config_path(resp), path_len, GFP_KERNEL); if (share->path) share->path_sz = strlen(share->path); diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index f29bb03f0dc47..8752ac82c557b 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -65,6 +65,7 @@ struct ipc_msg_table_entry { struct hlist_node ipc_table_hlist; void *response; + unsigned int msg_sz; }; static struct delayed_work ipc_timer_work; @@ -275,6 +276,7 @@ static int handle_response(int type, void *payload, size_t sz) } memcpy(entry->response, payload, sz); + entry->msg_sz = sz; wake_up_interruptible(&entry->wait); ret = 0; break; @@ -453,6 +455,34 @@ out: return ret; } +static int ipc_validate_msg(struct ipc_msg_table_entry *entry) +{ + unsigned int msg_sz = entry->msg_sz; + + if (entry->type == KSMBD_EVENT_RPC_REQUEST) { + struct ksmbd_rpc_command *resp = entry->response; + + msg_sz = sizeof(struct ksmbd_rpc_command) + resp->payload_sz; + } else if (entry->type == KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST) { + struct ksmbd_spnego_authen_response *resp = entry->response; + + msg_sz = sizeof(struct ksmbd_spnego_authen_response) + + resp->session_key_len + resp->spnego_blob_len; + } else if (entry->type == KSMBD_EVENT_SHARE_CONFIG_REQUEST) { + struct ksmbd_share_config_response *resp = entry->response; + + if (resp->payload_sz) { + if (resp->payload_sz < resp->veto_list_sz) + return -EINVAL; + + msg_sz = sizeof(struct ksmbd_share_config_response) + + resp->payload_sz; + } + } + + return entry->msg_sz != msg_sz ? -EINVAL : 0; +} + static void *ipc_msg_send_request(struct ksmbd_ipc_msg *msg, unsigned int handle) { struct ipc_msg_table_entry entry; @@ -477,6 +507,13 @@ static void *ipc_msg_send_request(struct ksmbd_ipc_msg *msg, unsigned int handle ret = wait_event_interruptible_timeout(entry.wait, entry.response != NULL, IPC_WAIT_TIMEOUT); + if (entry.response) { + ret = ipc_validate_msg(&entry); + if (ret) { + kvfree(entry.response); + entry.response = NULL; + } + } out: down_write(&ipc_msg_table_lock); hash_del(&entry.ipc_table_hlist); -- GitLab From 5ed11af19e56f0434ce0959376d136005745a936 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 2 Apr 2024 09:31:22 +0900 Subject: [PATCH 264/989] ksmbd: do not set SMB2_GLOBAL_CAP_ENCRYPTION for SMB 3.1.1 SMB2_GLOBAL_CAP_ENCRYPTION flag should be used only for 3.0 and 3.0.2 dialects. This flags set cause compatibility problems with other SMB clients. Reported-by: James Christopher Adduono Tested-by: James Christopher Adduono Cc: stable@vger.kernel.org Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2ops.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/smb/server/smb2ops.c b/fs/smb/server/smb2ops.c index a45f7dca482e0..606aa3c5189a2 100644 --- a/fs/smb/server/smb2ops.c +++ b/fs/smb/server/smb2ops.c @@ -228,6 +228,11 @@ void init_smb3_0_server(struct ksmbd_conn *conn) conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION) conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION || + (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) && + conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL; } @@ -278,11 +283,6 @@ int init_smb3_11_server(struct ksmbd_conn *conn) conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_DIRECTORY_LEASING; - if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION || - (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) && - conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)) - conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; - if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL; -- GitLab From d725ce9d7c78fb4e22c6c7676106e135ade14fa8 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Mon, 11 Mar 2024 16:56:26 +0200 Subject: [PATCH 265/989] drm/i915/dp: Fix DSC state HW readout for SST connectors Commit 0848814aa296 ("drm/i915/dp: Fix connector DSC HW state readout") moved the DSC HW state readout to a connector specific hook, however only added the hook for DP MST connectors, not for DP SST ones. Fix adding the hook for SST connectors as well. This fixes the following warn on platforms where BIOS enables DSC: [ 66.208601] i915 0000:00:02.0: drm_WARN_ON(!connector->dp.dsc_decompression_aux || !connector->dp.dsc_decompression_enabled) ... [ 66.209024] RIP: 0010:intel_dp_sink_disable_decompression+0x76/0x110 [i915] ... [ 66.209333] ? intel_dp_sink_disable_decompression+0x76/0x110 [i915] ... [ 66.210068] intel_disable_ddi+0x135/0x1d0 [i915] [ 66.210302] intel_encoders_disable+0x9b/0xc0 [i915] [ 66.210565] hsw_crtc_disable+0x153/0x170 [i915] [ 66.210823] intel_old_crtc_state_disables+0x52/0xb0 [i915] [ 66.211107] intel_atomic_commit_tail+0x5cf/0x1330 [i915] [ 66.211366] intel_atomic_commit+0x39d/0x3f0 [i915] [ 66.211612] ? intel_atomic_commit+0x39d/0x3f0 [i915] [ 66.211872] drm_atomic_commit+0x9d/0xd0 [drm] [ 66.211921] ? __pfx___drm_printfn_info+0x10/0x10 [drm] [ 66.211975] intel_initial_commit+0x1a8/0x260 [i915] [ 66.212234] intel_display_driver_probe+0x2a/0x80 [i915] [ 66.212479] i915_driver_probe+0x7c6/0xc60 [i915] [ 66.212664] ? drm_privacy_screen_get+0x168/0x190 [drm] [ 66.212711] i915_pci_probe+0xe2/0x1c0 [i915] Fixes: 0848814aa296 ("drm/i915/dp: Fix connector DSC HW state readout") Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10410 Cc: Ankit Nautiyal Reviewed-by: Ankit Nautiyal Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20240311145626.2454923-1-imre.deak@intel.com (cherry picked from commit 7a51a2aa2384ea8bee76698ae586a2bea5b8ddb5) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index f98ef4b42a448..af7ca00e9bc0a 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -6557,6 +6557,7 @@ intel_dp_init_connector(struct intel_digital_port *dig_port, intel_connector->get_hw_state = intel_ddi_connector_get_hw_state; else intel_connector->get_hw_state = intel_connector_get_hw_state; + intel_connector->sync_state = intel_dp_connector_sync_state; if (!intel_edp_init_connector(intel_dp, intel_connector)) { intel_dp_aux_fini(intel_dp); -- GitLab From caf3d748f646889425312897e81307441160d485 Mon Sep 17 00:00:00 2001 From: Arun R Murthy Date: Wed, 28 Feb 2024 20:13:50 +0530 Subject: [PATCH 266/989] drm/i915/dp: Remove support for UHBR13.5 UHBR13.5 is not supported in MTL and also the DP2.1 spec says UHBR13.5 is optional. Hence removing UHBR135 from the supported link rates. v2: Reframed the commit message and added link to the issue. Signed-off-by: Arun R Murthy Fixes: 62618c7f117e ("drm/i915/mtl: C20 PLL programming") Reviewed-by: Jani Nikula Signed-off-by: Animesh Manna Link: https://patchwork.freedesktop.org/patch/msgid/20240228144350.3184930-1-arun.r.murthy@intel.com (cherry picked from commit ddf8a8bbb5643265883bab0c59adf0648422c4bb) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index af7ca00e9bc0a..4016cf6b6c618 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -499,7 +499,7 @@ intel_dp_set_source_rates(struct intel_dp *intel_dp) /* The values must be in increasing order */ static const int mtl_rates[] = { 162000, 216000, 243000, 270000, 324000, 432000, 540000, 675000, - 810000, 1000000, 1350000, 2000000, + 810000, 1000000, 2000000, }; static const int icl_rates[] = { 162000, 216000, 270000, 324000, 432000, 540000, 648000, 810000, -- GitLab From e9e62243a3e2322cf639f653a0b0a88a76446ce7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Apr 2024 10:11:35 +0100 Subject: [PATCH 267/989] cifs: Fix caching to try to do open O_WRONLY as rdwr on server When we're engaged in local caching of a cifs filesystem, we cannot perform caching of a partially written cache granule unless we can read the rest of the granule. This can result in unexpected access errors being reported to the user. Fix this by the following: if a file is opened O_WRONLY locally, but the mount was given the "-o fsc" flag, try first opening the remote file with GENERIC_READ|GENERIC_WRITE and if that returns -EACCES, try dropping the GENERIC_READ and doing the open again. If that last succeeds, invalidate the cache for that file as for O_DIRECT. Fixes: 70431bfd825d ("cifs: Support fscache indexing rewrite") Signed-off-by: David Howells cc: Steve French cc: Shyam Prasad N cc: Rohith Surabattula cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/dir.c | 15 +++++++++++++ fs/smb/client/file.c | 48 ++++++++++++++++++++++++++++++++--------- fs/smb/client/fscache.h | 6 ++++++ 3 files changed, 59 insertions(+), 10 deletions(-) diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index d11dc3aa458ba..864b194dbaa0a 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -189,6 +189,7 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int disposition; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; + int rdwr_for_fscache = 0; *oplock = 0; if (tcon->ses->server->oplocks) @@ -200,6 +201,10 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned return PTR_ERR(full_path); } + /* If we're caching, we need to be able to fill in around partial writes. */ + if (cifs_fscache_enabled(inode) && (oflags & O_ACCMODE) == O_WRONLY) + rdwr_for_fscache = 1; + #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open && (CIFS_UNIX_POSIX_PATH_OPS_CAP & @@ -276,6 +281,8 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned desired_access |= GENERIC_READ; /* is this too little? */ if (OPEN_FMODE(oflags) & FMODE_WRITE) desired_access |= GENERIC_WRITE; + if (rdwr_for_fscache == 1) + desired_access |= GENERIC_READ; disposition = FILE_OVERWRITE_IF; if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) @@ -304,6 +311,7 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned if (!tcon->unix_ext && (mode & S_IWUGO) == 0) create_options |= CREATE_OPTION_READONLY; +retry_open: oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, @@ -317,8 +325,15 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned rc = server->ops->open(xid, &oparms, oplock, buf); if (rc) { cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc); + if (rc == -EACCES && rdwr_for_fscache == 1) { + desired_access &= ~GENERIC_READ; + rdwr_for_fscache = 2; + goto retry_open; + } goto out; } + if (rdwr_for_fscache == 2) + cifs_invalidate_cache(inode, FSCACHE_INVAL_DIO_WRITE); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 16aadce492b2e..ab536ce8a04a5 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -206,12 +206,12 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) */ } -static inline int cifs_convert_flags(unsigned int flags) +static inline int cifs_convert_flags(unsigned int flags, int rdwr_for_fscache) { if ((flags & O_ACCMODE) == O_RDONLY) return GENERIC_READ; else if ((flags & O_ACCMODE) == O_WRONLY) - return GENERIC_WRITE; + return rdwr_for_fscache == 1 ? (GENERIC_READ | GENERIC_WRITE) : GENERIC_WRITE; else if ((flags & O_ACCMODE) == O_RDWR) { /* GENERIC_ALL is too much permission to request can cause unnecessary access denied on create */ @@ -348,11 +348,16 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_ int create_options = CREATE_NOT_DIR; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; + int rdwr_for_fscache = 0; if (!server->ops->open) return -ENOSYS; - desired_access = cifs_convert_flags(f_flags); + /* If we're caching, we need to be able to fill in around partial writes. */ + if (cifs_fscache_enabled(inode) && (f_flags & O_ACCMODE) == O_WRONLY) + rdwr_for_fscache = 1; + + desired_access = cifs_convert_flags(f_flags, rdwr_for_fscache); /********************************************************************* * open flag mapping table: @@ -389,6 +394,7 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_ if (f_flags & O_DIRECT) create_options |= CREATE_NO_BUFFER; +retry_open: oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, @@ -400,8 +406,16 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_ }; rc = server->ops->open(xid, &oparms, oplock, buf); - if (rc) + if (rc) { + if (rc == -EACCES && rdwr_for_fscache == 1) { + desired_access = cifs_convert_flags(f_flags, 0); + rdwr_for_fscache = 2; + goto retry_open; + } return rc; + } + if (rdwr_for_fscache == 2) + cifs_invalidate_cache(inode, FSCACHE_INVAL_DIO_WRITE); /* TODO: Add support for calling posix query info but with passing in fid */ if (tcon->unix_ext) @@ -834,11 +848,11 @@ int cifs_open(struct inode *inode, struct file *file) use_cache: fscache_use_cookie(cifs_inode_cookie(file_inode(file)), file->f_mode & FMODE_WRITE); - if (file->f_flags & O_DIRECT && - (!((file->f_flags & O_ACCMODE) != O_RDONLY) || - file->f_flags & O_APPEND)) - cifs_invalidate_cache(file_inode(file), - FSCACHE_INVAL_DIO_WRITE); + if (!(file->f_flags & O_DIRECT)) + goto out; + if ((file->f_flags & (O_ACCMODE | O_APPEND)) == O_RDONLY) + goto out; + cifs_invalidate_cache(file_inode(file), FSCACHE_INVAL_DIO_WRITE); out: free_dentry_path(page); @@ -903,6 +917,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) int disposition = FILE_OPEN; int create_options = CREATE_NOT_DIR; struct cifs_open_parms oparms; + int rdwr_for_fscache = 0; xid = get_xid(); mutex_lock(&cfile->fh_mutex); @@ -966,7 +981,11 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ - desired_access = cifs_convert_flags(cfile->f_flags); + /* If we're caching, we need to be able to fill in around partial writes. */ + if (cifs_fscache_enabled(inode) && (cfile->f_flags & O_ACCMODE) == O_WRONLY) + rdwr_for_fscache = 1; + + desired_access = cifs_convert_flags(cfile->f_flags, rdwr_for_fscache); /* O_SYNC also has bit for O_DSYNC so following check picks up either */ if (cfile->f_flags & O_SYNC) @@ -978,6 +997,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &cfile->fid); +retry_open: oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, @@ -1003,6 +1023,11 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) /* indicate that we need to relock the file */ oparms.reconnect = true; } + if (rc == -EACCES && rdwr_for_fscache == 1) { + desired_access = cifs_convert_flags(cfile->f_flags, 0); + rdwr_for_fscache = 2; + goto retry_open; + } if (rc) { mutex_unlock(&cfile->fh_mutex); @@ -1011,6 +1036,9 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) goto reopen_error_exit; } + if (rdwr_for_fscache == 2) + cifs_invalidate_cache(inode, FSCACHE_INVAL_DIO_WRITE); + #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY reopen_success: #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h index a3d73720914f8..1f2ea9f5cc9a8 100644 --- a/fs/smb/client/fscache.h +++ b/fs/smb/client/fscache.h @@ -109,6 +109,11 @@ static inline void cifs_readahead_to_fscache(struct inode *inode, __cifs_readahead_to_fscache(inode, pos, len); } +static inline bool cifs_fscache_enabled(struct inode *inode) +{ + return fscache_cookie_enabled(cifs_inode_cookie(inode)); +} + #else /* CONFIG_CIFS_FSCACHE */ static inline void cifs_fscache_fill_coherency(struct inode *inode, @@ -124,6 +129,7 @@ static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {} static inline void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) {} static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { return NULL; } static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) {} +static inline bool cifs_fscache_enabled(struct inode *inode) { return false; } static inline int cifs_fscache_query_occupancy(struct inode *inode, pgoff_t first, unsigned int nr_pages, -- GitLab From ff91059932401894e6c86341915615c5eb0eca48 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Tue, 2 Apr 2024 12:46:21 +0200 Subject: [PATCH 268/989] bpf, sockmap: Prevent lock inversion deadlock in map delete elem syzkaller started using corpuses where a BPF tracing program deletes elements from a sockmap/sockhash map. Because BPF tracing programs can be invoked from any interrupt context, locks taken during a map_delete_elem operation must be hardirq-safe. Otherwise a deadlock due to lock inversion is possible, as reported by lockdep: CPU0 CPU1 ---- ---- lock(&htab->buckets[i].lock); local_irq_disable(); lock(&host->lock); lock(&htab->buckets[i].lock); lock(&host->lock); Locks in sockmap are hardirq-unsafe by design. We expects elements to be deleted from sockmap/sockhash only in task (normal) context with interrupts enabled, or in softirq context. Detect when map_delete_elem operation is invoked from a context which is _not_ hardirq-unsafe, that is interrupts are disabled, and bail out with an error. Note that map updates are not affected by this issue. BPF verifier does not allow updating sockmap/sockhash from a BPF tracing program today. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Reported-by: xingwei lee Reported-by: yue sun Reported-by: syzbot+bc922f476bd65abbd466@syzkaller.appspotmail.com Reported-by: syzbot+d4066896495db380182e@syzkaller.appspotmail.com Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Tested-by: syzbot+d4066896495db380182e@syzkaller.appspotmail.com Acked-by: John Fastabend Closes: https://syzkaller.appspot.com/bug?extid=d4066896495db380182e Closes: https://syzkaller.appspot.com/bug?extid=bc922f476bd65abbd466 Link: https://lore.kernel.org/bpf/20240402104621.1050319-1-jakub@cloudflare.com --- net/core/sock_map.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 27d733c0f65e1..8598466a38057 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -411,6 +411,9 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock *sk; int err = 0; + if (irqs_disabled()) + return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ + spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) @@ -933,6 +936,9 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) struct bpf_shtab_elem *elem; int ret = -ENOENT; + if (irqs_disabled()) + return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ + hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); -- GitLab From b91695b50d5b5c7c6a2cae08637b0a119cec0e12 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 25 Mar 2024 09:14:04 -0300 Subject: [PATCH 269/989] ARM: dts: imx7-mba7: Use 'no-mmc' property 'no-emmc' is not a valid property. The original intention was to use the 'no-mmc' property. Change it accordingly to fix the following dt-schema warning: imx7s-mba7.dtb: mmc@30b40000: Unevaluated properties are not allowed ('no-emmc' was unexpected) Fixes: d430a7e0e181 ("ARM: dts: imx7-mba7: restrict usdhc interface modes") Signed-off-by: Fabio Estevam Reviewed-by: Alexander Stein Signed-off-by: Shawn Guo --- arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi b/arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi index 1235a71c6abe9..52869e68f833c 100644 --- a/arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi @@ -666,7 +666,7 @@ bus-width = <4>; no-1-8-v; no-sdio; - no-emmc; + no-mmc; status = "okay"; }; -- GitLab From 8a655cee6c9d4588570ad0cb099c5660f9a44a12 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 2 Apr 2024 14:20:40 +0800 Subject: [PATCH 270/989] ASoC: codecs: ES8326: Solve error interruption issue We got an error report about headphone type detection and button detection. We fixed the headphone type detection error by adjusting the debounce timer configuration. And we fixed the button detection error by disabling the button detection feature when the headphone are unplugged and enabling it when headphone are plugged in. Signed-off-by: Zhang Yi Link: https://msgid.link/r/20240402062043.20608-2-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/es8326.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index 15289dadafea0..a6783fd6553de 100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -843,6 +843,7 @@ static void es8326_jack_detect_handler(struct work_struct *work) regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x03, 0x01); regmap_write(es8326->regmap, ES8326_SYS_BIAS, 0x0a); regmap_update_bits(es8326->regmap, ES8326_HP_DRIVER_REF, 0x0f, 0x03); + regmap_write(es8326->regmap, ES8326_INT_SOURCE, ES8326_INT_SRC_PIN9); /* * Inverted HPJACK_POL bit to trigger one IRQ to double check HP Removal event */ @@ -865,6 +866,8 @@ static void es8326_jack_detect_handler(struct work_struct *work) * set auto-check mode, then restart jack_detect_work after 400ms. * Don't report jack status. */ + regmap_write(es8326->regmap, ES8326_INT_SOURCE, + (ES8326_INT_SRC_PIN9 | ES8326_INT_SRC_BUTTON)); regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x03, 0x01); es8326_enable_micbias(es8326->component); usleep_range(50000, 70000); @@ -987,7 +990,7 @@ static int es8326_resume(struct snd_soc_component *component) regmap_write(es8326->regmap, ES8326_VMIDSEL, 0x0E); regmap_write(es8326->regmap, ES8326_ANA_LP, 0xf0); usleep_range(10000, 15000); - regmap_write(es8326->regmap, ES8326_HPJACK_TIMER, 0xe9); + regmap_write(es8326->regmap, ES8326_HPJACK_TIMER, 0xd9); regmap_write(es8326->regmap, ES8326_ANA_MICBIAS, 0xcb); /* set headphone default type and detect pin */ regmap_write(es8326->regmap, ES8326_HPDET_TYPE, 0x83); @@ -1038,8 +1041,7 @@ static int es8326_resume(struct snd_soc_component *component) es8326_enable_micbias(es8326->component); usleep_range(50000, 70000); regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x03, 0x00); - regmap_write(es8326->regmap, ES8326_INT_SOURCE, - (ES8326_INT_SRC_PIN9 | ES8326_INT_SRC_BUTTON)); + regmap_write(es8326->regmap, ES8326_INT_SOURCE, ES8326_INT_SRC_PIN9); regmap_write(es8326->regmap, ES8326_INTOUT_IO, es8326->interrupt_clk); regmap_write(es8326->regmap, ES8326_SDINOUT1_IO, -- GitLab From 4581468d071b64a2e3c2ae333fff82dc0391a306 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 2 Apr 2024 14:20:41 +0800 Subject: [PATCH 271/989] ASoC: codecs: ES8326: modify clock table We got a digital microphone feature issue. And we fixed it by modifying the clock table. Also, we changed the marco ES8326_CLK_ON declaration Signed-off-by: Zhang Yi Link: https://msgid.link/r/20240402062043.20608-3-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/es8326.c | 22 +++++++++++----------- sound/soc/codecs/es8326.h | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index a6783fd6553de..275db81d10d49 100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -412,9 +412,9 @@ static const struct _coeff_div coeff_div_v3[] = { {125, 48000, 6000000, 0x04, 0x04, 0x1F, 0x2D, 0x8A, 0x0A, 0x27, 0x27}, {128, 8000, 1024000, 0x60, 0x00, 0x05, 0x75, 0x8A, 0x1B, 0x1F, 0x7F}, - {128, 16000, 2048000, 0x20, 0x00, 0x31, 0x35, 0x8A, 0x1B, 0x1F, 0x3F}, - {128, 44100, 5644800, 0xE0, 0x00, 0x01, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, - {128, 48000, 6144000, 0xE0, 0x00, 0x01, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, + {128, 16000, 2048000, 0x20, 0x00, 0x31, 0x35, 0x08, 0x19, 0x1F, 0x3F}, + {128, 44100, 5644800, 0xE0, 0x00, 0x01, 0x2D, 0x48, 0x08, 0x1F, 0x1F}, + {128, 48000, 6144000, 0xE0, 0x00, 0x01, 0x2D, 0x48, 0x08, 0x1F, 0x1F}, {144, 8000, 1152000, 0x20, 0x00, 0x03, 0x35, 0x8A, 0x1B, 0x23, 0x47}, {144, 16000, 2304000, 0x20, 0x00, 0x11, 0x35, 0x8A, 0x1B, 0x23, 0x47}, {192, 8000, 1536000, 0x60, 0x02, 0x0D, 0x75, 0x8A, 0x1B, 0x1F, 0x7F}, @@ -423,10 +423,10 @@ static const struct _coeff_div coeff_div_v3[] = { {200, 48000, 9600000, 0x04, 0x04, 0x0F, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, {250, 48000, 12000000, 0x04, 0x04, 0x0F, 0x2D, 0xCA, 0x0A, 0x27, 0x27}, - {256, 8000, 2048000, 0x60, 0x00, 0x31, 0x35, 0x8A, 0x1B, 0x1F, 0x7F}, - {256, 16000, 4096000, 0x20, 0x00, 0x01, 0x35, 0x8A, 0x1B, 0x1F, 0x3F}, - {256, 44100, 11289600, 0xE0, 0x00, 0x30, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, - {256, 48000, 12288000, 0xE0, 0x00, 0x30, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, + {256, 8000, 2048000, 0x60, 0x00, 0x31, 0x35, 0x08, 0x19, 0x1F, 0x7F}, + {256, 16000, 4096000, 0x20, 0x00, 0x01, 0x35, 0x08, 0x19, 0x1F, 0x3F}, + {256, 44100, 11289600, 0xE0, 0x01, 0x01, 0x2D, 0x48, 0x08, 0x1F, 0x1F}, + {256, 48000, 12288000, 0xE0, 0x01, 0x01, 0x2D, 0x48, 0x08, 0x1F, 0x1F}, {288, 8000, 2304000, 0x20, 0x00, 0x01, 0x35, 0x8A, 0x1B, 0x23, 0x47}, {384, 8000, 3072000, 0x60, 0x02, 0x05, 0x75, 0x8A, 0x1B, 0x1F, 0x7F}, {384, 16000, 6144000, 0x20, 0x02, 0x03, 0x35, 0x8A, 0x1B, 0x1F, 0x3F}, @@ -435,10 +435,10 @@ static const struct _coeff_div coeff_div_v3[] = { {400, 48000, 19200000, 0xE4, 0x04, 0x35, 0x6d, 0xCA, 0x0A, 0x1F, 0x1F}, {500, 48000, 24000000, 0xF8, 0x04, 0x3F, 0x6D, 0xCA, 0x0A, 0x1F, 0x1F}, - {512, 8000, 4096000, 0x60, 0x00, 0x01, 0x35, 0x8A, 0x1B, 0x1F, 0x7F}, - {512, 16000, 8192000, 0x20, 0x00, 0x30, 0x35, 0x8A, 0x1B, 0x1F, 0x3F}, - {512, 44100, 22579200, 0xE0, 0x00, 0x00, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, - {512, 48000, 24576000, 0xE0, 0x00, 0x00, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, + {512, 8000, 4096000, 0x60, 0x00, 0x01, 0x08, 0x19, 0x1B, 0x1F, 0x7F}, + {512, 16000, 8192000, 0x20, 0x00, 0x30, 0x35, 0x08, 0x19, 0x1F, 0x3F}, + {512, 44100, 22579200, 0xE0, 0x00, 0x00, 0x2D, 0x48, 0x08, 0x1F, 0x1F}, + {512, 48000, 24576000, 0xE0, 0x00, 0x00, 0x2D, 0x48, 0x08, 0x1F, 0x1F}, {768, 8000, 6144000, 0x60, 0x02, 0x11, 0x35, 0x8A, 0x1B, 0x1F, 0x7F}, {768, 16000, 12288000, 0x20, 0x02, 0x01, 0x35, 0x8A, 0x1B, 0x1F, 0x3F}, {768, 32000, 24576000, 0xE0, 0x02, 0x30, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, diff --git a/sound/soc/codecs/es8326.h b/sound/soc/codecs/es8326.h index ee12caef81053..c3e52e7bdef57 100644 --- a/sound/soc/codecs/es8326.h +++ b/sound/soc/codecs/es8326.h @@ -104,7 +104,7 @@ #define ES8326_MUTE (3 << 0) /* ES8326_CLK_CTL */ -#define ES8326_CLK_ON (0x7e << 0) +#define ES8326_CLK_ON (0x7f << 0) #define ES8326_CLK_OFF (0 << 0) /* ES8326_CLK_INV */ -- GitLab From 6e5f5bf894eb9260f07ad0da4e2dd2efd616ed59 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 2 Apr 2024 14:20:42 +0800 Subject: [PATCH 272/989] ASoC: codecs: ES8326: Solve a headphone detection issue after suspend and resume We got a headphone detection issue after suspend and resume. And we fixed it by modifying the configuration at es8326_suspend and invoke es8326_irq at es8326_resume. Signed-off-by: Zhang Yi Link: https://msgid.link/r/20240402062043.20608-4-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/es8326.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index 275db81d10d49..fa809ab41a4ae 100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -1062,6 +1062,8 @@ static int es8326_resume(struct snd_soc_component *component) es8326->hp = 0; es8326->hpl_vol = 0x03; es8326->hpr_vol = 0x03; + + es8326_irq(es8326->irq, es8326); return 0; } @@ -1072,6 +1074,9 @@ static int es8326_suspend(struct snd_soc_component *component) cancel_delayed_work_sync(&es8326->jack_detect_work); es8326_disable_micbias(component); es8326->calibrated = false; + regmap_write(es8326->regmap, ES8326_CLK_MUX, 0x2d); + regmap_write(es8326->regmap, ES8326_DAC2HPMIX, 0x00); + regmap_write(es8326->regmap, ES8326_ANA_PDN, 0x3b); regmap_write(es8326->regmap, ES8326_CLK_CTL, ES8326_CLK_OFF); regcache_cache_only(es8326->regmap, true); regcache_mark_dirty(es8326->regmap); -- GitLab From fec9c7f668ac5dd107f4da5a3b18379e07ec1a41 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 2 Apr 2024 14:20:43 +0800 Subject: [PATCH 273/989] ASoC: codecs: ES8326: Removing the control of ADC_SCALE We removed the configuration of ES8326_ADC_SCALE in es8326_jack_detect_handler because user changed the configuration by snd_controls Signed-off-by: Zhang Yi Link: https://msgid.link/r/20240402062043.20608-5-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/es8326.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index fa809ab41a4ae..17bd6b5160772 100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -835,7 +835,6 @@ static void es8326_jack_detect_handler(struct work_struct *work) dev_dbg(comp->dev, "Report hp remove event\n"); snd_soc_jack_report(es8326->jack, 0, SND_JACK_HEADSET); /* mute adc when mic path switch */ - regmap_write(es8326->regmap, ES8326_ADC_SCALE, 0x33); regmap_write(es8326->regmap, ES8326_ADC1_SRC, 0x44); regmap_write(es8326->regmap, ES8326_ADC2_SRC, 0x66); es8326->hp = 0; @@ -894,7 +893,6 @@ static void es8326_jack_detect_handler(struct work_struct *work) snd_soc_jack_report(es8326->jack, SND_JACK_HEADSET, SND_JACK_HEADSET); - regmap_write(es8326->regmap, ES8326_ADC_SCALE, 0x33); regmap_update_bits(es8326->regmap, ES8326_PGA_PDN, 0x08, 0x08); regmap_update_bits(es8326->regmap, ES8326_PGAGAIN, -- GitLab From d619b0b70dc4f160f2b95d95ccfed2631ab7ac3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Tue, 2 Apr 2024 15:06:40 +0200 Subject: [PATCH 274/989] ASoC: Intel: avs: boards: Add modules description MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modpost warns about missing module description, add it. Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://msgid.link/r/20240402130640.3310999-1-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/avs/boards/da7219.c | 1 + sound/soc/intel/avs/boards/dmic.c | 1 + sound/soc/intel/avs/boards/es8336.c | 1 + sound/soc/intel/avs/boards/i2s_test.c | 1 + sound/soc/intel/avs/boards/max98357a.c | 1 + sound/soc/intel/avs/boards/max98373.c | 1 + sound/soc/intel/avs/boards/max98927.c | 1 + sound/soc/intel/avs/boards/nau8825.c | 1 + sound/soc/intel/avs/boards/probe.c | 1 + sound/soc/intel/avs/boards/rt274.c | 1 + sound/soc/intel/avs/boards/rt286.c | 1 + sound/soc/intel/avs/boards/rt298.c | 1 + sound/soc/intel/avs/boards/rt5514.c | 1 + sound/soc/intel/avs/boards/rt5663.c | 1 + sound/soc/intel/avs/boards/rt5682.c | 1 + sound/soc/intel/avs/boards/ssm4567.c | 1 + 16 files changed, 16 insertions(+) diff --git a/sound/soc/intel/avs/boards/da7219.c b/sound/soc/intel/avs/boards/da7219.c index c018f84fe0252..fc072dc58968c 100644 --- a/sound/soc/intel/avs/boards/da7219.c +++ b/sound/soc/intel/avs/boards/da7219.c @@ -296,5 +296,6 @@ static struct platform_driver avs_da7219_driver = { module_platform_driver(avs_da7219_driver); +MODULE_DESCRIPTION("Intel da7219 machine driver"); MODULE_AUTHOR("Cezary Rojewski "); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/dmic.c b/sound/soc/intel/avs/boards/dmic.c index ba2bc7f689eb6..d9e5e85f52335 100644 --- a/sound/soc/intel/avs/boards/dmic.c +++ b/sound/soc/intel/avs/boards/dmic.c @@ -96,4 +96,5 @@ static struct platform_driver avs_dmic_driver = { module_platform_driver(avs_dmic_driver); +MODULE_DESCRIPTION("Intel DMIC machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/es8336.c b/sound/soc/intel/avs/boards/es8336.c index 1090082e7d5bf..5c90a60075773 100644 --- a/sound/soc/intel/avs/boards/es8336.c +++ b/sound/soc/intel/avs/boards/es8336.c @@ -326,4 +326,5 @@ static struct platform_driver avs_es8336_driver = { module_platform_driver(avs_es8336_driver); +MODULE_DESCRIPTION("Intel es8336 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/i2s_test.c b/sound/soc/intel/avs/boards/i2s_test.c index 28f254eb0d03f..027373d6a16d6 100644 --- a/sound/soc/intel/avs/boards/i2s_test.c +++ b/sound/soc/intel/avs/boards/i2s_test.c @@ -204,4 +204,5 @@ static struct platform_driver avs_i2s_test_driver = { module_platform_driver(avs_i2s_test_driver); +MODULE_DESCRIPTION("Intel i2s test machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/max98357a.c b/sound/soc/intel/avs/boards/max98357a.c index a83b95f25129f..1ff85e4d8e160 100644 --- a/sound/soc/intel/avs/boards/max98357a.c +++ b/sound/soc/intel/avs/boards/max98357a.c @@ -154,4 +154,5 @@ static struct platform_driver avs_max98357a_driver = { module_platform_driver(avs_max98357a_driver) +MODULE_DESCRIPTION("Intel max98357a machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/max98373.c b/sound/soc/intel/avs/boards/max98373.c index 3b980a025e6f6..8d31586b73eae 100644 --- a/sound/soc/intel/avs/boards/max98373.c +++ b/sound/soc/intel/avs/boards/max98373.c @@ -211,4 +211,5 @@ static struct platform_driver avs_max98373_driver = { module_platform_driver(avs_max98373_driver) +MODULE_DESCRIPTION("Intel max98373 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/max98927.c b/sound/soc/intel/avs/boards/max98927.c index 86dd2b228df3a..572ec58073d06 100644 --- a/sound/soc/intel/avs/boards/max98927.c +++ b/sound/soc/intel/avs/boards/max98927.c @@ -208,4 +208,5 @@ static struct platform_driver avs_max98927_driver = { module_platform_driver(avs_max98927_driver) +MODULE_DESCRIPTION("Intel max98927 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/nau8825.c b/sound/soc/intel/avs/boards/nau8825.c index 1c1e2083f474d..55db75efae414 100644 --- a/sound/soc/intel/avs/boards/nau8825.c +++ b/sound/soc/intel/avs/boards/nau8825.c @@ -313,4 +313,5 @@ static struct platform_driver avs_nau8825_driver = { module_platform_driver(avs_nau8825_driver) +MODULE_DESCRIPTION("Intel nau8825 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/probe.c b/sound/soc/intel/avs/boards/probe.c index a9469b5ecb402..8be6887bbc6e8 100644 --- a/sound/soc/intel/avs/boards/probe.c +++ b/sound/soc/intel/avs/boards/probe.c @@ -69,4 +69,5 @@ static struct platform_driver avs_probe_mb_driver = { module_platform_driver(avs_probe_mb_driver); +MODULE_DESCRIPTION("Intel probe machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/rt274.c b/sound/soc/intel/avs/boards/rt274.c index bfcb8845fd15d..1cf5242160875 100644 --- a/sound/soc/intel/avs/boards/rt274.c +++ b/sound/soc/intel/avs/boards/rt274.c @@ -276,4 +276,5 @@ static struct platform_driver avs_rt274_driver = { module_platform_driver(avs_rt274_driver); +MODULE_DESCRIPTION("Intel rt274 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/rt286.c b/sound/soc/intel/avs/boards/rt286.c index 28d7d86b1cc99..4740bba105703 100644 --- a/sound/soc/intel/avs/boards/rt286.c +++ b/sound/soc/intel/avs/boards/rt286.c @@ -247,4 +247,5 @@ static struct platform_driver avs_rt286_driver = { module_platform_driver(avs_rt286_driver); +MODULE_DESCRIPTION("Intel rt286 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/rt298.c b/sound/soc/intel/avs/boards/rt298.c index 80f490b9e1184..6e409e29f6974 100644 --- a/sound/soc/intel/avs/boards/rt298.c +++ b/sound/soc/intel/avs/boards/rt298.c @@ -266,4 +266,5 @@ static struct platform_driver avs_rt298_driver = { module_platform_driver(avs_rt298_driver); +MODULE_DESCRIPTION("Intel rt298 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/rt5514.c b/sound/soc/intel/avs/boards/rt5514.c index 60105f453ae23..097ae5f73241e 100644 --- a/sound/soc/intel/avs/boards/rt5514.c +++ b/sound/soc/intel/avs/boards/rt5514.c @@ -192,4 +192,5 @@ static struct platform_driver avs_rt5514_driver = { module_platform_driver(avs_rt5514_driver); +MODULE_DESCRIPTION("Intel rt5514 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/rt5663.c b/sound/soc/intel/avs/boards/rt5663.c index b4762c2a7bf2d..1880c315cc4d1 100644 --- a/sound/soc/intel/avs/boards/rt5663.c +++ b/sound/soc/intel/avs/boards/rt5663.c @@ -265,4 +265,5 @@ static struct platform_driver avs_rt5663_driver = { module_platform_driver(avs_rt5663_driver); +MODULE_DESCRIPTION("Intel rt5663 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/rt5682.c b/sound/soc/intel/avs/boards/rt5682.c index 243f979fda98a..594a971ded9eb 100644 --- a/sound/soc/intel/avs/boards/rt5682.c +++ b/sound/soc/intel/avs/boards/rt5682.c @@ -341,5 +341,6 @@ static struct platform_driver avs_rt5682_driver = { module_platform_driver(avs_rt5682_driver) +MODULE_DESCRIPTION("Intel rt5682 machine driver"); MODULE_AUTHOR("Cezary Rojewski "); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/ssm4567.c b/sound/soc/intel/avs/boards/ssm4567.c index 4a0e136835ff5..d6f7f046c24e5 100644 --- a/sound/soc/intel/avs/boards/ssm4567.c +++ b/sound/soc/intel/avs/boards/ssm4567.c @@ -200,4 +200,5 @@ static struct platform_driver avs_ssm4567_driver = { module_platform_driver(avs_ssm4567_driver) +MODULE_DESCRIPTION("Intel ssm4567 machine driver"); MODULE_LICENSE("GPL"); -- GitLab From ac229a2d0939edfb469310c55b16d9321a858a46 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 22 Mar 2024 07:08:19 +1000 Subject: [PATCH 275/989] nvme-multipath: don't inherit LBA-related fields for the multipath node Linux 6.9 made the nvme multipath nodes not properly pick up changes when the LBA size goes smaller after an nvme format. This is because we now try to inherit the queue settings for the multipath node entirely from the individual paths. That is the right thing to do for I/O size limitations, which make up most of the queue limits, but it is wrong for changes to the namespace configuration, where we do want to pick up the new format, which will eventually show up on all paths once they are re-queried. Fix this by not inheriting the block size and related fields and always for updating them. Fixes: 8f03cfa117e0 ("nvme: don't use nvme_update_disk_info for the multipath disk") Reported-by: Nilay Shroff Tested-by: Nilay Shroff Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 943d72bdd794c..0cf46068f1d01 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2201,6 +2201,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) } if (!ret && nvme_ns_head_multipath(ns->head)) { + struct queue_limits *ns_lim = &ns->disk->queue->limits; struct queue_limits lim; blk_mq_freeze_queue(ns->head->disk->queue); @@ -2212,7 +2213,26 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); nvme_mpath_revalidate_paths(ns); + /* + * queue_limits mixes values that are the hardware limitations + * for bio splitting with what is the device configuration. + * + * For NVMe the device configuration can change after e.g. a + * Format command, and we really want to pick up the new format + * value here. But we must still stack the queue limits to the + * least common denominator for multipathing to split the bios + * properly. + * + * To work around this, we explicitly set the device + * configuration to those that we just queried, but only stack + * the splitting limits in to make sure we still obey possibly + * lower limitations of other controllers. + */ lim = queue_limits_start_update(ns->head->disk->queue); + lim.logical_block_size = ns_lim->logical_block_size; + lim.physical_block_size = ns_lim->physical_block_size; + lim.io_min = ns_lim->io_min; + lim.io_opt = ns_lim->io_opt; queue_limits_stack_bdev(&lim, ns->disk->part0, 0, ns->head->disk->disk_name); ret = queue_limits_commit_update(ns->head->disk->queue, &lim); -- GitLab From af133562d5aff41fcdbe51f1a504ae04788b5fc0 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 25 Mar 2024 09:31:04 +0100 Subject: [PATCH 276/989] swiotlb: extend buffer pre-padding to alloc_align_mask if necessary Allow a buffer pre-padding of up to alloc_align_mask, even if it requires allocating additional IO TLB slots. If the allocation alignment is bigger than IO_TLB_SIZE and min_align_mask covers any non-zero bits in the original address between IO_TLB_SIZE and alloc_align_mask, these bits are not preserved in the swiotlb buffer address. To fix this case, increase the allocation size and use a larger offset within the allocated buffer. As a result, extra padding slots may be allocated before the mapping start address. Leave orig_addr in these padding slots initialized to INVALID_PHYS_ADDR. These slots do not correspond to any CPU buffer, so attempts to sync the data should be ignored. The padding slots should be automatically released when the buffer is unmapped. However, swiotlb_tbl_unmap_single() takes only the address of the DMA buffer slot, not the first padding slot. Save the number of padding slots in struct io_tlb_slot and use it to adjust the slot index in swiotlb_release_slots(), so all allocated slots are properly freed. Fixes: 2fd4fa5d3fb5 ("swiotlb: Fix alignment checks when both allocation and DMA masks are present") Link: https://lore.kernel.org/linux-iommu/20240311210507.217daf8b@meshulam.tesarici.cz/ Signed-off-by: Petr Tesarik Reviewed-by: Michael Kelley Tested-by: Michael Kelley Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 59 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 86fe172b59582..d7a8cb93ef2d7 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -69,11 +69,14 @@ * @alloc_size: Size of the allocated buffer. * @list: The free list describing the number of free entries available * from each index. + * @pad_slots: Number of preceding padding slots. Valid only in the first + * allocated non-padding slot. */ struct io_tlb_slot { phys_addr_t orig_addr; size_t alloc_size; - unsigned int list; + unsigned short list; + unsigned short pad_slots; }; static bool swiotlb_force_bounce; @@ -287,6 +290,7 @@ static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start, mem->nslabs - i); mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; + mem->slots[i].pad_slots = 0; } memset(vaddr, 0, bytes); @@ -821,12 +825,30 @@ void swiotlb_dev_init(struct device *dev) #endif } -/* - * Return the offset into a iotlb slot required to keep the device happy. +/** + * swiotlb_align_offset() - Get required offset into an IO TLB allocation. + * @dev: Owning device. + * @align_mask: Allocation alignment mask. + * @addr: DMA address. + * + * Return the minimum offset from the start of an IO TLB allocation which is + * required for a given buffer address and allocation alignment to keep the + * device happy. + * + * First, the address bits covered by min_align_mask must be identical in the + * original address and the bounce buffer address. High bits are preserved by + * choosing a suitable IO TLB slot, but bits below IO_TLB_SHIFT require extra + * padding bytes before the bounce buffer. + * + * Second, @align_mask specifies which bits of the first allocated slot must + * be zero. This may require allocating additional padding slots, and then the + * offset (in bytes) from the first such padding slot is returned. */ -static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) +static unsigned int swiotlb_align_offset(struct device *dev, + unsigned int align_mask, u64 addr) { - return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1); + return addr & dma_get_min_align_mask(dev) & + (align_mask | (IO_TLB_SIZE - 1)); } /* @@ -847,7 +869,7 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size return; tlb_offset = tlb_addr & (IO_TLB_SIZE - 1); - orig_addr_offset = swiotlb_align_offset(dev, orig_addr); + orig_addr_offset = swiotlb_align_offset(dev, 0, orig_addr); if (tlb_offset < orig_addr_offset) { dev_WARN_ONCE(dev, 1, "Access before mapping start detected. orig offset %u, requested offset %u.\n", @@ -1005,7 +1027,7 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool unsigned long max_slots = get_max_slots(boundary_mask); unsigned int iotlb_align_mask = dma_get_min_align_mask(dev); unsigned int nslots = nr_slots(alloc_size), stride; - unsigned int offset = swiotlb_align_offset(dev, orig_addr); + unsigned int offset = swiotlb_align_offset(dev, 0, orig_addr); unsigned int index, slots_checked, count = 0, i; unsigned long flags; unsigned int slot_base; @@ -1328,11 +1350,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, unsigned long attrs) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; - unsigned int offset = swiotlb_align_offset(dev, orig_addr); + unsigned int offset; struct io_tlb_pool *pool; unsigned int i; int index; phys_addr_t tlb_addr; + unsigned short pad_slots; if (!mem || !mem->nslabs) { dev_warn_ratelimited(dev, @@ -1349,6 +1372,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return (phys_addr_t)DMA_MAPPING_ERROR; } + offset = swiotlb_align_offset(dev, alloc_align_mask, orig_addr); index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset, alloc_align_mask, &pool); if (index == -1) { @@ -1364,6 +1388,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * This is needed when we sync the memory. Then we sync the buffer if * needed. */ + pad_slots = offset >> IO_TLB_SHIFT; + offset &= (IO_TLB_SIZE - 1); + index += pad_slots; + pool->slots[index].pad_slots = pad_slots; for (i = 0; i < nr_slots(alloc_size + offset); i++) pool->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(pool->start, index) + offset; @@ -1384,13 +1412,17 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) { struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr); unsigned long flags; - unsigned int offset = swiotlb_align_offset(dev, tlb_addr); - int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; - int nslots = nr_slots(mem->slots[index].alloc_size + offset); - int aindex = index / mem->area_nslabs; - struct io_tlb_area *area = &mem->areas[aindex]; + unsigned int offset = swiotlb_align_offset(dev, 0, tlb_addr); + int index, nslots, aindex; + struct io_tlb_area *area; int count, i; + index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; + index -= mem->slots[index].pad_slots; + nslots = nr_slots(mem->slots[index].alloc_size + offset); + aindex = index / mem->area_nslabs; + area = &mem->areas[aindex]; + /* * Return the buffer to the free list by setting the corresponding * entries to indicate the number of contiguous entries available. @@ -1413,6 +1445,7 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) mem->slots[i].list = ++count; mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; + mem->slots[i].pad_slots = 0; } /* -- GitLab From e8068f2d756d57a5206fa3180ade365a8c12ed85 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Tue, 26 Mar 2024 20:45:48 -0700 Subject: [PATCH 277/989] swiotlb: fix swiotlb_bounce() to do partial sync's correctly In current code, swiotlb_bounce() may do partial sync's correctly in some circumstances, but may incorrectly fail in other circumstances. The failure cases require both of these to be true: 1) swiotlb_align_offset() returns a non-zero "offset" value 2) the tlb_addr of the partial sync area points into the first "offset" bytes of the _second_ or subsequent swiotlb slot allocated for the mapping Code added in commit 868c9ddc182b ("swiotlb: add overflow checks to swiotlb_bounce") attempts to WARN on the invalid case where tlb_addr points into the first "offset" bytes of the _first_ allocated slot. But there's no way for swiotlb_bounce() to distinguish the first slot from the second and subsequent slots, so the WARN can be triggered incorrectly when #2 above is true. Related, current code calculates an adjustment to the orig_addr stored in the swiotlb slot. The adjustment compensates for the difference in the tlb_addr used for the partial sync vs. the tlb_addr for the full mapping. The adjustment is stored in the local variable tlb_offset. But when #1 and #2 above are true, it's valid for this adjustment to be negative. In such case the arithmetic to adjust orig_addr produces the wrong result due to tlb_offset being declared as unsigned. Fix these problems by removing the over-constraining validations added in 868c9ddc182b. Change the declaration of tlb_offset to be signed instead of unsigned so the adjustment arithmetic works correctly. Tested with a test-only hack to how swiotlb_tbl_map_single() calls swiotlb_bounce(). Instead of calling swiotlb_bounce() just once for the entire mapped area, do a loop with each iteration doing only a 128 byte partial sync until the entire mapped area is sync'ed. Then with swiotlb=force on the kernel boot line, run a variety of raw disk writes followed by read and verification of all bytes of the written data. The storage device has DMA min_align_mask set, and the writes are done with a variety of original buffer memory address alignments and overall buffer sizes. For many of the combinations, current code triggers the WARN statements, or the data verification fails. With the fixes, no WARNs occur and all verifications pass. Fixes: 5f89468e2f06 ("swiotlb: manipulate orig_addr when tlb_addr has offset") Fixes: 868c9ddc182b ("swiotlb: add overflow checks to swiotlb_bounce") Signed-off-by: Michael Kelley Dominique Martinet Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d7a8cb93ef2d7..d57c8837c8138 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -863,27 +863,23 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start; - unsigned int tlb_offset, orig_addr_offset; + int tlb_offset; if (orig_addr == INVALID_PHYS_ADDR) return; - tlb_offset = tlb_addr & (IO_TLB_SIZE - 1); - orig_addr_offset = swiotlb_align_offset(dev, 0, orig_addr); - if (tlb_offset < orig_addr_offset) { - dev_WARN_ONCE(dev, 1, - "Access before mapping start detected. orig offset %u, requested offset %u.\n", - orig_addr_offset, tlb_offset); - return; - } - - tlb_offset -= orig_addr_offset; - if (tlb_offset > alloc_size) { - dev_WARN_ONCE(dev, 1, - "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n", - alloc_size, size, tlb_offset); - return; - } + /* + * It's valid for tlb_offset to be negative. This can happen when the + * "offset" returned by swiotlb_align_offset() is non-zero, and the + * tlb_addr is pointing within the first "offset" bytes of the second + * or subsequent slots of the allocated swiotlb area. While it's not + * valid for tlb_addr to be pointing within the first "offset" bytes + * of the first slot, there's no way to check for such an error since + * this function can't distinguish the first slot from the second and + * subsequent slots. + */ + tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) - + swiotlb_align_offset(dev, 0, orig_addr); orig_addr += tlb_offset; alloc_size -= tlb_offset; -- GitLab From a1255ccab8ecee89905ddb12161139b0d878a7f2 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 29 Mar 2024 12:28:09 -0700 Subject: [PATCH 278/989] swiotlb: do not set total_used to 0 in swiotlb_create_debugfs_files() Sometimes the readout of /sys/kernel/debug/swiotlb/io_tlb_used and io_tlb_used_hiwater can be a huge number (e.g. 18446744073709551615), which is actually a negative number if we use "%ld" to print the number. When swiotlb_create_default_debugfs() is running from late_initcall, mem->total_used may already be non-zero, because the storage driver may have already started to perform I/O operations: if the storage driver is built-in, its probe() callback is called before late_initcall. swiotlb_create_debugfs_files() should not blindly set mem->total_used and mem->used_hiwater to 0; actually it doesn't have to initialize the fields at all, because the fields, as part of the global struct io_tlb_default_mem, have been implicitly initialized to zero. Also don't explicitly set mem->transient_nslabs to 0. Fixes: 8b0977ecc8b3 ("swiotlb: track and report io_tlb_used high water marks in debugfs") Fixes: 02e765697038 ("swiotlb: add debugfs to track swiotlb transient pool usage") Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Reviewed-by: ZhangPeng Reviewed-by: Petr Tesarik Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d57c8837c8138..a5e0dfc44d24e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1676,9 +1676,6 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get, static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { - atomic_long_set(&mem->total_used, 0); - atomic_long_set(&mem->used_hiwater, 0); - mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs); if (!mem->nslabs) return; @@ -1689,7 +1686,6 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem, &fops_io_tlb_hiwater); #ifdef CONFIG_SWIOTLB_DYNAMIC - atomic_long_set(&mem->transient_nslabs, 0); debugfs_create_file("io_tlb_transient_nslabs", 0400, mem->debugfs, mem, &fops_io_tlb_transient_used); #endif -- GitLab From 062a7f0ff46eb57aff526897bd2bebfdb1d3046a Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 1 Apr 2024 22:37:42 -0500 Subject: [PATCH 279/989] smb: client: guarantee refcounted children from parent session Avoid potential use-after-free bugs when walking DFS referrals, mounting and performing DFS failover by ensuring that all children from parent @tcon->ses are also refcounted. They're all needed across the entire DFS mount. Get rid of @tcon->dfs_ses_list while we're at it, too. Cc: stable@vger.kernel.org # 6.4+ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404021527.ZlRkIxgv-lkp@intel.com/ Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 2 -- fs/smb/client/cifsproto.h | 20 +++++++-------- fs/smb/client/connect.c | 25 +++++++++++++++---- fs/smb/client/dfs.c | 51 ++++++++++++++++++--------------------- fs/smb/client/dfs.h | 33 ++++++++++++++++--------- fs/smb/client/dfs_cache.c | 11 +-------- fs/smb/client/misc.c | 6 ----- 7 files changed, 76 insertions(+), 72 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 7ed9d05f6890b..286afbe346be0 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1281,7 +1281,6 @@ struct cifs_tcon { struct cached_fids *cfids; /* BB add field for back pointer to sb struct(s)? */ #ifdef CONFIG_CIFS_DFS_UPCALL - struct list_head dfs_ses_list; struct delayed_work dfs_cache_work; #endif struct delayed_work query_interfaces; /* query interfaces workqueue job */ @@ -1804,7 +1803,6 @@ struct cifs_mount_ctx { struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; - struct list_head dfs_ses_list; }; static inline void __free_dfs_info_param(struct dfs_info3_param *param) diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 0723e1b57256b..8e0a348f1f660 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -725,31 +725,31 @@ struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon); void cifs_put_tcon_super(struct super_block *sb); int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry); -/* Put references of @ses and @ses->dfs_root_ses */ +/* Put references of @ses and its children */ static inline void cifs_put_smb_ses(struct cifs_ses *ses) { - struct cifs_ses *rses = ses->dfs_root_ses; + struct cifs_ses *next; - __cifs_put_smb_ses(ses); - if (rses) - __cifs_put_smb_ses(rses); + do { + next = ses->dfs_root_ses; + __cifs_put_smb_ses(ses); + } while ((ses = next)); } -/* Get an active reference of @ses and @ses->dfs_root_ses. +/* Get an active reference of @ses and its children. * * NOTE: make sure to call this function when incrementing reference count of * @ses to ensure that any DFS root session attached to it (@ses->dfs_root_ses) * will also get its reference count incremented. * - * cifs_put_smb_ses() will put both references, so call it when you're done. + * cifs_put_smb_ses() will put all references, so call it when you're done. */ static inline void cifs_smb_ses_inc_refcount(struct cifs_ses *ses) { lockdep_assert_held(&cifs_tcp_ses_lock); - ses->ses_count++; - if (ses->dfs_root_ses) - ses->dfs_root_ses->ses_count++; + for (; ses; ses = ses->dfs_root_ses) + ses->ses_count++; } static inline bool dfs_src_pathname_equal(const char *s1, const char *s2) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index ee29bc57300cd..38b75cfa06e3e 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -1866,6 +1866,9 @@ static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx) ctx->sectype != ses->sectype) return 0; + if (ctx->dfs_root_ses != ses->dfs_root_ses) + return 0; + /* * If an existing session is limited to less channels than * requested, it should not be reused @@ -2358,9 +2361,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) * need to lock before changing something in the session. */ spin_lock(&cifs_tcp_ses_lock); + if (ctx->dfs_root_ses) + cifs_smb_ses_inc_refcount(ctx->dfs_root_ses); ses->dfs_root_ses = ctx->dfs_root_ses; - if (ses->dfs_root_ses) - ses->dfs_root_ses->ses_count++; list_add(&ses->smb_ses_list, &server->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); @@ -3311,6 +3314,9 @@ void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx) cifs_put_smb_ses(mnt_ctx->ses); else if (mnt_ctx->server) cifs_put_tcp_session(mnt_ctx->server, 0); + mnt_ctx->ses = NULL; + mnt_ctx->tcon = NULL; + mnt_ctx->server = NULL; mnt_ctx->cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS; free_xid(mnt_ctx->xid); } @@ -3589,8 +3595,6 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) bool isdfs; int rc; - INIT_LIST_HEAD(&mnt_ctx.dfs_ses_list); - rc = dfs_mount_share(&mnt_ctx, &isdfs); if (rc) goto error; @@ -3621,7 +3625,6 @@ out: return rc; error: - dfs_put_root_smb_sessions(&mnt_ctx.dfs_ses_list); cifs_mount_put_conns(&mnt_ctx); return rc; } @@ -3636,6 +3639,18 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) goto error; rc = cifs_mount_get_tcon(&mnt_ctx); + if (!rc) { + /* + * Prevent superblock from being created with any missing + * connections. + */ + if (WARN_ON(!mnt_ctx.server)) + rc = -EHOSTDOWN; + else if (WARN_ON(!mnt_ctx.ses)) + rc = -EACCES; + else if (WARN_ON(!mnt_ctx.tcon)) + rc = -ENOENT; + } if (rc) goto error; diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index 449c59830039b..3ec965547e3d4 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -66,33 +66,20 @@ static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path) } /* - * Track individual DFS referral servers used by new DFS mount. - * - * On success, their lifetime will be shared by final tcon (dfs_ses_list). - * Otherwise, they will be put by dfs_put_root_smb_sessions() in cifs_mount(). + * Get an active reference of @ses so that next call to cifs_put_tcon() won't + * release it as any new DFS referrals must go through its IPC tcon. */ -static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx) +static void add_root_smb_session(struct cifs_mount_ctx *mnt_ctx) { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; - struct dfs_root_ses *root_ses; struct cifs_ses *ses = mnt_ctx->ses; if (ses) { - root_ses = kmalloc(sizeof(*root_ses), GFP_KERNEL); - if (!root_ses) - return -ENOMEM; - - INIT_LIST_HEAD(&root_ses->list); - spin_lock(&cifs_tcp_ses_lock); cifs_smb_ses_inc_refcount(ses); spin_unlock(&cifs_tcp_ses_lock); - root_ses->ses = ses; - list_add_tail(&root_ses->list, &mnt_ctx->dfs_ses_list); } - /* Select new DFS referral server so that new referrals go through it */ ctx->dfs_root_ses = ses; - return 0; } static inline int parse_dfs_target(struct smb3_fs_context *ctx, @@ -185,11 +172,8 @@ again: continue; } - if (is_refsrv) { - rc = add_root_smb_session(mnt_ctx); - if (rc) - goto out; - } + if (is_refsrv) + add_root_smb_session(mnt_ctx); rc = ref_walk_advance(rw); if (!rc) { @@ -232,6 +216,7 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; struct cifs_tcon *tcon; char *origin_fullpath; + bool new_tcon = true; int rc; origin_fullpath = dfs_get_path(cifs_sb, ctx->source); @@ -239,6 +224,18 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) return PTR_ERR(origin_fullpath); rc = dfs_referral_walk(mnt_ctx); + if (!rc) { + /* + * Prevent superblock from being created with any missing + * connections. + */ + if (WARN_ON(!mnt_ctx->server)) + rc = -EHOSTDOWN; + else if (WARN_ON(!mnt_ctx->ses)) + rc = -EACCES; + else if (WARN_ON(!mnt_ctx->tcon)) + rc = -ENOENT; + } if (rc) goto out; @@ -247,15 +244,14 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) if (!tcon->origin_fullpath) { tcon->origin_fullpath = origin_fullpath; origin_fullpath = NULL; + } else { + new_tcon = false; } spin_unlock(&tcon->tc_lock); - if (list_empty(&tcon->dfs_ses_list)) { - list_replace_init(&mnt_ctx->dfs_ses_list, &tcon->dfs_ses_list); + if (new_tcon) { queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, dfs_cache_get_ttl() * HZ); - } else { - dfs_put_root_smb_sessions(&mnt_ctx->dfs_ses_list); } out: @@ -298,7 +294,6 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) if (rc) return rc; - ctx->dfs_root_ses = mnt_ctx->ses; /* * If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally * try to get an DFS referral (even cached) to determine whether it is an DFS mount. @@ -324,7 +319,9 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) *isdfs = true; add_root_smb_session(mnt_ctx); - return __dfs_mount_share(mnt_ctx); + rc = __dfs_mount_share(mnt_ctx); + dfs_put_root_smb_sessions(mnt_ctx); + return rc; } /* Update dfs referral path of superblock */ diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h index 875ab7ae57fcd..e5c4dcf837503 100644 --- a/fs/smb/client/dfs.h +++ b/fs/smb/client/dfs.h @@ -7,7 +7,9 @@ #define _CIFS_DFS_H #include "cifsglob.h" +#include "cifsproto.h" #include "fs_context.h" +#include "dfs_cache.h" #include "cifs_unicode.h" #include @@ -114,11 +116,6 @@ static inline void ref_walk_set_tgt_hint(struct dfs_ref_walk *rw) ref_walk_tit(rw)); } -struct dfs_root_ses { - struct list_head list; - struct cifs_ses *ses; -}; - int dfs_parse_target_referral(const char *full_path, const struct dfs_info3_param *ref, struct smb3_fs_context *ctx); int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs); @@ -133,20 +130,32 @@ static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx, const char *p { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb; + struct cifs_ses *rses = ctx->dfs_root_ses ?: mnt_ctx->ses; - return dfs_cache_find(mnt_ctx->xid, ctx->dfs_root_ses, cifs_sb->local_nls, + return dfs_cache_find(mnt_ctx->xid, rses, cifs_sb->local_nls, cifs_remap(cifs_sb), path, ref, tl); } -static inline void dfs_put_root_smb_sessions(struct list_head *head) +/* + * cifs_get_smb_ses() already guarantees an active reference of + * @ses->dfs_root_ses when a new session is created, so we need to put extra + * references of all DFS root sessions that were used across the mount process + * in dfs_mount_share(). + */ +static inline void dfs_put_root_smb_sessions(struct cifs_mount_ctx *mnt_ctx) { - struct dfs_root_ses *root, *tmp; + const struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; + struct cifs_ses *ses = ctx->dfs_root_ses; + struct cifs_ses *cur; + + if (!ses) + return; - list_for_each_entry_safe(root, tmp, head, list) { - list_del_init(&root->list); - cifs_put_smb_ses(root->ses); - kfree(root); + for (cur = ses; cur; cur = cur->dfs_root_ses) { + if (cur->dfs_root_ses) + cifs_put_smb_ses(cur->dfs_root_ses); } + cifs_put_smb_ses(ses); } #endif /* _CIFS_DFS_H */ diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 508d831fabe37..0552a864ff08f 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -1278,21 +1278,12 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb) void dfs_cache_refresh(struct work_struct *work) { struct TCP_Server_Info *server; - struct dfs_root_ses *rses; struct cifs_tcon *tcon; struct cifs_ses *ses; tcon = container_of(work, struct cifs_tcon, dfs_cache_work.work); - ses = tcon->ses; - server = ses->server; - mutex_lock(&server->refpath_lock); - if (server->leaf_fullpath) - __refresh_tcon(server->leaf_fullpath + 1, ses, false); - mutex_unlock(&server->refpath_lock); - - list_for_each_entry(rses, &tcon->dfs_ses_list, list) { - ses = rses->ses; + for (ses = tcon->ses; ses; ses = ses->dfs_root_ses) { server = ses->server; mutex_lock(&server->refpath_lock); if (server->leaf_fullpath) diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index c3771fc81328f..1ea22b3955a2f 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -138,9 +138,6 @@ tcon_info_alloc(bool dir_leases_enabled) atomic_set(&ret_buf->num_local_opens, 0); atomic_set(&ret_buf->num_remote_opens, 0); ret_buf->stats_from_time = ktime_get_real_seconds(); -#ifdef CONFIG_CIFS_DFS_UPCALL - INIT_LIST_HEAD(&ret_buf->dfs_ses_list); -#endif return ret_buf; } @@ -156,9 +153,6 @@ tconInfoFree(struct cifs_tcon *tcon) atomic_dec(&tconInfoAllocCount); kfree(tcon->nativeFileSystem); kfree_sensitive(tcon->password); -#ifdef CONFIG_CIFS_DFS_UPCALL - dfs_put_root_smb_sessions(&tcon->dfs_ses_list); -#endif kfree(tcon->origin_fullpath); kfree(tcon); } -- GitLab From 0a05ad21d77a188d06481c36d6016805a881bcc0 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 1 Apr 2024 22:44:07 -0300 Subject: [PATCH 280/989] smb: client: refresh referral without acquiring refpath_lock Avoid refreshing DFS referral with refpath_lock acquired as the I/O could block for a while due to a potentially disconnected or slow DFS root server and then making other threads - that use same @server and don't require a DFS root server - unable to make any progress. Cc: stable@vger.kernel.org # 6.4+ Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/dfs_cache.c | 44 +++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 0552a864ff08f..11c8efecf7aa1 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -1172,8 +1172,8 @@ static bool is_ses_good(struct cifs_ses *ses) return ret; } -/* Refresh dfs referral of tcon and mark it for reconnect if needed */ -static int __refresh_tcon(const char *path, struct cifs_ses *ses, bool force_refresh) +/* Refresh dfs referral of @ses and mark it for reconnect if needed */ +static void __refresh_ses_referral(struct cifs_ses *ses, bool force_refresh) { struct TCP_Server_Info *server = ses->server; DFS_CACHE_TGT_LIST(old_tl); @@ -1181,10 +1181,21 @@ static int __refresh_tcon(const char *path, struct cifs_ses *ses, bool force_ref bool needs_refresh = false; struct cache_entry *ce; unsigned int xid; + char *path = NULL; int rc = 0; xid = get_xid(); + mutex_lock(&server->refpath_lock); + if (server->leaf_fullpath) { + path = kstrdup(server->leaf_fullpath + 1, GFP_ATOMIC); + if (!path) + rc = -ENOMEM; + } + mutex_unlock(&server->refpath_lock); + if (!path) + goto out; + down_read(&htable_rw_lock); ce = lookup_cache_entry(path); needs_refresh = force_refresh || IS_ERR(ce) || cache_entry_expired(ce); @@ -1218,19 +1229,17 @@ out: free_xid(xid); dfs_cache_free_tgts(&old_tl); dfs_cache_free_tgts(&new_tl); - return rc; + kfree(path); } -static int refresh_tcon(struct cifs_tcon *tcon, bool force_refresh) +static inline void refresh_ses_referral(struct cifs_ses *ses) { - struct TCP_Server_Info *server = tcon->ses->server; - struct cifs_ses *ses = tcon->ses; + __refresh_ses_referral(ses, false); +} - mutex_lock(&server->refpath_lock); - if (server->leaf_fullpath) - __refresh_tcon(server->leaf_fullpath + 1, ses, force_refresh); - mutex_unlock(&server->refpath_lock); - return 0; +static inline void force_refresh_ses_referral(struct cifs_ses *ses) +{ + __refresh_ses_referral(ses, true); } /** @@ -1271,25 +1280,20 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb) */ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; - return refresh_tcon(tcon, true); + force_refresh_ses_referral(tcon->ses); + return 0; } /* Refresh all DFS referrals related to DFS tcon */ void dfs_cache_refresh(struct work_struct *work) { - struct TCP_Server_Info *server; struct cifs_tcon *tcon; struct cifs_ses *ses; tcon = container_of(work, struct cifs_tcon, dfs_cache_work.work); - for (ses = tcon->ses; ses; ses = ses->dfs_root_ses) { - server = ses->server; - mutex_lock(&server->refpath_lock); - if (server->leaf_fullpath) - __refresh_tcon(server->leaf_fullpath + 1, ses, false); - mutex_unlock(&server->refpath_lock); - } + for (ses = tcon->ses; ses; ses = ses->dfs_root_ses) + refresh_ses_referral(ses); queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, atomic_read(&dfs_cache_ttl) * HZ); -- GitLab From 4a5ba0e0bfe552ac7451f57e304f6343c3d87f89 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 1 Apr 2024 22:44:08 -0300 Subject: [PATCH 281/989] smb: client: handle DFS tcons in cifs_construct_tcon() The tcons created by cifs_construct_tcon() on multiuser mounts must also be able to failover and refresh DFS referrals, so set the appropriate fields in order to get a full DFS tcon. They could be shared among different superblocks later, too. Cc: stable@vger.kernel.org # 6.4+ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404021518.3Xu2VU4s-lkp@intel.com/ Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 38b75cfa06e3e..9a35788814961 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3995,6 +3995,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) struct cifs_ses *ses; struct cifs_tcon *tcon = NULL; struct smb3_fs_context *ctx; + char *origin_fullpath = NULL; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (ctx == NULL) @@ -4018,6 +4019,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ctx->sign = master_tcon->ses->sign; ctx->seal = master_tcon->seal; ctx->witness = master_tcon->use_witness; + ctx->dfs_root_ses = master_tcon->ses->dfs_root_ses; rc = cifs_set_vol_auth(ctx, master_tcon->ses); if (rc) { @@ -4037,12 +4039,39 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) goto out; } +#ifdef CONFIG_CIFS_DFS_UPCALL + spin_lock(&master_tcon->tc_lock); + if (master_tcon->origin_fullpath) { + spin_unlock(&master_tcon->tc_lock); + origin_fullpath = dfs_get_path(cifs_sb, cifs_sb->ctx->source); + if (IS_ERR(origin_fullpath)) { + tcon = ERR_CAST(origin_fullpath); + origin_fullpath = NULL; + cifs_put_smb_ses(ses); + goto out; + } + } else { + spin_unlock(&master_tcon->tc_lock); + } +#endif + tcon = cifs_get_tcon(ses, ctx); if (IS_ERR(tcon)) { cifs_put_smb_ses(ses); goto out; } +#ifdef CONFIG_CIFS_DFS_UPCALL + if (origin_fullpath) { + spin_lock(&tcon->tc_lock); + tcon->origin_fullpath = origin_fullpath; + spin_unlock(&tcon->tc_lock); + origin_fullpath = NULL; + queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, + dfs_cache_get_ttl() * HZ); + } +#endif + #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(ses)) reset_cifs_unix_caps(0, tcon, NULL, ctx); @@ -4051,6 +4080,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) out: kfree(ctx->username); kfree_sensitive(ctx->password); + kfree(origin_fullpath); kfree(ctx); return tcon; -- GitLab From 93cee45ccfebc62a3bb4cd622b89e00c8c7d8493 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 1 Apr 2024 22:44:09 -0300 Subject: [PATCH 282/989] smb: client: serialise cifs_construct_tcon() with cifs_mount_mutex Serialise cifs_construct_tcon() with cifs_mount_mutex to handle parallel mounts that may end up reusing the session and tcon created by it. Cc: stable@vger.kernel.org # 6.4+ Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 13 ++++++++++++- fs/smb/client/fs_context.c | 6 +++--- fs/smb/client/fs_context.h | 12 ++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 9a35788814961..95e4bda4fd517 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3988,7 +3988,7 @@ cifs_set_vol_auth(struct smb3_fs_context *ctx, struct cifs_ses *ses) } static struct cifs_tcon * -cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) +__cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) { int rc; struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb); @@ -4086,6 +4086,17 @@ out: return tcon; } +static struct cifs_tcon * +cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) +{ + struct cifs_tcon *ret; + + cifs_mount_lock(); + ret = __cifs_construct_tcon(cifs_sb, fsuid); + cifs_mount_unlock(); + return ret; +} + struct cifs_tcon * cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb) { diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index bdcbe6ff2739a..b7bfe705b2c49 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -37,7 +37,7 @@ #include "rfc1002pdu.h" #include "fs_context.h" -static DEFINE_MUTEX(cifs_mount_mutex); +DEFINE_MUTEX(cifs_mount_mutex); static const match_table_t cifs_smb_version_tokens = { { Smb_1, SMB1_VERSION_STRING }, @@ -783,9 +783,9 @@ static int smb3_get_tree(struct fs_context *fc) if (err) return err; - mutex_lock(&cifs_mount_mutex); + cifs_mount_lock(); ret = smb3_get_tree_common(fc); - mutex_unlock(&cifs_mount_mutex); + cifs_mount_unlock(); return ret; } diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 7863f2248c4df..8a35645e0b65b 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -304,4 +304,16 @@ extern void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb); #define MAX_CACHED_FIDS 16 extern char *cifs_sanitize_prepath(char *prepath, gfp_t gfp); +extern struct mutex cifs_mount_mutex; + +static inline void cifs_mount_lock(void) +{ + mutex_lock(&cifs_mount_mutex); +} + +static inline void cifs_mount_unlock(void) +{ + mutex_unlock(&cifs_mount_mutex); +} + #endif -- GitLab From c85c9ab926a592e2f59f7d9a6ca7d6562843d8fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Apr 2024 16:47:54 +0200 Subject: [PATCH 283/989] nvme: split nvme_update_zone_info nvme_update_zone_info does (admin queue) I/O to the device and can fail. We fail to abort the queue limits update if that happen, but really should avoid with the frozen I/O queue as much as possible anyway. Split the logic into a helper to query the information that can be called on an unfrozen queue and one to apply it to the queue limits. Fixes: 9b130d681443 ("nvme: use the atomic queue limits update API") Reported-by: Kanchan Joshi Signed-off-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 19 +++++++++++-------- drivers/nvme/host/nvme.h | 12 ++++++++++-- drivers/nvme/host/zns.c | 33 ++++++++++++++++++++------------- 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0cf46068f1d01..504dc352c458d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2076,6 +2076,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT; struct queue_limits lim; struct nvme_id_ns_nvm *nvm = NULL; + struct nvme_zone_info zi = {}; struct nvme_id_ns *id; sector_t capacity; unsigned lbaf; @@ -2091,6 +2092,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, ret = -ENODEV; goto out; } + lbaf = nvme_lbaf_index(id->flbas); if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) { ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm); @@ -2098,8 +2100,14 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, goto out; } + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + ns->head->ids.csi == NVME_CSI_ZNS) { + ret = nvme_query_zone_info(ns, lbaf, &zi); + if (ret < 0) + goto out; + } + blk_mq_freeze_queue(ns->disk->queue); - lbaf = nvme_lbaf_index(id->flbas); ns->head->lba_shift = id->lbaf[lbaf].ds; ns->head->nuse = le64_to_cpu(id->nuse); capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze)); @@ -2112,13 +2120,8 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, capacity = 0; nvme_config_discard(ns, &lim); if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - ns->head->ids.csi == NVME_CSI_ZNS) { - ret = nvme_update_zone_info(ns, lbaf, &lim); - if (ret) { - blk_mq_unfreeze_queue(ns->disk->queue); - goto out; - } - } + ns->head->ids.csi == NVME_CSI_ZNS) + nvme_update_zone_info(ns, &lim, &zi); ret = queue_limits_commit_update(ns->disk->queue, &lim); if (ret) { blk_mq_unfreeze_queue(ns->disk->queue); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 24193fcb8bd58..d0ed64dc7380e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1036,10 +1036,18 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk) } #endif /* CONFIG_NVME_MULTIPATH */ +struct nvme_zone_info { + u64 zone_size; + unsigned int max_open_zones; + unsigned int max_active_zones; +}; + int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); -int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf, - struct queue_limits *lim); +int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf, + struct nvme_zone_info *zi); +void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim, + struct nvme_zone_info *zi); #ifdef CONFIG_BLK_DEV_ZONED blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd, diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 722384bcc765c..77aa0f440a6d2 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -35,8 +35,8 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl) return 0; } -int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf, - struct queue_limits *lim) +int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf, + struct nvme_zone_info *zi) { struct nvme_effects_log *log = ns->head->effects; struct nvme_command c = { }; @@ -89,27 +89,34 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf, goto free_data; } - ns->head->zsze = - nvme_lba_to_sect(ns->head, le64_to_cpu(id->lbafe[lbaf].zsze)); - if (!is_power_of_2(ns->head->zsze)) { + zi->zone_size = le64_to_cpu(id->lbafe[lbaf].zsze); + if (!is_power_of_2(zi->zone_size)) { dev_warn(ns->ctrl->device, - "invalid zone size:%llu for namespace:%u\n", - ns->head->zsze, ns->head->ns_id); + "invalid zone size: %llu for namespace: %u\n", + zi->zone_size, ns->head->ns_id); status = -ENODEV; goto free_data; } + zi->max_open_zones = le32_to_cpu(id->mor) + 1; + zi->max_active_zones = le32_to_cpu(id->mar) + 1; - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue); - lim->zoned = 1; - lim->max_open_zones = le32_to_cpu(id->mor) + 1; - lim->max_active_zones = le32_to_cpu(id->mar) + 1; - lim->chunk_sectors = ns->head->zsze; - lim->max_zone_append_sectors = ns->ctrl->max_zone_append; free_data: kfree(id); return status; } +void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim, + struct nvme_zone_info *zi) +{ + lim->zoned = 1; + lim->max_open_zones = zi->max_open_zones; + lim->max_active_zones = zi->max_active_zones; + lim->max_zone_append_sectors = ns->ctrl->max_zone_append; + lim->chunk_sectors = ns->head->zsze = + nvme_lba_to_sect(ns->head, zi->zone_size); + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue); +} + static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, unsigned int nr_zones, size_t *buflen) { -- GitLab From 538d505fde20393bce1e6fb95cec82b56cdd22ef Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 22 Jan 2024 15:22:35 +0100 Subject: [PATCH 284/989] tools/power turbostat: Read base_hz and bclk from CPUID.16H if available If MSRs cannot be read, values can be obtained from cpuid. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a4a40a6e1b957..c35c48b6a99a7 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5848,6 +5848,15 @@ void process_cpuid() base_mhz = max_mhz = bus_mhz = edx = 0; __cpuid(0x16, base_mhz, max_mhz, bus_mhz, edx); + + bclk = bus_mhz; + + base_hz = base_mhz * 1000000; + has_base_hz = 1; + + if (platform->enable_tsc_tweak) + tsc_tweak = base_hz / tsc_hz; + if (!quiet) fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n", base_mhz, max_mhz, bus_mhz); -- GitLab From b6fe938317eed58e8c687bd5965a956e15fb5828 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 19 Jan 2024 12:25:42 -0600 Subject: [PATCH 285/989] tools/power turbostat: Fix warning upon failed /dev/cpu_dma_latency read Previously a failed read of /dev/cpu_dma_latency erroneously complained turbostat: capget(CAP_SYS_ADMIN) failed, try "# setcap cap_sys_admin=ep ./turbostat This went unnoticed because this file is typically visible to root, and turbostat was typically run as root. Going forward, when a non-root user can run turbostat... Complain about failed read access to this file only if --debug is used. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c35c48b6a99a7..531f37e5f92ab 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5545,7 +5545,8 @@ void print_dev_latency(void) fd = open(path, O_RDONLY); if (fd < 0) { - warnx("capget(CAP_SYS_ADMIN) failed, try \"# setcap cap_sys_admin=ep %s\"", progname); + if (debug) + warnx("Read %s failed", path); return; } -- GitLab From 2d2ccd57338779469777d4319152151272994182 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 5 Feb 2024 15:56:25 -0600 Subject: [PATCH 286/989] tools/power turbostat: enhance -D (debug counter dump) output Eliminate redundant debug output for core and package scope counters. Include name and path for all "ADDED" counters. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 531f37e5f92ab..60432753fe6af 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1673,11 +1673,13 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "SMI: %d\n", t->smi_count); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "tADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, t->counter[i]); + outp += + sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + t->counter[i], mp->path); } } - if (c) { + if (c && is_cpu_first_thread_in_core(t, c, p)) { outp += sprintf(outp, "core: %d\n", c->core_id); outp += sprintf(outp, "c3: %016llX\n", c->c3); outp += sprintf(outp, "c6: %016llX\n", c->c6); @@ -1687,12 +1689,14 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "Joules: %0X\n", c->core_energy); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, c->counter[i]); + outp += + sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + c->counter[i], mp->path); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } - if (p) { + if (p && is_cpu_first_core_in_package(t, c, p)) { outp += sprintf(outp, "package: %d\n", p->package_id); outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0); @@ -1721,7 +1725,9 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "pADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, p->counter[i]); + outp += + sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + p->counter[i], mp->path); } } -- GitLab From 3e4048466c396cff52c6d435156dbcd0571e4381 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 11 Jan 2024 15:48:09 +0100 Subject: [PATCH 287/989] tools/power turbostat: Add --no-msr option Add --no-msr option to allow users to run turbostat without accessing MSRs via the MSR driver. Signed-off-by: Patryk Wlazlyn Reviewed-by: Len Brown Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 + tools/power/x86/turbostat/turbostat.c | 205 +++++++++++++++++++------- 2 files changed, 151 insertions(+), 56 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 1ba6340d3b3da..28be452fbfe2f 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -67,6 +67,8 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--quiet\fP Do not decode and print the system configuration header information. .PP ++\fB--no-msr\fP Disable all the uses of the MSR driver. ++.PP \fB--interval seconds\fP overrides the default 5.0 second measurement interval. .PP \fB--num_iterations num\fP number of the measurement iterations. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 60432753fe6af..4d5437a9725bf 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -36,6 +36,7 @@ #include #include #include +#include #define UNUSED(x) (void)(x) @@ -265,6 +266,7 @@ unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int first_counter_read = 1; int ignore_stdin; +bool no_msr; int get_msr(int cpu, off_t offset, unsigned long long *msr); @@ -1282,13 +1284,36 @@ int get_msr_fd(int cpu) sprintf(pathname, "/dev/cpu/%d/msr", cpu); fd = open(pathname, O_RDONLY); if (fd < 0) - err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname); + err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, " + "or run with --no-msr, or run as root", pathname); fd_percpu[cpu] = fd; return fd; } +static void bic_disable_msr_access(void) +{ + const unsigned long bic_msrs = + BIC_Avg_MHz | + BIC_Busy | + BIC_Bzy_MHz | + BIC_SMI | + BIC_CPU_c1 | + BIC_CPU_c3 | + BIC_CPU_c6 | + BIC_CPU_c7 | + BIC_Mod_c6 | + BIC_CoreTmp | + BIC_Totl_c0 | + BIC_Any_c0 | + BIC_GFX_c0 | + BIC_CPUGFX | + BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; + + bic_enabled &= ~bic_msrs; +} + static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); @@ -1328,6 +1353,8 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) { ssize_t retval; + assert(!no_msr); + retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset); if (retval != sizeof *msr) @@ -1371,6 +1398,7 @@ void help(void) " Override default 5-second measurement interval\n" " -J, --Joules displays energy in Joules instead of Watts\n" " -l, --list list column headers only\n" + " -M, --no-msr Disable all uses of the MSR driver\n" " -n, --num_iterations num\n" " number of the measurement iterations\n" " -N, --header_iterations num\n" @@ -2597,6 +2625,7 @@ unsigned long long snapshot_sysfs_counter(char *path) int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) { if (mp->msr_num != 0) { + assert(!no_msr); if (get_msr(cpu, mp->msr_num, counterp)) return -1; } else { @@ -2646,6 +2675,9 @@ int get_epb(int cpu) return epb; msr_fallback: + if (no_msr) + return -1; + get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr); return msr & 0xf; @@ -2865,7 +2897,7 @@ retry: if (DO_BIC(BIC_CORE_THROT_CNT)) get_core_throt_cnt(cpu, &c->core_throt_cnt); - if (platform->rapl_msrs & RAPL_AMD_F17H) { + if ((platform->rapl_msrs & RAPL_AMD_F17H) && !no_msr) { if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr)) return -14; c->core_energy = msr & 0xFFFFFFFF; @@ -2930,41 +2962,44 @@ retry: if (DO_BIC(BIC_SYS_LPI)) p->sys_lpi = cpuidle_cur_sys_lpi_us; - if (platform->rapl_msrs & RAPL_PKG) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) - return -13; - p->energy_pkg = msr; - } - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) { - if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) - return -14; - p->energy_cores = msr; - } - if (platform->rapl_msrs & RAPL_DRAM) { - if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) - return -15; - p->energy_dram = msr; - } - if (platform->rapl_msrs & RAPL_GFX) { - if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) - return -16; - p->energy_gfx = msr; - } - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) - return -16; - p->rapl_pkg_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) - return -16; - p->rapl_dram_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_AMD_F17H) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) - return -13; - p->energy_pkg = msr; + if (!no_msr) { + if (platform->rapl_msrs & RAPL_PKG) { + if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) + return -13; + p->energy_pkg = msr; + } + if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) { + if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) + return -14; + p->energy_cores = msr; + } + if (platform->rapl_msrs & RAPL_DRAM) { + if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) + return -15; + p->energy_dram = msr; + } + if (platform->rapl_msrs & RAPL_GFX) { + if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) + return -16; + p->energy_gfx = msr; + } + if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) { + if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) + return -16; + p->rapl_pkg_perf_status = msr; + } + if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) { + if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) + return -16; + p->rapl_dram_perf_status = msr; + } + if (platform->rapl_msrs & RAPL_AMD_F17H) { + if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) + return -13; + p->energy_pkg = msr; + } } + if (DO_BIC(BIC_PkgTmp)) { if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) return -17; @@ -3072,7 +3107,7 @@ void probe_cst_limit(void) unsigned long long msr; int *pkg_cstate_limits; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; switch (platform->cst_limit) { @@ -3116,7 +3151,7 @@ static void dump_platform_info(void) unsigned long long msr; unsigned int ratio; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_PLATFORM_INFO, &msr); @@ -3134,7 +3169,7 @@ static void dump_power_ctl(void) { unsigned long long msr; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr); @@ -3340,7 +3375,7 @@ static void dump_cst_cfg(void) { unsigned long long msr; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); @@ -3412,7 +3447,7 @@ void print_irtl(void) { unsigned long long msr; - if (!platform->has_irtl_msrs) + if (!platform->has_irtl_msrs || no_msr) return; if (platform->supported_cstates & PC3) { @@ -4193,6 +4228,8 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long *msr) int ret, idx; unsigned long long msr_cur, msr_last; + assert(!no_msr); + if (!per_cpu_msr_sum) return 1; @@ -4221,6 +4258,8 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg UNUSED(c); UNUSED(p); + assert(!no_msr); + for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) { unsigned long long msr_cur, msr_last; off_t offset; @@ -4465,7 +4504,7 @@ void check_permissions(void) sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (euidaccess(pathname, R_OK)) { do_exit++; - warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr"); + warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr, or run with --no-msr"); } /* if all else fails, thell them to be root */ @@ -4482,7 +4521,7 @@ void probe_bclk(void) unsigned long long msr; unsigned int base_ratio; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; if (platform->bclk_freq == BCLK_100MHZ) @@ -4522,7 +4561,7 @@ static void dump_turbo_ratio_info(void) if (!has_turbo) return; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; if (platform->trl_msrs & TRL_LIMIT2) @@ -4845,6 +4884,9 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + if (!has_hwp) return 0; @@ -4931,6 +4973,9 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + cpu = t->cpu_id; /* per-package */ @@ -5264,7 +5309,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) */ void probe_rapl(void) { - if (!platform->rapl_msrs) + if (!platform->rapl_msrs || no_msr) return; if (genuine_intel) @@ -5320,7 +5365,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk } /* Temperature Target MSR is Nehalem and newer only */ - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) goto guess; if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) @@ -5367,6 +5412,9 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + if (!(do_dts || do_ptm)) return 0; @@ -5464,6 +5512,9 @@ void decode_feature_control_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr)) fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n", base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : ""); @@ -5473,6 +5524,9 @@ void decode_misc_enable_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!genuine_intel) return; @@ -5490,6 +5544,9 @@ void decode_misc_feature_control(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_misc_feature_control) return; @@ -5511,6 +5568,9 @@ void decode_misc_pwr_mgmt_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_misc_pwr_mgmt) return; @@ -5530,6 +5590,9 @@ void decode_c6_demotion_policy_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_c6_demotion_policy_config) return; @@ -5626,7 +5689,7 @@ void probe_cstates(void) if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); - if (platform->has_ext_cst_msrs) { + if (platform->has_ext_cst_msrs && !no_msr) { BIC_PRESENT(BIC_Totl_c0); BIC_PRESENT(BIC_Any_c0); BIC_PRESENT(BIC_GFX_c0); @@ -5714,10 +5777,12 @@ void process_cpuid() ecx_flags = ecx; edx_flags = edx; - if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) - warnx("get_msr(UCODE)"); - else - ucode_patch_valid = true; + if (!no_msr) { + if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) + warnx("get_msr(UCODE)"); + else + ucode_patch_valid = true; + } /* * check max extended function levels of CPUID. @@ -5892,7 +5957,7 @@ void probe_pm_features(void) probe_thermal(); - if (platform->has_nhm_msrs) + if (platform->has_nhm_msrs && !no_msr) BIC_PRESENT(BIC_SMI); if (!quiet) @@ -6252,8 +6317,10 @@ void turbostat_init() { setup_all_buffers(true); set_base_cpu(); - check_dev_msr(); - check_permissions(); + if (!no_msr) { + check_dev_msr(); + check_permissions(); + } process_cpuid(); probe_pm_features(); linux_perf_init(); @@ -6370,6 +6437,9 @@ int add_counter(unsigned int msr_num, char *path, char *name, { struct msr_counter *msrp; + if (no_msr && msr_num) + errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); + msrp = calloc(1, sizeof(struct msr_counter)); if (msrp == NULL) { perror("calloc"); @@ -6674,6 +6744,7 @@ void cmdline(int argc, char **argv) { "list", no_argument, 0, 'l' }, { "out", required_argument, 0, 'o' }, { "quiet", no_argument, 0, 'q' }, + { "no-msr", no_argument, 0, 'M' }, { "show", required_argument, 0, 's' }, { "Summary", no_argument, 0, 'S' }, { "TCC", required_argument, 0, 'T' }, @@ -6683,7 +6754,22 @@ void cmdline(int argc, char **argv) progname = argv[0]; - while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", long_options, &option_index)) != -1) { + /* + * Parse some options early, because they may make other options invalid, + * like adding the MSR counter with --add and at the same time using --no-msr. + */ + while ((opt = getopt_long_only(argc, argv, "M", long_options, &option_index)) != -1) { + switch (opt) { + case 'M': + no_msr = 1; + break; + default: + break; + } + } + optind = 0; + + while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) { switch (opt) { case 'a': parse_add_command(optarg); @@ -6741,6 +6827,9 @@ void cmdline(int argc, char **argv) case 'q': quiet = 1; break; + case 'M': + /* Parsed earlier */ + break; case 'n': num_iterations = strtod(optarg, NULL); @@ -6817,6 +6906,9 @@ skip_cgroup_setting: outf = stderr; cmdline(argc, argv); + if (no_msr) + bic_disable_msr_access(); + if (!quiet) { print_version(); print_bootcmd(); @@ -6829,7 +6921,8 @@ skip_cgroup_setting: turbostat_init(); - msr_sum_record(); + if (!no_msr) + msr_sum_record(); /* dump counters and exit */ if (dump_only) -- GitLab From a0e86c90b83c118985260e36490583b5a38d4359 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 11 Jan 2024 15:58:02 +0100 Subject: [PATCH 288/989] tools/power turbostat: Add --no-perf option Add the --no-perf option to allow users to run turbostat without accessing perf. Signed-off-by: Patryk Wlazlyn Reviewed-by: Len Brown Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 ++ tools/power/x86/turbostat/turbostat.c | 25 ++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 28be452fbfe2f..567327b004e68 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -69,6 +69,8 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP +\fB--no-msr\fP Disable all the uses of the MSR driver. +.PP ++\fB--no-perf\fP Disable all the uses of the perf API. ++.PP \fB--interval seconds\fP overrides the default 5.0 second measurement interval. .PP \fB--num_iterations num\fP number of the measurement iterations. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4d5437a9725bf..bad2fec7f3424 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -267,6 +267,7 @@ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int first_counter_read = 1; int ignore_stdin; bool no_msr; +bool no_perf; int get_msr(int cpu, off_t offset, unsigned long long *msr); @@ -1314,8 +1315,17 @@ static void bic_disable_msr_access(void) bic_enabled &= ~bic_msrs; } +static void bic_disable_perf_access(void) +{ + const unsigned long bic_perf = BIC_IPC; + + bic_enabled &= ~bic_perf; +} + static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { + assert(!no_perf); + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } @@ -1332,8 +1342,8 @@ static int perf_instr_count_open(int cpu_num) /* counter for cpu_num, including user + kernel and all processes */ fd = perf_event_open(&pea, -1, cpu_num, -1, 0); if (fd == -1) { - warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"", progname); - BIC_NOT_PRESENT(BIC_IPC); + warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\" or use --no-perf", progname); + bic_disable_perf_access(); } return fd; @@ -1399,6 +1409,7 @@ void help(void) " -J, --Joules displays energy in Joules instead of Watts\n" " -l, --list list column headers only\n" " -M, --no-msr Disable all uses of the MSR driver\n" + " -P, --no-perf Disable all uses of the perf API\n" " -n, --num_iterations num\n" " number of the measurement iterations\n" " -N, --header_iterations num\n" @@ -6745,6 +6756,7 @@ void cmdline(int argc, char **argv) { "out", required_argument, 0, 'o' }, { "quiet", no_argument, 0, 'q' }, { "no-msr", no_argument, 0, 'M' }, + { "no-perf", no_argument, 0, 'P' }, { "show", required_argument, 0, 's' }, { "Summary", no_argument, 0, 'S' }, { "TCC", required_argument, 0, 'T' }, @@ -6758,11 +6770,14 @@ void cmdline(int argc, char **argv) * Parse some options early, because they may make other options invalid, * like adding the MSR counter with --add and at the same time using --no-msr. */ - while ((opt = getopt_long_only(argc, argv, "M", long_options, &option_index)) != -1) { + while ((opt = getopt_long_only(argc, argv, "MP", long_options, &option_index)) != -1) { switch (opt) { case 'M': no_msr = 1; break; + case 'P': + no_perf = 1; + break; default: break; } @@ -6828,6 +6843,7 @@ void cmdline(int argc, char **argv) quiet = 1; break; case 'M': + case 'P': /* Parsed earlier */ break; case 'n': @@ -6909,6 +6925,9 @@ skip_cgroup_setting: if (no_msr) bic_disable_msr_access(); + if (no_perf) + bic_disable_perf_access(); + if (!quiet) { print_version(); print_bootcmd(); -- GitLab From e48934c9f1048ed4640b60321baf1986d1a470e1 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 11 Jan 2024 16:42:22 +0100 Subject: [PATCH 289/989] tools/power turbostat: Add reading aperf and mperf via perf API By using the perf API we spend less time in between the reads of the counters, resulting in more accurate calculations of the dependent metrics. Using perf API is also usually faster overall, although cache miss, if we get one, is more costly when using perf vs MSR driver. We would fallback to the msr reads if the sysfs isn't there or when in --no-perf mode. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 374 +++++++++++++++++++++----- 1 file changed, 301 insertions(+), 73 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bad2fec7f3424..1294c46c2170a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -59,6 +59,7 @@ enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; +enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; struct msr_counter { unsigned int msr_num; @@ -207,10 +208,13 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) +struct amperf_group_fd; + char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; int *fd_instr_count_percpu; +struct amperf_group_fd *fd_amperf_percpu; /* File descriptors for perf group with APERF and MPERF counters. */ struct timeval interval_tv = { 5, 0 }; struct timespec interval_ts = { 5, 0 }; @@ -268,6 +272,7 @@ unsigned int first_counter_read = 1; int ignore_stdin; bool no_msr; bool no_perf; +enum amperf_source amperf_source; int get_msr(int cpu, off_t offset, unsigned long long *msr); @@ -1296,9 +1301,6 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { const unsigned long bic_msrs = - BIC_Avg_MHz | - BIC_Busy | - BIC_Bzy_MHz | BIC_SMI | BIC_CPU_c1 | BIC_CPU_c3 | @@ -1329,21 +1331,30 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } -static int perf_instr_count_open(int cpu_num) +static long open_perf_counter_or_fail(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format) { - struct perf_event_attr pea; - int fd; + struct perf_event_attr attr; + const pid_t pid = -1; + const unsigned long flags = 0; + + memset(&attr, 0, sizeof(struct perf_event_attr)); - memset(&pea, 0, sizeof(struct perf_event_attr)); - pea.type = PERF_TYPE_HARDWARE; - pea.size = sizeof(struct perf_event_attr); - pea.config = PERF_COUNT_HW_INSTRUCTIONS; + attr.type = type; + attr.size = sizeof(struct perf_event_attr); + attr.config = config; + attr.disabled = 0; + attr.sample_type = PERF_SAMPLE_IDENTIFIER; + attr.read_format = read_format; - /* counter for cpu_num, including user + kernel and all processes */ - fd = perf_event_open(&pea, -1, cpu_num, -1, 0); + const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags); if (fd == -1) { - warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\" or use --no-perf", progname); - bic_disable_perf_access(); + if (errno == EACCES) { + errx(1, "capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"" + " or use --no-perf or run as root", progname); + } else { + perror("perf_event_open"); + errx(1, "use --no-perf or run as root"); + } } return fd; @@ -1354,7 +1365,8 @@ int get_instr_count_fd(int cpu) if (fd_instr_count_percpu[cpu]) return fd_instr_count_percpu[cpu]; - fd_instr_count_percpu[cpu] = perf_instr_count_open(cpu); + fd_instr_count_percpu[cpu] = + open_perf_counter_or_fail(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); return fd_instr_count_percpu[cpu]; } @@ -2762,6 +2774,175 @@ int get_core_throt_cnt(int cpu, unsigned long long *cnt) return 0; } +struct amperf_group_fd { + int aperf; /* Also the group descriptor */ + int mperf; +}; + +static unsigned int read_perf_counter_info(const char *const path, const char *const parse_format) +{ + int fdmt; + char buf[16]; + unsigned int v; + + fdmt = open(path, O_RDONLY, 0); + if (fdmt == -1) + errx(1, "Failed to read perf counter info %s\n", path); + + if (read(fdmt, buf, sizeof(buf)) <= 0) + return 0; + + buf[sizeof(buf) - 1] = '\0'; + + if (sscanf(buf, parse_format, &v) != 1) + errx(1, "Failed to parse perf counter info %s\n", path); + + close(fdmt); + + return v; +} + +static unsigned read_msr_type(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/type"; + const char *const format = "%u"; + + return read_perf_counter_info(path, format); +} + +static unsigned read_aperf_config(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; + const char *const format = "event=%x"; + + return read_perf_counter_info(path, format); +} + +static unsigned read_mperf_config(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; + const char *const format = "event=%x"; + + return read_perf_counter_info(path, format); +} + +static struct amperf_group_fd open_amperf_fd(int cpu) +{ + const unsigned int msr_type = read_msr_type(); + const unsigned int aperf_config = read_aperf_config(); + const unsigned int mperf_config = read_mperf_config(); + struct amperf_group_fd fds = {.aperf = -1,.mperf = -1 }; + + fds.aperf = open_perf_counter_or_fail(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); + fds.mperf = open_perf_counter_or_fail(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); + + return fds; +} + +static int get_amperf_fd(int cpu) +{ + assert(fd_amperf_percpu); + + if (fd_amperf_percpu[cpu].aperf) + return fd_amperf_percpu[cpu].aperf; + + fd_amperf_percpu[cpu] = open_amperf_fd(cpu); + + return fd_amperf_percpu[cpu].aperf; +} + +/* Read APERF, MPERF and TSC using the perf API. */ +static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu) +{ + union { + struct { + unsigned long nr_entries; + unsigned long aperf; + unsigned long mperf; + }; + + unsigned long as_array[3]; + } cnt; + + const int fd_amperf = get_amperf_fd(cpu); + + /* + * Read the TSC with rdtsc, because we want the absolute value and not + * the offset from the start of the counter. + */ + t->tsc = rdtsc(); + + const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array)); + if (n != sizeof(cnt.as_array)) + return -2; + + t->aperf = cnt.aperf * aperf_mperf_multiplier; + t->mperf = cnt.mperf * aperf_mperf_multiplier; + + return 0; +} + +/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */ +static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu) +{ + unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; + int aperf_mperf_retry_count = 0; + + /* + * The TSC, APERF and MPERF must be read together for + * APERF/MPERF and MPERF/TSC to give accurate results. + * + * Unfortunately, APERF and MPERF are read by + * individual system call, so delays may occur + * between them. If the time to read them + * varies by a large amount, we re-read them. + */ + + /* + * This initial dummy APERF read has been seen to + * reduce jitter in the subsequent reads. + */ + + if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) + return -3; + +retry: + t->tsc = rdtsc(); /* re-read close to APERF */ + + tsc_before = t->tsc; + + if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) + return -3; + + tsc_between = rdtsc(); + + if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) + return -4; + + tsc_after = rdtsc(); + + aperf_time = tsc_between - tsc_before; + mperf_time = tsc_after - tsc_between; + + /* + * If the system call latency to read APERF and MPERF + * differ by more than 2x, then try again. + */ + if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { + aperf_mperf_retry_count++; + if (aperf_mperf_retry_count < 5) + goto retry; + else + warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); + } + aperf_mperf_retry_count = 0; + + t->aperf = t->aperf * aperf_mperf_multiplier; + t->mperf = t->mperf * aperf_mperf_multiplier; + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -2771,7 +2952,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int cpu = t->cpu_id; unsigned long long msr; - int aperf_mperf_retry_count = 0; struct msr_counter *mp; int i; @@ -2784,63 +2964,26 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (first_counter_read) get_apic_id(t); -retry: + t->tsc = rdtsc(); /* we are running on local CPU of interest */ if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC) || soft_c1_residency_display(BIC_Avg_MHz)) { - unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; - - /* - * The TSC, APERF and MPERF must be read together for - * APERF/MPERF and MPERF/TSC to give accurate results. - * - * Unfortunately, APERF and MPERF are read by - * individual system call, so delays may occur - * between them. If the time to read them - * varies by a large amount, we re-read them. - */ - - /* - * This initial dummy APERF read has been seen to - * reduce jitter in the subsequent reads. - */ - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - - t->tsc = rdtsc(); /* re-read close to APERF */ - - tsc_before = t->tsc; - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - - tsc_between = rdtsc(); - - if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) - return -4; + int status = -1; - tsc_after = rdtsc(); + assert(!no_perf || !no_msr); - aperf_time = tsc_between - tsc_before; - mperf_time = tsc_after - tsc_between; - - /* - * If the system call latency to read APERF and MPERF - * differ by more than 2x, then try again. - */ - if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { - aperf_mperf_retry_count++; - if (aperf_mperf_retry_count < 5) - goto retry; - else - warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); + switch (amperf_source) { + case AMPERF_SOURCE_PERF: + status = read_aperf_mperf_tsc_perf(t, cpu); + break; + case AMPERF_SOURCE_MSR: + status = read_aperf_mperf_tsc_msr(t, cpu); + break; } - aperf_mperf_retry_count = 0; - t->aperf = t->aperf * aperf_mperf_multiplier; - t->mperf = t->mperf * aperf_mperf_multiplier; + if (status != 0) + return status; } if (DO_BIC(BIC_IPC)) @@ -3516,6 +3659,21 @@ void free_fd_percpu(void) free(fd_percpu); } +void free_fd_amperf_percpu(void) +{ + int i; + + for (i = 0; i < topo.max_cpu_num + 1; ++i) { + if (fd_amperf_percpu[i].mperf != 0) + close(fd_amperf_percpu[i].mperf); + + if (fd_amperf_percpu[i].aperf != 0) + close(fd_amperf_percpu[i].aperf); + } + + free(fd_amperf_percpu); +} + void free_all_buffers(void) { int i; @@ -3557,6 +3715,7 @@ void free_all_buffers(void) outp = NULL; free_fd_percpu(); + free_fd_amperf_percpu(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -5647,17 +5806,62 @@ void print_dev_latency(void) */ void linux_perf_init(void) { - if (!BIC_IS_ENABLED(BIC_IPC)) - return; - if (access("/proc/sys/kernel/perf_event_paranoid", F_OK)) return; - fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); - if (fd_instr_count_percpu == NULL) - err(-1, "calloc fd_instr_count_percpu"); + if (BIC_IS_ENABLED(BIC_IPC) && has_aperf) { + fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); + if (fd_instr_count_percpu == NULL) + err(-1, "calloc fd_instr_count_percpu"); + } - BIC_PRESENT(BIC_IPC); + const bool aperf_required = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || + BIC_IS_ENABLED(BIC_Bzy_MHz) || BIC_IS_ENABLED(BIC_IPC); + if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { + fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); + if (fd_amperf_percpu == NULL) + err(-1, "calloc fd_amperf_percpu"); + } +} + +static int has_amperf_access_via_msr(void) +{ + const int cpu = sched_getcpu(); + unsigned long long dummy; + + if (get_msr(cpu, MSR_IA32_APERF, &dummy)) + return 0; + + if (get_msr(cpu, MSR_IA32_MPERF, &dummy)) + return 0; + + return 1; +} + +static int has_amperf_access_via_perf(void) +{ + if (access("/sys/bus/event_source/devices/msr/type", F_OK)) + return 0; + + if (access("/sys/bus/event_source/devices/msr/events/aperf", F_OK)) + return 0; + + if (access("/sys/bus/event_source/devices/msr/events/mperf", F_OK)) + return 0; + + return 1; +} + +/* Check if we can access APERF and MPERF */ +static int has_amperf_access(void) +{ + if (!no_msr && has_amperf_access_via_msr()) + return 1; + + if (!no_perf && has_amperf_access_via_perf()) + return 1; + + return 0; } void probe_cstates(void) @@ -5845,10 +6049,11 @@ void process_cpuid() __cpuid(0x6, eax, ebx, ecx, edx); has_aperf = ecx & (1 << 0); - if (has_aperf) { + if (has_aperf && has_amperf_access()) { BIC_PRESENT(BIC_Avg_MHz); BIC_PRESENT(BIC_Busy); BIC_PRESENT(BIC_Bzy_MHz); + BIC_PRESENT(BIC_IPC); } do_dts = eax & (1 << 0); if (do_dts) @@ -6324,6 +6529,19 @@ void set_base_cpu(void) err(-ENODEV, "No valid cpus found"); } +static void set_amperf_source(void) +{ + amperf_source = AMPERF_SOURCE_PERF; + + if (no_perf || !has_amperf_access_via_perf()) + amperf_source = AMPERF_SOURCE_MSR; + + if (quiet || !debug) + return; + + fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); +} + void turbostat_init() { setup_all_buffers(true); @@ -6334,6 +6552,7 @@ void turbostat_init() } process_cpuid(); probe_pm_features(); + set_amperf_source(); linux_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); @@ -6341,6 +6560,15 @@ void turbostat_init() if (DO_BIC(BIC_IPC)) (void)get_instr_count_fd(base_cpu); + + /* + * If TSC tweak is needed, but couldn't get it, + * disable more BICs, since it can't be reported accurately. + */ + if (platform->enable_tsc_tweak && !has_base_hz) { + bic_enabled &= ~BIC_Busy; + bic_enabled &= ~BIC_Bzy_MHz; + } } int fork_it(char **argv) -- GitLab From 5088741ec805cd249e27c7176ed09bdab164960e Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 15 Jan 2024 19:04:21 +0100 Subject: [PATCH 290/989] tools/power turbostat: detect and disable unavailable BICs at runtime To allow unprivileged user to run turbostat seamlessly. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 188 +++++++++++++++++--------- 1 file changed, 125 insertions(+), 63 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1294c46c2170a..30db6e92193a1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1317,13 +1317,6 @@ static void bic_disable_msr_access(void) bic_enabled &= ~bic_msrs; } -static void bic_disable_perf_access(void) -{ - const unsigned long bic_perf = BIC_IPC; - - bic_enabled &= ~bic_perf; -} - static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { assert(!no_perf); @@ -1331,12 +1324,14 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } -static long open_perf_counter_or_fail(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format) +static long open_perf_counter(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format) { struct perf_event_attr attr; const pid_t pid = -1; const unsigned long flags = 0; + assert(!no_perf); + memset(&attr, 0, sizeof(struct perf_event_attr)); attr.type = type; @@ -1347,15 +1342,6 @@ static long open_perf_counter_or_fail(int cpu, unsigned int type, unsigned int c attr.read_format = read_format; const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags); - if (fd == -1) { - if (errno == EACCES) { - errx(1, "capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"" - " or use --no-perf or run as root", progname); - } else { - perror("perf_event_open"); - errx(1, "use --no-perf or run as root"); - } - } return fd; } @@ -1365,8 +1351,7 @@ int get_instr_count_fd(int cpu) if (fd_instr_count_percpu[cpu]) return fd_instr_count_percpu[cpu]; - fd_instr_count_percpu[cpu] = - open_perf_counter_or_fail(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); + fd_instr_count_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); return fd_instr_count_percpu[cpu]; } @@ -2833,8 +2818,8 @@ static struct amperf_group_fd open_amperf_fd(int cpu) const unsigned int mperf_config = read_mperf_config(); struct amperf_group_fd fds = {.aperf = -1,.mperf = -1 }; - fds.aperf = open_perf_counter_or_fail(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); - fds.mperf = open_perf_counter_or_fail(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); + fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); + fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); return fds; } @@ -4509,7 +4494,8 @@ release_msr: /* * set_my_sched_priority(pri) - * return previous + * return previous priority on success + * return value < -20 on failure */ int set_my_sched_priority(int priority) { @@ -4519,16 +4505,16 @@ int set_my_sched_priority(int priority) errno = 0; original_priority = getpriority(PRIO_PROCESS, 0); if (errno && (original_priority == -1)) - err(errno, "getpriority"); + return -21; retval = setpriority(PRIO_PROCESS, 0, priority); if (retval) - errx(retval, "capget(CAP_SYS_NICE) failed,try \"# setcap cap_sys_nice=ep %s\"", progname); + return -21; errno = 0; retval = getpriority(PRIO_PROCESS, 0); if (retval != priority) - err(retval, "getpriority(%d) != setpriority(%d)", retval, priority); + return -21; return original_priority; } @@ -4543,6 +4529,9 @@ void turbostat_loop() /* * elevate own priority for interval mode + * + * ignore on error - we probably don't have permission to set it, but + * it's not a big deal */ set_my_sched_priority(-20); @@ -4628,10 +4617,13 @@ void check_dev_msr() struct stat sb; char pathname[32]; + if (no_msr) + return; + sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (stat(pathname, &sb)) if (system("/sbin/modprobe msr > /dev/null 2>&1")) - err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" "); + no_msr = 1; } /* @@ -4643,47 +4635,51 @@ int check_for_cap_sys_rawio(void) { cap_t caps; cap_flag_value_t cap_flag_value; + int ret = 0; caps = cap_get_proc(); if (caps == NULL) - err(-6, "cap_get_proc\n"); + return 1; - if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) - err(-6, "cap_get\n"); + if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) { + ret = 1; + goto free_and_exit; + } if (cap_flag_value != CAP_SET) { - warnx("capget(CAP_SYS_RAWIO) failed," " try \"# setcap cap_sys_rawio=ep %s\"", progname); - return 1; + ret = 1; + goto free_and_exit; } +free_and_exit: if (cap_free(caps) == -1) err(-6, "cap_free\n"); - return 0; + return ret; } -void check_permissions(void) +void check_msr_permission(void) { - int do_exit = 0; + int failed = 0; char pathname[32]; + if (no_msr) + return; + /* check for CAP_SYS_RAWIO */ - do_exit += check_for_cap_sys_rawio(); + failed += check_for_cap_sys_rawio(); /* test file permissions */ sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (euidaccess(pathname, R_OK)) { - do_exit++; - warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr, or run with --no-msr"); + failed++; } - /* if all else fails, thell them to be root */ - if (do_exit) - if (getuid() != 0) - warnx("... or simply run as root"); - - if (do_exit) - exit(-6); + if (failed) { + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", pathname, "--no-msr"); + no_msr = 1; + } } void probe_bclk(void) @@ -5800,6 +5796,28 @@ void print_dev_latency(void) close(fd); } +static int has_instr_count_access(void) +{ + int fd; + int has_access; + + if (no_perf) + return 0; + + fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); + has_access = fd != -1; + + if (fd != -1) + close(fd); + + if (!has_access) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "instructions retired perf counter", "--no-perf"); + + return has_access; +} + /* * Linux-perf manages the HW instructions-retired counter * by enabling when requested, and hiding rollover @@ -5826,13 +5844,15 @@ void linux_perf_init(void) static int has_amperf_access_via_msr(void) { - const int cpu = sched_getcpu(); unsigned long long dummy; - if (get_msr(cpu, MSR_IA32_APERF, &dummy)) + if (no_msr) + return 0; + + if (get_msr(base_cpu, MSR_IA32_APERF, &dummy)) return 0; - if (get_msr(cpu, MSR_IA32_MPERF, &dummy)) + if (get_msr(base_cpu, MSR_IA32_MPERF, &dummy)) return 0; return 1; @@ -5840,16 +5860,44 @@ static int has_amperf_access_via_msr(void) static int has_amperf_access_via_perf(void) { - if (access("/sys/bus/event_source/devices/msr/type", F_OK)) - return 0; + struct amperf_group_fd fds; - if (access("/sys/bus/event_source/devices/msr/events/aperf", F_OK)) - return 0; + /* + * Cache the last result, so we don't warn the user multiple times + * + * Negative means cached, no access + * Zero means not cached + * Positive means cached, has access + */ + static int has_access_cached; - if (access("/sys/bus/event_source/devices/msr/events/mperf", F_OK)) + if (no_perf) return 0; - return 1; + if (has_access_cached != 0) + return has_access_cached > 0; + + fds = open_amperf_fd(base_cpu); + has_access_cached = (fds.aperf != -1) && (fds.mperf != -1); + + if (fds.aperf == -1) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "APERF perf counter", "--no-perf"); + else + close(fds.aperf); + + if (fds.mperf == -1) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "MPERF perf counter", "--no-perf"); + else + close(fds.mperf); + + if (has_access_cached == 0) + has_access_cached = -1; + + return has_access_cached > 0; } /* Check if we can access APERF and MPERF */ @@ -6542,14 +6590,34 @@ static void set_amperf_source(void) fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); } +void check_msr_access(void) +{ + check_dev_msr(); + check_msr_permission(); + + if (no_msr) + bic_disable_msr_access(); +} + +void check_perf_access(void) +{ + if (no_perf || !has_instr_count_access()) + bic_enabled &= ~BIC_IPC; + + if (!has_amperf_access()) { + bic_enabled &= ~BIC_Avg_MHz; + bic_enabled &= ~BIC_Busy; + bic_enabled &= ~BIC_Bzy_MHz; + bic_enabled &= ~BIC_IPC; + } +} + void turbostat_init() { setup_all_buffers(true); set_base_cpu(); - if (!no_msr) { - check_dev_msr(); - check_permissions(); - } + check_msr_access(); + check_perf_access(); process_cpuid(); probe_pm_features(); set_amperf_source(); @@ -7150,12 +7218,6 @@ skip_cgroup_setting: outf = stderr; cmdline(argc, argv); - if (no_msr) - bic_disable_msr_access(); - - if (no_perf) - bic_disable_perf_access(); - if (!quiet) { print_version(); print_bootcmd(); -- GitLab From aed48c48fa65abdd584e14f7d0273711bc10d223 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 30 Jan 2024 23:57:07 +0100 Subject: [PATCH 291/989] tools/power turbostat: add early exits for permission checks Checking early if the permissions are even needed gets rid of the warnings about some of them missing. Earlier we issued a warning in case of missing MSR and/or perf permissions, even when user never asked for counters that require those. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 66 +++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 30db6e92193a1..e5e01b58992e3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5818,6 +5818,14 @@ static int has_instr_count_access(void) return has_access; } +bool is_aperf_access_required(void) +{ + return BIC_IS_ENABLED(BIC_Avg_MHz) + || BIC_IS_ENABLED(BIC_Busy) + || BIC_IS_ENABLED(BIC_Bzy_MHz) + || BIC_IS_ENABLED(BIC_IPC); +} + /* * Linux-perf manages the HW instructions-retired counter * by enabling when requested, and hiding rollover @@ -5833,8 +5841,7 @@ void linux_perf_init(void) err(-1, "calloc fd_instr_count_percpu"); } - const bool aperf_required = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || - BIC_IS_ENABLED(BIC_Bzy_MHz) || BIC_IS_ENABLED(BIC_IPC); + const bool aperf_required = is_aperf_access_required(); if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); if (fd_amperf_percpu == NULL) @@ -5903,6 +5910,9 @@ static int has_amperf_access_via_perf(void) /* Check if we can access APERF and MPERF */ static int has_amperf_access(void) { + if (!is_aperf_access_required()) + return 0; + if (!no_msr && has_amperf_access_via_msr()) return 1; @@ -6581,7 +6591,8 @@ static void set_amperf_source(void) { amperf_source = AMPERF_SOURCE_PERF; - if (no_perf || !has_amperf_access_via_perf()) + const bool aperf_required = is_aperf_access_required(); + if (no_perf || !aperf_required || !has_amperf_access_via_perf()) amperf_source = AMPERF_SOURCE_MSR; if (quiet || !debug) @@ -6590,8 +6601,51 @@ static void set_amperf_source(void) fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); } +bool is_msr_access_required(void) +{ + /* TODO: add detection for dynamic counters from add_counter() */ + if (no_msr) + return false; + + return BIC_IS_ENABLED(BIC_SMI) + || BIC_IS_ENABLED(BIC_CPU_c1) + || BIC_IS_ENABLED(BIC_CPU_c3) + || BIC_IS_ENABLED(BIC_CPU_c6) + || BIC_IS_ENABLED(BIC_CPU_c7) + || BIC_IS_ENABLED(BIC_Mod_c6) + || BIC_IS_ENABLED(BIC_CoreTmp) + || BIC_IS_ENABLED(BIC_Totl_c0) + || BIC_IS_ENABLED(BIC_Any_c0) + || BIC_IS_ENABLED(BIC_GFX_c0) + || BIC_IS_ENABLED(BIC_CPUGFX) + || BIC_IS_ENABLED(BIC_Pkgpc3) + || BIC_IS_ENABLED(BIC_Pkgpc6) + || BIC_IS_ENABLED(BIC_Pkgpc2) + || BIC_IS_ENABLED(BIC_Pkgpc7) + || BIC_IS_ENABLED(BIC_Pkgpc8) + || BIC_IS_ENABLED(BIC_Pkgpc9) + || BIC_IS_ENABLED(BIC_Pkgpc10) + || BIC_IS_ENABLED(BIC_CorWatt) + || BIC_IS_ENABLED(BIC_Cor_J) + || BIC_IS_ENABLED(BIC_PkgWatt) + || BIC_IS_ENABLED(BIC_CorWatt) + || BIC_IS_ENABLED(BIC_GFXWatt) + || BIC_IS_ENABLED(BIC_RAMWatt) + || BIC_IS_ENABLED(BIC_Pkg_J) + || BIC_IS_ENABLED(BIC_Cor_J) + || BIC_IS_ENABLED(BIC_GFX_J) + || BIC_IS_ENABLED(BIC_RAM_J) + || BIC_IS_ENABLED(BIC_PKG__) + || BIC_IS_ENABLED(BIC_RAM__) + || BIC_IS_ENABLED(BIC_PkgTmp) + || (is_aperf_access_required() && !has_amperf_access_via_perf()); +} + void check_msr_access(void) { + if (!is_msr_access_required()) + no_msr = 1; + check_dev_msr(); check_msr_permission(); @@ -6601,10 +6655,12 @@ void check_msr_access(void) void check_perf_access(void) { - if (no_perf || !has_instr_count_access()) + const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC); + if (no_perf || !intrcount_required || !has_instr_count_access()) bic_enabled &= ~BIC_IPC; - if (!has_amperf_access()) { + const bool aperf_required = is_aperf_access_required(); + if (!aperf_required || !has_amperf_access()) { bic_enabled &= ~BIC_Avg_MHz; bic_enabled &= ~BIC_Busy; bic_enabled &= ~BIC_Bzy_MHz; -- GitLab From 4a1bb4dad5d16669e841410944e7bc84ef7263fc Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 14 Mar 2024 11:36:55 +0100 Subject: [PATCH 292/989] tools/power turbostat: Clear added counters when in no-msr mode If user request --no-msr or is not able to access the MSRs, turbostat should clear all the counters added with --add. Because MSR access permission checks are done after the cmdline is parsed, the decision has to be defered up until the transition into no-msr mode happen. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 47 ++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e5e01b58992e3..b4a892bf22bfd 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1160,6 +1160,37 @@ struct sys_counters { struct msr_counter *pp; } sys; +void free_sys_counters(void) +{ + struct msr_counter *p = sys.tp, *pnext = NULL; + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + p = sys.cp, pnext = NULL; + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + p = sys.pp, pnext = NULL; + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + sys.added_thread_counters = 0; + sys.added_core_counters = 0; + sys.added_package_counters = 0; + sys.tp = NULL; + sys.cp = NULL; + sys.pp = NULL; +} + struct system_summary { struct thread_data threads; struct core_data cores; @@ -1315,6 +1346,8 @@ static void bic_disable_msr_access(void) BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; bic_enabled &= ~bic_msrs; + + free_sys_counters(); } static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) @@ -6601,12 +6634,24 @@ static void set_amperf_source(void) fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); } +bool has_added_counters(void) +{ + /* + * It only makes sense to call this after the command line is parsed, + * otherwise sys structure is not populated. + */ + + return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters; +} + bool is_msr_access_required(void) { - /* TODO: add detection for dynamic counters from add_counter() */ if (no_msr) return false; + if (has_added_counters()) + return true; + return BIC_IS_ENABLED(BIC_SMI) || BIC_IS_ENABLED(BIC_CPU_c1) || BIC_IS_ENABLED(BIC_CPU_c3) -- GitLab From ebf8449caba1df2eb6ba0b465fe15dc06d3b9135 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 15 Feb 2024 12:50:19 +0100 Subject: [PATCH 293/989] tools/power turbostat: Add proper re-initialization for perf file descriptors Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b4a892bf22bfd..a380829c58903 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3669,18 +3669,25 @@ void free_fd_percpu(void) { int i; + if (!fd_percpu) + return; + for (i = 0; i < topo.max_cpu_num + 1; ++i) { if (fd_percpu[i] != 0) close(fd_percpu[i]); } free(fd_percpu); + fd_percpu = NULL; } void free_fd_amperf_percpu(void) { int i; + if (!fd_amperf_percpu) + return; + for (i = 0; i < topo.max_cpu_num + 1; ++i) { if (fd_amperf_percpu[i].mperf != 0) close(fd_amperf_percpu[i].mperf); @@ -3690,6 +3697,21 @@ void free_fd_amperf_percpu(void) } free(fd_amperf_percpu); + fd_amperf_percpu = NULL; +} + +void free_fd_instr_count_percpu(void) +{ + if (!fd_instr_count_percpu) + return; + + for (int i = 0; i < topo.max_cpu_num + 1; ++i) { + if (fd_instr_count_percpu[i] != 0) + close(fd_instr_count_percpu[i]); + } + + free(fd_instr_count_percpu); + fd_instr_count_percpu = NULL; } void free_all_buffers(void) @@ -3733,6 +3755,7 @@ void free_all_buffers(void) outp = NULL; free_fd_percpu(); + free_fd_instr_count_percpu(); free_fd_amperf_percpu(); free(irq_column_2_cpu); @@ -4067,10 +4090,13 @@ static void update_effective_set(bool startup) err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str); } +void linux_perf_init(void); + void re_initialize(void) { free_all_buffers(); setup_all_buffers(false); + linux_perf_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } -- GitLab From 141fb8cd206ace23c02cd2791c6da52c1d77d42a Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 19 Mar 2024 10:54:22 -0700 Subject: [PATCH 294/989] btrfs: qgroup: correctly model root qgroup rsv in convert We use add_root_meta_rsv and sub_root_meta_rsv to track prealloc and pertrans reservations for subvolumes when quotas are enabled. The convert function does not properly increment pertrans after decrementing prealloc, so the count is not accurate. Note: we check that the fs is not read-only to mirror the logic in qgroup_convert_meta, which checks that before adding to the pertrans rsv. Fixes: 8287475a2055 ("btrfs: qgroup: Use root::qgroup_meta_rsv_* to record qgroup meta reserved space") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5f90f0605b12f..cf8820ce7aa29 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4495,6 +4495,8 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); + if (!sb_rdonly(fs_info->sb)) + add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); } /* -- GitLab From 74e97958121aa1f5854da6effba70143f051b0cd Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Mar 2024 10:02:04 -0700 Subject: [PATCH 295/989] btrfs: qgroup: fix qgroup prealloc rsv leak in subvolume operations Create subvolume, create snapshot and delete subvolume all use btrfs_subvolume_reserve_metadata() to reserve metadata for the changes done to the parent subvolume's fs tree, which cannot be mediated in the normal way via start_transaction. When quota groups (squota or qgroups) are enabled, this reserves qgroup metadata of type PREALLOC. Once the operation is associated to a transaction, we convert PREALLOC to PERTRANS, which gets cleared in bulk at the end of the transaction. However, the error paths of these three operations were not implementing this lifecycle correctly. They unconditionally converted the PREALLOC to PERTRANS in a generic cleanup step regardless of errors or whether the operation was fully associated to a transaction or not. This resulted in error paths occasionally converting this rsv to PERTRANS without calling record_root_in_trans successfully, which meant that unless that root got recorded in the transaction by some other thread, the end of the transaction would not free that root's PERTRANS, leaking it. Ultimately, this resulted in hitting a WARN in CONFIG_BTRFS_DEBUG builds at unmount for the leaked reservation. The fix is to ensure that every qgroup PREALLOC reservation observes the following properties: 1. any failure before record_root_in_trans is called successfully results in freeing the PREALLOC reservation. 2. after record_root_in_trans, we convert to PERTRANS, and now the transaction owns freeing the reservation. This patch enforces those properties on the three operations. Without it, generic/269 with squotas enabled at mkfs time would fail in ~5-10 runs on my system. With this patch, it ran successfully 1000 times in a row. Fixes: e85fde5162bf ("btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/inode.c | 13 ++++++++++++- fs/btrfs/ioctl.c | 37 ++++++++++++++++++++++++++++--------- fs/btrfs/root-tree.c | 10 ---------- fs/btrfs/root-tree.h | 2 -- 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 37701531eeb1b..1f9e74c4161d3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4503,6 +4503,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; struct btrfs_block_rsv block_rsv; u64 root_flags; + u64 qgroup_reserved = 0; int ret; down_write(&fs_info->subvol_sem); @@ -4547,12 +4548,20 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) goto out_undead; + qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_release; } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -4611,7 +4620,9 @@ out_end_trans: ret = btrfs_end_transaction(trans); inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(root, &block_rsv); + btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_undead: if (ret) { spin_lock(&dest->root_item_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 38459a89b27c4..5a507237c4fa3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -613,6 +613,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, int ret; dev_t anon_dev; u64 objectid; + u64 qgroup_reserved = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -650,13 +651,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap, trans_num_items, false); if (ret) goto out_new_inode_args; + qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(root, &block_rsv); - goto out_new_inode_args; + goto out_release_rsv; } + ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); + if (ret) + goto out; + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; /* Tree log can't currently deal with an inode which is a new root. */ @@ -767,9 +773,11 @@ static noinline int create_subvol(struct mnt_idmap *idmap, out: trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(root, &block_rsv); - btrfs_end_transaction(trans); +out_release_rsv: + btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: @@ -791,6 +799,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_pending_snapshot *pending_snapshot; unsigned int trans_num_items; struct btrfs_trans_handle *trans; + struct btrfs_block_rsv *block_rsv; + u64 qgroup_reserved = 0; int ret; /* We do not support snapshotting right now. */ @@ -827,19 +837,19 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto free_pending; } - btrfs_init_block_rsv(&pending_snapshot->block_rsv, - BTRFS_BLOCK_RSV_TEMP); + block_rsv = &pending_snapshot->block_rsv; + btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * 1 to add dir item * 1 to add dir index * 1 to update parent inode item */ trans_num_items = create_subvol_num_items(inherit) + 3; - ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv, trans_num_items, false); if (ret) goto free_pending; + qgroup_reserved = block_rsv->qgroup_rsv_reserved; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -852,6 +862,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, ret = PTR_ERR(trans); goto fail; } + ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); + if (ret) { + btrfs_end_transaction(trans); + goto fail; + } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->pending_snapshot = pending_snapshot; @@ -881,7 +898,9 @@ fail: if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); - btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); + btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 4bb538a372ce5..7007f9e0c9728 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -548,13 +548,3 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, } return ret; } - -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - u64 qgroup_to_release; - - btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); - btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); -} diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h index 6f929cf3bd496..8f5739e732b9b 100644 --- a/fs/btrfs/root-tree.h +++ b/fs/btrfs/root-tree.h @@ -18,8 +18,6 @@ struct btrfs_trans_handle; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv); int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const struct fscrypt_str *name); -- GitLab From 71537e35c324ea6fbd68377a4f26bb93a831ae35 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Mar 2024 10:14:24 -0700 Subject: [PATCH 296/989] btrfs: record delayed inode root in transaction When running delayed inode updates, we do not record the inode's root in the transaction, but we do allocate PREALLOC and thus converted PERTRANS space for it. To be sure we free that PERTRANS meta rsv, we must ensure that we record the root in the transaction. Fixes: 4f5427ccce5d ("btrfs: delayed-inode: Use new qgroup meta rsv for delayed inode and item") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index dd6f566a383f0..121ab890bd055 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1133,6 +1133,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_record_root_in_trans(trans, node->root); + if (ret) + return ret; ret = btrfs_update_delayed_inode(trans, node->root, path, node); return ret; } -- GitLab From 211de93367304ab395357f8cb12568a4d1e20701 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Mar 2024 10:18:39 -0700 Subject: [PATCH 297/989] btrfs: qgroup: convert PREALLOC to PERTRANS after record_root_in_trans The transaction is only able to free PERTRANS reservations for a root once that root has been recorded with the TRANS tag on the roots radix tree. Therefore, until we are sure that this root will get tagged, it isn't safe to convert. Generally, this is not an issue as *some* transaction will likely tag the root before long and this reservation will get freed in that transaction, but technically it could stick around until unmount and result in a warning about leaked metadata reservation space. This path is most exercised by running the generic/269 fstest with CONFIG_BTRFS_DEBUG. Fixes: a6496849671a ("btrfs: fix start transaction qgroup rsv double free") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 46e8426adf4f1..1f10082d35d92 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -745,14 +745,6 @@ again: h->reloc_reserved = reloc_reserved; } - /* - * Now that we have found a transaction to be a part of, convert the - * qgroup reservation from prealloc to pertrans. A different transaction - * can't race in and free our pertrans out from under us. - */ - if (qgroup_reserved) - btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); - got_it: if (!current->journal_info) current->journal_info = h; @@ -786,8 +778,15 @@ got_it: * not just freed. */ btrfs_end_transaction(h); - return ERR_PTR(ret); + goto reserve_fail; } + /* + * Now that we have found a transaction to be a part of, convert the + * qgroup reservation from prealloc to pertrans. A different transaction + * can't race in and free our pertrans out from under us. + */ + if (qgroup_reserved) + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); return h; -- GitLab From 3c6f0c5ecc8910d4ffb0dfe85609ebc0c91c8f34 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 26 Mar 2024 11:55:22 -0700 Subject: [PATCH 298/989] btrfs: make btrfs_clear_delalloc_extent() free delalloc reserve Currently, this call site in btrfs_clear_delalloc_extent() only converts the reservation. We are marking it not delalloc, so I don't think it makes sense to keep the rsv around. This is a path where we are not sure to join a transaction, so it leads to incorrect free-ing during umount. Helps with the pass rate of generic/269 and generic/475. Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f9e74c4161d3..c65fe5de40220 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2533,7 +2533,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, */ if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) - btrfs_delalloc_release_metadata(inode, len, false); + btrfs_delalloc_release_metadata(inode, len, true); /* For sanity tests. */ if (btrfs_is_testing(fs_info)) -- GitLab From 6e68de0bb0ed59e0554a0c15ede7308c47351e2d Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 26 Mar 2024 12:01:28 -0700 Subject: [PATCH 299/989] btrfs: always clear PERTRANS metadata during commit It is possible to clear a root's IN_TRANS tag from the radix tree, but not clear its PERTRANS, if there is some error in between. Eliminate that possibility by moving the free up to where we clear the tag. Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1f10082d35d92..85f359e0e0a7f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1494,6 +1494,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); + btrfs_qgroup_free_meta_all_pertrans(root); spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); @@ -1518,7 +1519,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) if (ret2) return ret2; spin_lock(&fs_info->fs_roots_radix_lock); - btrfs_qgroup_free_meta_all_pertrans(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); -- GitLab From 97ca7c1f93bbac6982717a7055cd727813c45e61 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 25 Feb 2024 08:29:25 -0800 Subject: [PATCH 300/989] mean_and_variance: Drop always failing tests mean_and_variance_test_2 and mean_and_variance_test_4 always fail. The input parameters to those tests are identical to the input parameters to tests 1 and 3, yet the expected result for tests 2 and 4 is different for the mean and stddev tests. That will always fail. Expected mean_and_variance_get_mean(mv) == mean[i], but mean_and_variance_get_mean(mv) == 22 (0x16) mean[i] == 10 (0xa) Drop the bad tests. Fixes: 65bc41090720 ("mean and variance: More tests") Closes: https://lore.kernel.org/lkml/065b94eb-6a24-4248-b7d7-d3212efb4787@roeck-us.net/ Cc: Kent Overstreet Signed-off-by: Guenter Roeck Signed-off-by: Kent Overstreet --- fs/bcachefs/mean_and_variance_test.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c index db63b3f3b338a..4c298e74723db 100644 --- a/fs/bcachefs/mean_and_variance_test.c +++ b/fs/bcachefs/mean_and_variance_test.c @@ -136,20 +136,8 @@ static void mean_and_variance_test_1(struct kunit *test) d, mean, stddev, weighted_mean, weighted_stddev); } -static void mean_and_variance_test_2(struct kunit *test) -{ - s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; - s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 }; - s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 }; - s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; - s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - /* Test behaviour where we switch from one steady state to another: */ -static void mean_and_variance_test_3(struct kunit *test) +static void mean_and_variance_test_2(struct kunit *test) { s64 d[] = { 100, 100, 100, 100, 100 }; s64 mean[] = { 22, 32, 40, 46, 50 }; @@ -161,18 +149,6 @@ static void mean_and_variance_test_3(struct kunit *test) d, mean, stddev, weighted_mean, weighted_stddev); } -static void mean_and_variance_test_4(struct kunit *test) -{ - s64 d[] = { 100, 100, 100, 100, 100 }; - s64 mean[] = { 10, 11, 12, 13, 14 }; - s64 stddev[] = { 9, 13, 15, 17, 19 }; - s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; - s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - static void mean_and_variance_fast_divpow2(struct kunit *test) { s64 i; @@ -230,8 +206,6 @@ static struct kunit_case mean_and_variance_test_cases[] = { KUNIT_CASE(mean_and_variance_weighted_advanced_test), KUNIT_CASE(mean_and_variance_test_1), KUNIT_CASE(mean_and_variance_test_2), - KUNIT_CASE(mean_and_variance_test_3), - KUNIT_CASE(mean_and_variance_test_4), {} }; -- GitLab From 8a4ff5452dd0cdcc35940460bb777d836bece11c Mon Sep 17 00:00:00 2001 From: Stephen Horvath Date: Sun, 31 Mar 2024 18:37:06 +1000 Subject: [PATCH 301/989] ACPI: thermal: Register thermal zones without valid trip points Some laptops where the thermal control is handled by the EC may provide trip points that fail the kernels new validation, but still have working temperature sensors. An example of this is the Framework 13 AMD. This patch allows the thermal zone to still be registered without trip points if the trip points fail validation, allowing the temperature sensor to be viewed and used by the user. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218586 Fixes: 9c8647224e9f ("ACPI: thermal: Use library functions to obtain trip point temperature values") Signed-off-by: Stephen Horvath [ rjw: Subject edits, remove redundant braces ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/thermal.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 302dce0b2b504..d67881b50bca2 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -662,14 +662,15 @@ static int acpi_thermal_register_thermal_zone(struct acpi_thermal *tz, { int result; - tz->thermal_zone = thermal_zone_device_register_with_trips("acpitz", - trip_table, - trip_count, - tz, - &acpi_thermal_zone_ops, - NULL, - passive_delay, - tz->polling_frequency * 100); + if (trip_count) + tz->thermal_zone = thermal_zone_device_register_with_trips( + "acpitz", trip_table, trip_count, tz, + &acpi_thermal_zone_ops, NULL, passive_delay, + tz->polling_frequency * 100); + else + tz->thermal_zone = thermal_tripless_zone_device_register( + "acpitz", tz, &acpi_thermal_zone_ops, NULL); + if (IS_ERR(tz->thermal_zone)) return PTR_ERR(tz->thermal_zone); @@ -901,11 +902,8 @@ static int acpi_thermal_add(struct acpi_device *device) trip++; } - if (trip == trip_table) { + if (trip == trip_table) pr_warn(FW_BUG "No valid trip points!\n"); - result = -ENODEV; - goto free_memory; - } result = acpi_thermal_register_thermal_zone(tz, trip_table, trip - trip_table, -- GitLab From e0319af2b6cdfa7c39edf73dcb813b7ff1261fa5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 2 Apr 2024 16:42:27 -0400 Subject: [PATCH 302/989] bcachefs: Improve bch2_btree_update_to_text() Print out the mode as a string, and also print out the btree and watermark. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 41 ++++++++++++++++++++--------- fs/bcachefs/btree_update_interior.h | 24 ++++++++++------- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 9fd2dd0f46825..2b46e6ad248ac 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -26,6 +26,13 @@ #include +const char * const bch2_btree_update_modes[] = { +#define x(t) #t, + BCH_WATERMARKS() +#undef x + NULL +}; + static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); @@ -846,11 +853,11 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode != BTREE_UPDATE_none); BUG_ON(!btree_node_dirty(b)); BUG_ON(!b->c.level); - as->mode = BTREE_INTERIOR_UPDATING_NODE; + as->mode = BTREE_UPDATE_node; as->b = b; set_btree_node_write_blocked(b); @@ -873,7 +880,7 @@ static void btree_update_reparent(struct btree_update *as, lockdep_assert_held(&c->btree_interior_update_lock); child->b = NULL; - child->mode = BTREE_INTERIOR_UPDATING_AS; + child->mode = BTREE_UPDATE_update; bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, bch2_update_reparent_journal_pin_flush); @@ -884,7 +891,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) struct bkey_i *insert = &b->key; struct bch_fs *c = as->c; - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode != BTREE_UPDATE_none); BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ARRAY_SIZE(as->journal_entries)); @@ -898,7 +905,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - as->mode = BTREE_INTERIOR_UPDATING_ROOT; + as->mode = BTREE_UPDATE_root; mutex_unlock(&c->btree_interior_update_lock); } @@ -1076,7 +1083,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * struct bch_fs *c = as->c; u64 start_time = as->start_time; - BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode == BTREE_UPDATE_none); if (as->took_gc_lock) up_read(&as->c->gc_lock); @@ -1172,7 +1179,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as->c = c; as->start_time = start_time; as->ip_started = _RET_IP_; - as->mode = BTREE_INTERIOR_NO_UPDATE; + as->mode = BTREE_UPDATE_none; + as->watermark = watermark; as->took_gc_lock = true; as->btree_id = path->btree_id; as->update_level = update_level; @@ -2509,18 +2517,25 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); } +static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) +{ + prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + (void *) as->ip_started, + bch2_btree_id_str(as->btree_id), + bch2_watermarks[as->watermark], + bch2_btree_update_modes[as->mode], + as->nodes_written, + closure_nr_remaining(&as->cl), + as->journal.seq); +} + void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_update *as; mutex_lock(&c->btree_interior_update_lock); list_for_each_entry(as, &c->btree_interior_update_list, list) - prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n", - (void *) as->ip_started, - as->mode, - as->nodes_written, - closure_nr_remaining(&as->cl), - as->journal.seq); + bch2_btree_update_to_text(out, as); mutex_unlock(&c->btree_interior_update_lock); } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7ac3e17b84fde..1d41d3f4b107c 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -12,6 +12,18 @@ int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); +#define BTREE_UPDATE_MODES() \ + x(none) \ + x(node) \ + x(root) \ + x(update) + +enum btree_update_mode { +#define x(n) BTREE_UPDATE_##n, + BTREE_UPDATE_MODES() +#undef x +}; + /* * Tracks an in progress split/rewrite of a btree node and the update to the * parent node: @@ -39,14 +51,8 @@ struct btree_update { struct list_head list; struct list_head unwritten_list; - /* What kind of update are we doing? */ - enum { - BTREE_INTERIOR_NO_UPDATE, - BTREE_INTERIOR_UPDATING_NODE, - BTREE_INTERIOR_UPDATING_ROOT, - BTREE_INTERIOR_UPDATING_AS, - } mode; - + enum btree_update_mode mode; + enum bch_watermark watermark; unsigned nodes_written:1; unsigned took_gc_lock:1; @@ -56,7 +62,7 @@ struct btree_update { struct disk_reservation disk_res; /* - * BTREE_INTERIOR_UPDATING_NODE: + * BTREE_UPDATE_node: * The update that made the new nodes visible was a regular update to an * existing interior node - @b. We can't write out the update to @b * until the new nodes we created are finished writing, so we block @b -- GitLab From 7ee88737ab802ac832f978d6e6258571fe08d870 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 2 Apr 2024 18:30:14 -0400 Subject: [PATCH 303/989] bcachefs: Check for bad needs_discard before doing discard In the discard worker, we were failing to validate the bucket state - meaning a corrupt needs_discard btree could cause us to discard a bucket that we shouldn't. If check_alloc_info hasn't run yet we just want to bail out, otherwise it's a filesystem inconsistent error. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 47 +++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 893e38f9db807..4ff56fa4d5392 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1713,34 +1713,37 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (ret) goto out; - if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { - a->v.gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); - goto write; - } - - if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { - bch2_trans_inconsistent(trans, - "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" - "%s", - a->v.journal_seq, - c->journal.flushed_seq_ondisk, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (a->v.dirty_sectors) { + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, + trans, "attempting to discard bucket with dirty data\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = -EIO; - } goto out; } if (a->v.data_type != BCH_DATA_need_discard) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { - bch2_trans_inconsistent(trans, - "bucket incorrectly set in need_discard btree\n" - "%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; + if (data_type_is_empty(a->v.data_type) && + BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { + a->v.gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + goto write; } + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, + trans, "bucket incorrectly set in need_discard btree\n" + "%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = -EIO; + goto out; + } + + if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, + trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", + a->v.journal_seq, + c->journal.flushed_seq_ondisk, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = -EIO; goto out; } @@ -1835,6 +1838,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo if (ret) goto err; + BUG_ON(a->v.dirty_sectors); SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); a->v.data_type = alloc_data_type(a->v, a->v.data_type); @@ -1942,6 +1946,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, goto out; BUG_ON(a->v.data_type != BCH_DATA_cached); + BUG_ON(a->v.dirty_sectors); if (!a->v.cached_sectors) bch_err(c, "invalidating empty bucket, confused"); -- GitLab From fa14b50460baba40c8b1138c517fb8cf04464292 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 2 Apr 2024 18:57:05 -0400 Subject: [PATCH 304/989] bcachefs: ratelimit informational fsck errors Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index cbb8b43e419fe..b704fb0dda951 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1131,8 +1131,8 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal i->count = count2; if (i->count != count2) { - bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", + w->last_pos.inode, i->snapshot, i->count, count2); return -BCH_ERR_internal_fsck_err; } @@ -1587,8 +1587,8 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ return count2; if (i->count != count2) { - bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", - i->count, count2); + bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", + w->last_pos.inode, i->snapshot, i->count, count2); i->count = count2; if (i->inode.bi_nlink == i->count) continue; -- GitLab From 135f218255b28c5bbf71e9e32a49e5c734cabbe5 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 28 Mar 2024 12:19:54 -0300 Subject: [PATCH 305/989] ARM: dts: imx7s-warp: Pass OV2680 link-frequencies Since commit 63b0cd30b78e ("media: ov2680: Add bus-cfg / endpoint property verification") the ov2680 no longer probes on a imx7s-warp7: ov2680 1-0036: error -EINVAL: supported link freq 330000000 not found ov2680 1-0036: probe with driver ov2680 failed with error -22 Fix it by passing the required 'link-frequencies' property as recommended by: https://www.kernel.org/doc/html/v6.9-rc1/driver-api/media/camera-sensor.html#handling-clocks Cc: stable@vger.kernel.org Fixes: 63b0cd30b78e ("media: ov2680: Add bus-cfg / endpoint property verification") Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/boot/dts/nxp/imx/imx7s-warp.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/nxp/imx/imx7s-warp.dts b/arch/arm/boot/dts/nxp/imx/imx7s-warp.dts index ba7231b364bb8..7bab113ca6da7 100644 --- a/arch/arm/boot/dts/nxp/imx/imx7s-warp.dts +++ b/arch/arm/boot/dts/nxp/imx/imx7s-warp.dts @@ -210,6 +210,7 @@ remote-endpoint = <&mipi_from_sensor>; clock-lanes = <0>; data-lanes = <1>; + link-frequencies = /bits/ 64 <330000000>; }; }; }; -- GitLab From fd819ad3ecf6f3c232a06b27423ce9ed8c20da89 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 29 Mar 2024 09:50:23 +0800 Subject: [PATCH 306/989] ax25: fix use-after-free bugs caused by ax25_ds_del_timer When the ax25 device is detaching, the ax25_dev_device_down() calls ax25_ds_del_timer() to cleanup the slave_timer. When the timer handler is running, the ax25_ds_del_timer() that calls del_timer() in it will return directly. As a result, the use-after-free bugs could happen, one of the scenarios is shown below: (Thread 1) | (Thread 2) | ax25_ds_timeout() ax25_dev_device_down() | ax25_ds_del_timer() | del_timer() | ax25_dev_put() //FREE | | ax25_dev-> //USE In order to mitigate bugs, when the device is detaching, use timer_shutdown_sync() to stop the timer. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Duoming Zhou Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240329015023.9223-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski --- net/ax25/ax25_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index c5462486dbca1..282ec581c0720 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -105,7 +105,7 @@ void ax25_dev_device_down(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); #ifdef CONFIG_AX25_DAMA_SLAVE - ax25_ds_del_timer(ax25_dev); + timer_shutdown_sync(&ax25_dev->dama.slave_timer); #endif /* -- GitLab From b32a09ea7c38849ff925489a6bf5bd8914bc45df Mon Sep 17 00:00:00 2001 From: Marco Pinna Date: Fri, 29 Mar 2024 17:12:59 +0100 Subject: [PATCH 307/989] vsock/virtio: fix packet delivery to tap device Commit 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks") added virtio_transport_deliver_tap_pkt() for handing packets to the vsockmon device. However, in virtio_transport_send_pkt_work(), the function is called before actually sending the packet (i.e. before placing it in the virtqueue with virtqueue_add_sgs() and checking whether it returned successfully). Queuing the packet in the virtqueue can fail even multiple times. However, in virtio_transport_deliver_tap_pkt() we deliver the packet to the monitoring tap interface only the first time we call it. This certainly avoids seeing the same packet replicated multiple times in the monitoring interface, but it can show the packet sent with the wrong timestamp or even before we succeed to queue it in the virtqueue. Move virtio_transport_deliver_tap_pkt() after calling virtqueue_add_sgs() and making sure it returned successfully. Fixes: 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks") Cc: stable@vge.kernel.org Signed-off-by: Marco Pinna Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20240329161259.411751-1-marco.pinn95@gmail.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 1748268e0694f..ee5d306a96d0f 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -120,7 +120,6 @@ virtio_transport_send_pkt_work(struct work_struct *work) if (!skb) break; - virtio_transport_deliver_tap_pkt(skb); reply = virtio_vsock_skb_reply(skb); sgs = vsock->out_sgs; sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), @@ -170,6 +169,8 @@ virtio_transport_send_pkt_work(struct work_struct *work) break; } + virtio_transport_deliver_tap_pkt(skb); + if (reply) { struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; int val; -- GitLab From 09ab7eff38202159271534d2f5ad45526168f2a5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Mar 2024 10:45:07 -0600 Subject: [PATCH 308/989] io_uring/kbuf: get rid of lower BGID lists Just rely on the xarray for any kind of bgid. This simplifies things, and it really doesn't bring us much, if anything. Cc: stable@vger.kernel.org # v6.4+ Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - io_uring/io_uring.c | 2 - io_uring/kbuf.c | 70 ++++------------------------------ 3 files changed, 8 insertions(+), 65 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e248936250852..05df0e399d7c0 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -294,7 +294,6 @@ struct io_ring_ctx { struct io_submit_state submit_state; - struct io_buffer_list *io_bl; struct xarray io_bl_xa; struct io_hash_table cancel_table_locked; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d1defb99b89ee..bc730f59265f3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -351,7 +351,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) err: kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); - kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; @@ -2932,7 +2931,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_napi_free(ctx); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); - kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 693c26da4ee1a..8bf0121f00af8 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -17,8 +17,6 @@ #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) -#define BGID_ARRAY 64 - /* BIDs are addressed by a 16-bit field in a CQE */ #define MAX_BIDS_PER_BGID (1 << 16) @@ -40,13 +38,9 @@ struct io_buf_free { int inuse; }; -static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, - unsigned int bgid) +static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, + unsigned int bgid) { - if (bl && bgid < BGID_ARRAY) - return &bl[bgid]; - return xa_load(&ctx->io_bl_xa, bgid); } @@ -55,7 +49,7 @@ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, { lockdep_assert_held(&ctx->uring_lock); - return __io_buffer_get_list(ctx, ctx->io_bl, bgid); + return __io_buffer_get_list(ctx, bgid); } static int io_buffer_add_list(struct io_ring_ctx *ctx, @@ -68,10 +62,6 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, */ bl->bgid = bgid; smp_store_release(&bl->is_ready, 1); - - if (bgid < BGID_ARRAY) - return 0; - return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -208,24 +198,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, return ret; } -static __cold int io_init_bl_list(struct io_ring_ctx *ctx) -{ - struct io_buffer_list *bl; - int i; - - bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL); - if (!bl) - return -ENOMEM; - - for (i = 0; i < BGID_ARRAY; i++) { - INIT_LIST_HEAD(&bl[i].buf_list); - bl[i].bgid = i; - } - - smp_store_release(&ctx->io_bl, bl); - return 0; -} - /* * Mark the given mapped range as free for reuse */ @@ -300,13 +272,6 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) struct list_head *item, *tmp; struct io_buffer *buf; unsigned long index; - int i; - - for (i = 0; i < BGID_ARRAY; i++) { - if (!ctx->io_bl) - break; - __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); - } xa_for_each(&ctx->io_bl_xa, index, bl) { xa_erase(&ctx->io_bl_xa, bl->bgid); @@ -489,12 +454,6 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_lock(ctx, issue_flags); - if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { - ret = io_init_bl_list(ctx); - if (ret) - goto err; - } - bl = io_buffer_get_list(ctx, p->bgid); if (unlikely(!bl)) { bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); @@ -507,14 +466,9 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) if (ret) { /* * Doesn't need rcu free as it was never visible, but - * let's keep it consistent throughout. Also can't - * be a lower indexed array group, as adding one - * where lookup failed cannot happen. + * let's keep it consistent throughout. */ - if (p->bgid >= BGID_ARRAY) - kfree_rcu(bl, rcu); - else - WARN_ON_ONCE(1); + kfree_rcu(bl, rcu); goto err; } } @@ -679,12 +633,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.ring_entries >= 65536) return -EINVAL; - if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { - int ret = io_init_bl_list(ctx); - if (ret) - return ret; - } - bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ @@ -734,10 +682,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -EINVAL; __io_remove_buffers(ctx, bl, -1U); - if (bl->bgid >= BGID_ARRAY) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - kfree_rcu(bl, rcu); - } + xa_erase(&ctx->io_bl_xa, bl->bgid); + kfree_rcu(bl, rcu); return 0; } @@ -771,7 +717,7 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) { struct io_buffer_list *bl; - bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid); + bl = __io_buffer_get_list(ctx, bgid); if (!bl || !bl->is_mmap) return NULL; -- GitLab From 3b80cff5a4d117c53d38ce805823084eaeffbde6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Mar 2024 10:46:40 -0600 Subject: [PATCH 309/989] io_uring/kbuf: get rid of bl->is_ready Now that xarray is being exclusively used for the buffer_list lookup, this check is no longer needed. Get rid of it and the is_ready member. Cc: stable@vger.kernel.org # v6.4+ Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 8 -------- io_uring/kbuf.h | 2 -- 2 files changed, 10 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 8bf0121f00af8..011280d873e7a 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -61,7 +61,6 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, * always under the ->uring_lock, but the RCU lookup from mmap does. */ bl->bgid = bgid; - smp_store_release(&bl->is_ready, 1); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -721,13 +720,6 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) if (!bl || !bl->is_mmap) return NULL; - /* - * Ensure the list is fully setup. Only strictly needed for RCU lookup - * via mmap, and in that case only for the array indexed groups. For - * the xarray lookups, it's either visible and ready, or not at all. - */ - if (!smp_load_acquire(&bl->is_ready)) - return NULL; return bl->buf_ring; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 1c7b654ee7263..fdbb104495132 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -29,8 +29,6 @@ struct io_buffer_list { __u8 is_buf_ring; /* ring mapped provided buffers, but mmap'ed by application */ __u8 is_mmap; - /* bl is visible from an RCU point of view for lookup */ - __u8 is_ready; }; struct io_buffer { -- GitLab From 6b69c4ab4f685327d9e10caf0d84217ba23a8c4b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Mar 2024 16:12:51 -0600 Subject: [PATCH 310/989] io_uring/kbuf: protect io_buffer_list teardown with a reference No functional changes in this patch, just in preparation for being able to keep the buffer list alive outside of the ctx->uring_lock. Cc: stable@vger.kernel.org # v6.4+ Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 15 +++++++++++---- io_uring/kbuf.h | 2 ++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 011280d873e7a..2edc6854f6f3e 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -61,6 +61,7 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, * always under the ->uring_lock, but the RCU lookup from mmap does. */ bl->bgid = bgid; + atomic_set(&bl->refs, 1); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -265,6 +266,14 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, return i; } +static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +{ + if (atomic_dec_and_test(&bl->refs)) { + __io_remove_buffers(ctx, bl, -1U); + kfree_rcu(bl, rcu); + } +} + void io_destroy_buffers(struct io_ring_ctx *ctx) { struct io_buffer_list *bl; @@ -274,8 +283,7 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) xa_for_each(&ctx->io_bl_xa, index, bl) { xa_erase(&ctx->io_bl_xa, bl->bgid); - __io_remove_buffers(ctx, bl, -1U); - kfree_rcu(bl, rcu); + io_put_bl(ctx, bl); } /* @@ -680,9 +688,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!bl->is_buf_ring) return -EINVAL; - __io_remove_buffers(ctx, bl, -1U); xa_erase(&ctx->io_bl_xa, bl->bgid); - kfree_rcu(bl, rcu); + io_put_bl(ctx, bl); return 0; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index fdbb104495132..8b868a1744e2f 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -25,6 +25,8 @@ struct io_buffer_list { __u16 head; __u16 mask; + atomic_t refs; + /* ring mapped provided buffers */ __u8 is_buf_ring; /* ring mapped provided buffers, but mmap'ed by application */ -- GitLab From 561e4f9451d65fc2f7eef564e0064373e3019793 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 Apr 2024 16:16:03 -0600 Subject: [PATCH 311/989] io_uring/kbuf: hold io_buffer_list reference over mmap If we look up the kbuf, ensure that it doesn't get unregistered until after we're done with it. Since we're inside mmap, we cannot safely use the io_uring lock. Rely on the fact that we can lookup the buffer list under RCU now and grab a reference to it, preventing it from being unregistered until we're done with it. The lookup returns the io_buffer_list directly with it referenced. Cc: stable@vger.kernel.org # v6.4+ Fixes: 5cf4f52e6d8a ("io_uring: free io_buffer_list entries via RCU") Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 ++++++----- io_uring/kbuf.c | 35 +++++++++++++++++++++++++++-------- io_uring/kbuf.h | 4 +++- 3 files changed, 36 insertions(+), 14 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bc730f59265f3..4521c2b66b98d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3447,14 +3447,15 @@ static void *io_uring_validate_mmap_request(struct file *file, ptr = ctx->sq_sqes; break; case IORING_OFF_PBUF_RING: { + struct io_buffer_list *bl; unsigned int bgid; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - rcu_read_lock(); - ptr = io_pbuf_get_address(ctx, bgid); - rcu_read_unlock(); - if (!ptr) - return ERR_PTR(-EINVAL); + bl = io_pbuf_get_bl(ctx, bgid); + if (IS_ERR(bl)) + return bl; + ptr = bl->buf_ring; + io_put_bl(ctx, bl); break; } default: diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 2edc6854f6f3e..3aa16e27f5099 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -266,7 +266,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, return i; } -static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { if (atomic_dec_and_test(&bl->refs)) { __io_remove_buffers(ctx, bl, -1U); @@ -719,16 +719,35 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) return 0; } -void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) +struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, + unsigned long bgid) { struct io_buffer_list *bl; + bool ret; - bl = __io_buffer_get_list(ctx, bgid); - - if (!bl || !bl->is_mmap) - return NULL; - - return bl->buf_ring; + /* + * We have to be a bit careful here - we're inside mmap and cannot grab + * the uring_lock. This means the buffer_list could be simultaneously + * going away, if someone is trying to be sneaky. Look it up under rcu + * so we know it's not going away, and attempt to grab a reference to + * it. If the ref is already zero, then fail the mapping. If successful, + * the caller will call io_put_bl() to drop the the reference at at the + * end. This may then safely free the buffer_list (and drop the pages) + * at that point, vm_insert_pages() would've already grabbed the + * necessary vma references. + */ + rcu_read_lock(); + bl = xa_load(&ctx->io_bl_xa, bgid); + /* must be a mmap'able buffer ring and have pages */ + ret = false; + if (bl && bl->is_mmap) + ret = atomic_inc_not_zero(&bl->refs); + rcu_read_unlock(); + + if (ret) + return bl; + + return ERR_PTR(-EINVAL); } /* diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 8b868a1744e2f..df365b8860cf1 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -61,7 +61,9 @@ void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); -void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid); +void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); +struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, + unsigned long bgid); static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) { -- GitLab From 5d872c9f46bd2ea3524af3c2420a364a13667135 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 30 Mar 2024 12:49:02 +0100 Subject: [PATCH 312/989] r8169: fix issue caused by buggy BIOS on certain boards with RTL8168d On some boards with this chip version the BIOS is buggy and misses to reset the PHY page selector. This results in the PHY ID read accessing registers on a different page, returning a more or less random value. Fix this by resetting the page selector first. Fixes: f1e911d5d0df ("r8169: add basic phylib support") Cc: stable@vger.kernel.org Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/64f2055e-98b8-45ec-8568-665e3d54d4e6@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 4ac444eb269ff..6f1e6f386b7ba 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5164,6 +5164,15 @@ static int r8169_mdio_register(struct rtl8169_private *tp) struct mii_bus *new_bus; int ret; + /* On some boards with this chip version the BIOS is buggy and misses + * to reset the PHY page selector. This results in the PHY ID read + * accessing registers on a different page, returning a more or + * less random value. Fix this by resetting the page selector first. + */ + if (tp->mac_version == RTL_GIGA_MAC_VER_25 || + tp->mac_version == RTL_GIGA_MAC_VER_26) + r8169_mdio_write(tp, 0x1f, 0); + new_bus = devm_mdiobus_alloc(&pdev->dev); if (!new_bus) return -ENOMEM; -- GitLab From d21d40605bca7bd5fc23ef03d4c1ca1f48bc2cae Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 1 Apr 2024 14:10:04 -0700 Subject: [PATCH 313/989] ipv6: Fix infinite recursion in fib6_dump_done(). syzkaller reported infinite recursive calls of fib6_dump_done() during netlink socket destruction. [1] From the log, syzkaller sent an AF_UNSPEC RTM_GETROUTE message, and then the response was generated. The following recvmmsg() resumed the dump for IPv6, but the first call of inet6_dump_fib() failed at kzalloc() due to the fault injection. [0] 12:01:34 executing program 3: r0 = socket$nl_route(0x10, 0x3, 0x0) sendmsg$nl_route(r0, ... snip ...) recvmmsg(r0, ... snip ...) (fail_nth: 8) Here, fib6_dump_done() was set to nlk_sk(sk)->cb.done, and the next call of inet6_dump_fib() set it to nlk_sk(sk)->cb.args[3]. syzkaller stopped receiving the response halfway through, and finally netlink_sock_destruct() called nlk_sk(sk)->cb.done(). fib6_dump_done() calls fib6_dump_end() and nlk_sk(sk)->cb.done() if it is still not NULL. fib6_dump_end() rewrites nlk_sk(sk)->cb.done() by nlk_sk(sk)->cb.args[3], but it has the same function, not NULL, calling itself recursively and hitting the stack guard page. To avoid the issue, let's set the destructor after kzalloc(). [0]: FAULT_INJECTION: forcing a failure. name failslab, interval 1, probability 0, space 0, times 0 CPU: 1 PID: 432110 Comm: syz-executor.3 Not tainted 6.8.0-12821-g537c2e91d354-dirty #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:117) should_fail_ex (lib/fault-inject.c:52 lib/fault-inject.c:153) should_failslab (mm/slub.c:3733) kmalloc_trace (mm/slub.c:3748 mm/slub.c:3827 mm/slub.c:3992) inet6_dump_fib (./include/linux/slab.h:628 ./include/linux/slab.h:749 net/ipv6/ip6_fib.c:662) rtnl_dump_all (net/core/rtnetlink.c:4029) netlink_dump (net/netlink/af_netlink.c:2269) netlink_recvmsg (net/netlink/af_netlink.c:1988) ____sys_recvmsg (net/socket.c:1046 net/socket.c:2801) ___sys_recvmsg (net/socket.c:2846) do_recvmmsg (net/socket.c:2943) __x64_sys_recvmmsg (net/socket.c:3041 net/socket.c:3034 net/socket.c:3034) [1]: BUG: TASK stack guard page was hit at 00000000f2fa9af1 (stack is 00000000b7912430..000000009a436beb) stack guard page: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 223719 Comm: kworker/1:3 Not tainted 6.8.0-12821-g537c2e91d354-dirty #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Workqueue: events netlink_sock_destruct_work RIP: 0010:fib6_dump_done (net/ipv6/ip6_fib.c:570) Code: 3c 24 e8 f3 e9 51 fd e9 28 fd ff ff 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 41 57 41 56 41 55 41 54 55 48 89 fd <53> 48 8d 5d 60 e8 b6 4d 07 fd 48 89 da 48 b8 00 00 00 00 00 fc ff RSP: 0018:ffffc9000d980000 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffffffff84405990 RCX: ffffffff844059d3 RDX: ffff8881028e0000 RSI: ffffffff84405ac2 RDI: ffff88810c02f358 RBP: ffff88810c02f358 R08: 0000000000000007 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000224 R12: 0000000000000000 R13: ffff888007c82c78 R14: ffff888007c82c68 R15: ffff888007c82c68 FS: 0000000000000000(0000) GS:ffff88811b100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffc9000d97fff8 CR3: 0000000102309002 CR4: 0000000000770ef0 PKRU: 55555554 Call Trace: <#DF> fib6_dump_done (net/ipv6/ip6_fib.c:572 (discriminator 1)) fib6_dump_done (net/ipv6/ip6_fib.c:572 (discriminator 1)) ... fib6_dump_done (net/ipv6/ip6_fib.c:572 (discriminator 1)) fib6_dump_done (net/ipv6/ip6_fib.c:572 (discriminator 1)) netlink_sock_destruct (net/netlink/af_netlink.c:401) __sk_destruct (net/core/sock.c:2177 (discriminator 2)) sk_destruct (net/core/sock.c:2224) __sk_free (net/core/sock.c:2235) sk_free (net/core/sock.c:2246) process_one_work (kernel/workqueue.c:3259) worker_thread (kernel/workqueue.c:3329 kernel/workqueue.c:3416) kthread (kernel/kthread.c:388) ret_from_fork (arch/x86/kernel/process.c:153) ret_from_fork_asm (arch/x86/entry/entry_64.S:256) Modules linked in: Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240401211003.25274-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5c558dc1c6838..7209419cfb0e9 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -651,19 +651,19 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (!w) { /* New dump: * - * 1. hook callback destructor. - */ - cb->args[3] = (long)cb->done; - cb->done = fib6_dump_done; - - /* - * 2. allocate and initialize walker. + * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_dump_node; cb->args[2] = (long)w; + + /* 2. hook callback destructor. + */ + cb->args[3] = (long)cb->done; + cb->done = fib6_dump_done; + } arg.skb = skb; -- GitLab From c53fe72cb5fffd69f2fff104b0119d6e271759c5 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 1 Apr 2024 21:43:47 +0300 Subject: [PATCH 314/989] MAINTAINERS: mlx5: Add Tariq Toukan Add myself as mlx5 core and EN maintainer. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Acked-by: Saeed Mahameed Link: https://lore.kernel.org/r/20240401184347.53884-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 7b8f97bf79751..de969a9c8ad2d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14014,6 +14014,7 @@ F: drivers/net/ethernet/mellanox/mlx4/en_* MELLANOX ETHERNET DRIVER (mlx5e) M: Saeed Mahameed +M: Tariq Toukan L: netdev@vger.kernel.org S: Supported W: http://www.mellanox.com @@ -14081,6 +14082,7 @@ F: include/uapi/rdma/mlx4-abi.h MELLANOX MLX5 core VPI driver M: Saeed Mahameed M: Leon Romanovsky +M: Tariq Toukan L: netdev@vger.kernel.org L: linux-rdma@vger.kernel.org S: Supported -- GitLab From 5bd31ab5f79eb6e3bdfa0ca0b57650f9d1604062 Mon Sep 17 00:00:00 2001 From: Shivaprasad G Bhat Date: Thu, 15 Feb 2024 07:52:32 -0600 Subject: [PATCH 315/989] powerpc/iommu: Refactor spapr_tce_platform_iommu_attach_dev() The patch makes the iommu_group_get() call only when using it thereby avoiding the unnecessary get & put for domain already being set case. Reviewed-by: Jason Gunthorpe Signed-off-by: Shivaprasad G Bhat Signed-off-by: Michael Ellerman Link: https://msgid.link/170800513841.2411.13524607664262048895.stgit@linux.ibm.com --- arch/powerpc/kernel/iommu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 1185efebf032b..29a8c8e185851 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1285,15 +1285,14 @@ spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, struct device *dev) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct iommu_group *grp = iommu_group_get(dev); struct iommu_table_group *table_group; + struct iommu_group *grp; /* At first attach the ownership is already set */ - if (!domain) { - iommu_group_put(grp); + if (!domain) return 0; - } + grp = iommu_group_get(dev); table_group = iommu_group_get_iommudata(grp); /* * The domain being set to PLATFORM from earlier -- GitLab From c3eeb1ffc6a88af9b002e22be0f70851759be03a Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 1 Apr 2024 11:16:39 -0700 Subject: [PATCH 316/989] x86/resctrl: Fix uninitialized memory read when last CPU of domain goes offline Tony encountered this OOPS when the last CPU of a domain goes offline while running a kernel built with CONFIG_NO_HZ_FULL: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 Oops: 0000 [#1] PREEMPT SMP NOPTI ... RIP: 0010:__find_nth_andnot_bit+0x66/0x110 ... Call Trace: ? __die() ? page_fault_oops() ? exc_page_fault() ? asm_exc_page_fault() cpumask_any_housekeeping() mbm_setup_overflow_handler() resctrl_offline_cpu() resctrl_arch_offline_cpu() cpuhp_invoke_callback() cpuhp_thread_fun() smpboot_thread_fn() kthread() ret_from_fork() ret_from_fork_asm() The NULL pointer dereference is encountered while searching for another online CPU in the domain (of which there are none) that can be used to run the MBM overflow handler. Because the kernel is configured with CONFIG_NO_HZ_FULL the search for another CPU (in its effort to prefer those CPUs that aren't marked nohz_full) consults the mask representing the nohz_full CPUs, tick_nohz_full_mask. On a kernel with CONFIG_CPUMASK_OFFSTACK=y tick_nohz_full_mask is not allocated unless the kernel is booted with the "nohz_full=" parameter and because of that any access to tick_nohz_full_mask needs to be guarded with tick_nohz_full_enabled(). Replace the IS_ENABLED(CONFIG_NO_HZ_FULL) with tick_nohz_full_enabled(). The latter ensures tick_nohz_full_mask can be accessed safely and can be used whether kernel is built with CONFIG_NO_HZ_FULL enabled or not. [ Use Ingo's suggestion that combines the two NO_HZ checks into one. ] Fixes: a4846aaf3945 ("x86/resctrl: Add cpumask_any_housekeeping() for limbo/overflow") Reported-by: Tony Luck Signed-off-by: Reinette Chatre Signed-off-by: Ingo Molnar Reviewed-by: Babu Moger Link: https://lore.kernel.org/r/ff8dfc8d3dcb04b236d523d1e0de13d2ef585223.1711993956.git.reinette.chatre@intel.com Closes: https://lore.kernel.org/lkml/ZgIFT5gZgIQ9A9G7@agluck-desk3/ --- arch/x86/kernel/cpu/resctrl/internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index c99f26ebe7a65..1a8687f8073a8 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -78,7 +78,8 @@ cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) else cpu = cpumask_any_but(mask, exclude_cpu); - if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) + /* Only continue if tick_nohz_full_mask has been initialized. */ + if (!tick_nohz_full_enabled()) return cpu; /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ -- GitLab From 312be9fc2234c8acfb8148a9f4c358b70d358dee Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 1 Apr 2024 06:33:20 -0700 Subject: [PATCH 317/989] perf/x86/intel/ds: Don't clear ->pebs_data_cfg for the last PEBS event The MSR_PEBS_DATA_CFG MSR register is used to configure which data groups should be generated into a PEBS record, and it's shared among all counters. If there are different configurations among counters, perf combines all the configurations. The first perf command as below requires a complete PEBS record (including memory info, GPRs, XMMs, and LBRs). The second perf command only requires a basic group. However, after the second perf command is running, the MSR_PEBS_DATA_CFG register is cleared. Only a basic group is generated in a PEBS record, which is wrong. The required information for the first perf command is missed. $ perf record --intr-regs=AX,SP,XMM0 -a -C 8 -b -W -d -c 100000003 -o /dev/null -e cpu/event=0xd0,umask=0x81/upp & $ sleep 5 $ perf record --per-thread -c 1 -e cycles:pp --no-timestamp --no-tid taskset -c 8 ./noploop 1000 The first PEBS event is a system-wide PEBS event. The second PEBS event is a per-thread event. When the thread is scheduled out, the intel_pmu_pebs_del() function is invoked to update the PEBS state. Since the system-wide event is still available, the cpuc->n_pebs is 1. The cpuc->pebs_data_cfg is cleared. The data configuration for the system-wide PEBS event is lost. The (cpuc->n_pebs == 1) check was introduced in commit: b6a32f023fcc ("perf/x86: Fix PEBS threshold initialization") At that time, it indeed didn't hurt whether the state was updated during the removal, because only the threshold is updated. The calculation of the threshold takes the last PEBS event into account. However, since commit: b752ea0c28e3 ("perf/x86/intel/ds: Flush PEBS DS when changing PEBS_DATA_CFG") we delay the threshold update, and clear the PEBS data config, which triggers the bug. The PEBS data config update scope should not be shrunk during removal. [ mingo: Improved the changelog & comments. ] Fixes: b752ea0c28e3 ("perf/x86/intel/ds: Flush PEBS DS when changing PEBS_DATA_CFG") Reported-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240401133320.703971-1-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 2641ba620f12a..e010bfed84170 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1237,11 +1237,11 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu = event->pmu; /* - * Make sure we get updated with the first PEBS - * event. It will trigger also during removal, but - * that does not hurt: + * Make sure we get updated with the first PEBS event. + * During removal, ->pebs_data_cfg is still valid for + * the last PEBS event. Don't clear it. */ - if (cpuc->n_pebs == 1) + if ((cpuc->n_pebs == 1) && add) cpuc->pebs_data_cfg = PEBS_UPDATE_DS_SW; if (needed_cb != pebs_needs_sched_cb(cpuc)) { -- GitLab From ef15ddeeb6bee87c044bf7754fac524545bf71e8 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Thu, 28 Mar 2024 19:55:05 +0300 Subject: [PATCH 318/989] octeontx2-af: Add array index check In rvu_map_cgx_lmac_pf() the 'iter', which is used as an array index, can reach value (up to 14) that exceed the size (MAX_LMAC_COUNT = 8) of the array. Fix this bug by adding 'iter' value check. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 91c6945ea1f9 ("octeontx2-af: cn10k: Add RPM MAC support") Signed-off-by: Aleksandr Mishin Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index 72e060cf6b618..e9bf9231b0185 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -160,6 +160,8 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu) continue; lmac_bmap = cgx_get_lmac_bmap(rvu_cgx_pdata(cgx, rvu)); for_each_set_bit(iter, &lmac_bmap, rvu->hw->lmac_per_cgx) { + if (iter >= MAX_LMAC_COUNT) + continue; lmac = cgx_get_lmacid(rvu_cgx_pdata(cgx, rvu), iter); rvu->pf2cgxlmac_map[pf] = cgxlmac_id_to_bmap(cgx, lmac); -- GitLab From bff892acf79cec531da6cb21c50980a584ce1476 Mon Sep 17 00:00:00 2001 From: Carlos Song Date: Wed, 3 Apr 2024 16:40:29 +0800 Subject: [PATCH 319/989] spi: spi-fsl-lpspi: remove redundant spi_controller_put call devm_spi_alloc_controller will allocate an SPI controller and automatically release a reference on it when dev is unbound from its driver. It doesn't need to call spi_controller_put explicitly to put the reference when lpspi driver failed initialization. Fixes: 2ae0ab0143fc ("spi: lpspi: Avoid potential use-after-free in probe()") Signed-off-by: Carlos Song Reviewed-by: Alexander Sverdlin Link: https://msgid.link/r/20240403084029.2000544-1-carlos.song@nxp.com Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-lpspi.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c index 079035db7dd85..92a662d1b55cf 100644 --- a/drivers/spi/spi-fsl-lpspi.c +++ b/drivers/spi/spi-fsl-lpspi.c @@ -852,39 +852,39 @@ static int fsl_lpspi_probe(struct platform_device *pdev) fsl_lpspi->base = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(fsl_lpspi->base)) { ret = PTR_ERR(fsl_lpspi->base); - goto out_controller_put; + return ret; } fsl_lpspi->base_phys = res->start; irq = platform_get_irq(pdev, 0); if (irq < 0) { ret = irq; - goto out_controller_put; + return ret; } ret = devm_request_irq(&pdev->dev, irq, fsl_lpspi_isr, 0, dev_name(&pdev->dev), fsl_lpspi); if (ret) { dev_err(&pdev->dev, "can't get irq%d: %d\n", irq, ret); - goto out_controller_put; + return ret; } fsl_lpspi->clk_per = devm_clk_get(&pdev->dev, "per"); if (IS_ERR(fsl_lpspi->clk_per)) { ret = PTR_ERR(fsl_lpspi->clk_per); - goto out_controller_put; + return ret; } fsl_lpspi->clk_ipg = devm_clk_get(&pdev->dev, "ipg"); if (IS_ERR(fsl_lpspi->clk_ipg)) { ret = PTR_ERR(fsl_lpspi->clk_ipg); - goto out_controller_put; + return ret; } /* enable the clock */ ret = fsl_lpspi_init_rpm(fsl_lpspi); if (ret) - goto out_controller_put; + return ret; ret = pm_runtime_get_sync(fsl_lpspi->dev); if (ret < 0) { @@ -945,8 +945,6 @@ out_pm_get: pm_runtime_dont_use_autosuspend(fsl_lpspi->dev); pm_runtime_put_sync(fsl_lpspi->dev); pm_runtime_disable(fsl_lpspi->dev); -out_controller_put: - spi_controller_put(controller); return ret; } -- GitLab From 1f886a7bfb3faf4c1021e73f045538008ce7634e Mon Sep 17 00:00:00 2001 From: Huai-Yuan Liu Date: Wed, 3 Apr 2024 09:42:21 +0800 Subject: [PATCH 320/989] spi: mchp-pci1xxx: Fix a possible null pointer dereference in pci1xxx_spi_probe In function pci1xxxx_spi_probe, there is a potential null pointer that may be caused by a failed memory allocation by the function devm_kzalloc. Hence, a null pointer check needs to be added to prevent null pointer dereferencing later in the code. To fix this issue, spi_bus->spi_int[iter] should be checked. The memory allocated by devm_kzalloc will be automatically released, so just directly return -ENOMEM without worrying about memory leaks. Fixes: 1cc0cbea7167 ("spi: microchip: pci1xxxx: Add driver for SPI controller of PCI1XXXX PCIe switch") Signed-off-by: Huai-Yuan Liu Link: https://msgid.link/r/20240403014221.969801-1-qq810974084@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-pci1xxxx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/spi/spi-pci1xxxx.c b/drivers/spi/spi-pci1xxxx.c index 969965d7bc98b..cc18d320370f9 100644 --- a/drivers/spi/spi-pci1xxxx.c +++ b/drivers/spi/spi-pci1xxxx.c @@ -725,6 +725,8 @@ static int pci1xxxx_spi_probe(struct pci_dev *pdev, const struct pci_device_id * spi_bus->spi_int[iter] = devm_kzalloc(&pdev->dev, sizeof(struct pci1xxxx_spi_internal), GFP_KERNEL); + if (!spi_bus->spi_int[iter]) + return -ENOMEM; spi_sub_ptr = spi_bus->spi_int[iter]; spi_sub_ptr->spi_host = devm_spi_alloc_host(dev, sizeof(struct spi_controller)); if (!spi_sub_ptr->spi_host) -- GitLab From 0a6380cb4c6b5c1d6dad226ba3130f9090f0ccea Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Mon, 1 Apr 2024 13:09:33 +0200 Subject: [PATCH 321/989] net: bcmgenet: Reset RBUF on first open If the RBUF logic is not reset when the kernel starts then there may be some data left over from any network boot loader. If the 64-byte packet headers are enabled then this can be fatal. Extend bcmgenet_dma_disable to do perform the reset, but not when called from bcmgenet_resume in order to preserve a wake packet. N.B. This different handling of resume is just based on a hunch - why else wouldn't one reset the RBUF as well as the TBUF? If this isn't the case then it's easy to change the patch to make the RBUF reset unconditional. See: https://github.com/raspberrypi/linux/issues/3850 See: https://github.com/raspberrypi/firmware/issues/1882 Signed-off-by: Phil Elwell Signed-off-by: Maarten Vanraes Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 7396e2823e328..b1f84b37032a7 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -3280,7 +3280,7 @@ static void bcmgenet_get_hw_addr(struct bcmgenet_priv *priv, } /* Returns a reusable dma control register value */ -static u32 bcmgenet_dma_disable(struct bcmgenet_priv *priv) +static u32 bcmgenet_dma_disable(struct bcmgenet_priv *priv, bool flush_rx) { unsigned int i; u32 reg; @@ -3305,6 +3305,14 @@ static u32 bcmgenet_dma_disable(struct bcmgenet_priv *priv) udelay(10); bcmgenet_umac_writel(priv, 0, UMAC_TX_FLUSH); + if (flush_rx) { + reg = bcmgenet_rbuf_ctrl_get(priv); + bcmgenet_rbuf_ctrl_set(priv, reg | BIT(0)); + udelay(10); + bcmgenet_rbuf_ctrl_set(priv, reg); + udelay(10); + } + return dma_ctrl; } @@ -3368,8 +3376,8 @@ static int bcmgenet_open(struct net_device *dev) bcmgenet_set_hw_addr(priv, dev->dev_addr); - /* Disable RX/TX DMA and flush TX queues */ - dma_ctrl = bcmgenet_dma_disable(priv); + /* Disable RX/TX DMA and flush TX and RX queues */ + dma_ctrl = bcmgenet_dma_disable(priv, true); /* Reinitialize TDMA and RDMA and SW housekeeping */ ret = bcmgenet_init_dma(priv); @@ -4235,7 +4243,7 @@ static int bcmgenet_resume(struct device *d) bcmgenet_hfb_create_rxnfc_filter(priv, rule); /* Disable RX/TX DMA and flush TX queues */ - dma_ctrl = bcmgenet_dma_disable(priv); + dma_ctrl = bcmgenet_dma_disable(priv, false); /* Reinitialize TDMA and RDMA and SW housekeeping */ ret = bcmgenet_init_dma(priv); -- GitLab From e8acd2d209a387f2358c2c83fe894b444db9ea46 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 2 Apr 2024 18:43:45 +0200 Subject: [PATCH 322/989] gpiolib: Fix triggering "kobject: 'gpiochipX' is not initialized, yet" kobject_get() errors When a gpiochip gets added by loading a module, then another driver may be waiting for that gpiochip to load on the deferred-probe list. If the deferred-probe for the consumer of gpiochip then triggers between the gpiodev_add_to_list_unlocked() calls which makes gpio_device_find() see the chip and the gpiochip_setup_dev() later then gpio_device_find() does a kobject_get() on an uninitialized kobject since the kobject is initialized by gpiochip_setup_dev() calling device_initialize(): arizona spi-10WM5102:00: cannot find GPIO chip arizona, deferring arizona spi-10WM5102:00: cannot find GPIO chip arizona, deferring ------------[ cut here ]------------ kobject: 'gpiochip5' (00000000241466f2): is not initialized, yet kobject_get() is being called. WARNING: CPU: 3 PID: 42 at lib/kobject.c:640 kobject_get+0x43/0x70 Call Trace: kobject_get gpio_device_find gpiod_find_and_request gpiod_get snd_byt_wm5102_mc_probe Not only is the device not initialized yet, but when the gpio-device is added to the list things like the irqchip also have not been initialized yet. So gpio_device_find() should really ignore the gpio-device until gpiochip_add_data_with_key() is fully done. Add a device_is_registered() check to gpio_device_find() to ignore gpio-devices on the list which are not yet fully initialized. Fixes: aab5c6f20023 ("gpio: set device type for GPIO chips") Suggested-by: Bartosz Golaszewski Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko [Bartosz: fix a typo in commit message] Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 59ccf9a3e1539..94903fc1c1459 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1175,6 +1175,9 @@ struct gpio_device *gpio_device_find(const void *data, list_for_each_entry_srcu(gdev, &gpio_devices, list, srcu_read_lock_held(&gpio_devices_srcu)) { + if (!device_is_registered(&gdev->dev)) + continue; + guard(srcu)(&gdev->srcu); gc = srcu_dereference(gdev->chip, &gdev->srcu); -- GitLab From 1d86c2b3946e69d6b0b93568d312aae6247847c0 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:03 -0400 Subject: [PATCH 323/989] arm64: dts: imx8-ss-lsio: fix pwm lpcg indices lpcg's arg0 should use clock indices instead of index. pwm0_lpcg: clock-controller@5d400000 { ... // Col1 Col2 clocks = <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>, // 0 0 <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>, // 1 1 <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>, // 2 4 <&lsio_bus_clk>, // 3 5 <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>; // 4 6 clock-indices = , , , , ; }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. pwm1 { .... clocks = <&pwm1_lpcg 4>, <&pwm1_lpcg 1>; ^^ ^^ should be: clocks = <&pwm1_lpcg IMX_LPCG_CLK_6>, <&pwm1_lpcg IMX_LPCG_CLK_1>; }; Arg0 is divided by 4 in lpcg driver, so index 0 and 1 will be get by pwm driver, which are same as IMX_LPCG_CLK_6 and IMX_LPCG_CLK_1. Even it can work, but code logic is wrong. Fixed it by use correct indices. Cc: stable@vger.kernel.org Fixes: 23fa99b205ea ("arm64: dts: freescale: imx8-ss-lsio: add support for lsio_pwm0-3") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi index 7e510b21bbac5..764c1a08e3b11 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi @@ -25,8 +25,8 @@ lsio_subsys: bus@5d000000 { compatible = "fsl,imx27-pwm"; reg = <0x5d000000 0x10000>; clock-names = "ipg", "per"; - clocks = <&pwm0_lpcg 4>, - <&pwm0_lpcg 1>; + clocks = <&pwm0_lpcg IMX_LPCG_CLK_6>, + <&pwm0_lpcg IMX_LPCG_CLK_1>; assigned-clocks = <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; #pwm-cells = <3>; @@ -38,8 +38,8 @@ lsio_subsys: bus@5d000000 { compatible = "fsl,imx27-pwm"; reg = <0x5d010000 0x10000>; clock-names = "ipg", "per"; - clocks = <&pwm1_lpcg 4>, - <&pwm1_lpcg 1>; + clocks = <&pwm1_lpcg IMX_LPCG_CLK_6>, + <&pwm1_lpcg IMX_LPCG_CLK_1>; assigned-clocks = <&clk IMX_SC_R_PWM_1 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; #pwm-cells = <3>; @@ -51,8 +51,8 @@ lsio_subsys: bus@5d000000 { compatible = "fsl,imx27-pwm"; reg = <0x5d020000 0x10000>; clock-names = "ipg", "per"; - clocks = <&pwm2_lpcg 4>, - <&pwm2_lpcg 1>; + clocks = <&pwm2_lpcg IMX_LPCG_CLK_6>, + <&pwm2_lpcg IMX_LPCG_CLK_1>; assigned-clocks = <&clk IMX_SC_R_PWM_2 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; #pwm-cells = <3>; @@ -64,8 +64,8 @@ lsio_subsys: bus@5d000000 { compatible = "fsl,imx27-pwm"; reg = <0x5d030000 0x10000>; clock-names = "ipg", "per"; - clocks = <&pwm3_lpcg 4>, - <&pwm3_lpcg 1>; + clocks = <&pwm3_lpcg IMX_LPCG_CLK_6>, + <&pwm3_lpcg IMX_LPCG_CLK_1>; assigned-clocks = <&clk IMX_SC_R_PWM_3 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; #pwm-cells = <3>; -- GitLab From 808e7716edcdb39d3498b9f567ef6017858b49aa Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:04 -0400 Subject: [PATCH 324/989] arm64: dts: imx8-ss-conn: fix usb lpcg indices usb2_lpcg: clock-controller@5b270000 { ... Col1 Col2 clocks = <&conn_ahb_clk>, <&conn_ipg_clk>; // 0 6 clock-indices = , ; // 0 7 ... }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. usbotg1: usb@5b0d0000 { ... clocks = <&usb2_lpcg 0>; ^^ Should be: clocks = <&usb2_lpcg IMX_LPCG_CLK_6>; }; usbphy1: usbphy@5b100000 { clocks = <&usb2_lpcg 1>; ^^ SHould be: clocks = <&usb2_lpcg IMX_LPCG_CLK_7>; }; Arg0 is divided by 4 in lpcg driver. So lpcg will do dummy enable. Fix it by use correct clock indices. Cc: stable@vger.kernel.org Fixes: 8065fc937f0f ("arm64: dts: imx8dxl: add usb1 and usb2 support") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi index af2259e997967..4aaf5a0c1ed8a 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi @@ -41,7 +41,7 @@ conn_subsys: bus@5b000000 { interrupts = ; fsl,usbphy = <&usbphy1>; fsl,usbmisc = <&usbmisc1 0>; - clocks = <&usb2_lpcg 0>; + clocks = <&usb2_lpcg IMX_LPCG_CLK_6>; ahb-burst-config = <0x0>; tx-burst-size-dword = <0x10>; rx-burst-size-dword = <0x10>; @@ -58,7 +58,7 @@ conn_subsys: bus@5b000000 { usbphy1: usbphy@5b100000 { compatible = "fsl,imx7ulp-usbphy"; reg = <0x5b100000 0x1000>; - clocks = <&usb2_lpcg 1>; + clocks = <&usb2_lpcg IMX_LPCG_CLK_7>; power-domains = <&pd IMX_SC_R_USB_0_PHY>; status = "disabled"; }; -- GitLab From f72b544a514c07d34a0d9d5380f5905b3731e647 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:05 -0400 Subject: [PATCH 325/989] arm64: dts: imx8-ss-dma: fix spi lpcg indices spi0_lpcg: clock-controller@5a400000 { ... Col0 Col1 clocks = <&clk IMX_SC_R_SPI_0 IMX_SC_PM_CLK_PER>,// 0 1 <&dma_ipg_clk>; // 1 4 clock-indices = , ; }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. lpspi0: spi@5a000000 { ... clocks = <&spi0_lpcg 0>, <&spi0_lpcg 1>; ^ ^ Should be: clocks = <&spi0_lpcg IMX_LPCG_CLK_0>, <&spi0_lpcg IMX_LPCG_CLK_4>; }; Arg0 is divided by 4 in lpcg driver. <&spi0_lpcg 0> and <&spi0_lpcg 1> are IMX_SC_PM_CLK_PER. Although code can work, code logic is wrong. It should use IMX_LPCG_CLK_0 and IMX_LPCG_CLK_4 for lpcg arg0. Cc: stable@vger.kernel.org Fixes: c4098885e790 ("arm64: dts: imx8dxl: add lpspi support") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi index cab3468b1875e..169cf885c7442 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi @@ -28,8 +28,8 @@ dma_subsys: bus@5a000000 { #size-cells = <0>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&spi0_lpcg 0>, - <&spi0_lpcg 1>; + clocks = <&spi0_lpcg IMX_LPCG_CLK_0>, + <&spi0_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_SPI_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <60000000>; @@ -44,8 +44,8 @@ dma_subsys: bus@5a000000 { #size-cells = <0>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&spi1_lpcg 0>, - <&spi1_lpcg 1>; + clocks = <&spi1_lpcg IMX_LPCG_CLK_0>, + <&spi1_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_SPI_1 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <60000000>; @@ -60,8 +60,8 @@ dma_subsys: bus@5a000000 { #size-cells = <0>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&spi2_lpcg 0>, - <&spi2_lpcg 1>; + clocks = <&spi2_lpcg IMX_LPCG_CLK_0>, + <&spi2_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_SPI_2 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <60000000>; @@ -76,8 +76,8 @@ dma_subsys: bus@5a000000 { #size-cells = <0>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&spi3_lpcg 0>, - <&spi3_lpcg 1>; + clocks = <&spi3_lpcg IMX_LPCG_CLK_0>, + <&spi3_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_SPI_3 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <60000000>; -- GitLab From 9055d87bce7276234173fa90e9702af31b3f5353 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:06 -0400 Subject: [PATCH 326/989] arm64: dts: imx8-ss-dma: fix pwm lpcg indices adma_pwm_lpcg: clock-controller@5a590000 { ... col1 col2 clocks = <&clk IMX_SC_R_LCD_0_PWM_0 IMX_SC_PM_CLK_PER>,// 0 0 <&dma_ipg_clk>; // 1 4 clock-indices = , ; ... }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. adma_pwm: pwm@5a190000 { ... clocks = <&adma_pwm_lpcg 1>, <&adma_pwm_lpcg 0>; ^^ ^^ Should be clocks = <&adma_pwm_lpcg IMX_LPCG_CLK_4>, <&adma_pwm_lpcg IMX_LPCG_CLK_0>; }; Arg0 will be divided by 4 in lcpg driver, so pwm will get IMX_SC_PM_CLK_PER by <&adma_pwm_lpcg 1>, <&adma_pwm_lpcg 0>. Although function can work, code logic is wrong. Fix it by use correct indices. Cc: stable@vger.kernel.org Fixes: f1d6a6b991ef ("arm64: dts: imx8qxp: add adma_pwm in adma") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi index 169cf885c7442..e3e911e577907 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi @@ -145,8 +145,8 @@ dma_subsys: bus@5a000000 { compatible = "fsl,imx8qxp-pwm", "fsl,imx27-pwm"; reg = <0x5a190000 0x1000>; interrupts = ; - clocks = <&adma_pwm_lpcg 1>, - <&adma_pwm_lpcg 0>; + clocks = <&adma_pwm_lpcg IMX_LPCG_CLK_4>, + <&adma_pwm_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "per"; assigned-clocks = <&clk IMX_SC_R_LCD_0_PWM_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; -- GitLab From 81975080f14167610976e968e8016e92d836266f Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:07 -0400 Subject: [PATCH 327/989] arm64: dts: imx8-ss-dma: fix adc lpcg indices adc0_lpcg: clock-controller@5ac80000 { ... Col1 Col2 clocks = <&clk IMX_SC_R_ADC_0 IMX_SC_PM_CLK_PER>, // 0 0 <&dma_ipg_clk>; // 1 4 clock-indices = , ; }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. adc0: adc@5a880000 { clocks = <&adc0_lpcg 0>, <&adc0_lpcg 1>; ^^ ^^ clocks = <&adc0_lpcg IMX_LPCG_CLK_0>, <&adc0_lpcg IMX_LPCG_CLK_4>; Arg0 is divided by 4 in lpcg driver. So adc get IMX_SC_PM_CLK_PER by <&adc0_lpcg 0>, <&adc0_lpcg 1>. Although function can work, code logic is wrong. Fix it by using correct indices. Cc: stable@vger.kernel.org Fixes: 1db044b25d2e ("arm64: dts: imx8dxl: add adc0 support") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi index e3e911e577907..23d59b34bc4ce 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi @@ -355,8 +355,8 @@ dma_subsys: bus@5a000000 { reg = <0x5a880000 0x10000>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&adc0_lpcg 0>, - <&adc0_lpcg 1>; + clocks = <&adc0_lpcg IMX_LPCG_CLK_0>, + <&adc0_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_ADC_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; @@ -370,8 +370,8 @@ dma_subsys: bus@5a000000 { reg = <0x5a890000 0x10000>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&adc1_lpcg 0>, - <&adc1_lpcg 1>; + clocks = <&adc1_lpcg IMX_LPCG_CLK_0>, + <&adc1_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_ADC_1 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; -- GitLab From 0893392334b5dffdf616a53679c6a2942c46391b Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:08 -0400 Subject: [PATCH 328/989] arm64: dts: imx8-ss-dma: fix can lpcg indices can0_lpcg: clock-controller@5acd0000 { ... Col1 Col2 clocks = <&clk IMX_SC_R_CAN_0 IMX_SC_PM_CLK_PER>, // 0 0 <&dma_ipg_clk>, // 1 4 <&dma_ipg_clk>; // 2 5 clock-indices = , , ; } Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. flexcan1: can@5a8d0000 { clocks = <&can0_lpcg 1>, <&can0_lpcg 0>; ^^ ^^ Should be: clocks = <&can0_lpcg IMX_LPCG_CLK_4>, <&can0_lpcg IMX_LPCG_CLK_0>; }; Arg0 is divided by 4 in lpcg driver. flexcan driver get IMX_SC_PM_CLK_PER by <&can0_lpcg 1> and <&can0_lpcg 0>. Although function can work, code logic is wrong. Fix it by using correct clock indices. Cc: stable@vger.kernel.org Fixes: 5e7d5b023e03 ("arm64: dts: imx8qxp: add flexcan in adma") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi index 23d59b34bc4ce..f7a91d43a0ffe 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi @@ -384,8 +384,8 @@ dma_subsys: bus@5a000000 { reg = <0x5a8d0000 0x10000>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&can0_lpcg 1>, - <&can0_lpcg 0>; + clocks = <&can0_lpcg IMX_LPCG_CLK_4>, + <&can0_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "per"; assigned-clocks = <&clk IMX_SC_R_CAN_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <40000000>; @@ -405,8 +405,8 @@ dma_subsys: bus@5a000000 { * CAN1 shares CAN0's clock and to enable CAN0's clock it * has to be powered on. */ - clocks = <&can0_lpcg 1>, - <&can0_lpcg 0>; + clocks = <&can0_lpcg IMX_LPCG_CLK_4>, + <&can0_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "per"; assigned-clocks = <&clk IMX_SC_R_CAN_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <40000000>; @@ -426,8 +426,8 @@ dma_subsys: bus@5a000000 { * CAN2 shares CAN0's clock and to enable CAN0's clock it * has to be powered on. */ - clocks = <&can0_lpcg 1>, - <&can0_lpcg 0>; + clocks = <&can0_lpcg IMX_LPCG_CLK_4>, + <&can0_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "per"; assigned-clocks = <&clk IMX_SC_R_CAN_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <40000000>; -- GitLab From 00b436182138310bb8d362b912b12a9df8f72ca3 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:09 -0400 Subject: [PATCH 329/989] arm64: dts: imx8qm-ss-dma: fix can lpcg indices can1_lpcg: clock-controller@5ace0000 { ... Col1 Col2 clocks = <&clk IMX_SC_R_CAN_1 IMX_SC_PM_CLK_PER>,// 0 0 <&dma_ipg_clk>, // 1 4 <&dma_ipg_clk>; // 2 5 clock-indices = , , ; }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver &flexcan2 { clocks = <&can1_lpcg 1>, <&can1_lpcg 0>; ^^ ^^ Should be: clocks = <&can1_lpcg IMX_LPCG_CLK_4>, <&can1_lpcg IMX_LPCG_CLK_0>; }; Arg0 is divided by 4 in lpcg driver. So flexcan get IMX_SC_PM_CLK_PER by <&can1_lpcg 1> and <&can1_lpcg 0>. Although function work, code logic is wrong. Fix it by using correct clock indices. Cc: stable@vger.kernel.org Fixes: be85831de020 ("arm64: dts: imx8qm: add can node in devicetree") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi index 11626fae5f97f..aa9f28c4431d0 100644 --- a/arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi @@ -153,15 +153,15 @@ }; &flexcan2 { - clocks = <&can1_lpcg 1>, - <&can1_lpcg 0>; + clocks = <&can1_lpcg IMX_LPCG_CLK_4>, + <&can1_lpcg IMX_LPCG_CLK_0>; assigned-clocks = <&clk IMX_SC_R_CAN_1 IMX_SC_PM_CLK_PER>; fsl,clk-source = /bits/ 8 <1>; }; &flexcan3 { - clocks = <&can2_lpcg 1>, - <&can2_lpcg 0>; + clocks = <&can2_lpcg IMX_LPCG_CLK_4>, + <&can2_lpcg IMX_LPCG_CLK_0>; assigned-clocks = <&clk IMX_SC_R_CAN_2 IMX_SC_PM_CLK_PER>; fsl,clk-source = /bits/ 8 <1>; }; -- GitLab From e6ec07dc6dd498415bc8cc49437d5ec9e09cc48e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 20 Mar 2024 10:38:58 +0100 Subject: [PATCH 330/989] s390/mm: fix NULL pointer dereference The recently added check to figure out if a fault happened on gmap ASCE dereferences the gmap pointer in lowcore without checking that it is not NULL. For all non-KVM processes the pointer is NULL, so that some value from lowcore will be read. With the current layouts of struct gmap and struct lowcore the read value (aka ASCE) is zero, so that this doesn't lead to any observable bug; at least currently. Fix this by adding the missing NULL pointer check. Fixes: 64c3431808bd ("s390/entry: compare gmap asce to determine guest/host fault") Acked-by: Sven Schnelle Reviewed-by: Claudio Imbrenda Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index c421dd44ffbe0..0c66b32e0f9f1 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -75,7 +75,7 @@ static enum fault_type get_fault_type(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_PGSTE)) return KERNEL_FAULT; gmap = (struct gmap *)S390_lowcore.gmap; - if (regs->cr1 == gmap->asce) + if (gmap && gmap->asce == regs->cr1) return GMAP_FAULT; return KERNEL_FAULT; } -- GitLab From 01cac82ae02b43983173ea8e475a1c999edd25a6 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 20 Mar 2024 23:47:48 +0100 Subject: [PATCH 331/989] s390/atomic: mark all functions __always_inline Atomic functions are quite ubiquitous and may be called by noinstr ones, introducing unwanted instrumentation. They are very small, so there are no significant downsides to force-inlining them. Signed-off-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/20240320230007.4782-2-iii@linux.ibm.com Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/atomic.h | 44 +++++++++++++++--------------- arch/s390/include/asm/atomic_ops.h | 22 +++++++-------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index 7138d189cc420..0c4cad7d5a5b1 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -15,31 +15,31 @@ #include #include -static inline int arch_atomic_read(const atomic_t *v) +static __always_inline int arch_atomic_read(const atomic_t *v) { return __atomic_read(v); } #define arch_atomic_read arch_atomic_read -static inline void arch_atomic_set(atomic_t *v, int i) +static __always_inline void arch_atomic_set(atomic_t *v, int i) { __atomic_set(v, i); } #define arch_atomic_set arch_atomic_set -static inline int arch_atomic_add_return(int i, atomic_t *v) +static __always_inline int arch_atomic_add_return(int i, atomic_t *v) { return __atomic_add_barrier(i, &v->counter) + i; } #define arch_atomic_add_return arch_atomic_add_return -static inline int arch_atomic_fetch_add(int i, atomic_t *v) +static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { return __atomic_add_barrier(i, &v->counter); } #define arch_atomic_fetch_add arch_atomic_fetch_add -static inline void arch_atomic_add(int i, atomic_t *v) +static __always_inline void arch_atomic_add(int i, atomic_t *v) { __atomic_add(i, &v->counter); } @@ -50,11 +50,11 @@ static inline void arch_atomic_add(int i, atomic_t *v) #define arch_atomic_fetch_sub(_i, _v) arch_atomic_fetch_add(-(int)(_i), _v) #define ATOMIC_OPS(op) \ -static inline void arch_atomic_##op(int i, atomic_t *v) \ +static __always_inline void arch_atomic_##op(int i, atomic_t *v) \ { \ __atomic_##op(i, &v->counter); \ } \ -static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \ +static __always_inline int arch_atomic_fetch_##op(int i, atomic_t *v) \ { \ return __atomic_##op##_barrier(i, &v->counter); \ } @@ -74,7 +74,7 @@ ATOMIC_OPS(xor) #define arch_atomic_xchg(v, new) (arch_xchg(&((v)->counter), new)) -static inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) +static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { return __atomic_cmpxchg(&v->counter, old, new); } @@ -82,31 +82,31 @@ static inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) #define ATOMIC64_INIT(i) { (i) } -static inline s64 arch_atomic64_read(const atomic64_t *v) +static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { return __atomic64_read(v); } #define arch_atomic64_read arch_atomic64_read -static inline void arch_atomic64_set(atomic64_t *v, s64 i) +static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { __atomic64_set(v, i); } #define arch_atomic64_set arch_atomic64_set -static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { return __atomic64_add_barrier(i, (long *)&v->counter) + i; } #define arch_atomic64_add_return arch_atomic64_add_return -static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { return __atomic64_add_barrier(i, (long *)&v->counter); } #define arch_atomic64_fetch_add arch_atomic64_fetch_add -static inline void arch_atomic64_add(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { __atomic64_add(i, (long *)&v->counter); } @@ -114,20 +114,20 @@ static inline void arch_atomic64_add(s64 i, atomic64_t *v) #define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new)) -static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) +static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { return __atomic64_cmpxchg((long *)&v->counter, old, new); } #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg -#define ATOMIC64_OPS(op) \ -static inline void arch_atomic64_##op(s64 i, atomic64_t *v) \ -{ \ - __atomic64_##op(i, (long *)&v->counter); \ -} \ -static inline long arch_atomic64_fetch_##op(s64 i, atomic64_t *v) \ -{ \ - return __atomic64_##op##_barrier(i, (long *)&v->counter); \ +#define ATOMIC64_OPS(op) \ +static __always_inline void arch_atomic64_##op(s64 i, atomic64_t *v) \ +{ \ + __atomic64_##op(i, (long *)&v->counter); \ +} \ +static __always_inline long arch_atomic64_fetch_##op(s64 i, atomic64_t *v) \ +{ \ + return __atomic64_##op##_barrier(i, (long *)&v->counter); \ } ATOMIC64_OPS(and) diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h index 50510e08b893b..7fa5f96a553a4 100644 --- a/arch/s390/include/asm/atomic_ops.h +++ b/arch/s390/include/asm/atomic_ops.h @@ -8,7 +8,7 @@ #ifndef __ARCH_S390_ATOMIC_OPS__ #define __ARCH_S390_ATOMIC_OPS__ -static inline int __atomic_read(const atomic_t *v) +static __always_inline int __atomic_read(const atomic_t *v) { int c; @@ -18,14 +18,14 @@ static inline int __atomic_read(const atomic_t *v) return c; } -static inline void __atomic_set(atomic_t *v, int i) +static __always_inline void __atomic_set(atomic_t *v, int i) { asm volatile( " st %1,%0\n" : "=R" (v->counter) : "d" (i)); } -static inline s64 __atomic64_read(const atomic64_t *v) +static __always_inline s64 __atomic64_read(const atomic64_t *v) { s64 c; @@ -35,7 +35,7 @@ static inline s64 __atomic64_read(const atomic64_t *v) return c; } -static inline void __atomic64_set(atomic64_t *v, s64 i) +static __always_inline void __atomic64_set(atomic64_t *v, s64 i) { asm volatile( " stg %1,%0\n" @@ -45,7 +45,7 @@ static inline void __atomic64_set(atomic64_t *v, s64 i) #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES #define __ATOMIC_OP(op_name, op_type, op_string, op_barrier) \ -static inline op_type op_name(op_type val, op_type *ptr) \ +static __always_inline op_type op_name(op_type val, op_type *ptr) \ { \ op_type old; \ \ @@ -96,7 +96,7 @@ __ATOMIC_CONST_OPS(__atomic64_add_const, long, "agsi") #else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ #define __ATOMIC_OP(op_name, op_string) \ -static inline int op_name(int val, int *ptr) \ +static __always_inline int op_name(int val, int *ptr) \ { \ int old, new; \ \ @@ -122,7 +122,7 @@ __ATOMIC_OPS(__atomic_xor, "xr") #undef __ATOMIC_OPS #define __ATOMIC64_OP(op_name, op_string) \ -static inline long op_name(long val, long *ptr) \ +static __always_inline long op_name(long val, long *ptr) \ { \ long old, new; \ \ @@ -154,7 +154,7 @@ __ATOMIC64_OPS(__atomic64_xor, "xgr") #endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ -static inline int __atomic_cmpxchg(int *ptr, int old, int new) +static __always_inline int __atomic_cmpxchg(int *ptr, int old, int new) { asm volatile( " cs %[old],%[new],%[ptr]" @@ -164,7 +164,7 @@ static inline int __atomic_cmpxchg(int *ptr, int old, int new) return old; } -static inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new) +static __always_inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new) { int old_expected = old; @@ -176,7 +176,7 @@ static inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new) return old == old_expected; } -static inline long __atomic64_cmpxchg(long *ptr, long old, long new) +static __always_inline long __atomic64_cmpxchg(long *ptr, long old, long new) { asm volatile( " csg %[old],%[new],%[ptr]" @@ -186,7 +186,7 @@ static inline long __atomic64_cmpxchg(long *ptr, long old, long new) return old; } -static inline bool __atomic64_cmpxchg_bool(long *ptr, long old, long new) +static __always_inline bool __atomic64_cmpxchg_bool(long *ptr, long old, long new) { long old_expected = old; -- GitLab From c9c260681f521e4ad9f9f4cc71fe35b978e06222 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 20 Mar 2024 23:47:49 +0100 Subject: [PATCH 332/989] s390/preempt: mark all functions __always_inline preempt_count-related functions are quite ubiquitous and may be called by noinstr ones, introducing unwanted instrumentation. Here is one example call chain: irqentry_nmi_enter() # noinstr lockdep_hardirqs_enabled() this_cpu_read() __pcpu_size_call_return() this_cpu_read_*() this_cpu_generic_read() __this_cpu_generic_read_nopreempt() preempt_disable_notrace() __preempt_count_inc() __preempt_count_add() They are very small, so there are no significant downsides to force-inlining them. Signed-off-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/20240320230007.4782-3-iii@linux.ibm.com Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/preempt.h | 36 ++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h index bf15da0fedbca..0e3da500e98c1 100644 --- a/arch/s390/include/asm/preempt.h +++ b/arch/s390/include/asm/preempt.h @@ -12,12 +12,12 @@ #define PREEMPT_NEED_RESCHED 0x80000000 #define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED) -static inline int preempt_count(void) +static __always_inline int preempt_count(void) { return READ_ONCE(S390_lowcore.preempt_count) & ~PREEMPT_NEED_RESCHED; } -static inline void preempt_count_set(int pc) +static __always_inline void preempt_count_set(int pc) { int old, new; @@ -29,22 +29,22 @@ static inline void preempt_count_set(int pc) old, new) != old); } -static inline void set_preempt_need_resched(void) +static __always_inline void set_preempt_need_resched(void) { __atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); } -static inline void clear_preempt_need_resched(void) +static __always_inline void clear_preempt_need_resched(void) { __atomic_or(PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); } -static inline bool test_preempt_need_resched(void) +static __always_inline bool test_preempt_need_resched(void) { return !(READ_ONCE(S390_lowcore.preempt_count) & PREEMPT_NEED_RESCHED); } -static inline void __preempt_count_add(int val) +static __always_inline void __preempt_count_add(int val) { /* * With some obscure config options and CONFIG_PROFILE_ALL_BRANCHES @@ -59,17 +59,17 @@ static inline void __preempt_count_add(int val) __atomic_add(val, &S390_lowcore.preempt_count); } -static inline void __preempt_count_sub(int val) +static __always_inline void __preempt_count_sub(int val) { __preempt_count_add(-val); } -static inline bool __preempt_count_dec_and_test(void) +static __always_inline bool __preempt_count_dec_and_test(void) { return __atomic_add(-1, &S390_lowcore.preempt_count) == 1; } -static inline bool should_resched(int preempt_offset) +static __always_inline bool should_resched(int preempt_offset) { return unlikely(READ_ONCE(S390_lowcore.preempt_count) == preempt_offset); @@ -79,45 +79,45 @@ static inline bool should_resched(int preempt_offset) #define PREEMPT_ENABLED (0) -static inline int preempt_count(void) +static __always_inline int preempt_count(void) { return READ_ONCE(S390_lowcore.preempt_count); } -static inline void preempt_count_set(int pc) +static __always_inline void preempt_count_set(int pc) { S390_lowcore.preempt_count = pc; } -static inline void set_preempt_need_resched(void) +static __always_inline void set_preempt_need_resched(void) { } -static inline void clear_preempt_need_resched(void) +static __always_inline void clear_preempt_need_resched(void) { } -static inline bool test_preempt_need_resched(void) +static __always_inline bool test_preempt_need_resched(void) { return false; } -static inline void __preempt_count_add(int val) +static __always_inline void __preempt_count_add(int val) { S390_lowcore.preempt_count += val; } -static inline void __preempt_count_sub(int val) +static __always_inline void __preempt_count_sub(int val) { S390_lowcore.preempt_count -= val; } -static inline bool __preempt_count_dec_and_test(void) +static __always_inline bool __preempt_count_dec_and_test(void) { return !--S390_lowcore.preempt_count && tif_need_resched(); } -static inline bool should_resched(int preempt_offset) +static __always_inline bool should_resched(int preempt_offset) { return unlikely(preempt_count() == preempt_offset && tif_need_resched()); -- GitLab From e9f3af02f63909f41b43c28330434cc437639c5c Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 29 Feb 2024 15:00:28 +0100 Subject: [PATCH 333/989] s390/pai: fix sampling event removal for PMU device driver In case of a sampling event, the PAI PMU device drivers need a reference to this event. Currently to PMU device driver reference is removed when a sampling event is destroyed. This may lead to situations where the reference of the PMU device driver is removed while being used by a different sampling event. Reset the event reference pointer of the PMU device driver when a sampling event is deleted and before the next one might be added. Fixes: 39d62336f5c1 ("s390/pai: add support for cryptography counters") Signed-off-by: Thomas Richter Acked-by: Sumanth Korikkar Signed-off-by: Vasily Gorbik --- arch/s390/kernel/perf_pai_crypto.c | 10 +++++++--- arch/s390/kernel/perf_pai_ext.c | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index 823d652e3917f..4ad472d130a3c 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -90,7 +90,6 @@ static void paicrypt_event_destroy(struct perf_event *event) event->cpu); struct paicrypt_map *cpump = mp->mapptr; - cpump->event = NULL; static_branch_dec(&pai_key); mutex_lock(&pai_reserve_mutex); debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d" @@ -356,10 +355,15 @@ static int paicrypt_add(struct perf_event *event, int flags) static void paicrypt_stop(struct perf_event *event, int flags) { - if (!event->attr.sample_period) /* Counting */ + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; + + if (!event->attr.sample_period) { /* Counting */ paicrypt_read(event); - else /* Sampling */ + } else { /* Sampling */ perf_sched_cb_dec(event->pmu); + cpump->event = NULL; + } event->hw.state = PERF_HES_STOPPED; } diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index 616a25606cd63..a6da7e0cc7a66 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -122,7 +122,6 @@ static void paiext_event_destroy(struct perf_event *event) free_page(PAI_SAVE_AREA(event)); mutex_lock(&paiext_reserve_mutex); - cpump->event = NULL; if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */ paiext_free(mp); paiext_root_free(); @@ -362,10 +361,15 @@ static int paiext_add(struct perf_event *event, int flags) static void paiext_stop(struct perf_event *event, int flags) { - if (!event->attr.sample_period) /* Counting */ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + + if (!event->attr.sample_period) { /* Counting */ paiext_read(event); - else /* Sampling */ + } else { /* Sampling */ perf_sched_cb_dec(event->pmu); + cpump->event = NULL; + } event->hw.state = PERF_HES_STOPPED; } -- GitLab From 378ca2d2ad410a1cd5690d06b46c5e2297f4c8c0 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Tue, 26 Mar 2024 18:12:13 +0100 Subject: [PATCH 334/989] s390/entry: align system call table on 8 bytes Align system call table on 8 bytes. With sys_call_table entry size of 8 bytes that eliminates the possibility of a system call pointer crossing cache line boundary. Cc: stable@kernel.org Suggested-by: Ulrich Weigand Reviewed-by: Alexander Gordeev Signed-off-by: Sumanth Korikkar Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 787394978bc0f..3dc85638bc63b 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -635,6 +635,7 @@ SYM_DATA_START_LOCAL(daton_psw) SYM_DATA_END(daton_psw) .section .rodata, "a" + .balign 8 #define SYSCALL(esame,emu) .quad __s390x_ ## esame SYM_DATA_START(sys_call_table) #include "asm/syscall_table.h" -- GitLab From 438d3fc46f0deba24da7ded046c818e7bf434d24 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 24 Feb 2024 10:12:34 +0100 Subject: [PATCH 335/989] dt-bindings: clock: keystone: remove unstable remark Keystone clock controller bindings were marked as work-in-progress / unstable in 2013 in commit b9e0d40c0d83 ("clk: keystone: add Keystone PLL clock driver") and commit 7affe5685c96 ("clk: keystone: Add gate control clock driver") Almost eleven years is enough, so drop the "unstable" remark and expect usual ABI rules. Signed-off-by: Krzysztof Kozlowski Acked-by: Stephen Boyd Acked-by: Rob Herring Link: https://lore.kernel.org/r/20240224091236.10146-1-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/clock/keystone-gate.txt | 2 -- Documentation/devicetree/bindings/clock/keystone-pll.txt | 2 -- 2 files changed, 4 deletions(-) diff --git a/Documentation/devicetree/bindings/clock/keystone-gate.txt b/Documentation/devicetree/bindings/clock/keystone-gate.txt index c5aa187026e3a..43f6fb6c93927 100644 --- a/Documentation/devicetree/bindings/clock/keystone-gate.txt +++ b/Documentation/devicetree/bindings/clock/keystone-gate.txt @@ -1,5 +1,3 @@ -Status: Unstable - ABI compatibility may be broken in the future - Binding for Keystone gate control driver which uses PSC controller IP. This binding uses the common clock binding[1]. diff --git a/Documentation/devicetree/bindings/clock/keystone-pll.txt b/Documentation/devicetree/bindings/clock/keystone-pll.txt index 9a3fbc6656065..69b0eb7c03c9e 100644 --- a/Documentation/devicetree/bindings/clock/keystone-pll.txt +++ b/Documentation/devicetree/bindings/clock/keystone-pll.txt @@ -1,5 +1,3 @@ -Status: Unstable - ABI compatibility may be broken in the future - Binding for keystone PLLs. The main PLL IP typically has a multiplier, a divider and a post divider. The additional PLL IPs like ARMPLL, DDRPLL and PAPLL are controlled by the memory mapped register where as the Main -- GitLab From 63fd4d7dc45db58a348624fd46ed74509c458054 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 24 Feb 2024 10:12:35 +0100 Subject: [PATCH 336/989] dt-bindings: clock: ti: remove unstable remark Several TI SoC clock bindings were marked as work-in-progress / unstable between 2013-2016, for example in commit f60b1ea5ea7a ("CLK: TI: add support for gate clock"). It was enough of time to consider them stable and expect usual ABI rules. Signed-off-by: Krzysztof Kozlowski Acked-by: Stephen Boyd Acked-by: Rob Herring Acked-by: Tony Lindgren Link: https://lore.kernel.org/r/20240224091236.10146-2-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/clock/ti/adpll.txt | 2 -- Documentation/devicetree/bindings/clock/ti/apll.txt | 2 -- Documentation/devicetree/bindings/clock/ti/autoidle.txt | 2 -- Documentation/devicetree/bindings/clock/ti/clockdomain.txt | 2 -- Documentation/devicetree/bindings/clock/ti/composite.txt | 2 -- Documentation/devicetree/bindings/clock/ti/divider.txt | 2 -- Documentation/devicetree/bindings/clock/ti/dpll.txt | 2 -- Documentation/devicetree/bindings/clock/ti/fapll.txt | 2 -- .../devicetree/bindings/clock/ti/fixed-factor-clock.txt | 2 -- Documentation/devicetree/bindings/clock/ti/gate.txt | 2 -- Documentation/devicetree/bindings/clock/ti/interface.txt | 2 -- Documentation/devicetree/bindings/clock/ti/mux.txt | 2 -- 12 files changed, 24 deletions(-) diff --git a/Documentation/devicetree/bindings/clock/ti/adpll.txt b/Documentation/devicetree/bindings/clock/ti/adpll.txt index 4c8a2ce2cd701..3122360adcf3c 100644 --- a/Documentation/devicetree/bindings/clock/ti/adpll.txt +++ b/Documentation/devicetree/bindings/clock/ti/adpll.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments ADPLL clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped ADPLL with two to three selectable input clocks and three to four children. diff --git a/Documentation/devicetree/bindings/clock/ti/apll.txt b/Documentation/devicetree/bindings/clock/ti/apll.txt index ade4dd4c30f0e..bbd505c1199df 100644 --- a/Documentation/devicetree/bindings/clock/ti/apll.txt +++ b/Documentation/devicetree/bindings/clock/ti/apll.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments APLL clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped APLL with usually two selectable input clocks (reference clock and bypass clock), with analog phase locked diff --git a/Documentation/devicetree/bindings/clock/ti/autoidle.txt b/Documentation/devicetree/bindings/clock/ti/autoidle.txt index 7c735dde9fe97..05645a10a9e33 100644 --- a/Documentation/devicetree/bindings/clock/ti/autoidle.txt +++ b/Documentation/devicetree/bindings/clock/ti/autoidle.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments autoidle clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register mapped clock which can be put to idle automatically by hardware based on the usage and a configuration bit setting. Autoidle clock is never an individual diff --git a/Documentation/devicetree/bindings/clock/ti/clockdomain.txt b/Documentation/devicetree/bindings/clock/ti/clockdomain.txt index 9c6199249ce59..edf0b5d427682 100644 --- a/Documentation/devicetree/bindings/clock/ti/clockdomain.txt +++ b/Documentation/devicetree/bindings/clock/ti/clockdomain.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments clockdomain. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1] in consumer role. Every clock on TI SoC belongs to one clockdomain, but software only needs this information for specific clocks which require diff --git a/Documentation/devicetree/bindings/clock/ti/composite.txt b/Documentation/devicetree/bindings/clock/ti/composite.txt index 33ac7c9ad053c..6f7e1331b5466 100644 --- a/Documentation/devicetree/bindings/clock/ti/composite.txt +++ b/Documentation/devicetree/bindings/clock/ti/composite.txt @@ -1,7 +1,5 @@ Binding for TI composite clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped composite clock with multiple different sub-types; diff --git a/Documentation/devicetree/bindings/clock/ti/divider.txt b/Documentation/devicetree/bindings/clock/ti/divider.txt index 9b13b32974f99..4d7c76f0b3569 100644 --- a/Documentation/devicetree/bindings/clock/ti/divider.txt +++ b/Documentation/devicetree/bindings/clock/ti/divider.txt @@ -1,7 +1,5 @@ Binding for TI divider clock -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped adjustable clock rate divider that does not gate and has only one input clock or parent. By default the value programmed into diff --git a/Documentation/devicetree/bindings/clock/ti/dpll.txt b/Documentation/devicetree/bindings/clock/ti/dpll.txt index 37a7cb6ad07d8..14a1b72c2e712 100644 --- a/Documentation/devicetree/bindings/clock/ti/dpll.txt +++ b/Documentation/devicetree/bindings/clock/ti/dpll.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments DPLL clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped DPLL with usually two selectable input clocks (reference clock and bypass clock), with digital phase locked diff --git a/Documentation/devicetree/bindings/clock/ti/fapll.txt b/Documentation/devicetree/bindings/clock/ti/fapll.txt index c19b3f253b8cf..88986ef39ddd2 100644 --- a/Documentation/devicetree/bindings/clock/ti/fapll.txt +++ b/Documentation/devicetree/bindings/clock/ti/fapll.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments FAPLL clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped FAPLL with usually two selectable input clocks (reference clock and bypass clock), and one or more child diff --git a/Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt b/Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt index 518e3c1422762..dc69477b6e98e 100644 --- a/Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt +++ b/Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt @@ -1,7 +1,5 @@ Binding for TI fixed factor rate clock sources. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1], and also uses the autoidle support from TI autoidle clock [2]. diff --git a/Documentation/devicetree/bindings/clock/ti/gate.txt b/Documentation/devicetree/bindings/clock/ti/gate.txt index 4982615c01b9c..a8e0335b006a0 100644 --- a/Documentation/devicetree/bindings/clock/ti/gate.txt +++ b/Documentation/devicetree/bindings/clock/ti/gate.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments gate clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. This clock is quite much similar to the basic gate-clock [2], however, it supports a number of additional features. If no register diff --git a/Documentation/devicetree/bindings/clock/ti/interface.txt b/Documentation/devicetree/bindings/clock/ti/interface.txt index d3eb5ca92a7fe..85fb1f2d2d286 100644 --- a/Documentation/devicetree/bindings/clock/ti/interface.txt +++ b/Documentation/devicetree/bindings/clock/ti/interface.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments interface clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. This clock is quite much similar to the basic gate-clock [2], however, it supports a number of additional features, including diff --git a/Documentation/devicetree/bindings/clock/ti/mux.txt b/Documentation/devicetree/bindings/clock/ti/mux.txt index b33f641f10432..cd56d3c1c09f3 100644 --- a/Documentation/devicetree/bindings/clock/ti/mux.txt +++ b/Documentation/devicetree/bindings/clock/ti/mux.txt @@ -1,7 +1,5 @@ Binding for TI mux clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped multiplexer with multiple input clock signals or parents, one of which can be selected as output. This clock does not -- GitLab From 9117a64403e66b295a875e172954b758477e5f57 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 24 Feb 2024 10:12:36 +0100 Subject: [PATCH 337/989] dt-bindings: remoteproc: ti,davinci: remove unstable remark TI Davinci remoteproc bindings were marked as work-in-progress / unstable in 2017 in commit ae67b8007816 ("dt-bindings: remoteproc: Add bindings for Davinci DSP processors"). Almost seven years is enough, so drop the "unstable" remark and expect usual ABI rules. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Mathieu Poirier Acked-by: Rob Herring Link: https://lore.kernel.org/r/20240224091236.10146-3-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- .../devicetree/bindings/remoteproc/ti,davinci-rproc.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/Documentation/devicetree/bindings/remoteproc/ti,davinci-rproc.txt b/Documentation/devicetree/bindings/remoteproc/ti,davinci-rproc.txt index 25f8658e216ff..48a49c516b62c 100644 --- a/Documentation/devicetree/bindings/remoteproc/ti,davinci-rproc.txt +++ b/Documentation/devicetree/bindings/remoteproc/ti,davinci-rproc.txt @@ -1,9 +1,6 @@ TI Davinci DSP devices ======================= -Binding status: Unstable - Subject to changes for DT representation of clocks - and resets - The TI Davinci family of SoCs usually contains a TI DSP Core sub-system that is used to offload some of the processor-intensive tasks or algorithms, for achieving various system level goals. -- GitLab From 6fad9df49b40fdb7d8458167ebbde46a8681f729 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 25 Mar 2024 11:48:32 +0100 Subject: [PATCH 338/989] dt-bindings: soc: fsl: narrow regex for unit address to hex numbers Regular expression used to match the unit address part should not allow non-hex numbers. Acked-by: Rob Herring Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240325104833.33372-1-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- .../devicetree/bindings/soc/fsl/fsl,layerscape-dcfg.yaml | 2 +- .../devicetree/bindings/soc/fsl/fsl,layerscape-scfg.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-dcfg.yaml b/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-dcfg.yaml index 397f75909b205..ce1a6505eb514 100644 --- a/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-dcfg.yaml +++ b/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-dcfg.yaml @@ -51,7 +51,7 @@ properties: ranges: true patternProperties: - "^clock-controller@[0-9a-z]+$": + "^clock-controller@[0-9a-f]+$": $ref: /schemas/clock/fsl,flexspi-clock.yaml# required: diff --git a/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-scfg.yaml b/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-scfg.yaml index 8d088b5fe8236..a6a511b00a128 100644 --- a/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-scfg.yaml +++ b/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-scfg.yaml @@ -41,7 +41,7 @@ properties: ranges: true patternProperties: - "^interrupt-controller@[a-z0-9]+$": + "^interrupt-controller@[a-f0-9]+$": $ref: /schemas/interrupt-controller/fsl,ls-extirq.yaml# required: -- GitLab From 500b42091c1dd878b7a8a59ef89aba85e0054b7b Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 25 Mar 2024 11:48:33 +0100 Subject: [PATCH 339/989] dt-bindings: timer: narrow regex for unit address to hex numbers Regular expression used to match the unit address part should not allow non-hex numbers. Expect at least one hex digit as well. Acked-by: Rob Herring Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240325104833.33372-2-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- .../devicetree/bindings/timer/arm,arch_timer_mmio.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/timer/arm,arch_timer_mmio.yaml b/Documentation/devicetree/bindings/timer/arm,arch_timer_mmio.yaml index 7a4a6ab85970d..ab8f28993139e 100644 --- a/Documentation/devicetree/bindings/timer/arm,arch_timer_mmio.yaml +++ b/Documentation/devicetree/bindings/timer/arm,arch_timer_mmio.yaml @@ -60,7 +60,7 @@ properties: be implemented in an always-on power domain." patternProperties: - '^frame@[0-9a-z]*$': + '^frame@[0-9a-f]+$': type: object additionalProperties: false description: A timer node has up to 8 frame sub-nodes, each with the following properties. -- GitLab From 0200ceed3042222a1f78b3e79ec71a5a52977e3a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 29 Feb 2024 22:51:38 +0000 Subject: [PATCH 340/989] vboxsf: remove redundant variable out_len The variable out_len is being used to accumulate the number of bytes but it is not being used for any other purpose. The variable is redundant and can be removed. Cleans up clang scan build warning: fs/vboxsf/utils.c:443:9: warning: variable 'out_len' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240229225138.351909-1-colin.i.king@gmail.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- fs/vboxsf/utils.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index 72ac9320e6a35..9515bbf0b54ce 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -440,7 +440,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len, { const char *in; char *out; - size_t out_len; size_t out_bound_len; size_t in_bound_len; @@ -448,7 +447,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len, in_bound_len = utf8_len; out = name; - out_len = 0; /* Reserve space for terminating 0 */ out_bound_len = name_bound_len - 1; @@ -469,7 +467,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len, out += nb; out_bound_len -= nb; - out_len += nb; } *out = 0; -- GitLab From b017a0cea627fcbe158fc2c214fe893e18c4d0c4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 25 Mar 2024 16:35:21 +0000 Subject: [PATCH 341/989] arm64/ptrace: Use saved floating point state type to determine SVE layout The SVE register sets have two different formats, one of which is a wrapped version of the standard FPSIMD register set and another with actual SVE register data. At present we check TIF_SVE to see if full SVE register state should be provided when reading the SVE regset but if we were in a syscall we may have saved only floating point registers even though that is set. Fix this and simplify the logic by checking and using the format which we recorded when deciding if we should use FPSIMD or SVE format. Fixes: 8c845e273104 ("arm64/sve: Leave SVE enabled on syscall if we don't context switch") Cc: # 6.2.x Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20240325-arm64-ptrace-fp-type-v1-1-8dc846caf11f@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/ptrace.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 162b030ab9da3..0d022599eb61b 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -761,7 +761,6 @@ static void sve_init_header_from_task(struct user_sve_header *header, { unsigned int vq; bool active; - bool fpsimd_only; enum vec_type task_type; memset(header, 0, sizeof(*header)); @@ -777,12 +776,10 @@ static void sve_init_header_from_task(struct user_sve_header *header, case ARM64_VEC_SVE: if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) header->flags |= SVE_PT_VL_INHERIT; - fpsimd_only = !test_tsk_thread_flag(target, TIF_SVE); break; case ARM64_VEC_SME: if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) header->flags |= SVE_PT_VL_INHERIT; - fpsimd_only = false; break; default: WARN_ON_ONCE(1); @@ -790,7 +787,7 @@ static void sve_init_header_from_task(struct user_sve_header *header, } if (active) { - if (fpsimd_only) { + if (target->thread.fp_type == FP_STATE_FPSIMD) { header->flags |= SVE_PT_REGS_FPSIMD; } else { header->flags |= SVE_PT_REGS_SVE; -- GitLab From de3f64b738af57e2732b91a0774facc675b75b54 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 1 Nov 2023 11:49:48 +0100 Subject: [PATCH 342/989] vboxsf: Avoid an spurious warning if load_nls_xxx() fails If an load_nls_xxx() function fails a few lines above, the 'sbi->bdi_id' is still 0. So, in the error handling path, we will call ida_simple_remove(..., 0) which is not allocated yet. In order to prevent a spurious "ida_free called for id=0 which is not allocated." message, tweak the error handling path and add a new label. Fixes: 0fd169576648 ("fs: Add VirtualBox guest shared folder (vboxsf) support") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/d09eaaa4e2e08206c58a1a27ca9b3e81dc168773.1698835730.git.christophe.jaillet@wanadoo.fr Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- fs/vboxsf/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index cabe8ac4fefc5..1ed6b1a76bf92 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -151,7 +151,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) if (!sbi->nls) { vbg_err("vboxsf: Count not load '%s' nls\n", nls_name); err = -EINVAL; - goto fail_free; + goto fail_destroy_idr; } } @@ -224,6 +224,7 @@ fail_free: ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id); if (sbi->nls) unload_nls(sbi->nls); +fail_destroy_idr: idr_destroy(&sbi->ino_idr); kfree(sbi); return err; -- GitLab From 0141d68f86d78953fb4c3983d666e92f7df4a43d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 1 Nov 2023 11:49:49 +0100 Subject: [PATCH 343/989] vboxsf: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/b3c057c86b73f0309a6362031d21f4d7ebb60587.1698835730.git.christophe.jaillet@wanadoo.fr Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- fs/vboxsf/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 1ed6b1a76bf92..ffb1d565da398 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -155,7 +155,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) } } - sbi->bdi_id = ida_simple_get(&vboxsf_bdi_ida, 0, 0, GFP_KERNEL); + sbi->bdi_id = ida_alloc(&vboxsf_bdi_ida, GFP_KERNEL); if (sbi->bdi_id < 0) { err = sbi->bdi_id; goto fail_free; @@ -221,7 +221,7 @@ fail_unmap: vboxsf_unmap_folder(sbi->root); fail_free: if (sbi->bdi_id >= 0) - ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id); + ida_free(&vboxsf_bdi_ida, sbi->bdi_id); if (sbi->nls) unload_nls(sbi->nls); fail_destroy_idr: @@ -269,7 +269,7 @@ static void vboxsf_put_super(struct super_block *sb) vboxsf_unmap_folder(sbi->root); if (sbi->bdi_id >= 0) - ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id); + ida_free(&vboxsf_bdi_ida, sbi->bdi_id); if (sbi->nls) unload_nls(sbi->nls); -- GitLab From 1ece2c43b88660ddbdf8ecb772e9c41ed9cda3dd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 19 Mar 2024 12:32:04 -0400 Subject: [PATCH 344/989] vboxsf: explicitly deny setlease attempts vboxsf does not break leases on its own, so it can't properly handle the case where the hypervisor changes the data. Don't allow file leases on vboxsf. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240319-setlease-v1-1-5997d67e04b3@kernel.org Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- fs/vboxsf/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index 2307f8037efc3..118dedef8ebe8 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -218,6 +218,7 @@ const struct file_operations vboxsf_reg_fops = { .release = vboxsf_file_release, .fsync = noop_fsync, .splice_read = filemap_splice_read, + .setlease = simple_nosetlease, }; const struct inode_operations vboxsf_reg_iops = { -- GitLab From 1057c4c36ef8b236a2e28edef301da0801338c5f Mon Sep 17 00:00:00 2001 From: Nikita Travkin Date: Wed, 3 Apr 2024 16:31:40 +0500 Subject: [PATCH 345/989] thermal: gov_power_allocator: Allow binding without cooling devices IPA was recently refactored to split out memory allocation into a separate funciton. That funciton was made to return -EINVAL if there is zero power_actors and thus no memory to allocate. This causes IPA to fail probing when the thermal zone has no attached cooling devices. Since cooling devices can attach after the thermal zone is created and the governer is attached to it, failing probe due to the lack of cooling devices is incorrect. Change the allocate_actors_buffer() to return success when there is no cooling devices present. Fixes: 912e97c67cc3 ("thermal: gov_power_allocator: Move memory allocation out of throttle()") Signed-off-by: Nikita Travkin Reviewed-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_power_allocator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 1b17dc4c219cc..ec637071ef1fa 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -606,7 +606,7 @@ static int allocate_actors_buffer(struct power_allocator_params *params, /* There might be no cooling devices yet. */ if (!num_actors) { - ret = -EINVAL; + ret = 0; goto clean_state; } -- GitLab From da781936e7c301e6197eb6513775748e79fb2575 Mon Sep 17 00:00:00 2001 From: Nikita Travkin Date: Wed, 3 Apr 2024 16:31:41 +0500 Subject: [PATCH 346/989] thermal: gov_power_allocator: Allow binding without trip points IPA probe function was recently refactored to perform extra error checks and make sure the thermal zone has trip points necessary for the IPA operation. With this change, if a thermal zone is probed such that it has no trip points that IPA can use, IPA will fail and the TZ won't be created. This is the case if a platform defines a TZ without cooling devices and only with "hot"/"critical" trip points, often found on some Qualcomm devices [1]. Documentation across IPA code (notably get_governor_trips() kerneldoc) suggests that IPA is supposed to handle such TZ even if it won't actually do anything. This commit partially reverts the previous change to allow IPA to bind to such "empty" thermal zones. Fixes: e83747c2f8e3 ("thermal: gov_power_allocator: Set up trip points earlier") Link: arch/arm64/boot/dts/qcom/sc7180.dtsi#n4776 # [1] Signed-off-by: Nikita Travkin Reviewed-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_power_allocator.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index ec637071ef1fa..e25e48d76aa79 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -679,11 +679,6 @@ static int power_allocator_bind(struct thermal_zone_device *tz) return -ENOMEM; get_governor_trips(tz, params); - if (!params->trip_max) { - dev_warn(&tz->device, "power_allocator: missing trip_max\n"); - kfree(params); - return -EINVAL; - } ret = check_power_actors(tz, params); if (ret < 0) { @@ -714,9 +709,10 @@ static int power_allocator_bind(struct thermal_zone_device *tz) else params->sustainable_power = tz->tzp->sustainable_power; - estimate_pid_constants(tz, tz->tzp->sustainable_power, - params->trip_switch_on, - params->trip_max->temperature); + if (params->trip_max) + estimate_pid_constants(tz, tz->tzp->sustainable_power, + params->trip_switch_on, + params->trip_max->temperature); reset_pid_controller(params); -- GitLab From 90ca6956d3834db4060f87700e2fcbb699c4e4fd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 21 Mar 2024 17:42:12 +0300 Subject: [PATCH 347/989] ice: Fix freeing uninitialized pointers Automatically cleaned up pointers need to be initialized before exiting their scope. In this case, they need to be initialized to NULL before any return statement. Fixes: 90f821d72e11 ("ice: avoid unnecessary devm_ usage") Signed-off-by: Dan Carpenter Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_common.c | 10 +++++----- drivers/net/ethernet/intel/ice/ice_ethtool.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index db4b2844e1f71..d9f6cc71d900a 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -1002,8 +1002,8 @@ static void ice_get_itr_intrl_gran(struct ice_hw *hw) */ int ice_init_hw(struct ice_hw *hw) { - struct ice_aqc_get_phy_caps_data *pcaps __free(kfree); - void *mac_buf __free(kfree); + struct ice_aqc_get_phy_caps_data *pcaps __free(kfree) = NULL; + void *mac_buf __free(kfree) = NULL; u16 mac_buf_len; int status; @@ -3272,7 +3272,7 @@ int ice_update_link_info(struct ice_port_info *pi) return status; if (li->link_info & ICE_AQ_MEDIA_AVAILABLE) { - struct ice_aqc_get_phy_caps_data *pcaps __free(kfree); + struct ice_aqc_get_phy_caps_data *pcaps __free(kfree) = NULL; pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL); if (!pcaps) @@ -3420,7 +3420,7 @@ ice_cfg_phy_fc(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg, int ice_set_fc(struct ice_port_info *pi, u8 *aq_failures, bool ena_auto_link_update) { - struct ice_aqc_get_phy_caps_data *pcaps __free(kfree); + struct ice_aqc_get_phy_caps_data *pcaps __free(kfree) = NULL; struct ice_aqc_set_phy_cfg_data cfg = { 0 }; struct ice_hw *hw; int status; @@ -3561,7 +3561,7 @@ int ice_cfg_phy_fec(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg, enum ice_fec_mode fec) { - struct ice_aqc_get_phy_caps_data *pcaps __free(kfree); + struct ice_aqc_get_phy_caps_data *pcaps __free(kfree) = NULL; struct ice_hw *hw; int status; diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 255a9c8151b45..78b833b3e1d7e 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -941,11 +941,11 @@ static u64 ice_loopback_test(struct net_device *netdev) struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_vsi *orig_vsi = np->vsi, *test_vsi; struct ice_pf *pf = orig_vsi->back; + u8 *tx_frame __free(kfree) = NULL; u8 broadcast[ETH_ALEN], ret = 0; int num_frames, valid_frames; struct ice_tx_ring *tx_ring; struct ice_rx_ring *rx_ring; - u8 *tx_frame __free(kfree); int i; netdev_info(netdev, "loopback test\n"); -- GitLab From 8edfc7a40e3300fc6c5fa7a3228a24d5bcd86ba5 Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Mon, 25 Mar 2024 21:19:01 +0100 Subject: [PATCH 348/989] ice: fix enabling RX VLAN filtering ice_port_vlan_on/off() was introduced in commit 2946204b3fa8 ("ice: implement bridge port vlan"). But ice_port_vlan_on() incorrectly assigns ena_rx_filtering to inner_vlan_ops in DVM mode. This causes an error when rx_filtering cannot be enabled in legacy mode. Reproducer: echo 1 > /sys/class/net/$PF/device/sriov_numvfs ip link set $PF vf 0 spoofchk off trust on vlan 3 dmesg: ice 0000:41:00.0: failed to enable Rx VLAN filtering for VF 0 VSI 9 during VF rebuild, error -95 Fixes: 2946204b3fa8 ("ice: implement bridge port vlan") Signed-off-by: Petr Oros Reviewed-by: Michal Swiatkowski Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- .../ethernet/intel/ice/ice_vf_vsi_vlan_ops.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c index 80dc4bcdd3a41..b3e1bdcb80f84 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c +++ b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c @@ -26,24 +26,22 @@ static void ice_port_vlan_on(struct ice_vsi *vsi) struct ice_vsi_vlan_ops *vlan_ops; struct ice_pf *pf = vsi->back; - if (ice_is_dvm_ena(&pf->hw)) { - vlan_ops = &vsi->outer_vlan_ops; - - /* setup outer VLAN ops */ - vlan_ops->set_port_vlan = ice_vsi_set_outer_port_vlan; - vlan_ops->clear_port_vlan = ice_vsi_clear_outer_port_vlan; + /* setup inner VLAN ops */ + vlan_ops = &vsi->inner_vlan_ops; - /* setup inner VLAN ops */ - vlan_ops = &vsi->inner_vlan_ops; + if (ice_is_dvm_ena(&pf->hw)) { vlan_ops->add_vlan = noop_vlan_arg; vlan_ops->del_vlan = noop_vlan_arg; vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping; vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping; vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion; vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion; - } else { - vlan_ops = &vsi->inner_vlan_ops; + /* setup outer VLAN ops */ + vlan_ops = &vsi->outer_vlan_ops; + vlan_ops->set_port_vlan = ice_vsi_set_outer_port_vlan; + vlan_ops->clear_port_vlan = ice_vsi_clear_outer_port_vlan; + } else { vlan_ops->set_port_vlan = ice_vsi_set_inner_port_vlan; vlan_ops->clear_port_vlan = ice_vsi_clear_inner_port_vlan; } -- GitLab From 0e110732473e14d6520e49d75d2c88ef7d46fe67 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Tue, 2 Apr 2024 16:05:49 +0200 Subject: [PATCH 349/989] x86/retpoline: Do the necessary fixup to the Zen3/4 srso return thunk for !SRSO The srso_alias_untrain_ret() dummy thunk in the !CONFIG_MITIGATION_SRSO case is there only for the altenative in CALL_UNTRAIN_RET to have a symbol to resolve. However, testing with kernels which don't have CONFIG_MITIGATION_SRSO enabled, leads to the warning in patch_return() to fire: missing return thunk: srso_alias_untrain_ret+0x0/0x10-0x0: eb 0e 66 66 2e WARNING: CPU: 0 PID: 0 at arch/x86/kernel/alternative.c:826 apply_returns (arch/x86/kernel/alternative.c:826 Put in a plain "ret" there so that gcc doesn't put a return thunk in in its place which special and gets checked. In addition: ERROR: modpost: "srso_alias_untrain_ret" [arch/x86/kvm/kvm-amd.ko] undefined! make[2]: *** [scripts/Makefile.modpost:145: Module.symvers] Chyba 1 make[1]: *** [/usr/src/linux-6.8.3/Makefile:1873: modpost] Chyba 2 make: *** [Makefile:240: __sub-make] Chyba 2 since !SRSO builds would use the dummy return thunk as reported by petr.pisar@atlas.cz, https://bugzilla.kernel.org/show_bug.cgi?id=218679. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202404020901.da75a60f-oliver.sang@intel.com Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/all/202404020901.da75a60f-oliver.sang@intel.com/ Signed-off-by: Linus Torvalds --- arch/x86/lib/retpoline.S | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 02cde194a99e6..0795b3464058b 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -228,8 +228,11 @@ SYM_CODE_END(srso_return_thunk) #else /* !CONFIG_MITIGATION_SRSO */ /* Dummy for the alternative in CALL_UNTRAIN_RET. */ SYM_CODE_START(srso_alias_untrain_ret) - RET + ANNOTATE_UNRET_SAFE + ret + int3 SYM_FUNC_END(srso_alias_untrain_ret) +__EXPORT_THUNK(srso_alias_untrain_ret) #define JMP_SRSO_UNTRAIN_RET "ud2" #endif /* CONFIG_MITIGATION_SRSO */ -- GitLab From 701b38995e5bdd2a293936c55782140423827fb1 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 3 Apr 2024 09:57:29 +0200 Subject: [PATCH 350/989] security: Place security_path_post_mknod() where the original IMA call was Commit 08abce60d63f ("security: Introduce path_post_mknod hook") introduced security_path_post_mknod(), to replace the IMA-specific call to ima_post_path_mknod(). For symmetry with security_path_mknod(), security_path_post_mknod() was called after a successful mknod operation, for any file type, rather than only for regular files at the time there was the IMA call. However, as reported by VFS maintainers, successful mknod operation does not mean that the dentry always has an inode attached to it (for example, not for FIFOs on a SAMBA mount). If that condition happens, the kernel crashes when security_path_post_mknod() attempts to verify if the inode associated to the dentry is private. Move security_path_post_mknod() where the ima_post_path_mknod() call was, which is obviously correct from IMA/EVM perspective. IMA/EVM are the only in-kernel users, and only need to inspect regular files. Reported-by: Steve French Closes: https://lore.kernel.org/linux-kernel/CAH2r5msAVzxCUHHG8VKrMPUKQHmBpE6K9_vjhgDa1uAvwx4ppw@mail.gmail.com/ Suggested-by: Al Viro Fixes: 08abce60d63f ("security: Introduce path_post_mknod hook") Signed-off-by: Roberto Sassu Reviewed-by: Christian Brauner Reviewed-by: Mimi Zohar Acked-by: Paul Moore Signed-off-by: Linus Torvalds --- fs/namei.c | 7 ++----- security/security.c | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index ceb9ddf8dfdd4..c5b2a25be7d04 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4050,6 +4050,8 @@ retry: case 0: case S_IFREG: error = vfs_create(idmap, path.dentry->d_inode, dentry, mode, true); + if (!error) + security_path_post_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(idmap, path.dentry->d_inode, @@ -4060,11 +4062,6 @@ retry: dentry, mode, 0); break; } - - if (error) - goto out2; - - security_path_post_mknod(idmap, dentry); out2: done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { diff --git a/security/security.c b/security/security.c index 7e118858b545c..0a9a0ac3f2662 100644 --- a/security/security.c +++ b/security/security.c @@ -1793,11 +1793,11 @@ int security_path_mknod(const struct path *dir, struct dentry *dentry, EXPORT_SYMBOL(security_path_mknod); /** - * security_path_post_mknod() - Update inode security field after file creation + * security_path_post_mknod() - Update inode security after reg file creation * @idmap: idmap of the mount * @dentry: new file * - * Update inode security field after a file has been created. + * Update inode security field after a regular file has been created. */ void security_path_post_mknod(struct mnt_idmap *idmap, struct dentry *dentry) { -- GitLab From 3f5eb32513e75eb321919a703800d4e13e9d3ba8 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 3 Apr 2024 14:18:39 +0300 Subject: [PATCH 351/989] ASoC: SOF: Intel: lnl: Disable DMIC/SSP offload on remove During probe the DMIC/SSP offload is enabled and it is not reversed on remove. Add a remove wrapper for LNL to disable the offload for DMIC and SSP similarly to what is done during probe. Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Link: https://msgid.link/r/20240403111839.27259-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/lnl.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/sound/soc/sof/intel/lnl.c b/sound/soc/sof/intel/lnl.c index d1c73d407e68e..aeb4350cce6bb 100644 --- a/sound/soc/sof/intel/lnl.c +++ b/sound/soc/sof/intel/lnl.c @@ -29,15 +29,17 @@ static const struct snd_sof_debugfs_map lnl_dsp_debugfs[] = { }; /* this helps allows the DSP to setup DMIC/SSP */ -static int hdac_bus_offload_dmic_ssp(struct hdac_bus *bus) +static int hdac_bus_offload_dmic_ssp(struct hdac_bus *bus, bool enable) { int ret; - ret = hdac_bus_eml_enable_offload(bus, true, AZX_REG_ML_LEPTR_ID_INTEL_SSP, true); + ret = hdac_bus_eml_enable_offload(bus, true, + AZX_REG_ML_LEPTR_ID_INTEL_SSP, enable); if (ret < 0) return ret; - ret = hdac_bus_eml_enable_offload(bus, true, AZX_REG_ML_LEPTR_ID_INTEL_DMIC, true); + ret = hdac_bus_eml_enable_offload(bus, true, + AZX_REG_ML_LEPTR_ID_INTEL_DMIC, enable); if (ret < 0) return ret; @@ -52,7 +54,19 @@ static int lnl_hda_dsp_probe(struct snd_sof_dev *sdev) if (ret < 0) return ret; - return hdac_bus_offload_dmic_ssp(sof_to_bus(sdev)); + return hdac_bus_offload_dmic_ssp(sof_to_bus(sdev), true); +} + +static void lnl_hda_dsp_remove(struct snd_sof_dev *sdev) +{ + int ret; + + ret = hdac_bus_offload_dmic_ssp(sof_to_bus(sdev), false); + if (ret < 0) + dev_warn(sdev->dev, + "Failed to disable offload for DMIC/SSP: %d\n", ret); + + hda_dsp_remove(sdev); } static int lnl_hda_dsp_resume(struct snd_sof_dev *sdev) @@ -63,7 +77,7 @@ static int lnl_hda_dsp_resume(struct snd_sof_dev *sdev) if (ret < 0) return ret; - return hdac_bus_offload_dmic_ssp(sof_to_bus(sdev)); + return hdac_bus_offload_dmic_ssp(sof_to_bus(sdev), true); } static int lnl_hda_dsp_runtime_resume(struct snd_sof_dev *sdev) @@ -74,7 +88,7 @@ static int lnl_hda_dsp_runtime_resume(struct snd_sof_dev *sdev) if (ret < 0) return ret; - return hdac_bus_offload_dmic_ssp(sof_to_bus(sdev)); + return hdac_bus_offload_dmic_ssp(sof_to_bus(sdev), true); } static int lnl_dsp_post_fw_run(struct snd_sof_dev *sdev) @@ -97,9 +111,11 @@ int sof_lnl_ops_init(struct snd_sof_dev *sdev) /* common defaults */ memcpy(&sof_lnl_ops, &sof_hda_common_ops, sizeof(struct snd_sof_dsp_ops)); - /* probe */ - if (!sdev->dspless_mode_selected) + /* probe/remove */ + if (!sdev->dspless_mode_selected) { sof_lnl_ops.probe = lnl_hda_dsp_probe; + sof_lnl_ops.remove = lnl_hda_dsp_remove; + } /* shutdown */ sof_lnl_ops.shutdown = hda_dsp_shutdown; -- GitLab From 64d845f651267deb62bcf013ce37e2360161fdf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20H=C3=B6gander?= Date: Tue, 19 Mar 2024 14:33:23 +0200 Subject: [PATCH 352/989] drm/i915/psr: Calculate PIPE_SRCSZ_ERLY_TPT value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When early transport is enabled we need to write PIPE_SRCSZ_ERLY_TPT on every flip doing selective update. This patch calculates PIPE_SRCSZ_ERLY_TPT same way as is done for PSR2_MAN_TRK_CTL value and stores i in intel_crtc_state->pipe_srcsz_early_tpt to be written later during flip. Bspec: 68927 Signed-off-by: Jouni Högander Reviewed-by: Mika Kahola Link: https://patchwork.freedesktop.org/patch/msgid/20240319123327.1661097-2-jouni.hogander@intel.com (cherry picked from commit f3b899f0b4b17fa0b20e27c23f78604d5686383d) Signed-off-by: Rodrigo Vivi --- .../gpu/drm/i915/display/intel_display_types.h | 2 ++ drivers/gpu/drm/i915/display/intel_psr.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h b/drivers/gpu/drm/i915/display/intel_display_types.h index 9104f18753b48..bf3f942e19c3d 100644 --- a/drivers/gpu/drm/i915/display/intel_display_types.h +++ b/drivers/gpu/drm/i915/display/intel_display_types.h @@ -1423,6 +1423,8 @@ struct intel_crtc_state { u32 psr2_man_track_ctl; + u32 pipe_srcsz_early_tpt; + struct drm_rect psr2_su_area; /* Variable Refresh Rate state */ diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index 6927785fd6ff2..2c4978e189a3a 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -2051,6 +2051,20 @@ exit: crtc_state->psr2_man_track_ctl = val; } +static u32 psr2_pipe_srcsz_early_tpt_calc(struct intel_crtc_state *crtc_state, + bool full_update) +{ + int width, height; + + if (!crtc_state->enable_psr2_su_region_et || full_update) + return 0; + + width = drm_rect_width(&crtc_state->psr2_su_area); + height = drm_rect_height(&crtc_state->psr2_su_area); + + return PIPESRC_WIDTH(width - 1) | PIPESRC_HEIGHT(height - 1); +} + static void clip_area_update(struct drm_rect *overlap_damage_area, struct drm_rect *damage_area, struct drm_rect *pipe_src) @@ -2338,6 +2352,8 @@ int intel_psr2_sel_fetch_update(struct intel_atomic_state *state, skip_sel_fetch_set_loop: psr2_man_trk_ctl_calc(crtc_state, full_update); + crtc_state->pipe_srcsz_early_tpt = + psr2_pipe_srcsz_early_tpt_calc(crtc_state, full_update); return 0; } -- GitLab From 4e29234353a4378a49e5ee6f5683678d7e101e17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20H=C3=B6gander?= Date: Tue, 19 Mar 2024 14:33:24 +0200 Subject: [PATCH 353/989] drm/i915/psr: Move writing early transport pipe src MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently PIPE_SRCSZ_ERLY_TPT is written in intel_display.c:intel_set_pipe_src_size. This doesn't work as intel_set_pipe_src_size is called only on modeset. Bspec: 68927 Fixes: 3291bbb93e16 ("drm/i915/psr: Configure PIPE_SRCSZ_ERLY_TPT for psr2 early transport") Signed-off-by: Jouni Högander Reviewed-by: Mika Kahola Link: https://patchwork.freedesktop.org/patch/msgid/20240319123327.1661097-3-jouni.hogander@intel.com (cherry picked from commit b52c4093b0c9089b00b42823d41986a94d32e341) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_display.c | 9 --------- drivers/gpu/drm/i915/display/intel_psr.c | 7 +++++++ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index ab2f52d21bad8..8af9e6128277a 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -2709,15 +2709,6 @@ static void intel_set_pipe_src_size(const struct intel_crtc_state *crtc_state) */ intel_de_write(dev_priv, PIPESRC(pipe), PIPESRC_WIDTH(width - 1) | PIPESRC_HEIGHT(height - 1)); - - if (!crtc_state->enable_psr2_su_region_et) - return; - - width = drm_rect_width(&crtc_state->psr2_su_area); - height = drm_rect_height(&crtc_state->psr2_su_area); - - intel_de_write(dev_priv, PIPE_SRCSZ_ERLY_TPT(pipe), - PIPESRC_WIDTH(width - 1) | PIPESRC_HEIGHT(height - 1)); } static bool intel_pipe_is_interlaced(const struct intel_crtc_state *crtc_state) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index 2c4978e189a3a..47175e4156bd7 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -1994,6 +1994,7 @@ static void psr_force_hw_tracking_exit(struct intel_dp *intel_dp) void intel_psr2_program_trans_man_trk_ctl(const struct intel_crtc_state *crtc_state) { + struct intel_crtc *crtc = to_intel_crtc(crtc_state->uapi.crtc); struct drm_i915_private *dev_priv = to_i915(crtc_state->uapi.crtc->dev); enum transcoder cpu_transcoder = crtc_state->cpu_transcoder; struct intel_encoder *encoder; @@ -2013,6 +2014,12 @@ void intel_psr2_program_trans_man_trk_ctl(const struct intel_crtc_state *crtc_st intel_de_write(dev_priv, PSR2_MAN_TRK_CTL(cpu_transcoder), crtc_state->psr2_man_track_ctl); + + if (!crtc_state->enable_psr2_su_region_et) + return; + + intel_de_write(dev_priv, PIPE_SRCSZ_ERLY_TPT(crtc->pipe), + crtc_state->pipe_srcsz_early_tpt); } static void psr2_man_trk_ctl_calc(struct intel_crtc_state *crtc_state, -- GitLab From bf1f6f8d0b193561f213209b902edda634b6c74a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20H=C3=B6gander?= Date: Tue, 19 Mar 2024 14:33:25 +0200 Subject: [PATCH 354/989] drm/i915/psr: Fix intel_psr2_sel_fetch_et_alignment usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we are not aligning selective update area to cover cursor fully when cursor is not updated by itself but still in the selective update area. Fix this by checking cursor separately after drm_atomic_add_affected_planes. Bspec: 68927 Fixes: 1bff93b8bc27 ("drm/i915/psr: Extend SU area to cover cursor fully if needed") Signed-off-by: Jouni Högander Reviewed-by: Mika Kahola Link: https://patchwork.freedesktop.org/patch/msgid/20240319123327.1661097-4-jouni.hogander@intel.com (cherry picked from commit d37b3dac68e26669f03f768b3afc9abc094c9ac9) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_psr.c | 55 ++++++++++++++---------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index 47175e4156bd7..b6e539f1342c2 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -2116,21 +2116,36 @@ static void intel_psr2_sel_fetch_pipe_alignment(struct intel_crtc_state *crtc_st * cursor fully when cursor is in SU area. */ static void -intel_psr2_sel_fetch_et_alignment(struct intel_crtc_state *crtc_state, - struct intel_plane_state *cursor_state) +intel_psr2_sel_fetch_et_alignment(struct intel_atomic_state *state, + struct intel_crtc *crtc) { - struct drm_rect inter; + struct intel_crtc_state *crtc_state = intel_atomic_get_new_crtc_state(state, crtc); + struct intel_plane_state *new_plane_state; + struct intel_plane *plane; + int i; - if (!crtc_state->enable_psr2_su_region_et || - !cursor_state->uapi.visible) + if (!crtc_state->enable_psr2_su_region_et) return; - inter = crtc_state->psr2_su_area; - if (!drm_rect_intersect(&inter, &cursor_state->uapi.dst)) - return; + for_each_new_intel_plane_in_state(state, plane, new_plane_state, i) { + struct drm_rect inter; - clip_area_update(&crtc_state->psr2_su_area, &cursor_state->uapi.dst, - &crtc_state->pipe_src); + if (new_plane_state->uapi.crtc != crtc_state->uapi.crtc) + continue; + + if (plane->id != PLANE_CURSOR) + continue; + + if (!new_plane_state->uapi.visible) + continue; + + inter = crtc_state->psr2_su_area; + if (!drm_rect_intersect(&inter, &new_plane_state->uapi.dst)) + continue; + + clip_area_update(&crtc_state->psr2_su_area, &new_plane_state->uapi.dst, + &crtc_state->pipe_src); + } } /* @@ -2173,8 +2188,7 @@ int intel_psr2_sel_fetch_update(struct intel_atomic_state *state, { struct drm_i915_private *dev_priv = to_i915(state->base.dev); struct intel_crtc_state *crtc_state = intel_atomic_get_new_crtc_state(state, crtc); - struct intel_plane_state *new_plane_state, *old_plane_state, - *cursor_plane_state = NULL; + struct intel_plane_state *new_plane_state, *old_plane_state; struct intel_plane *plane; bool full_update = false; int i, ret; @@ -2259,13 +2273,6 @@ int intel_psr2_sel_fetch_update(struct intel_atomic_state *state, damaged_area.x2 += new_plane_state->uapi.dst.x1 - src.x1; clip_area_update(&crtc_state->psr2_su_area, &damaged_area, &crtc_state->pipe_src); - - /* - * Cursor plane new state is stored to adjust su area to cover - * cursor are fully. - */ - if (plane->id == PLANE_CURSOR) - cursor_plane_state = new_plane_state; } /* @@ -2294,9 +2301,13 @@ int intel_psr2_sel_fetch_update(struct intel_atomic_state *state, if (ret) return ret; - /* Adjust su area to cover cursor fully as necessary */ - if (cursor_plane_state) - intel_psr2_sel_fetch_et_alignment(crtc_state, cursor_plane_state); + /* + * Adjust su area to cover cursor fully as necessary (early + * transport). This needs to be done after + * drm_atomic_add_affected_planes to ensure visible cursor is added into + * affected planes even when cursor is not updated by itself. + */ + intel_psr2_sel_fetch_et_alignment(state, crtc); intel_psr2_sel_fetch_pipe_alignment(crtc_state); -- GitLab From 94bf3e60e1a61973cdb6488af873b8de66250c77 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Wed, 27 Mar 2024 21:05:46 +0100 Subject: [PATCH 355/989] drm/i915/gt: Limit the reserved VM space to only the platforms that need it Commit 9bb66c179f50 ("drm/i915: Reserve some kernel space per vm") reduces the available VM space of one page in order to apply Wa_16018031267 and Wa_16018063123. This page was reserved indiscrimitely in all platforms even when not needed. Limit it to DG2 onwards. Fixes: 9bb66c179f50 ("drm/i915: Reserve some kernel space per vm") Signed-off-by: Andi Shyti Cc: Andrzej Hajda Cc: Chris Wilson Cc: Jonathan Cavitt Cc: Nirmoy Das Reviewed-by: Nirmoy Das Acked-by: Michal Mrozek Link: https://patchwork.freedesktop.org/patch/msgid/20240327200546.640108-1-andi.shyti@linux.intel.com (cherry picked from commit 9721634441d5dedba7f9eebb2bf0c9411cbafc4e) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 3 +++ drivers/gpu/drm/i915/gt/intel_gt.c | 6 ++++++ drivers/gpu/drm/i915/gt/intel_gt.h | 9 +++++---- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index fa46d2308b0ed..81bf2216371be 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -961,6 +961,9 @@ static int gen8_init_rsvd(struct i915_address_space *vm) struct i915_vma *vma; int ret; + if (!intel_gt_needs_wa_16018031267(vm->gt)) + return 0; + /* The memory will be used only by GPU. */ obj = i915_gem_object_create_lmem(i915, PAGE_SIZE, I915_BO_ALLOC_VOLATILE | diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index a425db5ed3a22..6a2c2718bcc38 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -1024,6 +1024,12 @@ enum i915_map_type intel_gt_coherent_map_type(struct intel_gt *gt, return I915_MAP_WC; } +bool intel_gt_needs_wa_16018031267(struct intel_gt *gt) +{ + /* Wa_16018031267, Wa_16018063123 */ + return IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 55), IP_VER(12, 71)); +} + bool intel_gt_needs_wa_22016122933(struct intel_gt *gt) { return MEDIA_VER_FULL(gt->i915) == IP_VER(13, 0) && gt->type == GT_MEDIA; diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h index 608f5c8729285..003eb93b826fd 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.h +++ b/drivers/gpu/drm/i915/gt/intel_gt.h @@ -82,17 +82,18 @@ struct drm_printer; ##__VA_ARGS__); \ } while (0) -#define NEEDS_FASTCOLOR_BLT_WABB(engine) ( \ - IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 55), IP_VER(12, 71)) && \ - engine->class == COPY_ENGINE_CLASS && engine->instance == 0) - static inline bool gt_is_root(struct intel_gt *gt) { return !gt->info.id; } +bool intel_gt_needs_wa_16018031267(struct intel_gt *gt); bool intel_gt_needs_wa_22016122933(struct intel_gt *gt); +#define NEEDS_FASTCOLOR_BLT_WABB(engine) ( \ + intel_gt_needs_wa_16018031267(engine->gt) && \ + engine->class == COPY_ENGINE_CLASS && engine->instance == 0) + static inline struct intel_gt *uc_to_gt(struct intel_uc *uc) { return container_of(uc, struct intel_gt, uc); -- GitLab From bc9a1ec01289e6e7259dc5030b413a9c6654a99a Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Thu, 28 Mar 2024 08:34:03 +0100 Subject: [PATCH 356/989] drm/i915/gt: Disable HW load balancing for CCS The hardware should not dynamically balance the load between CCS engines. Wa_14019159160 recommends disabling it across all platforms. Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement") Signed-off-by: Andi Shyti Cc: Chris Wilson Cc: Joonas Lahtinen Cc: Matt Roper Cc: # v6.2+ Reviewed-by: Matt Roper Acked-by: Michal Mrozek Link: https://patchwork.freedesktop.org/patch/msgid/20240328073409.674098-2-andi.shyti@linux.intel.com (cherry picked from commit f5d2904cf814f20b79e3e4c1b24a4ccc2411b7e0) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_gt_regs.h | 1 + drivers/gpu/drm/i915/gt/intel_workarounds.c | 23 +++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h index 50962cfd1353a..31b102604e3df 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h @@ -1477,6 +1477,7 @@ #define ECOBITS_PPGTT_CACHE4B (0 << 8) #define GEN12_RCU_MODE _MMIO(0x14800) +#define XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE REG_BIT(1) #define GEN12_RCU_MODE_CCS_ENABLE REG_BIT(0) #define CHV_FUSE_GT _MMIO(VLV_GUNIT_BASE + 0x2168) diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 25413809b9dc9..4865eb5ca9c91 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -51,7 +51,8 @@ * registers belonging to BCS, VCS or VECS should be implemented in * xcs_engine_wa_init(). Workarounds for registers not belonging to a specific * engine's MMIO range but that are part of of the common RCS/CCS reset domain - * should be implemented in general_render_compute_wa_init(). + * should be implemented in general_render_compute_wa_init(). The settings + * about the CCS load balancing should be added in ccs_engine_wa_mode(). * * - GT workarounds: the list of these WAs is applied whenever these registers * revert to their default values: on GPU reset, suspend/resume [1]_, etc. @@ -2854,6 +2855,22 @@ add_render_compute_tuning_settings(struct intel_gt *gt, wa_write_clr(wal, GEN8_GARBCNTL, GEN12_BUS_HASH_CTL_BIT_EXC); } +static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_list *wal) +{ + struct intel_gt *gt = engine->gt; + + if (!IS_DG2(gt->i915)) + return; + + /* + * Wa_14019159160: This workaround, along with others, leads to + * significant challenges in utilizing load balancing among the + * CCS slices. Consequently, an architectural decision has been + * made to completely disable automatic CCS load balancing. + */ + wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE); +} + /* * The workarounds in this function apply to shared registers in * the general render reset domain that aren't tied to a @@ -3004,8 +3021,10 @@ engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal * to a single RCS/CCS engine's workaround list since * they're reset as part of the general render domain reset. */ - if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) + if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) { general_render_compute_wa_init(engine, wal); + ccs_engine_wa_mode(engine, wal); + } if (engine->class == COMPUTE_CLASS) ccs_engine_wa_init(engine, wal); -- GitLab From ea315f98e5d6d3191b74beb0c3e5fc16081d517c Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Thu, 28 Mar 2024 08:34:04 +0100 Subject: [PATCH 357/989] drm/i915/gt: Do not generate the command streamer for all the CCS We want a fixed load CCS balancing consisting in all slices sharing one single user engine. For this reason do not create the intel_engine_cs structure with its dedicated command streamer for CCS slices beyond the first. Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement") Signed-off-by: Andi Shyti Cc: Chris Wilson Cc: Joonas Lahtinen Cc: Matt Roper Cc: # v6.2+ Acked-by: Michal Mrozek Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240328073409.674098-3-andi.shyti@linux.intel.com (cherry picked from commit c7a5aa4e57f88470313a8277eb299b221b86e3b1) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 1ade568ffbfa4..7a6dc371c384e 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -908,6 +908,23 @@ static intel_engine_mask_t init_engine_mask(struct intel_gt *gt) info->engine_mask &= ~BIT(GSC0); } + /* + * Do not create the command streamer for CCS slices beyond the first. + * All the workload submitted to the first engine will be shared among + * all the slices. + * + * Once the user will be allowed to customize the CCS mode, then this + * check needs to be removed. + */ + if (IS_DG2(gt->i915)) { + u8 first_ccs = __ffs(CCS_MASK(gt)); + + /* Mask off all the CCS engine */ + info->engine_mask &= ~GENMASK(CCS3, CCS0); + /* Put back in the first CCS engine */ + info->engine_mask |= BIT(_CCS(first_ccs)); + } + return info->engine_mask; } -- GitLab From 6db31251bb265813994bfb104eb4b4d0f44d64fb Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Thu, 28 Mar 2024 08:34:05 +0100 Subject: [PATCH 358/989] drm/i915/gt: Enable only one CCS for compute workload Enable only one CCS engine by default with all the compute sices allocated to it. While generating the list of UABI engines to be exposed to the user, exclude any additional CCS engines beyond the first instance. This change can be tested with igt i915_query. Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement") Signed-off-by: Andi Shyti Cc: Chris Wilson Cc: Joonas Lahtinen Cc: Matt Roper Cc: # v6.2+ Reviewed-by: Matt Roper Acked-by: Michal Mrozek Link: https://patchwork.freedesktop.org/patch/msgid/20240328073409.674098-4-andi.shyti@linux.intel.com (cherry picked from commit 2bebae0112b117de7e8a7289277a4bd2403b9e17) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/Makefile | 1 + drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c | 39 +++++++++++++++++++++ drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h | 13 +++++++ drivers/gpu/drm/i915/gt/intel_gt_regs.h | 5 +++ drivers/gpu/drm/i915/gt/intel_workarounds.c | 7 ++++ 5 files changed, 65 insertions(+) create mode 100644 drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c create mode 100644 drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index 4c2f85632391a..fba73c38e2356 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -118,6 +118,7 @@ gt-y += \ gt/intel_ggtt_fencing.o \ gt/intel_gt.o \ gt/intel_gt_buffer_pool.o \ + gt/intel_gt_ccs_mode.o \ gt/intel_gt_clock_utils.o \ gt/intel_gt_debugfs.o \ gt/intel_gt_engines_debugfs.o \ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c new file mode 100644 index 0000000000000..044219c5960a5 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2024 Intel Corporation + */ + +#include "i915_drv.h" +#include "intel_gt.h" +#include "intel_gt_ccs_mode.h" +#include "intel_gt_regs.h" + +void intel_gt_apply_ccs_mode(struct intel_gt *gt) +{ + int cslice; + u32 mode = 0; + int first_ccs = __ffs(CCS_MASK(gt)); + + if (!IS_DG2(gt->i915)) + return; + + /* Build the value for the fixed CCS load balancing */ + for (cslice = 0; cslice < I915_MAX_CCS; cslice++) { + if (CCS_MASK(gt) & BIT(cslice)) + /* + * If available, assign the cslice + * to the first available engine... + */ + mode |= XEHP_CCS_MODE_CSLICE(cslice, first_ccs); + + else + /* + * ... otherwise, mark the cslice as + * unavailable if no CCS dispatches here + */ + mode |= XEHP_CCS_MODE_CSLICE(cslice, + XEHP_CCS_MODE_CSLICE_MASK); + } + + intel_uncore_write(gt->uncore, XEHP_CCS_MODE, mode); +} diff --git a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h new file mode 100644 index 0000000000000..9e5549caeb269 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2024 Intel Corporation + */ + +#ifndef __INTEL_GT_CCS_MODE_H__ +#define __INTEL_GT_CCS_MODE_H__ + +struct intel_gt; + +void intel_gt_apply_ccs_mode(struct intel_gt *gt); + +#endif /* __INTEL_GT_CCS_MODE_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h index 31b102604e3df..743fe35667227 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h @@ -1480,6 +1480,11 @@ #define XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE REG_BIT(1) #define GEN12_RCU_MODE_CCS_ENABLE REG_BIT(0) +#define XEHP_CCS_MODE _MMIO(0x14804) +#define XEHP_CCS_MODE_CSLICE_MASK REG_GENMASK(2, 0) /* CCS0-3 + rsvd */ +#define XEHP_CCS_MODE_CSLICE_WIDTH ilog2(XEHP_CCS_MODE_CSLICE_MASK + 1) +#define XEHP_CCS_MODE_CSLICE(cslice, ccs) (ccs << (cslice * XEHP_CCS_MODE_CSLICE_WIDTH)) + #define CHV_FUSE_GT _MMIO(VLV_GUNIT_BASE + 0x2168) #define CHV_FGT_DISABLE_SS0 (1 << 10) #define CHV_FGT_DISABLE_SS1 (1 << 11) diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 4865eb5ca9c91..6ec3582c97357 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -10,6 +10,7 @@ #include "intel_engine_regs.h" #include "intel_gpu_commands.h" #include "intel_gt.h" +#include "intel_gt_ccs_mode.h" #include "intel_gt_mcr.h" #include "intel_gt_print.h" #include "intel_gt_regs.h" @@ -2869,6 +2870,12 @@ static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_li * made to completely disable automatic CCS load balancing. */ wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE); + + /* + * After having disabled automatic load balancing we need to + * assign all slices to a single CCS. We will call it CCS mode 1 + */ + intel_gt_apply_ccs_mode(gt); } /* -- GitLab From f7caddfd558e32db0ae944256e623a259538b357 Mon Sep 17 00:00:00 2001 From: Ankit Nautiyal Date: Tue, 5 Mar 2024 11:14:43 +0530 Subject: [PATCH 359/989] drm/i915/dp: Fix the computation for compressed_bpp for DISPLAY < 13 For DISPLAY < 13, compressed bpp is chosen from a list of supported compressed bpps. Fix the condition to choose the appropriate compressed bpp from the list. Fixes: 1c56e9a39833 ("drm/i915/dp: Get optimal link config to have best compressed bpp") Cc: Ankit Nautiyal Cc: Stanislav Lisovskiy Cc: Jani Nikula Cc: # v6.7+ Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10162 Signed-off-by: Ankit Nautiyal Reviewed-by: Suraj Kandpal Link: https://patchwork.freedesktop.org/patch/msgid/20240305054443.2489895-1-ankit.k.nautiyal@intel.com (cherry picked from commit 5a1da42b50f3594e18738885c2f23ed36629dd00) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 4016cf6b6c618..36afbb68d87d4 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -1917,8 +1917,9 @@ icl_dsc_compute_link_config(struct intel_dp *intel_dp, dsc_max_bpp = min(dsc_max_bpp, pipe_bpp - 1); for (i = 0; i < ARRAY_SIZE(valid_dsc_bpp); i++) { - if (valid_dsc_bpp[i] < dsc_min_bpp || - valid_dsc_bpp[i] > dsc_max_bpp) + if (valid_dsc_bpp[i] < dsc_min_bpp) + continue; + if (valid_dsc_bpp[i] > dsc_max_bpp) break; ret = dsc_compute_link_config(intel_dp, -- GitLab From 51bc63392e96ca45d7be98bc43c180b174ffca09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 2 Apr 2024 16:51:46 +0300 Subject: [PATCH 360/989] drm/i915/mst: Limit MST+DSC to TGL+ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MST code currently assumes that glk+ already supports MST+DSC, which is incorrect. We need to check for TGL+ actually. ICL does support SST+DSC, but supposedly it can't do MST+FEC which will also rule out MST+DSC. Note that a straight TGL+ check doesn't work here because DSC support can get fused out, so we do need to also check 'has_dsc'. Cc: stable@vger.kernel.org Fixes: d51f25eb479a ("drm/i915: Add DSC support to MST path") Reviewed-by: Uma Shankar Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240402135148.23011-6-ville.syrjala@linux.intel.com (cherry picked from commit c9c92f286dbdf872390ef3e74dbe5f0641e46f55) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_display_device.h | 1 + drivers/gpu/drm/i915/display/intel_dp_mst.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_display_device.h b/drivers/gpu/drm/i915/display/intel_display_device.h index fe42688137863..9b1bce2624b9e 100644 --- a/drivers/gpu/drm/i915/display/intel_display_device.h +++ b/drivers/gpu/drm/i915/display/intel_display_device.h @@ -47,6 +47,7 @@ struct drm_printer; #define HAS_DPT(i915) (DISPLAY_VER(i915) >= 13) #define HAS_DSB(i915) (DISPLAY_INFO(i915)->has_dsb) #define HAS_DSC(__i915) (DISPLAY_RUNTIME_INFO(__i915)->has_dsc) +#define HAS_DSC_MST(__i915) (DISPLAY_VER(__i915) >= 12 && HAS_DSC(__i915)) #define HAS_FBC(i915) (DISPLAY_RUNTIME_INFO(i915)->fbc_mask != 0) #define HAS_FPGA_DBG_UNCLAIMED(i915) (DISPLAY_INFO(i915)->has_fpga_dbg) #define HAS_FW_BLC(i915) (DISPLAY_VER(i915) >= 3) diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c index 53aec023ce92f..b651c990af85f 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c @@ -1355,7 +1355,7 @@ intel_dp_mst_mode_valid_ctx(struct drm_connector *connector, return 0; } - if (DISPLAY_VER(dev_priv) >= 10 && + if (HAS_DSC_MST(dev_priv) && drm_dp_sink_supports_dsc(intel_connector->dp.dsc_dpcd)) { /* * TBD pass the connector BPC, -- GitLab From 99f855082f228cdcecd6ab768d3b8b505e0eb028 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 2 Apr 2024 16:51:47 +0300 Subject: [PATCH 361/989] drm/i915/mst: Reject FEC+MST on ICL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ICL supposedly doesn't support FEC on MST. Reject it. Cc: stable@vger.kernel.org Fixes: d51f25eb479a ("drm/i915: Add DSC support to MST path") Reviewed-by: Uma Shankar Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240402135148.23011-7-ville.syrjala@linux.intel.com (cherry picked from commit b648ce2a28ba83c4fa67c61fcc5983e15e9d4afb) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 36afbb68d87d4..abd62bebc46d0 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -1422,7 +1422,8 @@ static bool intel_dp_source_supports_fec(struct intel_dp *intel_dp, if (DISPLAY_VER(dev_priv) >= 12) return true; - if (DISPLAY_VER(dev_priv) == 11 && encoder->port != PORT_A) + if (DISPLAY_VER(dev_priv) == 11 && encoder->port != PORT_A && + !intel_crtc_has_type(pipe_config, INTEL_OUTPUT_DP_MST)) return true; return false; -- GitLab From 27fcec6c27caec05a512d2f1be7f855d8899cb8d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 30 Mar 2024 22:25:45 -0400 Subject: [PATCH 362/989] bcachefs: Clear recovery_passes_required as they complete without errors Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 4 +--- fs/bcachefs/recovery_passes.c | 41 +++++++++++++++++++++++++++-------- fs/bcachefs/util.h | 10 +++++++++ 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index e8b4340092936..88b794d5dbd06 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -838,9 +838,7 @@ use_clean: } if (!test_bit(BCH_FS_error, &c->flags) && - (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || - !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { - memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); + !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) { memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); write_sb = true; } diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 0089d9456b100..b066c5155b742 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -59,18 +59,23 @@ static struct recovery_pass_fn recovery_pass_fns[] = { #undef x }; -u64 bch2_recovery_passes_to_stable(u64 v) -{ - static const u8 map[] = { +static const u8 passes_to_stable_map[] = { #define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, BCH_RECOVERY_PASSES() #undef x - }; +}; + +static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) +{ + return passes_to_stable_map[pass]; +} +u64 bch2_recovery_passes_to_stable(u64 v) +{ u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); + ret |= BIT_ULL(passes_to_stable_map[i]); return ret; } @@ -116,13 +121,13 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, enum bch_recovery_pass pass) { - __le64 s = cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(pass))); + enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - if (!(ext->recovery_passes_required[0] & s)) { - ext->recovery_passes_required[0] |= s; + if (!test_bit_le64(s, ext->recovery_passes_required)) { + __set_bit_le64(s, ext->recovery_passes_required); bch2_write_super(c); } mutex_unlock(&c->sb_lock); @@ -130,6 +135,21 @@ int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, return bch2_run_explicit_recovery_pass(c, pass); } +static void bch2_clear_recovery_pass_required(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (test_bit_le64(s, ext->recovery_passes_required)) { + __clear_bit_le64(s, ext->recovery_passes_required); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); +} + u64 bch2_fsck_recovery_passes(void) { u64 ret = 0; @@ -218,6 +238,9 @@ int bch2_run_recovery_passes(struct bch_fs *c) c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); + if (!test_bit(BCH_FS_error, &c->flags)) + bch2_clear_recovery_pass_required(c, c->curr_recovery_pass); + c->curr_recovery_pass++; } diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 175aee3074c7d..4577c0789a3a7 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -797,4 +797,14 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr) addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); } +static inline void __clear_bit_le64(size_t bit, __le64 *addr) +{ + addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64)); +} + +static inline bool test_bit_le64(size_t bit, __le64 *addr) +{ + return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; +} + #endif /* _BCACHEFS_UTIL_H */ -- GitLab From bdbf953b3c9036e284e28c9484dda5c81b2a45fa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 19 Mar 2024 18:56:26 -0400 Subject: [PATCH 363/989] bcachefs: bch2_shoot_down_journal_keys() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 19 +++++++++++++++++++ fs/bcachefs/btree_journal_iter.h | 4 ++++ fs/bcachefs/recovery.c | 22 ++++++++++++---------- 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 1f588264575db..5cbcbfe85235b 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -567,3 +567,22 @@ int bch2_journal_keys_sort(struct bch_fs *c) bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); return 0; } + +void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, + unsigned level_min, unsigned level_max, + struct bpos start, struct bpos end) +{ + struct journal_keys *keys = &c->journal_keys; + size_t dst = 0; + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) + if (!(i->btree_id == btree && + i->level >= level_min && + i->level <= level_max && + bpos_ge(i->k->k.p, start) && + bpos_le(i->k->k.p, end))) + keys->data[dst++] = *i; + keys->nr = keys->gap = dst; +} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 09cd53091a05b..af25046ebcaa7 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -66,4 +66,8 @@ void bch2_journal_entries_free(struct bch_fs *); int bch2_journal_keys_sort(struct bch_fs *); +void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, + unsigned, unsigned, + struct bpos, struct bpos); + #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 88b794d5dbd06..7cbf30e3ef73b 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -47,7 +47,7 @@ static bool btree_id_is_alloc(enum btree_id id) } /* for -o reconstruct_alloc: */ -static void do_reconstruct_alloc(struct bch_fs *c) +static void bch2_reconstruct_alloc(struct bch_fs *c) { bch2_journal_log_msg(c, "dropping alloc info"); bch_info(c, "dropping and reconstructing all alloc info"); @@ -82,15 +82,17 @@ static void do_reconstruct_alloc(struct bch_fs *c) c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - struct journal_keys *keys = &c->journal_keys; - size_t src, dst; - - move_gap(keys, keys->nr); - for (src = 0, dst = 0; src < keys->nr; src++) - if (!btree_id_is_alloc(keys->data[src].btree_id)) - keys->data[dst++] = keys->data[src]; - keys->nr = keys->gap = dst; + bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_freespace, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); } /* @@ -731,7 +733,7 @@ use_clean: c->journal_replay_seq_end = blacklist_seq - 1; if (c->opts.reconstruct_alloc) - do_reconstruct_alloc(c); + bch2_reconstruct_alloc(c); zero_out_btree_mem_ptr(&c->journal_keys); -- GitLab From ca1e02f7e9a1352b3f7b04de821ae74c9e07df74 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 22 Mar 2024 19:26:33 -0400 Subject: [PATCH 364/989] bcachefs: Etyzinger cleanups Pull out eytzinger.c and kill eytzinger_cmp_fn. We now provide eytzinger0_sort and eytzinger0_sort_r, which use the standard cmp_func_t and cmp_r_func_t callbacks. Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/eytzinger.c | 234 ++++++++++++++++++++++++++++ fs/bcachefs/eytzinger.h | 63 ++++---- fs/bcachefs/journal_seq_blacklist.c | 3 +- fs/bcachefs/replicas.c | 19 ++- fs/bcachefs/util.c | 143 ----------------- fs/bcachefs/util.h | 4 - 7 files changed, 285 insertions(+), 182 deletions(-) create mode 100644 fs/bcachefs/eytzinger.c diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index f6d86eb4b9765..0933493a322c4 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -37,6 +37,7 @@ bcachefs-y := \ error.o \ extents.o \ extent_update.o \ + eytzinger.o \ fs.o \ fs-common.o \ fs-ioctl.o \ diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c new file mode 100644 index 0000000000000..4ce5e957a6e91 --- /dev/null +++ b/fs/bcachefs/eytzinger.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "eytzinger.h" + +/** + * is_aligned - is this pointer & size okay for word-wide copying? + * @base: pointer to data + * @size: size of each element + * @align: required alignment (typically 4 or 8) + * + * Returns true if elements can be copied using word loads and stores. + * The size must be a multiple of the alignment, and the base address must + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + * + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" + * to "if ((a | b) & mask)", so we do that by hand. + */ +__attribute_const__ __always_inline +static bool is_aligned(const void *base, size_t size, unsigned char align) +{ + unsigned char lsbits = (unsigned char)size; + + (void)base; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + lsbits |= (unsigned char)(uintptr_t)base; +#endif + return (lsbits & (align - 1)) == 0; +} + +/** + * swap_words_32 - swap two elements in 32-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 4) + * + * Exchange the two objects in memory. This exploits base+index addressing, + * which basically all CPUs have, to minimize loop overhead computations. + * + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the + * bottom of the loop, even though the zero flag is still valid from the + * subtract (since the intervening mov instructions don't alter the flags). + * Gcc 8.1.0 doesn't have that problem. + */ +static void swap_words_32(void *a, void *b, size_t n) +{ + do { + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + } while (n); +} + +/** + * swap_words_64 - swap two elements in 64-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 8) + * + * Exchange the two objects in memory. This exploits base+index + * addressing, which basically all CPUs have, to minimize loop overhead + * computations. + * + * We'd like to use 64-bit loads if possible. If they're not, emulating + * one requires base+index+4 addressing which x86 has but most other + * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, + * but it's possible to have 64-bit loads without 64-bit pointers (e.g. + * x32 ABI). Are there any cases the kernel needs to worry about? + */ +static void swap_words_64(void *a, void *b, size_t n) +{ + do { +#ifdef CONFIG_64BIT + u64 t = *(u64 *)(a + (n -= 8)); + *(u64 *)(a + n) = *(u64 *)(b + n); + *(u64 *)(b + n) = t; +#else + /* Use two 32-bit transfers to avoid base+index+4 addressing */ + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + + t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; +#endif + } while (n); +} + +/** + * swap_bytes - swap two elements a byte at a time + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size + * + * This is the fallback if alignment doesn't allow using larger chunks. + */ +static void swap_bytes(void *a, void *b, size_t n) +{ + do { + char t = ((char *)a)[--n]; + ((char *)a)[n] = ((char *)b)[n]; + ((char *)b)[n] = t; + } while (n); +} + +/* + * The values are arbitrary as long as they can't be confused with + * a pointer, but small integers make for the smallest compare + * instructions. + */ +#define SWAP_WORDS_64 (swap_r_func_t)0 +#define SWAP_WORDS_32 (swap_r_func_t)1 +#define SWAP_BYTES (swap_r_func_t)2 +#define SWAP_WRAPPER (swap_r_func_t)3 + +struct wrapper { + cmp_func_t cmp; + swap_func_t swap; +}; + +/* + * The function pointer is last to make tail calls most efficient if the + * compiler decides not to inline this function. + */ +static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) +{ + if (swap_func == SWAP_WRAPPER) { + ((const struct wrapper *)priv)->swap(a, b, (int)size); + return; + } + + if (swap_func == SWAP_WORDS_64) + swap_words_64(a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32(a, b, size); + else if (swap_func == SWAP_BYTES) + swap_bytes(a, b, size); + else + swap_func(a, b, (int)size, priv); +} + +#define _CMP_WRAPPER ((cmp_r_func_t)0L) + +static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) +{ + if (cmp == _CMP_WRAPPER) + return ((const struct wrapper *)priv)->cmp(a, b); + return cmp(a, b, priv); +} + +static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, const void *priv, + size_t l, size_t r) +{ + return do_cmp(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + cmp_func, priv); +} + +static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, + swap_r_func_t swap_func, const void *priv, + size_t l, size_t r) +{ + do_swap(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size, swap_func, priv); +} + +void eytzinger0_sort_r(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + int i, c, r; + + /* called from 'sort' without swap function, let's pick the default */ + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) + swap_func = NULL; + + if (!swap_func) { + if (is_aligned(base, size, 8)) + swap_func = SWAP_WORDS_64; + else if (is_aligned(base, size, 4)) + swap_func = SWAP_WORDS_32; + else + swap_func = SWAP_BYTES; + } + + /* heapify */ + for (i = n / 2 - 1; i >= 0; --i) { + for (r = i; r * 2 + 1 < n; r = c) { + c = r * 2 + 1; + + if (c + 1 < n && + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) + c++; + + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) + break; + + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + } + } + + /* sort */ + for (i = n - 1; i > 0; --i) { + eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); + + for (r = 0; r * 2 + 1 < i; r = c) { + c = r * 2 + 1; + + if (c + 1 < i && + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) + c++; + + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) + break; + + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + } + } +} + +void eytzinger0_sort(void *base, size_t n, size_t size, + cmp_func_t cmp_func, + swap_func_t swap_func) +{ + struct wrapper w = { + .cmp = cmp_func, + .swap = swap_func, + }; + + return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); +} diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index b04750dbf870b..ee0e2df33322d 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -5,23 +5,33 @@ #include #include -#include "util.h" +#ifdef EYTZINGER_DEBUG +#define EYTZINGER_BUG_ON(cond) BUG_ON(cond) +#else +#define EYTZINGER_BUG_ON(cond) +#endif /* * Traversal for trees in eytzinger layout - a full binary tree layed out in an - * array - */ - -/* - * One based indexing version: + * array. * - * With one based indexing each level of the tree starts at a power of two - - * good for cacheline alignment: + * Consider using an eytzinger tree any time you would otherwise be doing binary + * search over an array. Binary search is a worst case scenario for branch + * prediction and prefetching, but in an eytzinger tree every node's children + * are adjacent in memory, thus we can prefetch children before knowing the + * result of the comparison, assuming multiple nodes fit on a cacheline. + * + * Two variants are provided, for one based indexing and zero based indexing. + * + * Zero based indexing is more convenient, but one based indexing has better + * alignment and thus better performance because each new level of the tree + * starts at a power of two, and thus if element 0 was cacheline aligned, each + * new level will be as well. */ static inline unsigned eytzinger1_child(unsigned i, unsigned child) { - EBUG_ON(child > 1); + EYTZINGER_BUG_ON(child > 1); return (i << 1) + child; } @@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size) static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EBUG_ON(i > size); + EYTZINGER_BUG_ON(i > size); if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); @@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EBUG_ON(i > size); + EYTZINGER_BUG_ON(i > size); if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; @@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, unsigned shift = __fls(size) - b; int s; - EBUG_ON(!i || i > size); + EYTZINGER_BUG_ON(!i || i > size); i ^= 1U << b; i <<= 1; @@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, unsigned shift; int s; - EBUG_ON(!i || i > size); + EYTZINGER_BUG_ON(!i || i > size); /* * sign bit trick: @@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) static inline unsigned eytzinger0_child(unsigned i, unsigned child) { - EBUG_ON(child > 1); + EYTZINGER_BUG_ON(child > 1); return (i << 1) + 1 + child; } @@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) -typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); - /* return greatest node <= @search, or -1 if not found */ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, - eytzinger_cmp_fn cmp, const void *search) + cmp_func_t cmp, const void *search) { unsigned i, n = 0; @@ -244,21 +252,24 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, do { i = n; - n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); + n = eytzinger0_child(i, cmp(base + i * size, search) <= 0); } while (n < nr); if (n & 1) { /* @i was greater than @search, return previous node: */ - - if (i == eytzinger0_first(nr)) - return -1; - return eytzinger0_prev(i, nr); } else { return i; } } +static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) +{ + ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); + return eytzinger0_next(idx, size); +} + #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ void *_base = (base); \ @@ -269,13 +280,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, int _res; \ \ while (_i < _nr && \ - (_res = _cmp(_search, _base + _i * _size, _size))) \ + (_res = _cmp(_search, _base + _i * _size))) \ _i = eytzinger0_child(_i, _res > 0); \ _i; \ }) -void eytzinger0_sort(void *, size_t, size_t, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)); +void eytzinger0_sort_r(void *, size_t, size_t, + cmp_r_func_t, swap_r_func_t, const void *); +void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t); #endif /* _EYTZINGER_H */ diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index b5303874fc35b..37a024e034d49 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -95,8 +95,7 @@ out: return ret ?: bch2_blacklist_table_initialize(c); } -static int journal_seq_blacklist_table_cmp(const void *_l, - const void *_r, size_t size) +static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) { const struct journal_seq_blacklist_table_entry *l = _l; const struct journal_seq_blacklist_table_entry *r = _r; diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index cc2672c120312..678b9c20e2514 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -6,12 +6,15 @@ #include "replicas.h" #include "super-io.h" +#include + static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ -static int bch2_memcmp(const void *l, const void *r, size_t size) +static int bch2_memcmp(const void *l, const void *r, const void *priv) { + size_t size = (size_t) priv; return memcmp(l, r, size); } @@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { - eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL); + eytzinger0_sort_r(r->entries, r->nr, r->entry_size, + bch2_memcmp, NULL, (void *)(size_t)r->entry_size); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, @@ -228,7 +232,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, verify_replicas_entry(search); -#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) +#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) idx = eytzinger0_find(r->entries, r->nr, r->entry_size, entry_cmp, search); #undef entry_cmp @@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, { unsigned i; - sort_cmp_size(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - bch2_memcmp, NULL); + sort_r(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + bch2_memcmp, NULL, + (void *)(size_t)cpu_r->entry_size); for (i = 0; i < cpu_r->nr; i++) { struct bch_replicas_entry_v1 *e = diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 216fadf16928b..92c6ad75e702a 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -707,149 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } -static int alignment_ok(const void *base, size_t align) -{ - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || - ((unsigned long)base & (align - 1)) == 0; -} - -static void u32_swap(void *a, void *b, size_t size) -{ - u32 t = *(u32 *)a; - *(u32 *)a = *(u32 *)b; - *(u32 *)b = t; -} - -static void u64_swap(void *a, void *b, size_t size) -{ - u64 t = *(u64 *)a; - *(u64 *)a = *(u64 *)b; - *(u64 *)b = t; -} - -static void generic_swap(void *a, void *b, size_t size) -{ - char t; - - do { - t = *(char *)a; - *(char *)a++ = *(char *)b; - *(char *)b++ = t; - } while (--size > 0); -} - -static inline int do_cmp(void *base, size_t n, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - size_t l, size_t r) -{ - return cmp_func(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - size); -} - -static inline void do_swap(void *base, size_t n, size_t size, - void (*swap_func)(void *, void *, size_t), - size_t l, size_t r) -{ - swap_func(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - size); -} - -void eytzinger0_sort(void *base, size_t n, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)) -{ - int i, c, r; - - if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; - else - swap_func = generic_swap; - } - - /* heapify */ - for (i = n / 2 - 1; i >= 0; --i) { - for (r = i; r * 2 + 1 < n; r = c) { - c = r * 2 + 1; - - if (c + 1 < n && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; - - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; - - do_swap(base, n, size, swap_func, r, c); - } - } - - /* sort */ - for (i = n - 1; i > 0; --i) { - do_swap(base, n, size, swap_func, 0, i); - - for (r = 0; r * 2 + 1 < i; r = c) { - c = r * 2 + 1; - - if (c + 1 < i && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; - - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; - - do_swap(base, n, size, swap_func, r, c); - } - } -} - -void sort_cmp_size(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t size)) -{ - /* pre-scale counters for performance */ - int i = (num/2 - 1) * size, n = num * size, c, r; - - if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; - else - swap_func = generic_swap; - } - - /* heapify */ - for ( ; i >= 0; i -= size) { - for (r = i; r * 2 + size < n; r = c) { - c = r * 2 + size; - if (c < n - size && - cmp_func(base + c, base + c + size, size) < 0) - c += size; - if (cmp_func(base + r, base + c, size) >= 0) - break; - swap_func(base + r, base + c, size); - } - } - - /* sort */ - for (i = n - size; i > 0; i -= size) { - swap_func(base, base + i, size); - for (r = 0; r * 2 + size < i; r = c) { - c = r * 2 + size; - if (c < i - size && - cmp_func(base + c, base + c + size, size) < 0) - c += size; - if (cmp_func(base + r, base + c, size) >= 0) - break; - swap_func(base + r, base + c, size); - } - } -} - #if 0 void eytzinger1_test(void) { diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 4577c0789a3a7..b7e7c29278fc0 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -631,10 +631,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -void sort_cmp_size(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)); - /* just the memmove, doesn't update @_nr */ #define __array_insert_item(_array, _nr, _pos) \ memmove(&(_array)[(_pos) + 1], \ -- GitLab From f2f61f4192de536fa36bebe49dc2c241213b53ee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 14 Mar 2024 22:17:40 -0400 Subject: [PATCH 365/989] bcachefs: bch2_btree_root_alloc() -> bch2_btree_root_alloc_fake() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 8 ++++---- fs/bcachefs/btree_update_interior.h | 2 +- fs/bcachefs/recovery.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 2b46e6ad248ac..32397b99752fd 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2466,7 +2466,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) bch2_btree_set_root_inmem(c, b); } -static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) +static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level) { struct bch_fs *c = trans->c; struct closure cl; @@ -2485,7 +2485,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) set_btree_node_fake(b); set_btree_node_need_rewrite(b); - b->c.level = 0; + b->c.level = level; b->c.btree_id = id; bkey_btree_ptr_init(&b->key); @@ -2512,9 +2512,9 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) return 0; } -void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) +void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); + bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level)); } static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 1d41d3f4b107c..88dcf5a22a3bd 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -171,7 +171,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, struct bkey_i *, unsigned, bool); void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); -void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); +void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned); static inline unsigned btree_update_reserve_required(struct bch_fs *c, struct btree *b) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 7cbf30e3ef73b..f234de0ac834d 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -476,7 +476,7 @@ static int read_btree_roots(struct bch_fs *c) if (!r->b) { r->alive = false; r->level = 0; - bch2_btree_root_alloc(c, i); + bch2_btree_root_alloc_fake(c, i, 0); } } fsck_err: @@ -929,7 +929,7 @@ int bch2_fs_initialize(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); for (unsigned i = 0; i < BTREE_ID_NR; i++) - bch2_btree_root_alloc(c, i); + bch2_btree_root_alloc_fake(c, i, 0); for_each_member_device(c, ca) bch2_dev_usage_init(ca); -- GitLab From b268aa4e7fb8be3c50e25a09008fb2feed2cd345 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 10 Mar 2024 16:18:41 -0400 Subject: [PATCH 366/989] bcachefs: Don't skip fake btree roots in fsck When a btree root is unreadable, we might still have keys fro the journal to walk and mark. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index e5d2c6daa663d..9db9c8a5beaa0 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -931,9 +931,6 @@ static int bch2_gc_btree_init(struct btree_trans *trans, b = bch2_btree_id_root(c, btree_id)->b; - if (btree_node_fake(b)) - return 0; - six_lock_read(&b->c.lock, NULL, NULL); printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->min_key); -- GitLab From 4409b8081d1624af814a9cda781ad9cdda3973cb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Mar 2024 23:11:46 -0400 Subject: [PATCH 367/989] bcachefs: Repair pass for scanning for btree nodes If a btree root or interior btree node goes bad, we're going to lose a lot of data, unless we can recover the nodes that it pointed to by scanning. Fortunately btree node headers are fully self describing, and additionally the magic number is xored with the filesytem UUID, so we can do so safely. This implements the scanning - next patch will rework topology repair to make use of the found nodes. Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/bcachefs.h | 3 + fs/bcachefs/btree_node_scan.c | 495 ++++++++++++++++++++++++++++ fs/bcachefs/btree_node_scan.h | 11 + fs/bcachefs/btree_node_scan_types.h | 30 ++ fs/bcachefs/extents.c | 52 +-- fs/bcachefs/extents.h | 1 + fs/bcachefs/opts.h | 4 +- fs/bcachefs/recovery.c | 54 +-- fs/bcachefs/recovery_passes.c | 1 + fs/bcachefs/recovery_passes_types.h | 1 + fs/bcachefs/super.c | 3 + 12 files changed, 605 insertions(+), 51 deletions(-) create mode 100644 fs/bcachefs/btree_node_scan.c create mode 100644 fs/bcachefs/btree_node_scan.h create mode 100644 fs/bcachefs/btree_node_scan_types.h diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 0933493a322c4..66ca0bbee6394 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -17,6 +17,7 @@ bcachefs-y := \ btree_journal_iter.o \ btree_key_cache.o \ btree_locking.o \ + btree_node_scan.o \ btree_trans_commit.o \ btree_update.o \ btree_update_interior.o \ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 963162a627cd6..93a61dbaa3d86 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -456,6 +456,7 @@ enum bch_time_stats { #include "alloc_types.h" #include "btree_types.h" +#include "btree_node_scan_types.h" #include "btree_write_buffer_types.h" #include "buckets_types.h" #include "buckets_waiting_for_journal_types.h" @@ -1103,6 +1104,8 @@ struct bch_fs { struct journal_keys journal_keys; struct list_head journal_iters; + struct find_btree_nodes found_btree_nodes; + u64 last_bucket_seq_cleanup; u64 counters_on_mount[BCH_COUNTER_NR]; diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c new file mode 100644 index 0000000000000..3f33be7e5e5c2 --- /dev/null +++ b/fs/bcachefs/btree_node_scan.c @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_journal_iter.h" +#include "btree_node_scan.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "error.h" +#include "journal_io.h" +#include "recovery_passes.h" + +#include +#include + +struct find_btree_nodes_worker { + struct closure *cl; + struct find_btree_nodes *f; + struct bch_dev *ca; +}; + +static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) +{ + prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie); + bch2_bpos_to_text(out, n->min_key); + prt_str(out, "-"); + bch2_bpos_to_text(out, n->max_key); + + if (n->range_updated) + prt_str(out, " range updated"); + if (n->overwritten) + prt_str(out, " overwritten"); + + for (unsigned i = 0; i < n->nr_ptrs; i++) { + prt_char(out, ' '); + bch2_extent_ptr_to_text(out, c, n->ptrs + i); + } +} + +static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) +{ + printbuf_indent_add(out, 2); + darray_for_each(nodes, i) { + found_btree_node_to_text(out, c, i); + prt_newline(out); + } + printbuf_indent_sub(out, 2); +} + +static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) +{ + struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); + + set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); + bp->k.p = f->max_key; + bp->v.seq = cpu_to_le64(f->cookie); + bp->v.sectors_written = 0; + bp->v.flags = 0; + bp->v.min_key = f->min_key; + SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); + memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); +} + +static bool found_btree_node_is_readable(struct btree_trans *trans, + const struct found_btree_node *f) +{ + struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k; + + found_btree_node_to_key(&k.k, f); + + struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false); + bool ret = !IS_ERR_OR_NULL(b); + if (ret) + six_unlock_read(&b->c.lock); + + /* + * We might update this node's range; if that happens, we need the node + * to be re-read so the read path can trim keys that are no longer in + * this node + */ + if (b != btree_node_root(trans->c, b)) + bch2_btree_node_evict(trans, &k.k); + return ret; +} + +static int found_btree_node_cmp_cookie(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->level, r->level) ?: + cmp_int(l->cookie, r->cookie); +} + +/* + * Given two found btree nodes, if their sequence numbers are equal, take the + * one that's readable: + */ +static int found_btree_node_cmp_time(const struct found_btree_node *l, + const struct found_btree_node *r) +{ + return cmp_int(l->seq, r->seq); +} + +static int found_btree_node_cmp_pos(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->min_key, r->min_key) ?: + -found_btree_node_cmp_time(l, r); +} + +static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, + struct bio *bio, struct btree_node *bn, u64 offset) +{ + struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = offset; + bch2_bio_map(bio, bn, PAGE_SIZE); + + submit_bio_wait(bio); + if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + "IO error in try_read_btree_node() at %llu: %s", + offset, bch2_blk_status_to_str(bio->bi_status))) + return; + + if (le64_to_cpu(bn->magic) != bset_magic(c)) + return; + + rcu_read_lock(); + struct found_btree_node n = { + .btree_id = BTREE_NODE_ID(bn), + .level = BTREE_NODE_LEVEL(bn), + .seq = BTREE_NODE_SEQ(bn), + .cookie = le64_to_cpu(bn->keys.seq), + .min_key = bn->min_key, + .max_key = bn->max_key, + .nr_ptrs = 1, + .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, + .ptrs[0].offset = offset, + .ptrs[0].dev = ca->dev_idx, + .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)), + }; + rcu_read_unlock(); + + if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { + mutex_lock(&f->lock); + if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { + bch_err(c, "try_read_btree_node() can't handle endian conversion"); + f->ret = -EINVAL; + goto unlock; + } + + if (darray_push(&f->nodes, n)) + f->ret = -ENOMEM; +unlock: + mutex_unlock(&f->lock); + } +} + +static int read_btree_nodes_worker(void *p) +{ + struct find_btree_nodes_worker *w = p; + struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); + struct bch_dev *ca = w->ca; + void *buf = (void *) __get_free_page(GFP_KERNEL); + struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); + unsigned long last_print = jiffies; + + if (!buf || !bio) { + bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); + w->f->ret = -ENOMEM; + goto err; + } + + for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) + for (unsigned bucket_offset = 0; + bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; + bucket_offset += btree_sectors(c)) { + if (time_after(jiffies, last_print + HZ * 30)) { + u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; + u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; + + bch_info(ca, "%s: %2u%% done", __func__, + (unsigned) div64_u64(cur_sector * 100, end_sector)); + last_print = jiffies; + } + + try_read_btree_node(w->f, ca, bio, buf, + bucket * ca->mi.bucket_size + bucket_offset); + } +err: + bio_put(bio); + free_page((unsigned long) buf); + percpu_ref_get(&ca->io_ref); + closure_put(w->cl); + kfree(w); + return 0; +} + +static int read_btree_nodes(struct find_btree_nodes *f) +{ + struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + struct closure cl; + int ret = 0; + + closure_init_stack(&cl); + + for_each_online_member(c, ca) { + struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); + struct task_struct *t; + + if (!w) { + percpu_ref_put(&ca->io_ref); + ret = -ENOMEM; + goto err; + } + + percpu_ref_get(&ca->io_ref); + closure_get(&cl); + w->cl = &cl; + w->f = f; + w->ca = ca; + + t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); + ret = IS_ERR_OR_NULL(t); + if (ret) { + percpu_ref_put(&ca->io_ref); + closure_put(&cl); + f->ret = ret; + bch_err(c, "error starting kthread: %i", ret); + break; + } + } +err: + closure_sync(&cl); + return f->ret ?: ret; +} + +static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) +{ + while (n + 1 < end && + found_btree_node_cmp_pos(n, n + 1) > 0) { + swap(n[0], n[1]); + n++; + } +} + +static int handle_overwrites(struct bch_fs *c, + struct found_btree_node *start, + struct found_btree_node *end) +{ + struct found_btree_node *n; +again: + for (n = start + 1; + n < end && + n->btree_id == start->btree_id && + n->level == start->level && + bpos_lt(n->min_key, start->max_key); + n++) { + int cmp = found_btree_node_cmp_time(start, n); + + if (cmp > 0) { + if (bpos_cmp(start->max_key, n->max_key) >= 0) + n->overwritten = true; + else { + n->range_updated = true; + n->min_key = bpos_successor(start->max_key); + n->range_updated = true; + bubble_up(n, end); + goto again; + } + } else if (cmp < 0) { + BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); + + start->max_key = bpos_predecessor(n->min_key); + start->range_updated = true; + } else { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "overlapping btree nodes with same seq! halting\n "); + found_btree_node_to_text(&buf, c, start); + prt_str(&buf, "\n "); + found_btree_node_to_text(&buf, c, n); + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + return -1; + } + } + + return 0; +} + +int bch2_scan_for_btree_nodes(struct bch_fs *c) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + struct printbuf buf = PRINTBUF; + size_t dst; + int ret = 0; + + if (f->nodes.nr) + return 0; + + mutex_init(&f->lock); + + ret = read_btree_nodes(f); + if (ret) + return ret; + + if (!f->nodes.nr) { + bch_err(c, "%s: no btree nodes found", __func__); + ret = -EINVAL; + goto err; + } + + if (0 && c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes found:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); + + dst = 0; + darray_for_each(f->nodes, i) { + struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; + + if (prev && + prev->cookie == i->cookie) { + if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { + bch_err(c, "%s: found too many replicas for btree node", __func__); + ret = -EINVAL; + goto err; + } + prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; + } else { + f->nodes.data[dst++] = *i; + } + } + f->nodes.nr = dst; + + sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); + + if (0 && c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + dst = 0; + darray_for_each(f->nodes, i) { + if (i->overwritten) + continue; + + ret = handle_overwrites(c, i, &darray_top(f->nodes)); + if (ret) + goto err; + + BUG_ON(i->overwritten); + f->nodes.data[dst++] = *i; + } + f->nodes.nr = dst; + + if (c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); +err: + printbuf_exit(&buf); + return ret; +} + +static int found_btree_node_range_start_cmp(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->max_key, r->min_key); +} + +#define for_each_found_btree_node_in_range(_f, _search, _idx) \ + for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ + sizeof((_f)->nodes.data[0]), \ + found_btree_node_range_start_cmp, &search); \ + _idx < (_f)->nodes.nr && \ + (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ + (_f)->nodes.data[_idx].level == _search.level && \ + bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ + _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) + +bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + + struct found_btree_node search = { + .btree_id = b->c.btree_id, + .level = b->c.level, + .min_key = b->data->min_key, + .max_key = b->key.k.p, + }; + + for_each_found_btree_node_in_range(f, search, idx) + if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) + return true; + return false; +} + +bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) +{ + struct found_btree_node search = { + .btree_id = btree, + .level = 0, + .min_key = POS_MIN, + .max_key = SPOS_MAX, + }; + + for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) + return true; + return false; +} + +int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos node_min, struct bpos node_max) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + + int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + return ret; + + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level); + bch2_bpos_to_text(&buf, node_min); + prt_str(&buf, " - "); + bch2_bpos_to_text(&buf, node_max); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + + struct found_btree_node search = { + .btree_id = btree, + .level = level, + .min_key = node_min, + .max_key = node_max, + }; + + for_each_found_btree_node_in_range(f, search, idx) { + struct found_btree_node n = f->nodes.data[idx]; + + n.range_updated |= bpos_lt(n.min_key, node_min); + n.min_key = bpos_max(n.min_key, node_min); + + n.range_updated |= bpos_gt(n.max_key, node_max); + n.max_key = bpos_min(n.max_key, node_max); + + struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; + + found_btree_node_to_key(&tmp.k, &n); + + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); + bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); + printbuf_exit(&buf); + + BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL)); + + ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); + if (ret) + return ret; + } + + return 0; +} + +void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) +{ + darray_exit(&f->nodes); +} diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h new file mode 100644 index 0000000000000..08687b209787b --- /dev/null +++ b/fs/bcachefs/btree_node_scan.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_NODE_SCAN_H +#define _BCACHEFS_BTREE_NODE_SCAN_H + +int bch2_scan_for_btree_nodes(struct bch_fs *); +bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); +bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); +int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); +void bch2_find_btree_nodes_exit(struct find_btree_nodes *); + +#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */ diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h new file mode 100644 index 0000000000000..abb7b27d556a9 --- /dev/null +++ b/fs/bcachefs/btree_node_scan_types.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H +#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H + +#include "darray.h" + +struct found_btree_node { + bool range_updated:1; + bool overwritten:1; + u8 btree_id; + u8 level; + u32 seq; + u64 cookie; + + struct bpos min_key; + struct bpos max_key; + + unsigned nr_ptrs; + struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; +}; + +typedef DARRAY(struct found_btree_node) found_btree_nodes; + +struct find_btree_nodes { + int ret; + struct mutex lock; + found_btree_nodes nodes; +}; + +#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index b2432d88cda64..0e3ca99fbd2de 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -978,6 +978,31 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) return bkey_deleted(k.k); } +void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) +{ + struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; + + if (!ca) { + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : ""); + } else { + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, "ptr: %u:%llu:%u gen %u", + ptr->dev, b, offset, ptr->gen); + if (ptr->cached) + prt_str(out, " cached"); + if (ptr->unwritten) + prt_str(out, " unwritten"); + if (ca && ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } +} + void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { @@ -993,31 +1018,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " "); switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: { - const struct bch_extent_ptr *ptr = entry_to_ptr(entry); - struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] - ? bch_dev_bkey_exists(c, ptr->dev) - : NULL; - - if (!ca) { - prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : ""); - } else { - u32 offset; - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); - - prt_printf(out, "ptr: %u:%llu:%u gen %u", - ptr->dev, b, offset, ptr->gen); - if (ptr->cached) - prt_str(out, " cached"); - if (ptr->unwritten) - prt_str(out, " unwritten"); - if (ca && ptr_stale(ca, ptr)) - prt_printf(out, " stale"); - } + case BCH_EXTENT_ENTRY_ptr: + bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry)); break; - } + case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: { diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 3fd0169b98c18..528e817eacbda 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -676,6 +676,7 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); +void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 084247c13bd8f..1ac4135cca1c3 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -368,11 +368,11 @@ enum fsck_err_opts { OPT_STR_NOLIMIT(bch2_recovery_passes), \ BCH2_NO_SB_OPT, 0, \ NULL, "Exit recovery after specified pass") \ - x(keep_journal, u8, \ + x(retain_recovery_info, u8, \ 0, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ - NULL, "Don't free journal entries/keys after startup")\ + NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\ x(read_entire_journal, u8, \ 0, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index f234de0ac834d..24671020f22b1 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -4,6 +4,7 @@ #include "alloc_background.h" #include "bkey_buf.h" #include "btree_journal_iter.h" +#include "btree_node_scan.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -271,7 +272,7 @@ int bch2_journal_replay(struct bch_fs *c) bch2_trans_put(trans); trans = NULL; - if (!c->opts.keep_journal && + if (!c->opts.retain_recovery_info && c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) bch2_journal_keys_put_initial(c); @@ -435,10 +436,9 @@ static int journal_replay_early(struct bch_fs *c, static int read_btree_roots(struct bch_fs *c) { - unsigned i; int ret = 0; - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (!r->alive) @@ -447,33 +447,36 @@ static int read_btree_roots(struct bch_fs *c) if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) continue; - if (r->error) { - __fsck_err(c, - btree_id_is_alloc(i) - ? FSCK_CAN_IGNORE : 0, - btree_root_bkey_invalid, - "invalid btree root %s", - bch2_btree_id_str(i)); - if (i == BTREE_ID_alloc) + if (mustfix_fsck_err_on((ret = r->error), + c, btree_root_bkey_invalid, + "invalid btree root %s", + bch2_btree_id_str(i)) || + mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), + c, btree_root_read_error, + "error reading btree root %s l=%u: %s", + bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { + if (btree_id_is_alloc(i)) { + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - } + r->error = 0; + } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { + bch_info(c, "will run btree node scan"); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + } - ret = bch2_btree_root_read(c, i, &r->key, r->level); - if (ret) { - fsck_err(c, - btree_root_read_error, - "error reading btree root %s", - bch2_btree_id_str(i)); - if (btree_id_is_alloc(i)) - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); ret = 0; } } - for (i = 0; i < BTREE_ID_NR; i++) { + for (unsigned i = 0; i < BTREE_ID_NR; i++) { struct btree_root *r = bch2_btree_id_root(c, i); - if (!r->b) { + if (!r->b && !r->error) { r->alive = false; r->level = 0; bch2_btree_root_alloc_fake(c, i, 0); @@ -653,7 +656,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { + if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) { struct genradix_iter iter; struct journal_replay **i; @@ -883,9 +886,10 @@ use_clean: out: bch2_flush_fsck_errs(c); - if (!c->opts.keep_journal && - test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + if (!c->opts.retain_recovery_info) { bch2_journal_keys_put_initial(c); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); + } kfree(clean); if (!ret && diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index b066c5155b742..cb501460d6152 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -4,6 +4,7 @@ #include "alloc_background.h" #include "backpointers.h" #include "btree_gc.h" +#include "btree_node_scan.h" #include "ec.h" #include "fsck.h" #include "inode.h" diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index f305212857068..840542cfd65b6 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -13,6 +13,7 @@ * must never change: */ #define BCH_RECOVERY_PASSES() \ + x(scan_for_btree_nodes, 37, 0) \ x(check_topology, 4, 0) \ x(alloc_read, 0, PASS_ALWAYS) \ x(stripes_read, 1, PASS_ALWAYS) \ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index bc026a77eb99d..ed63018f21bef 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -15,6 +15,7 @@ #include "btree_gc.h" #include "btree_journal_iter.h" #include "btree_key_cache.h" +#include "btree_node_scan.h" #include "btree_update_interior.h" #include "btree_io.h" #include "btree_write_buffer.h" @@ -536,6 +537,7 @@ static void __bch2_fs_free(struct bch_fs *c) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); @@ -560,6 +562,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); bch2_journal_keys_put_initial(c); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); -- GitLab From 43f5ea4646b2271a9a5af3729dfdf644d69b3282 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 16 Mar 2024 22:45:30 -0400 Subject: [PATCH 368/989] bcachefs: Topology repair now uses nodes found by scanning to fill holes With the new btree node scan code, we can now recover from corrupt btree roots - simply create a new fake root at depth 1, and then insert all the leaves we found. If the root wasn't corrupt but there's corruption elsewhere in the btree, we can fill in holes as needed with the newest version of a given node(s) from the scan; we also check if a given btree node is older than what we found from the scan. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 303 ++++++++++++++++++++++------------ fs/bcachefs/sb-errors_types.h | 3 +- 2 files changed, 199 insertions(+), 107 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 9db9c8a5beaa0..6280da1244b55 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -13,6 +13,7 @@ #include "btree_journal_iter.h" #include "btree_key_cache.h" #include "btree_locking.h" +#include "btree_node_scan.h" #include "btree_update_interior.h" #include "btree_io.h" #include "btree_gc.h" @@ -41,6 +42,7 @@ #define DROP_THIS_NODE 10 #define DROP_PREV_NODE 11 +#define DID_FILL_FROM_SCAN 12 static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) { @@ -129,6 +131,17 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) struct bkey_i_btree_ptr_v2 *new; int ret; + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, " -> "); + bch2_bpos_to_text(&buf, new_min); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) return -BCH_ERR_ENOMEM_gc_repair_key; @@ -154,6 +167,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) struct bkey_i_btree_ptr_v2 *new; int ret; + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, " -> "); + bch2_bpos_to_text(&buf, new_max); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); if (ret) return ret; @@ -185,127 +209,138 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) return 0; } -static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, - struct btree *prev, struct btree *cur) +static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b, + struct btree *prev, struct btree *cur, + struct bpos *pulled_from_scan) { struct bpos expected_start = !prev ? b->data->min_key : bpos_successor(prev->key.k.p); - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; int ret = 0; - if (!prev) { - prt_printf(&buf1, "start of node: "); - bch2_bpos_to_text(&buf1, b->data->min_key); - } else { - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); + + if (bpos_eq(expected_start, cur->data->min_key)) + return 0; + + prt_printf(&buf, " at btree %s level %u:\n parent: ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + if (prev) { + prt_printf(&buf, "\n prev: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key)); } - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); - - if (prev && - bpos_gt(expected_start, cur->data->min_key) && - BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { - /* cur overwrites prev: */ - - if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, - cur->data->min_key), c, - btree_node_topology_overwritten_by_next_node, - "btree node overwritten by next node at btree %s level %u:\n" - " node %s\n" - " next %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = DROP_PREV_NODE; - goto out; - } + prt_str(&buf, "\n next: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key)); - if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, - bpos_predecessor(cur->data->min_key)), c, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " node %s\n" - " next %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) - ret = set_node_max(c, prev, - bpos_predecessor(cur->data->min_key)); - } else { - /* prev overwrites cur: */ - - if (mustfix_fsck_err_on(bpos_ge(expected_start, - cur->data->max_key), c, - btree_node_topology_overwritten_by_prev_node, - "btree node overwritten by prev node at btree %s level %u:\n" - " prev %s\n" - " node %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = DROP_THIS_NODE; - goto out; - } + if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */ + if (b->c.level == 1 && + bpos_lt(*pulled_from_scan, cur->data->min_key)) { + ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, + expected_start, + bpos_predecessor(cur->data->min_key)); + if (ret) + goto err; - if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, - btree_node_topology_bad_min_key, - "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " node %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) - ret = set_node_min(c, cur, expected_start); + *pulled_from_scan = cur->data->min_key; + ret = DID_FILL_FROM_SCAN; + } else { + if (mustfix_fsck_err(c, btree_node_topology_bad_min_key, + "btree node with incorrect min_key%s", buf.buf)) + ret = set_node_min(c, cur, expected_start); + } + } else { /* overlap */ + if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */ + if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ + if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node, + "btree node overwritten by next node%s", buf.buf)) + ret = DROP_PREV_NODE; + } else { + if (mustfix_fsck_err(c, btree_node_topology_bad_max_key, + "btree node with incorrect max_key%s", buf.buf)) + ret = set_node_max(c, prev, + bpos_predecessor(cur->data->min_key)); + } + } else { + if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ + if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node, + "btree node overwritten by prev node%s", buf.buf)) + ret = DROP_THIS_NODE; + } else { + if (mustfix_fsck_err(c, btree_node_topology_bad_min_key, + "btree node with incorrect min_key%s", buf.buf)) + ret = set_node_min(c, cur, expected_start); + } + } } -out: +err: fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); + printbuf_exit(&buf); return ret; } static int btree_repair_node_end(struct bch_fs *c, struct btree *b, - struct btree *child) + struct btree *child, struct bpos *pulled_from_scan) { - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; int ret = 0; - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); - bch2_bpos_to_text(&buf2, b->key.k.p); + if (bpos_eq(child->key.k.p, b->key.k.p)) + return 0; - if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " %s\n" - " expected %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = set_node_max(c, child, b->key.k.p); - if (ret) - goto err; + prt_printf(&buf, "at btree %s level %u:\n parent: ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + prt_str(&buf, "\n child: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); + + if (mustfix_fsck_err(c, btree_node_topology_bad_max_key, + "btree node with incorrect max_key%s", buf.buf)) { + if (b->c.level == 1 && + bpos_lt(*pulled_from_scan, b->key.k.p)) { + ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, + bpos_successor(child->key.k.p), b->key.k.p); + if (ret) + goto err; + + *pulled_from_scan = b->key.k.p; + ret = DID_FILL_FROM_SCAN; + } else { + ret = set_node_max(c, child, b->key.k.p); + } } err: fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); + printbuf_exit(&buf); return ret; } -static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) +static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b, + struct bpos *pulled_from_scan) { struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; struct bkey_buf prev_k, cur_k; struct btree *prev = NULL, *cur = NULL; - bool have_child, dropped_children = false; + bool have_child, new_pass = false; struct printbuf buf = PRINTBUF; int ret = 0; if (!b->c.level) return 0; -again: - prev = NULL; - have_child = dropped_children = false; + bch2_bkey_buf_init(&prev_k); bch2_bkey_buf_init(&cur_k); +again: + cur = prev = NULL; + have_child = new_pass = false; bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); iter.prefetch = true; @@ -332,9 +367,10 @@ again: b->c.level - 1, buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); cur = NULL; + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: + bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); if (ret) break; continue; @@ -344,7 +380,23 @@ again: if (ret) break; - ret = btree_repair_node_boundaries(c, b, prev, cur); + if (bch2_btree_node_is_stale(c, cur)) { + bch_info(c, "btree node %s older than nodes found by scanning", buf.buf); + six_unlock_read(&cur->c.lock); + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + cur = NULL; + if (ret) + break; + continue; + } + + ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan); + if (ret == DID_FILL_FROM_SCAN) { + new_pass = true; + ret = 0; + } if (ret == DROP_THIS_NODE) { six_unlock_read(&cur->c.lock); @@ -370,8 +422,6 @@ again: break; bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&prev_k, c); - bch2_bkey_buf_exit(&cur_k, c); goto again; } else if (ret) break; @@ -383,7 +433,11 @@ again: if (!ret && !IS_ERR_OR_NULL(prev)) { BUG_ON(cur); - ret = btree_repair_node_end(c, b, prev); + ret = btree_repair_node_end(c, b, prev, pulled_from_scan); + if (ret == DID_FILL_FROM_SCAN) { + new_pass = true; + ret = 0; + } } if (!IS_ERR_OR_NULL(prev)) @@ -397,6 +451,10 @@ again: goto err; bch2_btree_and_journal_iter_exit(&iter); + + if (new_pass) + goto again; + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); iter.prefetch = true; @@ -413,7 +471,7 @@ again: if (ret) goto err; - ret = bch2_btree_repair_topology_recurse(trans, cur); + ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan); six_unlock_read(&cur->c.lock); cur = NULL; @@ -421,7 +479,7 @@ again: bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); - dropped_children = true; + new_pass = true; } if (ret) @@ -448,12 +506,14 @@ fsck_err: six_unlock_read(&cur->c.lock); bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&prev_k, c); - bch2_bkey_buf_exit(&cur_k, c); - if (!ret && dropped_children) + if (!ret && new_pass) goto again; + BUG_ON(!ret && bch2_btree_node_check_topology(trans, b)); + + bch2_bkey_buf_exit(&prev_k, c); + bch2_bkey_buf_exit(&cur_k, c); printbuf_exit(&buf); return ret; } @@ -461,32 +521,63 @@ fsck_err: int bch2_check_topology(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree *b; - unsigned i; + struct bpos pulled_from_scan = POS_MIN; int ret = 0; - for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { struct btree_root *r = bch2_btree_id_root(c, i); + bool reconstructed_root = false; - if (!r->alive) - continue; + if (r->error) { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + break; +reconstruct_root: + bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i)); - b = r->b; - if (btree_node_fake(b)) - continue; + r->alive = false; + r->error = 0; + + if (!bch2_btree_has_scanned_nodes(c, i)) { + mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing, + "no nodes found for btree %s, continue?", bch2_btree_id_str(i)); + bch2_btree_root_alloc_fake(c, i, 0); + } else { + bch2_btree_root_alloc_fake(c, i, 1); + ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); + if (ret) + break; + } + + bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + reconstructed_root = true; + } + + struct btree *b = r->b; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - ret = bch2_btree_repair_topology_recurse(trans, b); + ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); six_unlock_read(&b->c.lock); if (ret == DROP_THIS_NODE) { - bch_err(c, "empty btree root - repair unimplemented"); - ret = -BCH_ERR_fsck_repair_unimplemented; + bch2_btree_node_hash_remove(&c->btree_cache, b); + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + + r->b = NULL; + + if (!reconstructed_root) + goto reconstruct_root; + + bch_err(c, "empty btree root %s", bch2_btree_id_str(i)); + bch2_btree_root_alloc_fake(c, i, 0); + r->alive = false; + ret = 0; } } - +fsck_err: bch2_trans_put(trans); - return ret; } diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 73e9634df8ffb..2fec03a24c95b 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -267,7 +267,8 @@ x(subvol_unreachable, 259) \ x(btree_node_bkey_bad_u64s, 260) \ x(btree_node_topology_empty_interior_node, 261) \ - x(btree_ptr_v2_min_key_bad, 262) + x(btree_ptr_v2_min_key_bad, 262) \ + x(btree_root_unreadable_and_scan_found_nothing, 263) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, -- GitLab From 55936afe11077a84d9e1c5068169af328bbf2811 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Mar 2024 23:03:42 -0400 Subject: [PATCH 369/989] bcachefs: Flag btrees with missing data We need this to know when we should attempt to reconstruct the snapshots btree Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/bcachefs_format.h | 1 + fs/bcachefs/btree_io.c | 13 +++++++++---- fs/bcachefs/recovery.c | 23 +++++++++++++++++++++++ fs/bcachefs/recovery.h | 2 ++ fs/bcachefs/super-io.c | 9 ++++++++- 6 files changed, 44 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 93a61dbaa3d86..d1a0e54785032 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -797,6 +797,7 @@ struct bch_fs { u64 features; u64 compat; unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)]; + u64 btrees_lost_data; } sb; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index bff8750ac0d74..63102992d9556 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -818,6 +818,7 @@ struct bch_sb_field_ext { struct bch_sb_field field; __le64 recovery_passes_required[2]; __le64 errors_silent[8]; + __le64 btrees_lost_data; }; struct bch_sb_field_downgrade_entry { diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index f3f27bb85a5ba..d7de82ac38935 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1264,10 +1264,12 @@ out: return retry_read; fsck_err: if (ret == -BCH_ERR_btree_node_read_err_want_retry || - ret == -BCH_ERR_btree_node_read_err_must_retry) + ret == -BCH_ERR_btree_node_read_err_must_retry) { retry_read = 1; - else + } else { set_btree_node_read_error(b); + bch2_btree_lost_data(c, b->c.btree_id); + } goto out; } @@ -1328,6 +1330,7 @@ start: if (!can_retry) { set_btree_node_read_error(b); + bch2_btree_lost_data(c, b->c.btree_id); break; } } @@ -1527,9 +1530,10 @@ fsck_err: ret = -1; } - if (ret) + if (ret) { set_btree_node_read_error(b); - else if (*saw_error) + bch2_btree_lost_data(c, b->c.btree_id); + } else if (*saw_error) bch2_btree_node_rewrite_async(c, b); for (i = 0; i < ra->nr; i++) { @@ -1665,6 +1669,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bch2_fatal_error(c); set_btree_node_read_error(b); + bch2_btree_lost_data(c, b->c.btree_id); clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); printbuf_exit(&buf); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 24671020f22b1..b3c67ae3d3b2e 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -33,6 +33,20 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } +void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) +{ + u64 b = BIT_ULL(btree); + + if (!(c->sb.btrees_lost_data & b)) { + bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree)); + + mutex_lock(&c->sb_lock); + bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } +} + static bool btree_id_is_alloc(enum btree_id id) { switch (id) { @@ -470,6 +484,7 @@ static int read_btree_roots(struct bch_fs *c) } ret = 0; + bch2_btree_lost_data(c, i); } } @@ -848,6 +863,14 @@ use_clean: write_sb = true; } + if (c->opts.fsck && + !test_bit(BCH_FS_error, &c->flags) && + c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && + ext->btrees_lost_data) { + ext->btrees_lost_data = 0; + write_sb = true; + } + if (c->opts.fsck && !test_bit(BCH_FS_error, &c->flags) && !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 3962fd87b50d8..4bf818de1f2fe 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H +void bch2_btree_lost_data(struct bch_fs *, enum btree_id); + int bch2_journal_replay(struct bch_fs *); int bch2_fs_recovery(struct bch_fs *); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index e15f8b1f30c2a..e0aa3655b63b4 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -527,9 +527,11 @@ static void bch2_sb_update(struct bch_fs *c) memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); - if (ext) + if (ext) { le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, sizeof(c->sb.errors_silent) * 8); + c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); + } for_each_member_device(c, ca) { struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); @@ -1162,6 +1164,11 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, kfree(errors_silent); } + + prt_printf(out, "Btrees with missing data:"); + prt_tab(out); + prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data)); + prt_newline(out); } static const struct bch_sb_field_ops bch_sb_field_ops_ext = { -- GitLab From a292be3b68f3fdad6cff50c716174f51b119efd1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 27 Mar 2024 22:50:19 -0400 Subject: [PATCH 370/989] bcachefs: Reconstruct missing snapshot nodes When the snapshots btree is going, we'll have to delete huge amounts of data - unless we can reconstruct it by looking at the keys that refer to it. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/recovery.c | 1 + fs/bcachefs/recovery_passes_types.h | 1 + fs/bcachefs/sb-errors_types.h | 3 +- fs/bcachefs/snapshot.c | 173 +++++++++++++++++++++++++++- fs/bcachefs/snapshot.h | 26 ++++- 6 files changed, 199 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index d1a0e54785032..a31a5f706929e 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -615,6 +615,7 @@ struct bch_dev { */ #define BCH_FS_FLAGS() \ + x(new_fs) \ x(started) \ x(may_go_rw) \ x(rw) \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index b3c67ae3d3b2e..b76c16152579c 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -938,6 +938,7 @@ int bch2_fs_initialize(struct bch_fs *c) int ret; bch_notice(c, "initializing new filesystem"); + set_bit(BCH_FS_new_fs, &c->flags); mutex_lock(&c->sb_lock); c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 840542cfd65b6..773aea9a0080f 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -32,6 +32,7 @@ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ x(bucket_gens_init, 17, 0) \ + x(reconstruct_snapshots, 38, 0) \ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 2fec03a24c95b..2f8f4d2388b04 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -268,7 +268,8 @@ x(btree_node_bkey_bad_u64s, 260) \ x(btree_node_topology_empty_interior_node, 261) \ x(btree_ptr_v2_min_key_bad, 262) \ - x(btree_root_unreadable_and_scan_found_nothing, 263) + x(btree_root_unreadable_and_scan_found_nothing, 263) \ + x(snapshot_node_missing, 264) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 4577ee7939a2a..0e806f04f3d7c 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -8,6 +8,7 @@ #include "errcode.h" #include "error.h" #include "fs.h" +#include "recovery_passes.h" #include "snapshot.h" #include @@ -574,6 +575,13 @@ static int check_snapshot_tree(struct btree_trans *trans, u32 subvol_id; ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); + bch_err_fn(c, ret); + + if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */ + ret = 0; + goto err; + } + if (ret) goto err; @@ -731,7 +739,6 @@ static int check_snapshot(struct btree_trans *trans, u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); u32 real_depth; struct printbuf buf = PRINTBUF; - bool should_have_subvol; u32 i, id; int ret = 0; @@ -777,7 +784,7 @@ static int check_snapshot(struct btree_trans *trans, } } - should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && + bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && !BCH_SNAPSHOT_DELETED(&s); if (should_have_subvol) { @@ -879,6 +886,154 @@ int bch2_check_snapshots(struct bch_fs *c) return ret; } +static int check_snapshot_exists(struct btree_trans *trans, u32 id) +{ + struct bch_fs *c = trans->c; + + if (bch2_snapshot_equiv(c, id)) + return 0; + + u32 tree_id; + int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); + if (ret) + return ret; + + struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); + ret = PTR_ERR_OR_ZERO(snapshot); + if (ret) + return ret; + + bkey_snapshot_init(&snapshot->k_i); + snapshot->k.p = POS(0, id); + snapshot->v.tree = cpu_to_le32(tree_id); + snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + + return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: + bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: + bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i)); +} + +/* Figure out which snapshot nodes belong in the same tree: */ +struct snapshot_tree_reconstruct { + enum btree_id btree; + struct bpos cur_pos; + snapshot_id_list cur_ids; + DARRAY(snapshot_id_list) trees; +}; + +static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r) +{ + darray_for_each(r->trees, i) + darray_exit(i); + darray_exit(&r->trees); + darray_exit(&r->cur_ids); +} + +static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos) +{ + return r->btree == BTREE_ID_inodes + ? r->cur_pos.offset == pos.offset + : r->cur_pos.inode == pos.inode; +} + +static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) +{ + darray_for_each(*l, i) + if (snapshot_list_has_id(r, *i)) + return true; + return false; +} + +static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) +{ + bool first = true; + darray_for_each(*s, i) { + if (!first) + prt_char(out, ' '); + first = false; + prt_printf(out, "%u", *i); + } +} + +static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r) +{ + if (r->cur_ids.nr) { + darray_for_each(r->trees, i) + if (snapshot_id_lists_have_common(i, &r->cur_ids)) { + int ret = snapshot_list_merge(c, i, &r->cur_ids); + if (ret) + return ret; + goto out; + } + darray_push(&r->trees, r->cur_ids); + darray_init(&r->cur_ids); + } +out: + r->cur_ids.nr = 0; + return 0; +} + +static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos) +{ + if (!same_snapshot(r, pos)) + snapshot_tree_reconstruct_next(c, r); + r->cur_pos = pos; + return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot); +} + +int bch2_reconstruct_snapshots(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct printbuf buf = PRINTBUF; + struct snapshot_tree_reconstruct r = {}; + int ret = 0; + + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { + if (btree_type_has_snapshots(btree)) { + r.btree = btree; + + ret = for_each_btree_key(trans, iter, btree, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({ + get_snapshot_trees(c, &r, k.k->p); + })); + if (ret) + goto err; + + snapshot_tree_reconstruct_next(c, &r); + } + } + + darray_for_each(r.trees, t) { + printbuf_reset(&buf); + snapshot_id_list_to_text(&buf, t); + + darray_for_each(*t, id) { + if (fsck_err_on(!bch2_snapshot_equiv(c, *id), + c, snapshot_node_missing, + "snapshot node %u from tree %s missing", *id, buf.buf)) { + if (t->nr > 1) { + bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; + } + + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_snapshot_exists(trans, *id)); + if (ret) + goto err; + } + } + } +fsck_err: +err: + bch2_trans_put(trans); + snapshot_tree_reconstruct_exit(&r); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + /* * Mark a snapshot as deleted, for future cleanup: */ @@ -1689,6 +1844,20 @@ int bch2_snapshots_read(struct bch_fs *c) POS_MIN, 0, k, (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); bch_err_fn(c, ret); + + /* + * It's important that we check if we need to reconstruct snapshots + * before going RW, so we mark that pass as required in the superblock - + * otherwise, we could end up deleting keys with missing snapshot nodes + * instead + */ + BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && + test_bit(BCH_FS_may_go_rw, &c->flags)); + + if (bch2_err_matches(ret, EIO) || + (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); + return ret; } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 331f20fd8d03d..b7d2fed37c4f3 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -209,15 +209,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) { - int ret; - BUG_ON(snapshot_list_has_id(s, id)); - ret = darray_push(s, id); + int ret = darray_push(s, id); if (ret) bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); return ret; } +static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id) +{ + int ret = snapshot_list_has_id(s, id) + ? 0 + : darray_push(s, id); + if (ret) + bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); + return ret; +} + +static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src) +{ + darray_for_each(*src, i) { + int ret = snapshot_list_add_nodup(c, dst, *i); + if (ret) + return ret; + } + + return 0; +} + int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s); int bch2_snapshot_get_subvol(struct btree_trans *, u32, @@ -229,6 +248,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); +int bch2_reconstruct_snapshots(struct bch_fs *); int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); void bch2_delete_dead_snapshots_work(struct work_struct *); -- GitLab From 4c02e63dadc7f5a92732ce3e267d0511749f60fb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 30 Mar 2024 18:43:00 -0400 Subject: [PATCH 371/989] bcachefs: Check for extents that point to same space In backpointer repair, if we get a missing backpointer - but there's already a backpointer that points to an existing extent - we've got multiple extents that point to the same space and need to decide which to keep. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 173 ++++++++++++++++++++++++++++++++-- fs/bcachefs/sb-errors_types.h | 3 +- 2 files changed, 168 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 762c8ddfc5e73..114328acde720 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -8,6 +8,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" +#include "checksum.h" #include "error.h" #include @@ -418,6 +419,84 @@ struct extents_to_bp_state { struct bkey_buf last_flushed; }; +static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, + struct bkey_s_c extent, unsigned dev) +{ + struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bch2_bkey_drop_device(bkey_i_to_s(n), dev); + return bch2_btree_insert_trans(trans, btree, n, 0); +} + +static int check_extent_checksum(struct btree_trans *trans, + enum btree_id btree, struct bkey_s_c extent, + enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct printbuf buf = PRINTBUF; + void *data_buf = NULL; + struct bio *bio = NULL; + size_t bytes; + int ret = 0; + + if (bkey_is_btree_ptr(extent.k)) + return false; + + bkey_for_each_ptr_decode(extent.k, ptrs, p, entry) + if (p.ptr.dev == dev) + goto found; + BUG(); +found: + if (!p.crc.csum_type) + return false; + + bytes = p.crc.compressed_size << 9; + + struct bch_dev *ca = bch_dev_bkey_exists(c, dev); + if (!bch2_dev_get_ioref(ca, READ)) + return false; + + data_buf = kvmalloc(bytes, GFP_KERNEL); + if (!data_buf) { + ret = -ENOMEM; + goto err; + } + + bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL); + bio->bi_iter.bi_sector = p.ptr.offset; + bch2_bio_map(bio, data_buf, bytes); + ret = submit_bio_wait(bio); + if (ret) + goto err; + + prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); + prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree)); + bch2_bkey_val_to_text(&buf, c, extent); + prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree)); + bch2_bkey_val_to_text(&buf, c, extent2); + + struct nonce nonce = extent_nonce(extent.k->version, p.crc); + struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); + if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), + c, dup_backpointer_to_bad_csum_extent, + "%s", buf.buf)) + ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; +fsck_err: +err: + if (bio) + bio_put(bio); + kvfree(data_buf); + percpu_ref_put(&ca->io_ref); + printbuf_exit(&buf); + return ret; +} + static int check_bp_exists(struct btree_trans *trans, struct extents_to_bp_state *s, struct bpos bucket, @@ -425,7 +504,8 @@ static int check_bp_exists(struct btree_trans *trans, struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; - struct btree_iter bp_iter = { NULL }; + struct btree_iter bp_iter = {}; + struct btree_iter other_extent_iter = {}; struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; struct bkey_buf tmp; @@ -433,13 +513,19 @@ static int check_bp_exists(struct btree_trans *trans, bch2_bkey_buf_init(&tmp); + if (!bch2_dev_bucket_exists(c, bucket)) { + prt_str(&buf, "extent for nonexistent device:bucket "); + bch2_bpos_to_text(&buf, bucket); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, orig_k); + bch_err(c, "%s", buf.buf); + return -BCH_ERR_fsck_repair_unimplemented; + } + if (bpos_lt(bucket, s->bucket_start) || bpos_gt(bucket, s->bucket_end)) return 0; - if (!bch2_dev_bucket_exists(c, bucket)) - goto missing; - bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bucket_pos_to_bp(c, bucket, bp.bucket_offset), 0); @@ -465,21 +551,94 @@ static int check_bp_exists(struct btree_trans *trans, ret = -BCH_ERR_transaction_restart_write_buffer_flush; goto out; } - goto missing; + + goto check_existing_bp; } out: err: fsck_err: + bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); bch2_bkey_buf_exit(&tmp, c); printbuf_exit(&buf); return ret; +check_existing_bp: + /* Do we have a backpointer for a different extent? */ + if (bp_k.k->type != KEY_TYPE_backpointer) + goto missing; + + struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v; + + struct bkey_s_c other_extent = + bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0); + ret = bkey_err(other_extent); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + ret = 0; + if (ret) + goto err; + + if (!other_extent.k) + goto missing; + + if (bch2_extents_match(orig_k, other_extent)) { + printbuf_reset(&buf); + prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n "); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, other_extent); + bch_err(c, "%s", buf.buf); + + if (other_extent.k->size <= orig_k.k->size) { + ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode); + if (ret) + goto err; + goto out; + } else { + ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode); + if (ret) + goto err; + goto missing; + } + } + + ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode); + if (ret < 0) + goto err; + if (ret) { + ret = 0; + goto missing; + } + + ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode); + if (ret < 0) + goto err; + if (ret) { + ret = 0; + goto out; + } + + printbuf_reset(&buf); + prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, other_extent); + bch_err(c, "%s", buf.buf); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; missing: + printbuf_reset(&buf); prt_printf(&buf, "missing backpointer for btree=%s l=%u ", bch2_btree_id_str(bp.btree_id), bp.level); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\nbp pos "); - bch2_bpos_to_text(&buf, bp_iter.pos); + prt_printf(&buf, "\n got: "); + bch2_bkey_val_to_text(&buf, c, bp_k); + + struct bkey_i_backpointer n_bp_k; + bkey_backpointer_init(&n_bp_k.k_i); + n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + n_bp_k.v = bp; + prt_printf(&buf, "\n want: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 2f8f4d2388b04..d7d609131030a 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -269,7 +269,8 @@ x(btree_node_topology_empty_interior_node, 261) \ x(btree_ptr_v2_min_key_bad, 262) \ x(btree_root_unreadable_and_scan_found_nothing, 263) \ - x(snapshot_node_missing, 264) + x(snapshot_node_missing, 264) \ + x(dup_backpointer_to_bad_csum_extent, 265) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, -- GitLab From cc0532900bcf1896a81dcdd30873ffa6c4f6926b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 31 Mar 2024 02:03:03 -0400 Subject: [PATCH 372/989] bcachefs: Subvolume reconstruction We can now recreate missing subvolumes from dirents and/or inodes. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 167 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 148 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index b704fb0dda951..c1edc5647ba08 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -63,9 +63,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, u32 *snapshot, u64 *inum) { struct bch_subvolume s; - int ret; - - ret = bch2_subvolume_get(trans, subvol, false, 0, &s); + int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); *snapshot = le32_to_cpu(s.snapshot); *inum = le64_to_cpu(s.inode); @@ -170,7 +168,8 @@ err: /* Get lost+found, create if it doesn't exist: */ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - struct bch_inode_unpacked *lostfound) + struct bch_inode_unpacked *lostfound, + u64 reattaching_inum) { struct bch_fs *c = trans->c; struct qstr lostfound_str = QSTR("lost+found"); @@ -185,19 +184,36 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, return ret; subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; - u32 subvol_snapshot; - ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol), - &subvol_snapshot, &root_inum.inum); - bch_err_msg(c, ret, "looking up root subvol"); + struct bch_subvolume subvol; + ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), + false, 0, &subvol); + bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", + le32_to_cpu(st.master_subvol), snapshot); if (ret) return ret; + if (!subvol.inode) { + struct btree_iter iter; + struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(subvol); + if (ret) + return ret; + + subvol->v.inode = cpu_to_le64(reattaching_inum); + bch2_trans_iter_exit(trans, &iter); + } + + root_inum.inum = le64_to_cpu(subvol.inode); + struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; u32 root_inode_snapshot = snapshot; ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); - bch_err_msg(c, ret, "looking up root inode"); + bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", + root_inum.inum, le32_to_cpu(st.master_subvol)); if (ret) return ret; @@ -293,7 +309,7 @@ static int reattach_inode(struct btree_trans *trans, snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); } - ret = lookup_lostfound(trans, dirent_snapshot, &lostfound); + ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); if (ret) return ret; @@ -364,6 +380,85 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume return ret; } +static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) +{ + struct bch_fs *c = trans->c; + + if (!bch2_snapshot_is_leaf(c, snapshotid)) { + bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); + return -BCH_ERR_fsck_repair_unimplemented; + } + + /* + * If inum isn't set, that means we're being called from check_dirents, + * not check_inodes - the root of this subvolume doesn't exist or we + * would have found it there: + */ + if (!inum) { + struct btree_iter inode_iter = {}; + struct bch_inode_unpacked new_inode; + u64 cpu = raw_smp_processor_id(); + + bch2_inode_init_early(c, &new_inode); + bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); + + new_inode.bi_subvol = subvolid; + + int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: + bch2_btree_iter_traverse(&inode_iter) ?: + bch2_inode_write(trans, &inode_iter, &new_inode); + bch2_trans_iter_exit(trans, &inode_iter); + if (ret) + return ret; + + inum = new_inode.bi_inum; + } + + bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); + + struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); + int ret = PTR_ERR_OR_ZERO(new_subvol); + if (ret) + return ret; + + bkey_subvolume_init(&new_subvol->k_i); + new_subvol->k.p.offset = subvolid; + new_subvol->v.snapshot = cpu_to_le32(snapshotid); + new_subvol->v.inode = cpu_to_le64(inum); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); + if (ret) + return ret; + + struct btree_iter iter; + struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, snapshotid), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(s); + bch_err_msg(c, ret, "getting snapshot %u", snapshotid); + if (ret) + return ret; + + u32 snapshot_tree = le32_to_cpu(s->v.tree); + + s->v.subvol = cpu_to_le32(subvolid); + SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); + bch2_trans_iter_exit(trans, &iter); + + struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshot_trees, POS(0, snapshot_tree), + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(st); + bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); + if (ret) + return ret; + + if (!st->v.master_subvol) + st->v.master_subvol = cpu_to_le32(subvolid); + + bch2_trans_iter_exit(trans, &iter); + return 0; +} + struct snapshots_seen_entry { u32 id; u32 equiv; @@ -1065,6 +1160,11 @@ static int check_inode(struct btree_trans *trans, if (ret && !bch2_err_matches(ret, ENOENT)) goto err; + if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { + ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); + goto do_update; + } + if (fsck_err_on(ret, c, inode_bi_subvol_missing, "inode %llu:%u bi_subvol points to missing subvolume %u", @@ -1082,7 +1182,7 @@ static int check_inode(struct btree_trans *trans, do_update = true; } } - +do_update: if (do_update) { ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck updating inode"); @@ -1785,6 +1885,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); u32 parent_snapshot; + u32 new_parent_subvol = 0; u64 parent_inum; struct printbuf buf = PRINTBUF; int ret = 0; @@ -1793,6 +1894,27 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (ret && !bch2_err_matches(ret, ENOENT)) return ret; + if (ret || + (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { + int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); + if (ret2 && !bch2_err_matches(ret, ENOENT)) + return ret2; + } + + if (ret && + !new_parent_subvol && + (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { + /* + * Couldn't find a subvol for dirent's snapshot - but we lost + * subvols, so we need to reconstruct: + */ + ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); + if (ret) + return ret; + + parent_snapshot = d.k->p.snapshot; + } + if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol, "dirent parent_subvol points to missing subvolume\n%s", (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || @@ -1801,10 +1923,10 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", parent_snapshot, (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - u32 new_parent_subvol; - ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); - if (ret) - goto err; + if (!new_parent_subvol) { + bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); + return -BCH_ERR_fsck_repair_unimplemented; + } struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); ret = PTR_ERR_OR_ZERO(new_dirent); @@ -1850,9 +1972,16 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; + goto err; - if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol, + if (ret) { + bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); + ret = -BCH_ERR_fsck_repair_unimplemented; + ret = 0; + goto err; + } + + if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, c, inode_bi_parent_wrong, "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", target_inum, @@ -1860,13 +1989,13 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * subvol_root.bi_parent_subvol = parent_subvol; ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); if (ret) - return ret; + goto err; } ret = check_dirent_target(trans, iter, d, &subvol_root, target_snapshot); if (ret) - return ret; + goto err; out: err: fsck_err: -- GitLab From 09d4c2acbf4c864fef0f520bbcba256c9a19102e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 1 Apr 2024 00:00:56 -0400 Subject: [PATCH 373/989] bcachefs: reconstruct_inode() If an inode is missing, but corresponding extents and dirent still exist, it's well worth recreating it - this does so. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index c1edc5647ba08..8e2010212cc37 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -459,6 +459,33 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub return 0; } +static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked new_inode; + + bch2_inode_init_early(c, &new_inode); + bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL); + new_inode.bi_size = size; + new_inode.bi_inum = inum; + + return __bch2_fsck_write_inode(trans, &new_inode, snapshot); +} + +static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum) +{ + struct btree_iter iter = {}; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); + struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); + bch2_trans_iter_exit(trans, &iter); + int ret = bkey_err(k); + if (ret) + return ret; + + return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG); +} + struct snapshots_seen_entry { u32 id; u32 equiv; @@ -1535,6 +1562,17 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto err; if (k.k->type != KEY_TYPE_whiteout) { + if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { + ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + inode->last_pos.inode--; + ret = -BCH_ERR_transaction_restart_nested; + goto err; + } + if (fsck_err_on(!i, c, extent_in_missing_inode, "extent in missing inode:\n %s", (printbuf_reset(&buf), @@ -2012,7 +2050,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct snapshots_seen *s) { struct bch_fs *c = trans->c; - struct bkey_s_c_dirent d; struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; struct bpos equiv; @@ -2051,6 +2088,17 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); dir->first_this_inode = false; + if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { + ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + dir->last_pos.inode--; + ret = -BCH_ERR_transaction_restart_nested; + goto err; + } + if (fsck_err_on(!i, c, dirent_in_missing_dir_inode, "dirent in nonexisting directory:\n%s", (printbuf_reset(&buf), @@ -2085,7 +2133,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (k.k->type != KEY_TYPE_dirent) goto out; - d = bkey_s_c_to_dirent(k); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL) { ret = check_dirent_to_subvol(trans, iter, d); -- GitLab From de164a7f19248fb03229a4af9b0db333d9591e55 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 21 Mar 2024 23:54:19 -0700 Subject: [PATCH 374/989] nios2: Only use built-in devicetree blob if configured to do so Starting with commit 7b937cc243e5 ("of: Create of_root if no dtb provided by firmware"), attempts to boot nios2 images with an external devicetree blob result in a crash. Kernel panic - not syncing: early_init_dt_alloc_memory_arch: Failed to allocate 72 bytes align=0x40 For nios2, a built-in devicetree blob always overrides devicetree blobs provided by ROMMON/BIOS. This includes the new dummy devicetree blob. Result is that the dummy devicetree blob is used even if an external devicetree blob is provided. Since the dummy devicetree blob does not include any memory information, memory allocations fail, resulting in the crash. To fix the problem, only use the built-in devicetree blob if CONFIG_NIOS2_DTB_SOURCE_BOOL is enabled. Fixes: 7b937cc243e5 ("of: Create of_root if no dtb provided by firmware") Cc: Frank Rowand Cc: Stephen Boyd Cc: Rob Herring Signed-off-by: Guenter Roeck Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20240322065419.162416-1-linux@roeck-us.net Signed-off-by: Rob Herring --- arch/nios2/kernel/prom.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/nios2/kernel/prom.c b/arch/nios2/kernel/prom.c index 8d98af5c7201b..9a8393e6b4a85 100644 --- a/arch/nios2/kernel/prom.c +++ b/arch/nios2/kernel/prom.c @@ -21,7 +21,8 @@ void __init early_init_devtree(void *params) { - __be32 *dtb = (u32 *)__dtb_start; + __be32 __maybe_unused *dtb = (u32 *)__dtb_start; + #if defined(CONFIG_NIOS2_DTB_AT_PHYS_ADDR) if (be32_to_cpup((__be32 *)CONFIG_NIOS2_DTB_PHYS_ADDR) == OF_DT_HEADER) { @@ -30,8 +31,11 @@ void __init early_init_devtree(void *params) return; } #endif + +#ifdef CONFIG_NIOS2_DTB_SOURCE_BOOL if (be32_to_cpu((__be32) *dtb) == OF_DT_HEADER) params = (void *)__dtb_start; +#endif early_init_dt_scan(params); } -- GitLab From 173217bd73365867378b5e75a86f0049e1069ee8 Mon Sep 17 00:00:00 2001 From: Ritvik Budhiraja Date: Tue, 2 Apr 2024 14:01:28 -0500 Subject: [PATCH 375/989] smb3: retrying on failed server close In the current implementation, CIFS close sends a close to the server and does not check for the success of the server close. This patch adds functionality to check for server close return status and retries in case of an EBUSY or EAGAIN error. This can help avoid handle leaks Cc: stable@vger.kernel.org Signed-off-by: Ritvik Budhiraja Signed-off-by: Steve French --- fs/smb/client/cached_dir.c | 6 ++-- fs/smb/client/cifsfs.c | 11 +++++++ fs/smb/client/cifsglob.h | 7 +++-- fs/smb/client/file.c | 63 ++++++++++++++++++++++++++++++++++---- fs/smb/client/smb1ops.c | 4 +-- fs/smb/client/smb2ops.c | 9 +++--- fs/smb/client/smb2pdu.c | 2 +- 7 files changed, 85 insertions(+), 17 deletions(-) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index a0017724d5239..13a9d7acf8f8e 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -417,6 +417,7 @@ smb2_close_cached_fid(struct kref *ref) { struct cached_fid *cfid = container_of(ref, struct cached_fid, refcount); + int rc; spin_lock(&cfid->cfids->cfid_list_lock); if (cfid->on_list) { @@ -430,9 +431,10 @@ smb2_close_cached_fid(struct kref *ref) cfid->dentry = NULL; if (cfid->is_open) { - SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, + rc = SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, cfid->fid.volatile_fid); - atomic_dec(&cfid->tcon->num_remote_opens); + if (rc != -EBUSY && rc != -EAGAIN) + atomic_dec(&cfid->tcon->num_remote_opens); } free_cached_dir(cfid); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index aa6f1ecb7c0e8..d41eedbff674a 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -156,6 +156,7 @@ struct workqueue_struct *decrypt_wq; struct workqueue_struct *fileinfo_put_wq; struct workqueue_struct *cifsoplockd_wq; struct workqueue_struct *deferredclose_wq; +struct workqueue_struct *serverclose_wq; __u32 cifs_lock_secret; /* @@ -1888,6 +1889,13 @@ init_cifs(void) goto out_destroy_cifsoplockd_wq; } + serverclose_wq = alloc_workqueue("serverclose", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!serverclose_wq) { + rc = -ENOMEM; + goto out_destroy_serverclose_wq; + } + rc = cifs_init_inodecache(); if (rc) goto out_destroy_deferredclose_wq; @@ -1962,6 +1970,8 @@ out_destroy_decrypt_wq: destroy_workqueue(decrypt_wq); out_destroy_cifsiod_wq: destroy_workqueue(cifsiod_wq); +out_destroy_serverclose_wq: + destroy_workqueue(serverclose_wq); out_clean_proc: cifs_proc_clean(); return rc; @@ -1991,6 +2001,7 @@ exit_cifs(void) destroy_workqueue(cifsoplockd_wq); destroy_workqueue(decrypt_wq); destroy_workqueue(fileinfo_put_wq); + destroy_workqueue(serverclose_wq); destroy_workqueue(cifsiod_wq); cifs_proc_clean(); } diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 286afbe346be0..77ca7861a2cc8 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -442,10 +442,10 @@ struct smb_version_operations { /* set fid protocol-specific info */ void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32); /* close a file */ - void (*close)(const unsigned int, struct cifs_tcon *, + int (*close)(const unsigned int, struct cifs_tcon *, struct cifs_fid *); /* close a file, returning file attributes and timestamps */ - void (*close_getattr)(const unsigned int xid, struct cifs_tcon *tcon, + int (*close_getattr)(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *pfile_info); /* send a flush request to the server */ int (*flush)(const unsigned int, struct cifs_tcon *, struct cifs_fid *); @@ -1439,6 +1439,7 @@ struct cifsFileInfo { bool swapfile:1; bool oplock_break_cancelled:1; bool status_file_deleted:1; /* file has been deleted */ + bool offload:1; /* offload final part of _put to a wq */ unsigned int oplock_epoch; /* epoch from the lease break */ __u32 oplock_level; /* oplock/lease level from the lease break */ int count; @@ -1447,6 +1448,7 @@ struct cifsFileInfo { struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ struct work_struct put; /* work for the final part of _put */ + struct work_struct serverclose; /* work for serverclose */ struct delayed_work deferred; bool deferred_close_scheduled; /* Flag to indicate close is scheduled */ char *symlink_target; @@ -2103,6 +2105,7 @@ extern struct workqueue_struct *decrypt_wq; extern struct workqueue_struct *fileinfo_put_wq; extern struct workqueue_struct *cifsoplockd_wq; extern struct workqueue_struct *deferredclose_wq; +extern struct workqueue_struct *serverclose_wq; extern __u32 cifs_lock_secret; extern mempool_t *cifs_sm_req_poolp; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index ab536ce8a04a5..9be37d0fe724e 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -459,6 +459,7 @@ cifs_down_write(struct rw_semaphore *sem) } static void cifsFileInfo_put_work(struct work_struct *work); +void serverclose_work(struct work_struct *work); struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock, @@ -505,6 +506,7 @@ struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); INIT_WORK(&cfile->put, cifsFileInfo_put_work); + INIT_WORK(&cfile->serverclose, serverclose_work); INIT_DELAYED_WORK(&cfile->deferred, smb2_deferred_work_close); mutex_init(&cfile->fh_mutex); spin_lock_init(&cfile->file_info_lock); @@ -596,6 +598,40 @@ static void cifsFileInfo_put_work(struct work_struct *work) cifsFileInfo_put_final(cifs_file); } +void serverclose_work(struct work_struct *work) +{ + struct cifsFileInfo *cifs_file = container_of(work, + struct cifsFileInfo, serverclose); + + struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); + + struct TCP_Server_Info *server = tcon->ses->server; + int rc = 0; + int retries = 0; + int MAX_RETRIES = 4; + + do { + if (server->ops->close_getattr) + rc = server->ops->close_getattr(0, tcon, cifs_file); + else if (server->ops->close) + rc = server->ops->close(0, tcon, &cifs_file->fid); + + if (rc == -EBUSY || rc == -EAGAIN) { + retries++; + msleep(250); + } + } while ((rc == -EBUSY || rc == -EAGAIN) && (retries < MAX_RETRIES) + ); + + if (retries == MAX_RETRIES) + pr_warn("Serverclose failed %d times, giving up\n", MAX_RETRIES); + + if (cifs_file->offload) + queue_work(fileinfo_put_wq, &cifs_file->put); + else + cifsFileInfo_put_final(cifs_file); +} + /** * cifsFileInfo_put - release a reference of file priv data * @@ -636,10 +672,13 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, struct cifs_fid fid = {}; struct cifs_pending_open open; bool oplock_break_cancelled; + bool serverclose_offloaded = false; spin_lock(&tcon->open_file_lock); spin_lock(&cifsi->open_file_lock); spin_lock(&cifs_file->file_info_lock); + + cifs_file->offload = offload; if (--cifs_file->count > 0) { spin_unlock(&cifs_file->file_info_lock); spin_unlock(&cifsi->open_file_lock); @@ -681,13 +720,20 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, if (!tcon->need_reconnect && !cifs_file->invalidHandle) { struct TCP_Server_Info *server = tcon->ses->server; unsigned int xid; + int rc = 0; xid = get_xid(); if (server->ops->close_getattr) - server->ops->close_getattr(xid, tcon, cifs_file); + rc = server->ops->close_getattr(xid, tcon, cifs_file); else if (server->ops->close) - server->ops->close(xid, tcon, &cifs_file->fid); + rc = server->ops->close(xid, tcon, &cifs_file->fid); _free_xid(xid); + + if (rc == -EBUSY || rc == -EAGAIN) { + // Server close failed, hence offloading it as an async op + queue_work(serverclose_wq, &cifs_file->serverclose); + serverclose_offloaded = true; + } } if (oplock_break_cancelled) @@ -695,10 +741,15 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, cifs_del_pending_open(&open); - if (offload) - queue_work(fileinfo_put_wq, &cifs_file->put); - else - cifsFileInfo_put_final(cifs_file); + // if serverclose has been offloaded to wq (on failure), it will + // handle offloading put as well. If serverclose not offloaded, + // we need to handle offloading put here. + if (!serverclose_offloaded) { + if (offload) + queue_work(fileinfo_put_wq, &cifs_file->put); + else + cifsFileInfo_put_final(cifs_file); + } } int cifs_open(struct inode *inode, struct file *file) diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index a9eaba8083b0d..212ec6f66ec65 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -753,11 +753,11 @@ cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); } -static void +static int cifs_close_file(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *fid) { - CIFSSMBClose(xid, tcon, fid->netfid); + return CIFSSMBClose(xid, tcon, fid->netfid); } static int diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 35bf7eb315cdf..87b63f6ad2e2e 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1412,14 +1412,14 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) memcpy(cfile->fid.create_guid, fid->create_guid, 16); } -static void +static int smb2_close_file(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *fid) { - SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); + return SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); } -static void +static int smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile) { @@ -1430,7 +1430,7 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon, rc = __SMB2_close(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, &file_inf); if (rc) - return; + return rc; inode = d_inode(cfile->dentry); @@ -1459,6 +1459,7 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon, /* End of file and Attributes should not have to be updated on close */ spin_unlock(&inode->i_lock); + return rc; } static int diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 3ea688558e6c9..c0c4933af5fc3 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -3628,9 +3628,9 @@ replay_again: memcpy(&pbuf->network_open_info, &rsp->network_open_info, sizeof(pbuf->network_open_info)); + atomic_dec(&tcon->num_remote_opens); } - atomic_dec(&tcon->num_remote_opens); close_exit: SMB2_close_free(&rqst); free_rsp_buf(resp_buftype, rsp); -- GitLab From ca545b7f0823f19db0f1148d59bc5e1a56634502 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:33:53 -0300 Subject: [PATCH 376/989] smb: client: fix potential UAF in cifs_debug_files_proc_show() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 2 ++ fs/smb/client/cifsglob.h | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 226d4835c92db..c9aec9a38ad36 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -250,6 +250,8 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); list_for_each_entry(cfile, &tcon->openFileList, tlist) { diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 77ca7861a2cc8..f6a302205f89c 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -2325,4 +2325,14 @@ struct smb2_compound_vars { struct kvec ea_iov; }; +static inline bool cifs_ses_exiting(struct cifs_ses *ses) +{ + bool ret; + + spin_lock(&ses->ses_lock); + ret = ses->ses_status == SES_EXITING; + spin_unlock(&ses->ses_lock); + return ret; +} + #endif /* _CIFS_GLOB_H */ -- GitLab From 58acd1f497162e7d282077f816faa519487be045 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:33:54 -0300 Subject: [PATCH 377/989] smb: client: fix potential UAF in cifs_dump_full_key() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/ioctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index c012dfdba80d4..855ac5a62edfa 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -247,7 +247,9 @@ static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server_it, &cifs_tcp_ses_list, tcp_ses_list) { list_for_each_entry(ses_it, &server_it->smb_ses_list, smb_ses_list) { - if (ses_it->Suid == out.session_id) { + spin_lock(&ses_it->ses_lock); + if (ses_it->ses_status != SES_EXITING && + ses_it->Suid == out.session_id) { ses = ses_it; /* * since we are using the session outside the crit @@ -255,9 +257,11 @@ static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug * so increment its refcount */ cifs_smb_ses_inc_refcount(ses); + spin_unlock(&ses_it->ses_lock); found = true; goto search_end; } + spin_unlock(&ses_it->ses_lock); } } search_end: -- GitLab From d3da25c5ac84430f89875ca7485a3828150a7e0a Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:33:55 -0300 Subject: [PATCH 378/989] smb: client: fix potential UAF in cifs_stats_proc_write() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index c9aec9a38ad36..8535c99074626 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -678,6 +678,8 @@ static ssize_t cifs_stats_proc_write(struct file *file, } #endif /* CONFIG_CIFS_STATS2 */ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { atomic_set(&tcon->num_smbs_sent, 0); spin_lock(&tcon->stat_lock); -- GitLab From 0865ffefea197b437ba78b5dd8d8e256253efd65 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:33:56 -0300 Subject: [PATCH 379/989] smb: client: fix potential UAF in cifs_stats_proc_show() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 8535c99074626..c71ae5c043060 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -759,6 +759,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) } #endif /* STATS2 */ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { i++; seq_printf(m, "\n%d) %s", i, tcon->tree_name); -- GitLab From 705c76fbf726c7a2f6ff9143d4013b18daaaebf1 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:33:58 -0300 Subject: [PATCH 380/989] smb: client: fix potential UAF in smb2_is_valid_lease_break() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2misc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index 82b84a4941dd2..ed6d033dce96d 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -622,6 +622,8 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); cifs_stats_inc( -- GitLab From 22863485a4626ec6ecf297f4cc0aef709bc862e4 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:33:59 -0300 Subject: [PATCH 381/989] smb: client: fix potential UAF in smb2_is_valid_oplock_break() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2misc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index ed6d033dce96d..cc72be5a93a93 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -699,6 +699,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); -- GitLab From 69ccf040acddf33a3a85ec0f6b45ef84b0f7ec29 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:34:00 -0300 Subject: [PATCH 382/989] smb: client: fix potential UAF in is_valid_oplock_break() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/misc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 1ea22b3955a2f..33ac4f8f5050c 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -481,6 +481,8 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->tid != buf->Tid) continue; -- GitLab From 63981561ffd2d4987807df4126f96a11e18b0c1d Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:34:02 -0300 Subject: [PATCH 383/989] smb: client: fix potential UAF in smb2_is_network_name_deleted() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 87b63f6ad2e2e..b156eefa75d7c 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2481,6 +2481,8 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) { spin_lock(&tcon->tc_lock); -- GitLab From e0e50401cc3921c9eaf1b0e667db174519ea939f Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:34:04 -0300 Subject: [PATCH 384/989] smb: client: fix potential UAF in cifs_signal_cifsd_for_reconnect() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 95e4bda4fd517..85679ae106fd5 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -175,6 +175,8 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; spin_lock(&ses->chan_lock); for (i = 0; i < ses->chan_count; i++) { if (!ses->chans[i].server) -- GitLab From cffaefd15a8f423cdee5d8eac15d267bc92de314 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 20 Mar 2024 19:02:15 +0100 Subject: [PATCH 385/989] vdso: Use CONFIG_PAGE_SHIFT in vdso/datapage.h Both the vdso rework and the CONFIG_PAGE_SHIFT changes were merged during the v6.9 merge window, so it is now possible to use CONFIG_PAGE_SHIFT instead of including asm/page.h in the vdso. This avoids the workaround for arm64 - commit 8b3843ae3634 ("vdso/datapage: Quick fix - use asm/page-def.h for ARM64") and addresses a build warning for powerpc64: In file included from :4: In file included from /home/arnd/arm-soc/arm-soc/lib/vdso/gettimeofday.c:5: In file included from ../include/vdso/datapage.h:25: arch/powerpc/include/asm/page.h:230:9: error: result of comparison of constant 13835058055282163712 with expression of type 'unsigned long' is always true [-Werror,-Wtautological-constant-out-of-range-compare] 230 | return __pa(kaddr) >> PAGE_SHIFT; | ^~~~~~~~~~~ arch/powerpc/include/asm/page.h:217:37: note: expanded from macro '__pa' 217 | VIRTUAL_WARN_ON((unsigned long)(x) < PAGE_OFFSET); \ | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~ arch/powerpc/include/asm/page.h:202:73: note: expanded from macro 'VIRTUAL_WARN_ON' 202 | #define VIRTUAL_WARN_ON(x) WARN_ON(IS_ENABLED(CONFIG_DEBUG_VIRTUAL) && (x)) | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~ arch/powerpc/include/asm/bug.h:88:25: note: expanded from macro 'WARN_ON' 88 | int __ret_warn_on = !!(x); \ | ^ Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Acked-by: Michael Ellerman (powerpc) Link: https://lore.kernel.org/r/20240320180228.136371-1-arnd@kernel.org --- arch/powerpc/include/asm/vdso/gettimeofday.h | 3 +-- include/vdso/datapage.h | 8 +------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h index f0a4cf01e85c0..78302f6c25800 100644 --- a/arch/powerpc/include/asm/vdso/gettimeofday.h +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -4,7 +4,6 @@ #ifndef __ASSEMBLY__ -#include #include #include #include @@ -95,7 +94,7 @@ const struct vdso_data *__arch_get_vdso_data(void); static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) { - return (void *)vd + PAGE_SIZE; + return (void *)vd + (1U << CONFIG_PAGE_SHIFT); } #endif diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 5d5c0b8efff2d..c71ddb6d46914 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -19,12 +19,6 @@ #include #include -#ifdef CONFIG_ARM64 -#include -#else -#include -#endif - #ifdef CONFIG_ARCH_HAS_VDSO_DATA #include #else @@ -132,7 +126,7 @@ extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden */ union vdso_data_store { struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; + u8 page[1U << CONFIG_PAGE_SHIFT]; }; /* -- GitLab From dd19e827d63ac60debf117676d1126bff884bdb8 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Wed, 20 Mar 2024 17:09:25 -0700 Subject: [PATCH 386/989] idpf: fix kernel panic on unknown packet types In the very rare case where a packet type is unknown to the driver, idpf_rx_process_skb_fields would return early without calling eth_type_trans to set the skb protocol / the network layer handler. This is especially problematic if tcpdump is running when such a packet is received, i.e. it would cause a kernel panic. Instead, call eth_type_trans for every single packet, even when the packet type is unknown. Fixes: 3a8845af66ed ("idpf: add RX splitq napi poll support") Reported-by: Balazs Nemeth Signed-off-by: Joshua Hay Reviewed-by: Jesse Brandeburg Reviewed-by: Przemek Kitszel Tested-by: Salvatore Daniele Signed-off-by: Pavan Kumar Linga Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 6dd7a66bb8979..f5bc4a2780745 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2941,6 +2941,8 @@ static int idpf_rx_process_skb_fields(struct idpf_queue *rxq, rx_ptype = le16_get_bits(rx_desc->ptype_err_fflags0, VIRTCHNL2_RX_FLEX_DESC_ADV_PTYPE_M); + skb->protocol = eth_type_trans(skb, rxq->vport->netdev); + decoded = rxq->vport->rx_ptype_lkup[rx_ptype]; /* If we don't know the ptype we can't do anything else with it. Just * pass it up the stack as-is. @@ -2951,8 +2953,6 @@ static int idpf_rx_process_skb_fields(struct idpf_queue *rxq, /* process RSS/hash */ idpf_rx_hash(rxq, skb, rx_desc, &decoded); - skb->protocol = eth_type_trans(skb, rxq->vport->netdev); - if (le16_get_bits(rx_desc->hdrlen_flags, VIRTCHNL2_RX_FLEX_DESC_ADV_RSC_M)) return idpf_rx_rsc(rxq, skb, rx_desc, &decoded); -- GitLab From f7c52345ccc96343c0a05bdea3121c8ac7b67d5f Mon Sep 17 00:00:00 2001 From: Kwangjin Ko Date: Tue, 2 Apr 2024 17:14:03 +0900 Subject: [PATCH 387/989] cxl/core: Fix initialization of mbox_cmd.size_out in get event Since mbox_cmd.size_out is overwritten with the actual output size in the function below, it needs to be initialized every time. cxl_internal_send_cmd -> __cxl_pci_mbox_send_cmd Problem scenario: 1) The size_out variable is initially set to the size of the mailbox. 2) Read an event. - size_out is set to 160 bytes(header 32B + one event 128B). - Two event are created while reading. 3) Read the new *two* events. - size_out is still set to 160 bytes. - Although the value of out_len is 288 bytes, only 160 bytes are copied from the mailbox register to the local variable. - record_count is set to 2. - Accessing records[1] will result in reading incorrect data. Fixes: 6ebe28f9ec72 ("cxl/mem: Read, trace, and clear events on driver load") Tested-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Jonathan Cameron Signed-off-by: Kwangjin Ko Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 50146161887d5..f0f54aeccc872 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -958,13 +958,14 @@ static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, .payload_in = &log_type, .size_in = sizeof(log_type), .payload_out = payload, - .size_out = mds->payload_size, .min_out = struct_size(payload, records, 0), }; do { int rc, i; + mbox_cmd.size_out = mds->payload_size; + rc = cxl_internal_send_cmd(mds, &mbox_cmd); if (rc) { dev_err_ratelimited(dev, -- GitLab From e7d24c0aa8e678f41457d1304e2091cac6fd1a2e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Mar 2024 07:42:57 +0100 Subject: [PATCH 388/989] gcc-plugins/stackleak: Avoid .head.text section The .head.text section carries the startup code that runs with the MMU off or with a translation of memory that deviates from the ordinary one. So avoid instrumentation with the stackleak plugin, which already avoids .init.text and .noinstr.text entirely. Fixes: 48204aba801f1b51 ("x86/sme: Move early SME kernel encryption handling into .head.text") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202403221630.2692c998-oliver.sang@intel.com Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20240328064256.2358634-2-ardb+git@google.com Signed-off-by: Kees Cook --- scripts/gcc-plugins/stackleak_plugin.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c index c5c2ce113c923..d20c47d21ad83 100644 --- a/scripts/gcc-plugins/stackleak_plugin.c +++ b/scripts/gcc-plugins/stackleak_plugin.c @@ -467,6 +467,8 @@ static bool stackleak_gate(void) return false; if (STRING_EQUAL(section, ".entry.text")) return false; + if (STRING_EQUAL(section, ".head.text")) + return false; } return track_frame_size >= 0; -- GitLab From bbda3ba626b9f57ff6063058877eca856f5b734d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2024 10:06:20 +0200 Subject: [PATCH 389/989] ubsan: fix unused variable warning in test module This is one of the drivers with an unused variable that is marked 'const'. Adding a __used annotation here avoids the warning and lets us enable the option by default: lib/test_ubsan.c:137:28: error: unused variable 'skip_ubsan_array' [-Werror,-Wunused-const-variable] Fixes: 4a26f49b7b3d ("ubsan: expand tests and reporting") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240403080702.3509288-3-arnd@kernel.org Signed-off-by: Kees Cook --- lib/test_ubsan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c index 276c12140ee26..c288df9372ede 100644 --- a/lib/test_ubsan.c +++ b/lib/test_ubsan.c @@ -134,7 +134,7 @@ static const test_ubsan_fp test_ubsan_array[] = { }; /* Excluded because they Oops the module. */ -static const test_ubsan_fp skip_ubsan_array[] = { +static __used const test_ubsan_fp skip_ubsan_array[] = { test_ubsan_divrem_overflow, }; -- GitLab From 9c573cd313433f6c1f7236fe64b9b743500c1628 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 9 Mar 2024 12:24:48 -0800 Subject: [PATCH 390/989] randomize_kstack: Improve entropy diffusion The kstack_offset variable was really only ever using the low bits for kernel stack offset entropy. Add a ror32() to increase bit diffusion. Suggested-by: Arnd Bergmann Fixes: 39218ff4c625 ("stack: Optionally randomize kernel stack offset each syscall") Link: https://lore.kernel.org/r/20240309202445.work.165-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/randomize_kstack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index 5d868505a94e4..6d92b68efbf6c 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -80,7 +80,7 @@ DECLARE_PER_CPU(u32, kstack_offset); if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \ &randomize_kstack_offset)) { \ u32 offset = raw_cpu_read(kstack_offset); \ - offset ^= (rand); \ + offset = ror32(offset, 5) ^ (rand); \ raw_cpu_write(kstack_offset, offset); \ } \ } while (0) -- GitLab From 95197779091166b9ed4b1c630c13600abf94ada7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2024 10:06:22 +0200 Subject: [PATCH 391/989] i2c: pxa: hide unused icr_bits[] variable The function using this is hidden in an #ifdef, so the variable needs the same one for a clean W=1 build: drivers/i2c/busses/i2c-pxa.c:327:26: error: 'icr_bits' defined but not used [-Werror=unused-const-variable=] Fixes: d6a7b5f84b5c ("[ARM] 4827/1: fix two warnings in drivers/i2c/busses/i2c-pxa.c") Signed-off-by: Arnd Bergmann Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-pxa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-pxa.c b/drivers/i2c/busses/i2c-pxa.c index 76f79b68cef84..888ca636f3f3b 100644 --- a/drivers/i2c/busses/i2c-pxa.c +++ b/drivers/i2c/busses/i2c-pxa.c @@ -324,6 +324,7 @@ static void decode_ISR(unsigned int val) decode_bits(KERN_DEBUG "ISR", isr_bits, ARRAY_SIZE(isr_bits), val); } +#ifdef CONFIG_I2C_PXA_SLAVE static const struct bits icr_bits[] = { PXA_BIT(ICR_START, "START", NULL), PXA_BIT(ICR_STOP, "STOP", NULL), @@ -342,7 +343,6 @@ static const struct bits icr_bits[] = { PXA_BIT(ICR_UR, "UR", "ur"), }; -#ifdef CONFIG_I2C_PXA_SLAVE static void decode_ICR(unsigned int val) { decode_bits(KERN_DEBUG "ICR", icr_bits, ARRAY_SIZE(icr_bits), val); -- GitLab From c27fa53b858b4ee6552a719aa599c250cf98a586 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 3 Apr 2024 09:26:38 +0200 Subject: [PATCH 392/989] riscv: Fix vector state restore in rt_sigreturn() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RISC-V Vector specification states in "Appendix D: Calling Convention for Vector State" [1] that "Executing a system call causes all caller-saved vector registers (v0-v31, vl, vtype) and vstart to become unspecified.". In the RISC-V kernel this is called "discarding the vstate". Returning from a signal handler via the rt_sigreturn() syscall, vector discard is also performed. However, this is not an issue since the vector state should be restored from the sigcontext, and therefore not care about the vector discard. The "live state" is the actual vector register in the running context, and the "vstate" is the vector state of the task. A dirty live state, means that the vstate and live state are not in synch. When vectorized user_from_copy() was introduced, an bug sneaked in at the restoration code, related to the discard of the live state. An example when this go wrong: 1. A userland application is executing vector code 2. The application receives a signal, and the signal handler is entered. 3. The application returns from the signal handler, using the rt_sigreturn() syscall. 4. The live vector state is discarded upon entering the rt_sigreturn(), and the live state is marked as "dirty", indicating that the live state need to be synchronized with the current vstate. 5. rt_sigreturn() restores the vstate, except the Vector registers, from the sigcontext 6. rt_sigreturn() restores the Vector registers, from the sigcontext, and now the vectorized user_from_copy() is used. The dirty live state from the discard is saved to the vstate, making the vstate corrupt. 7. rt_sigreturn() returns to the application, which crashes due to corrupted vstate. Note that the vectorized user_from_copy() is invoked depending on the value of CONFIG_RISCV_ISA_V_UCOPY_THRESHOLD. Default is 768, which means that vlen has to be larger than 128b for this bug to trigger. The fix is simply to mark the live state as non-dirty/clean prior performing the vstate restore. Link: https://github.com/riscv/riscv-isa-manual/releases/download/riscv-isa-release-8abdb41-2024-03-26/unpriv-isa-asciidoc.pdf # [1] Reported-by: Charlie Jenkins Reported-by: Vineet Gupta Fixes: c2a658d41924 ("riscv: lib: vectorize copy_to_user/copy_from_user") Signed-off-by: Björn Töpel Reviewed-by: Andy Chiu Tested-by: Vineet Gupta Link: https://lore.kernel.org/r/20240403072638.567446-1-bjorn@kernel.org Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/signal.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 501e66debf697..5a2edd7f027e5 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -119,6 +119,13 @@ static long __restore_v_state(struct pt_regs *regs, void __user *sc_vec) struct __sc_riscv_v_state __user *state = sc_vec; void __user *datap; + /* + * Mark the vstate as clean prior performing the actual copy, + * to avoid getting the vstate incorrectly clobbered by the + * discarded vector state. + */ + riscv_v_vstate_set_restore(current, regs); + /* Copy everything of __sc_riscv_v_state except datap. */ err = __copy_from_user(¤t->thread.vstate, &state->v_state, offsetof(struct __riscv_v_ext_state, datap)); @@ -133,13 +140,7 @@ static long __restore_v_state(struct pt_regs *regs, void __user *sc_vec) * Copy the whole vector content from user space datap. Use * copy_from_user to prevent information leak. */ - err = copy_from_user(current->thread.vstate.datap, datap, riscv_v_vsize); - if (unlikely(err)) - return err; - - riscv_v_vstate_set_restore(current, regs); - - return err; + return copy_from_user(current->thread.vstate.datap, datap, riscv_v_vsize); } #else #define save_v_state(task, regs) (0) -- GitLab From e60aa472400b1ff8d0e6c563a2eb05916927f10a Mon Sep 17 00:00:00 2001 From: Thomas Bertschinger Date: Thu, 14 Mar 2024 10:02:18 -0600 Subject: [PATCH 393/989] bcachefs: create debugfs dir for each btree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This creates a subdirectory for each individual btree under the btrees/ debugfs directory. Directory structure, before: /sys/kernel/debug/bcachefs/$FS_ID/btrees/ ├── alloc ├── alloc-bfloat-failed ├── alloc-formats ├── backpointers ├── backpointers-bfloat-failed ├── backpointers-formats ... Directory structure, after: /sys/kernel/debug/bcachefs/$FS_ID/btrees/ ├── alloc │   ├── bfloat-failed │   ├── formats │   └── keys ├── backpointers │   ├── bfloat-failed │   ├── formats │   └── keys ... Signed-off-by: Thomas Bertschinger Signed-off-by: Kent Overstreet --- fs/bcachefs/debug.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 208ce6f0fc431..7b681ae91510e 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -866,6 +866,20 @@ void bch2_fs_debug_exit(struct bch_fs *c) debugfs_remove_recursive(c->fs_debug_dir); } +static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd) +{ + struct dentry *d; + + d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir); + + debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops); + + debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops); + + debugfs_create_file("bfloat-failed", 0400, d, bd, + &bfloat_failed_debug_ops); +} + void bch2_fs_debug_init(struct bch_fs *c) { struct btree_debug *bd; @@ -902,21 +916,7 @@ void bch2_fs_debug_init(struct bch_fs *c) bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); bd++) { bd->id = bd - c->btree_debug; - debugfs_create_file(bch2_btree_id_str(bd->id), - 0400, c->btree_debug_dir, bd, - &btree_debug_ops); - - snprintf(name, sizeof(name), "%s-formats", - bch2_btree_id_str(bd->id)); - - debugfs_create_file(name, 0400, c->btree_debug_dir, bd, - &btree_format_debug_ops); - - snprintf(name, sizeof(name), "%s-bfloat-failed", - bch2_btree_id_str(bd->id)); - - debugfs_create_file(name, 0400, c->btree_debug_dir, bd, - &bfloat_failed_debug_ops); + bch2_fs_debug_btree_init(c, bd); } } -- GitLab From cbc17e7802f5de37c7c262204baadfad3f7f99e5 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 28 Mar 2024 15:59:29 +0000 Subject: [PATCH 394/989] net: fec: Set mac_managed_pm during probe Setting mac_managed_pm during interface up is too late. In situations where the link is not brought up yet and the system suspends the regular PHY power management will run. Since the FEC ETHEREN control bit is cleared (automatically) on suspend the controller is off in resume. When the regular PHY power management resume path runs in this context it will write to the MII_DATA register but nothing will be transmitted on the MDIO bus. This can be observed by the following log: fec 5b040000.ethernet eth0: MDIO read timeout Microchip LAN87xx T1 5b040000.ethernet-1:04: PM: dpm_run_callback(): mdio_bus_phy_resume+0x0/0xc8 returns -110 Microchip LAN87xx T1 5b040000.ethernet-1:04: PM: failed to resume: error -110 The data written will however remain in the MII_DATA register. When the link later is set to administrative up it will trigger a call to fec_restart() which will restore the MII_SPEED register. This triggers the quirk explained in f166f890c8f0 ("net: ethernet: fec: Replace interrupt driven MDIO with polled IO") causing an extra MII_EVENT. This extra event desynchronizes all the MDIO register reads, causing them to complete too early. Leading all reads to read as 0 because fec_enet_mdio_wait() returns too early. When a Microchip LAN8700R PHY is connected to the FEC, the 0 reads causes the PHY to be initialized incorrectly and the PHY will not transmit any ethernet signal in this state. It cannot be brought out of this state without a power cycle of the PHY. Fixes: 557d5dc83f68 ("net: fec: use mac-managed PHY PM") Closes: https://lore.kernel.org/netdev/1f45bdbe-eab1-4e59-8f24-add177590d27@actia.se/ Signed-off-by: Wei Fang [jernberg: commit message] Signed-off-by: John Ernberg Link: https://lore.kernel.org/r/20240328155909.59613-2-john.ernberg@actia.se Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index d7693fdf640d5..8bd213da8fb6f 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2454,8 +2454,6 @@ static int fec_enet_mii_probe(struct net_device *ndev) fep->link = 0; fep->full_duplex = 0; - phy_dev->mac_managed_pm = true; - phy_attached_info(phy_dev); return 0; @@ -2467,10 +2465,12 @@ static int fec_enet_mii_init(struct platform_device *pdev) struct net_device *ndev = platform_get_drvdata(pdev); struct fec_enet_private *fep = netdev_priv(ndev); bool suppress_preamble = false; + struct phy_device *phydev; struct device_node *node; int err = -ENXIO; u32 mii_speed, holdtime; u32 bus_freq; + int addr; /* * The i.MX28 dual fec interfaces are not equal. @@ -2584,6 +2584,13 @@ static int fec_enet_mii_init(struct platform_device *pdev) goto err_out_free_mdiobus; of_node_put(node); + /* find all the PHY devices on the bus and set mac_managed_pm to true */ + for (addr = 0; addr < PHY_MAX_ADDR; addr++) { + phydev = mdiobus_get_phy(fep->mii_bus, addr); + if (phydev) + phydev->mac_managed_pm = true; + } + mii_cnt++; /* save fec0 mii_bus */ -- GitLab From c644920ce9220d83e070f575a4df711741c07f07 Mon Sep 17 00:00:00 2001 From: Duanqiang Wen Date: Tue, 2 Apr 2024 10:18:43 +0800 Subject: [PATCH 395/989] net: txgbe: fix i2c dev name cannot match clkdev txgbe clkdev shortened clk_name, so i2c_dev info_name also need to shorten. Otherwise, i2c_dev cannot initialize clock. Fixes: e30cef001da2 ("net: txgbe: fix clk_name exceed MAX_DEV_ID limits") Signed-off-by: Duanqiang Wen Link: https://lore.kernel.org/r/20240402021843.126192-1-duanqiangwen@net-swift.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c index 5b5d5e4310d12..2fa511227eac8 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c @@ -20,6 +20,8 @@ #include "txgbe_phy.h" #include "txgbe_hw.h" +#define TXGBE_I2C_CLK_DEV_NAME "i2c_dw" + static int txgbe_swnodes_register(struct txgbe *txgbe) { struct txgbe_nodes *nodes = &txgbe->nodes; @@ -571,8 +573,8 @@ static int txgbe_clock_register(struct txgbe *txgbe) char clk_name[32]; struct clk *clk; - snprintf(clk_name, sizeof(clk_name), "i2c_dw.%d", - pci_dev_id(pdev)); + snprintf(clk_name, sizeof(clk_name), "%s.%d", + TXGBE_I2C_CLK_DEV_NAME, pci_dev_id(pdev)); clk = clk_register_fixed_rate(NULL, clk_name, NULL, 0, 156250000); if (IS_ERR(clk)) @@ -634,7 +636,7 @@ static int txgbe_i2c_register(struct txgbe *txgbe) info.parent = &pdev->dev; info.fwnode = software_node_fwnode(txgbe->nodes.group[SWNODE_I2C]); - info.name = "i2c_designware"; + info.name = TXGBE_I2C_CLK_DEV_NAME; info.id = pci_dev_id(pdev); info.res = &DEFINE_RES_IRQ(pdev->irq); -- GitLab From b3da86d432b7cd65b025a11f68613e333d2483db Mon Sep 17 00:00:00 2001 From: Piotr Wejman Date: Mon, 1 Apr 2024 21:22:39 +0200 Subject: [PATCH 396/989] net: stmmac: fix rx queue priority assignment The driver should ensure that same priority is not mapped to multiple rx queues. From DesignWare Cores Ethernet Quality-of-Service Databook, section 17.1.29 MAC_RxQ_Ctrl2: "[...]The software must ensure that the content of this field is mutually exclusive to the PSRQ fields for other queues, that is, the same priority is not mapped to multiple Rx queues[...]" Previously rx_queue_priority() function was: - clearing all priorities from a queue - adding new priorities to that queue After this patch it will: - first assign new priorities to a queue - then remove those priorities from all other queues - keep other priorities previously assigned to that queue Fixes: a8f5102af2a7 ("net: stmmac: TX and RX queue priority configuration") Fixes: 2142754f8b9c ("net: stmmac: Add MAC related callbacks for XGMAC2") Signed-off-by: Piotr Wejman Link: https://lore.kernel.org/r/20240401192239.33942-1-piotrwejman90@gmail.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac4_core.c | 40 ++++++++++++++----- .../ethernet/stmicro/stmmac/dwxgmac2_core.c | 38 ++++++++++++++---- 2 files changed, 62 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index 6b6d0de096197..cef25efbdff99 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -92,19 +92,41 @@ static void dwmac4_rx_queue_priority(struct mac_device_info *hw, u32 prio, u32 queue) { void __iomem *ioaddr = hw->pcsr; - u32 base_register; - u32 value; + u32 clear_mask = 0; + u32 ctrl2, ctrl3; + int i; - base_register = (queue < 4) ? GMAC_RXQ_CTRL2 : GMAC_RXQ_CTRL3; - if (queue >= 4) - queue -= 4; + ctrl2 = readl(ioaddr + GMAC_RXQ_CTRL2); + ctrl3 = readl(ioaddr + GMAC_RXQ_CTRL3); - value = readl(ioaddr + base_register); + /* The software must ensure that the same priority + * is not mapped to multiple Rx queues + */ + for (i = 0; i < 4; i++) + clear_mask |= ((prio << GMAC_RXQCTRL_PSRQX_SHIFT(i)) & + GMAC_RXQCTRL_PSRQX_MASK(i)); + + ctrl2 &= ~clear_mask; + ctrl3 &= ~clear_mask; + + /* First assign new priorities to a queue, then + * clear them from others queues + */ + if (queue < 4) { + ctrl2 |= (prio << GMAC_RXQCTRL_PSRQX_SHIFT(queue)) & + GMAC_RXQCTRL_PSRQX_MASK(queue); - value &= ~GMAC_RXQCTRL_PSRQX_MASK(queue); - value |= (prio << GMAC_RXQCTRL_PSRQX_SHIFT(queue)) & + writel(ctrl2, ioaddr + GMAC_RXQ_CTRL2); + writel(ctrl3, ioaddr + GMAC_RXQ_CTRL3); + } else { + queue -= 4; + + ctrl3 |= (prio << GMAC_RXQCTRL_PSRQX_SHIFT(queue)) & GMAC_RXQCTRL_PSRQX_MASK(queue); - writel(value, ioaddr + base_register); + + writel(ctrl3, ioaddr + GMAC_RXQ_CTRL3); + writel(ctrl2, ioaddr + GMAC_RXQ_CTRL2); + } } static void dwmac4_tx_queue_priority(struct mac_device_info *hw, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index 1af2f89a0504a..e841e312077ef 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -105,17 +105,41 @@ static void dwxgmac2_rx_queue_prio(struct mac_device_info *hw, u32 prio, u32 queue) { void __iomem *ioaddr = hw->pcsr; - u32 value, reg; + u32 clear_mask = 0; + u32 ctrl2, ctrl3; + int i; - reg = (queue < 4) ? XGMAC_RXQ_CTRL2 : XGMAC_RXQ_CTRL3; - if (queue >= 4) + ctrl2 = readl(ioaddr + XGMAC_RXQ_CTRL2); + ctrl3 = readl(ioaddr + XGMAC_RXQ_CTRL3); + + /* The software must ensure that the same priority + * is not mapped to multiple Rx queues + */ + for (i = 0; i < 4; i++) + clear_mask |= ((prio << XGMAC_PSRQ_SHIFT(i)) & + XGMAC_PSRQ(i)); + + ctrl2 &= ~clear_mask; + ctrl3 &= ~clear_mask; + + /* First assign new priorities to a queue, then + * clear them from others queues + */ + if (queue < 4) { + ctrl2 |= (prio << XGMAC_PSRQ_SHIFT(queue)) & + XGMAC_PSRQ(queue); + + writel(ctrl2, ioaddr + XGMAC_RXQ_CTRL2); + writel(ctrl3, ioaddr + XGMAC_RXQ_CTRL3); + } else { queue -= 4; - value = readl(ioaddr + reg); - value &= ~XGMAC_PSRQ(queue); - value |= (prio << XGMAC_PSRQ_SHIFT(queue)) & XGMAC_PSRQ(queue); + ctrl3 |= (prio << XGMAC_PSRQ_SHIFT(queue)) & + XGMAC_PSRQ(queue); - writel(value, ioaddr + reg); + writel(ctrl3, ioaddr + XGMAC_RXQ_CTRL3); + writel(ctrl2, ioaddr + XGMAC_RXQ_CTRL2); + } } static void dwxgmac2_tx_queue_prio(struct mac_device_info *hw, u32 prio, -- GitLab From de99e1ea3a35f23ff83a31d6b08f43d27b2c6345 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 2 Apr 2024 09:16:34 +0200 Subject: [PATCH 397/989] net: phy: micrel: lan8814: Fix when enabling/disabling 1-step timestamping There are 2 issues with the blamed commit. 1. When the phy is initialized, it would enable the disabled of UDPv4 checksums. The UDPv6 checksum is already enabled by default. So when 1-step is configured then it would clear these flags. 2. After the 1-step is configured, then if 2-step is configured then the 1-step would be still configured because it is not clearing the flag. So the sync frames will still have origin timestamps set. Fix this by reading first the value of the register and then just change bit 12 as this one determines if the timestamp needs to be inserted in the frame, without changing any other bits. Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy") Signed-off-by: Horatiu Vultur Reviewed-by: Divya Koppera Link: https://lore.kernel.org/r/20240402071634.2483524-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 0f8a8ad7ea0bd..ddb50a0e2bc82 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -2431,6 +2431,7 @@ static int lan8814_hwtstamp(struct mii_timestamper *mii_ts, struct lan8814_ptp_rx_ts *rx_ts, *tmp; int txcfg = 0, rxcfg = 0; int pkt_ts_enable; + int tx_mod; ptp_priv->hwts_tx_type = config->tx_type; ptp_priv->rx_filter = config->rx_filter; @@ -2477,9 +2478,14 @@ static int lan8814_hwtstamp(struct mii_timestamper *mii_ts, lanphy_write_page_reg(ptp_priv->phydev, 5, PTP_RX_TIMESTAMP_EN, pkt_ts_enable); lanphy_write_page_reg(ptp_priv->phydev, 5, PTP_TX_TIMESTAMP_EN, pkt_ts_enable); - if (ptp_priv->hwts_tx_type == HWTSTAMP_TX_ONESTEP_SYNC) + tx_mod = lanphy_read_page_reg(ptp_priv->phydev, 5, PTP_TX_MOD); + if (ptp_priv->hwts_tx_type == HWTSTAMP_TX_ONESTEP_SYNC) { lanphy_write_page_reg(ptp_priv->phydev, 5, PTP_TX_MOD, - PTP_TX_MOD_TX_PTP_SYNC_TS_INSERT_); + tx_mod | PTP_TX_MOD_TX_PTP_SYNC_TS_INSERT_); + } else if (ptp_priv->hwts_tx_type == HWTSTAMP_TX_ON) { + lanphy_write_page_reg(ptp_priv->phydev, 5, PTP_TX_MOD, + tx_mod & ~PTP_TX_MOD_TX_PTP_SYNC_TS_INSERT_); + } if (config->rx_filter != HWTSTAMP_FILTER_NONE) lan8814_config_ts_intr(ptp_priv->phydev, true); -- GitLab From 7eb322360b0266481e560d1807ee79e0cef5742b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Apr 2024 13:41:33 +0000 Subject: [PATCH 398/989] net/sched: fix lockdep splat in qdisc_tree_reduce_backlog() qdisc_tree_reduce_backlog() is called with the qdisc lock held, not RTNL. We must use qdisc_lookup_rcu() instead of qdisc_lookup() syzbot reported: WARNING: suspicious RCU usage 6.1.74-syzkaller #0 Not tainted ----------------------------- net/sched/sch_api.c:305 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 3 locks held by udevd/1142: #0: ffffffff87c729a0 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:306 [inline] #0: ffffffff87c729a0 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:747 [inline] #0: ffffffff87c729a0 (rcu_read_lock){....}-{1:2}, at: net_tx_action+0x64a/0x970 net/core/dev.c:5282 #1: ffff888171861108 (&sch->q.lock){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:350 [inline] #1: ffff888171861108 (&sch->q.lock){+.-.}-{2:2}, at: net_tx_action+0x754/0x970 net/core/dev.c:5297 #2: ffffffff87c729a0 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:306 [inline] #2: ffffffff87c729a0 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:747 [inline] #2: ffffffff87c729a0 (rcu_read_lock){....}-{1:2}, at: qdisc_tree_reduce_backlog+0x84/0x580 net/sched/sch_api.c:792 stack backtrace: CPU: 1 PID: 1142 Comm: udevd Not tainted 6.1.74-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024 Call Trace: [] __dump_stack lib/dump_stack.c:88 [inline] [] dump_stack_lvl+0x1b1/0x28f lib/dump_stack.c:106 [] dump_stack+0x15/0x1e lib/dump_stack.c:113 [] lockdep_rcu_suspicious+0x1b9/0x260 kernel/locking/lockdep.c:6592 [] qdisc_lookup+0xac/0x6f0 net/sched/sch_api.c:305 [] qdisc_tree_reduce_backlog+0x243/0x580 net/sched/sch_api.c:811 [] pfifo_tail_enqueue+0x32c/0x4b0 net/sched/sch_fifo.c:51 [] qdisc_enqueue include/net/sch_generic.h:833 [inline] [] netem_dequeue+0xeb3/0x15d0 net/sched/sch_netem.c:723 [] dequeue_skb net/sched/sch_generic.c:292 [inline] [] qdisc_restart net/sched/sch_generic.c:397 [inline] [] __qdisc_run+0x249/0x1e60 net/sched/sch_generic.c:415 [] qdisc_run+0xd6/0x260 include/net/pkt_sched.h:125 [] net_tx_action+0x7c9/0x970 net/core/dev.c:5313 [] __do_softirq+0x2bd/0x9bd kernel/softirq.c:616 [] invoke_softirq kernel/softirq.c:447 [inline] [] __irq_exit_rcu+0xca/0x230 kernel/softirq.c:700 [] irq_exit_rcu+0x9/0x20 kernel/softirq.c:712 [] sysvec_apic_timer_interrupt+0x42/0x90 arch/x86/kernel/apic/apic.c:1107 [] asm_sysvec_apic_timer_interrupt+0x1b/0x20 arch/x86/include/asm/idtentry.h:656 Fixes: d636fc5dd692 ("net: sched: add rcu annotations around qdisc->qdisc_sleeping") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Jiri Pirko Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240402134133.2352776-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 65e05b0c98e46..60239378d43fb 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -809,7 +809,7 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) notify = !sch->q.qlen && !WARN_ON_ONCE(!n && !qdisc_is_offloaded); /* TODO: perform the search on a per txq basis */ - sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); + sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { WARN_ON_ONCE(parentid != TC_H_ROOT); break; -- GitLab From c0de6ab920aafb56feab56058e46b688e694a246 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Tue, 2 Apr 2024 12:48:36 -0700 Subject: [PATCH 399/989] net: mana: Fix Rx DMA datasize and skb_over_panic mana_get_rxbuf_cfg() aligns the RX buffer's DMA datasize to be multiple of 64. So a packet slightly bigger than mtu+14, say 1536, can be received and cause skb_over_panic. Sample dmesg: [ 5325.237162] skbuff: skb_over_panic: text:ffffffffc043277a len:1536 put:1536 head:ff1100018b517000 data:ff1100018b517100 tail:0x700 end:0x6ea dev: [ 5325.243689] ------------[ cut here ]------------ [ 5325.245748] kernel BUG at net/core/skbuff.c:192! [ 5325.247838] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 5325.258374] RIP: 0010:skb_panic+0x4f/0x60 [ 5325.302941] Call Trace: [ 5325.304389] [ 5325.315794] ? skb_panic+0x4f/0x60 [ 5325.317457] ? asm_exc_invalid_op+0x1f/0x30 [ 5325.319490] ? skb_panic+0x4f/0x60 [ 5325.321161] skb_put+0x4e/0x50 [ 5325.322670] mana_poll+0x6fa/0xb50 [mana] [ 5325.324578] __napi_poll+0x33/0x1e0 [ 5325.326328] net_rx_action+0x12e/0x280 As discussed internally, this alignment is not necessary. To fix this bug, remove it from the code. So oversized packets will be marked as CQE_RX_TRUNCATED by NIC, and dropped. Cc: stable@vger.kernel.org Fixes: 2fbbd712baf1 ("net: mana: Enable RX path to handle various MTU sizes") Signed-off-by: Haiyang Zhang Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/1712087316-20886-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/mana_en.c | 2 +- include/net/mana/mana.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 59287c6e6cee6..d8af5e7e15b4d 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -601,7 +601,7 @@ static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size, *alloc_size = mtu + MANA_RXBUF_PAD + *headroom; - *datasize = ALIGN(mtu + ETH_HLEN, MANA_RX_DATA_ALIGN); + *datasize = mtu + ETH_HLEN; } static int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu) diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 76147feb0d10a..4eeedf14711b3 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -39,7 +39,6 @@ enum TRI_STATE { #define COMP_ENTRY_SIZE 64 #define RX_BUFFERS_PER_QUEUE 512 -#define MANA_RX_DATA_ALIGN 64 #define MAX_SEND_BUFFERS_PER_QUEUE 256 -- GitLab From 3137b83a90646917c90951d66489db466b4ae106 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2024 10:06:48 +0200 Subject: [PATCH 400/989] ata: sata_mv: Fix PCI device ID table declaration compilation warning Building with W=1 shows a warning for an unused variable when CONFIG_PCI is diabled: drivers/ata/sata_mv.c:790:35: error: unused variable 'mv_pci_tbl' [-Werror,-Wunused-const-variable] static const struct pci_device_id mv_pci_tbl[] = { Move the table into the same block that containsn the pci_driver definition. Fixes: 7bb3c5290ca0 ("sata_mv: Remove PCI dependency") Signed-off-by: Arnd Bergmann Signed-off-by: Damien Le Moal --- drivers/ata/sata_mv.c | 63 +++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c index e82786c63fbd7..9bec0aee92e04 100644 --- a/drivers/ata/sata_mv.c +++ b/drivers/ata/sata_mv.c @@ -787,37 +787,6 @@ static const struct ata_port_info mv_port_info[] = { }, }; -static const struct pci_device_id mv_pci_tbl[] = { - { PCI_VDEVICE(MARVELL, 0x5040), chip_504x }, - { PCI_VDEVICE(MARVELL, 0x5041), chip_504x }, - { PCI_VDEVICE(MARVELL, 0x5080), chip_5080 }, - { PCI_VDEVICE(MARVELL, 0x5081), chip_508x }, - /* RocketRAID 1720/174x have different identifiers */ - { PCI_VDEVICE(TTI, 0x1720), chip_6042 }, - { PCI_VDEVICE(TTI, 0x1740), chip_6042 }, - { PCI_VDEVICE(TTI, 0x1742), chip_6042 }, - - { PCI_VDEVICE(MARVELL, 0x6040), chip_604x }, - { PCI_VDEVICE(MARVELL, 0x6041), chip_604x }, - { PCI_VDEVICE(MARVELL, 0x6042), chip_6042 }, - { PCI_VDEVICE(MARVELL, 0x6080), chip_608x }, - { PCI_VDEVICE(MARVELL, 0x6081), chip_608x }, - - { PCI_VDEVICE(ADAPTEC2, 0x0241), chip_604x }, - - /* Adaptec 1430SA */ - { PCI_VDEVICE(ADAPTEC2, 0x0243), chip_7042 }, - - /* Marvell 7042 support */ - { PCI_VDEVICE(MARVELL, 0x7042), chip_7042 }, - - /* Highpoint RocketRAID PCIe series */ - { PCI_VDEVICE(TTI, 0x2300), chip_7042 }, - { PCI_VDEVICE(TTI, 0x2310), chip_7042 }, - - { } /* terminate list */ -}; - static const struct mv_hw_ops mv5xxx_ops = { .phy_errata = mv5_phy_errata, .enable_leds = mv5_enable_leds, @@ -4303,6 +4272,36 @@ static int mv_pci_init_one(struct pci_dev *pdev, static int mv_pci_device_resume(struct pci_dev *pdev); #endif +static const struct pci_device_id mv_pci_tbl[] = { + { PCI_VDEVICE(MARVELL, 0x5040), chip_504x }, + { PCI_VDEVICE(MARVELL, 0x5041), chip_504x }, + { PCI_VDEVICE(MARVELL, 0x5080), chip_5080 }, + { PCI_VDEVICE(MARVELL, 0x5081), chip_508x }, + /* RocketRAID 1720/174x have different identifiers */ + { PCI_VDEVICE(TTI, 0x1720), chip_6042 }, + { PCI_VDEVICE(TTI, 0x1740), chip_6042 }, + { PCI_VDEVICE(TTI, 0x1742), chip_6042 }, + + { PCI_VDEVICE(MARVELL, 0x6040), chip_604x }, + { PCI_VDEVICE(MARVELL, 0x6041), chip_604x }, + { PCI_VDEVICE(MARVELL, 0x6042), chip_6042 }, + { PCI_VDEVICE(MARVELL, 0x6080), chip_608x }, + { PCI_VDEVICE(MARVELL, 0x6081), chip_608x }, + + { PCI_VDEVICE(ADAPTEC2, 0x0241), chip_604x }, + + /* Adaptec 1430SA */ + { PCI_VDEVICE(ADAPTEC2, 0x0243), chip_7042 }, + + /* Marvell 7042 support */ + { PCI_VDEVICE(MARVELL, 0x7042), chip_7042 }, + + /* Highpoint RocketRAID PCIe series */ + { PCI_VDEVICE(TTI, 0x2300), chip_7042 }, + { PCI_VDEVICE(TTI, 0x2310), chip_7042 }, + + { } /* terminate list */ +}; static struct pci_driver mv_pci_driver = { .name = DRV_NAME, @@ -4315,6 +4314,7 @@ static struct pci_driver mv_pci_driver = { #endif }; +MODULE_DEVICE_TABLE(pci, mv_pci_tbl); /** * mv_print_info - Dump key info to kernel log for perusal. @@ -4487,7 +4487,6 @@ static void __exit mv_exit(void) MODULE_AUTHOR("Brett Russ"); MODULE_DESCRIPTION("SCSI low-level driver for Marvell SATA controllers"); MODULE_LICENSE("GPL v2"); -MODULE_DEVICE_TABLE(pci, mv_pci_tbl); MODULE_VERSION(DRV_VERSION); MODULE_ALIAS("platform:" DRV_NAME); -- GitLab From e85006ae7430aef780cc4f0849692e266a102ec0 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 3 Apr 2024 04:33:49 +0000 Subject: [PATCH 401/989] ata: sata_gemini: Check clk_enable() result The call to clk_enable() in gemini_sata_start_bridge() can fail. Add a check to detect such failure. Signed-off-by: Chen Ni Signed-off-by: Damien Le Moal --- drivers/ata/sata_gemini.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/ata/sata_gemini.c b/drivers/ata/sata_gemini.c index 400b22ee99c33..4c270999ba3cc 100644 --- a/drivers/ata/sata_gemini.c +++ b/drivers/ata/sata_gemini.c @@ -200,7 +200,10 @@ int gemini_sata_start_bridge(struct sata_gemini *sg, unsigned int bridge) pclk = sg->sata0_pclk; else pclk = sg->sata1_pclk; - clk_enable(pclk); + ret = clk_enable(pclk); + if (ret) + return ret; + msleep(10); /* Do not keep clocking a bridge that is not online */ -- GitLab From 9852b1dc6a140365977d7bfb5fa03d413b3417ad Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2024 22:23:37 +0200 Subject: [PATCH 402/989] x86/numa/32: Include missing The __vmalloc_start_set declaration is in a header that is not included in numa_32.c in current linux-next: arch/x86/mm/numa_32.c: In function 'initmem_init': arch/x86/mm/numa_32.c:57:9: error: '__vmalloc_start_set' undeclared (first use in this function) 57 | __vmalloc_start_set = true; | ^~~~~~~~~~~~~~~~~~~ arch/x86/mm/numa_32.c:57:9: note: each undeclared identifier is reported only once for each function it appears in Add an explicit #include. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240403202344.3463169-1-arnd@kernel.org --- arch/x86/mm/numa_32.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 104544359d69c..025fd7ea5d69f 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -24,6 +24,7 @@ #include #include +#include #include "numa_internal.h" -- GitLab From 7d8ed162e6a92268d4b2b84d364a931216102c8e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 2 Apr 2024 13:26:59 +0000 Subject: [PATCH 403/989] memblock tests: fix undefined reference to `early_pfn_to_nid' commit 6a9531c3a880 ("memblock: fix crash when reserved memory is not added to memory") introduce the usage of early_pfn_to_nid, which is not defined in memblock tests. The original definition of early_pfn_to_nid is defined in mm.h, so let add this in the corresponding mm.h. Signed-off-by: Wei Yang CC: Yajun Deng CC: Mike Rapoport Link: https://lore.kernel.org/r/20240402132701.29744-2-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- tools/include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index f3c82ab5b14cd..7d73da0980473 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -37,4 +37,9 @@ static inline void totalram_pages_add(long count) { } +static inline int early_pfn_to_nid(unsigned long pfn) +{ + return 0; +} + #endif -- GitLab From e0f5a8e74be88f2476e58b25d3b49a9521bdc4ec Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 2 Apr 2024 13:27:00 +0000 Subject: [PATCH 404/989] memblock tests: fix undefined reference to `panic' commit e96c6b8f212a ("memblock: report failures when memblock_can_resize is not set") introduced the usage of panic, which is not defined in memblock test. Let's define it directly in panic.h to fix it. Signed-off-by: Wei Yang CC: Song Shuai CC: Mike Rapoport Link: https://lore.kernel.org/r/20240402132701.29744-3-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- tools/include/linux/kernel.h | 1 + tools/include/linux/panic.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 tools/include/linux/panic.h diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h index 4b0673bf52c2e..07cfad817d539 100644 --- a/tools/include/linux/kernel.h +++ b/tools/include/linux/kernel.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/tools/include/linux/panic.h b/tools/include/linux/panic.h new file mode 100644 index 0000000000000..9c8f17a41ce8e --- /dev/null +++ b/tools/include/linux/panic.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_PANIC_H +#define _TOOLS_LINUX_PANIC_H + +#include +#include +#include + +static inline void panic(const char *fmt, ...) +{ + va_list argp; + + va_start(argp, fmt); + vfprintf(stderr, fmt, argp); + va_end(argp); + exit(-1); +} + +#endif -- GitLab From 592447f6cb3c20d606d6c5d8e6af68e99707b786 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 2 Apr 2024 13:27:01 +0000 Subject: [PATCH 405/989] memblock tests: fix undefined reference to `BIT' commit 772dd0342727 ("mm: enumerate all gfp flags") define gfp flags with the help of BIT, while gfp_types.h doesn't include header file for the definition. This through an error on building memblock tests. Let's include linux/bits.h to fix it. Signed-off-by: Wei Yang CC: Suren Baghdasaryan CC: Michal Hocko Link: https://lore.kernel.org/r/20240402132701.29744-4-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- include/linux/gfp_types.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 868c8fb1bbc1c..13becafe41df0 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -2,6 +2,8 @@ #ifndef __LINUX_GFP_TYPES_H #define __LINUX_GFP_TYPES_H +#include + /* The typedef is in types.h but we want the documentation here */ #if 0 /** -- GitLab From 9ab4ad295622a3481818856762471c1f8c830e18 Mon Sep 17 00:00:00 2001 From: Nikita Kiryushin Date: Mon, 1 Apr 2024 22:14:18 +0300 Subject: [PATCH 406/989] tg3: Remove residual error handling in tg3_suspend As of now, tg3_power_down_prepare always ends with success, but the error handling code from former tg3_set_power_state call is still here. This code became unreachable in commit c866b7eac073 ("tg3: Do not use legacy PCI power management"). Remove (now unreachable) error handling code for simplification and change tg3_power_down_prepare to a void function as its result is no more checked. Signed-off-by: Nikita Kiryushin Reviewed-by: Michael Chan Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240401191418.361747-1-kiryushin@ancud.ru Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/tg3.c | 30 ++++------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 62ff4381ac83c..e6ff3c9bd7e50 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -4019,7 +4019,7 @@ static int tg3_power_up(struct tg3 *tp) static int tg3_setup_phy(struct tg3 *, bool); -static int tg3_power_down_prepare(struct tg3 *tp) +static void tg3_power_down_prepare(struct tg3 *tp) { u32 misc_host_ctrl; bool device_should_wake, do_low_power; @@ -4263,7 +4263,7 @@ static int tg3_power_down_prepare(struct tg3 *tp) tg3_ape_driver_state_change(tp, RESET_KIND_SHUTDOWN); - return 0; + return; } static void tg3_power_down(struct tg3 *tp) @@ -18084,7 +18084,6 @@ static int tg3_suspend(struct device *device) { struct net_device *dev = dev_get_drvdata(device); struct tg3 *tp = netdev_priv(dev); - int err = 0; rtnl_lock(); @@ -18108,32 +18107,11 @@ static int tg3_suspend(struct device *device) tg3_flag_clear(tp, INIT_COMPLETE); tg3_full_unlock(tp); - err = tg3_power_down_prepare(tp); - if (err) { - int err2; - - tg3_full_lock(tp, 0); - - tg3_flag_set(tp, INIT_COMPLETE); - err2 = tg3_restart_hw(tp, true); - if (err2) - goto out; - - tg3_timer_start(tp); - - netif_device_attach(dev); - tg3_netif_start(tp); - -out: - tg3_full_unlock(tp); - - if (!err2) - tg3_phy_start(tp); - } + tg3_power_down_prepare(tp); unlock: rtnl_unlock(); - return err; + return 0; } static int tg3_resume(struct device *device) -- GitLab From 99485c4c026f024e7cb82da84c7951dbe3deb584 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 26 Mar 2024 17:07:35 +0100 Subject: [PATCH 407/989] x86/coco: Require seeding RNG with RDRAND on CoCo systems There are few uses of CoCo that don't rely on working cryptography and hence a working RNG. Unfortunately, the CoCo threat model means that the VM host cannot be trusted and may actively work against guests to extract secrets or manipulate computation. Since a malicious host can modify or observe nearly all inputs to guests, the only remaining source of entropy for CoCo guests is RDRAND. If RDRAND is broken -- due to CPU hardware fault -- the RNG as a whole is meant to gracefully continue on gathering entropy from other sources, but since there aren't other sources on CoCo, this is catastrophic. This is mostly a concern at boot time when initially seeding the RNG, as after that the consequences of a broken RDRAND are much more theoretical. So, try at boot to seed the RNG using 256 bits of RDRAND output. If this fails, panic(). This will also trigger if the system is booted without RDRAND, as RDRAND is essential for a safe CoCo boot. Add this deliberately to be "just a CoCo x86 driver feature" and not part of the RNG itself. Many device drivers and platforms have some desire to contribute something to the RNG, and add_device_randomness() is specifically meant for this purpose. Any driver can call it with seed data of any quality, or even garbage quality, and it can only possibly make the quality of the RNG better or have no effect, but can never make it worse. Rather than trying to build something into the core of the RNG, consider the particular CoCo issue just a CoCo issue, and therefore separate it all out into driver (well, arch/platform) code. [ bp: Massage commit message. ] Signed-off-by: Jason A. Donenfeld Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Elena Reshetova Reviewed-by: Kirill A. Shutemov Reviewed-by: Theodore Ts'o Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240326160735.73531-1-Jason@zx2c4.com --- arch/x86/coco/core.c | 41 +++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/coco.h | 2 ++ arch/x86/kernel/setup.c | 2 ++ 3 files changed, 45 insertions(+) diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index d07be9d05cd03..ddd4efdc79d66 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -3,13 +3,17 @@ * Confidential Computing Platform Capability checks * * Copyright (C) 2021 Advanced Micro Devices, Inc. + * Copyright (C) 2024 Jason A. Donenfeld . All Rights Reserved. * * Author: Tom Lendacky */ #include #include +#include +#include +#include #include #include @@ -148,3 +152,40 @@ u64 cc_mkdec(u64 val) } } EXPORT_SYMBOL_GPL(cc_mkdec); + +__init void cc_random_init(void) +{ + /* + * The seed is 32 bytes (in units of longs), which is 256 bits, which + * is the security level that the RNG is targeting. + */ + unsigned long rng_seed[32 / sizeof(long)]; + size_t i, longs; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return; + + /* + * Since the CoCo threat model includes the host, the only reliable + * source of entropy that can be neither observed nor manipulated is + * RDRAND. Usually, RDRAND failure is considered tolerable, but since + * CoCo guests have no other unobservable source of entropy, it's + * important to at least ensure the RNG gets some initial random seeds. + */ + for (i = 0; i < ARRAY_SIZE(rng_seed); i += longs) { + longs = arch_get_random_longs(&rng_seed[i], ARRAY_SIZE(rng_seed) - i); + + /* + * A zero return value means that the guest doesn't have RDRAND + * or the CPU is physically broken, and in both cases that + * means most crypto inside of the CoCo instance will be + * broken, defeating the purpose of CoCo in the first place. So + * just panic here because it's absolutely unsafe to continue + * executing. + */ + if (longs == 0) + panic("RDRAND is defective."); + } + add_device_randomness(rng_seed, sizeof(rng_seed)); + memzero_explicit(rng_seed, sizeof(rng_seed)); +} diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index fb7388bbc212f..c086699b0d0c5 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -22,6 +22,7 @@ static inline void cc_set_mask(u64 mask) u64 cc_mkenc(u64 val); u64 cc_mkdec(u64 val); +void cc_random_init(void); #else #define cc_vendor (CC_VENDOR_NONE) @@ -34,6 +35,7 @@ static inline u64 cc_mkdec(u64 val) { return val; } +static inline void cc_random_init(void) { } #endif #endif /* _ASM_X86_COCO_H */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0109e6c510e02..e125e059e2c45 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -991,6 +992,7 @@ void __init setup_arch(char **cmdline_p) * memory size. */ mem_encrypt_setup_arch(); + cc_random_init(); efi_fake_memmap(); efi_find_mirror(); -- GitLab From 54f5f47b6055c6b57cbc41a440f8ca8b2ec4275a Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 27 Mar 2024 16:43:15 +0100 Subject: [PATCH 408/989] x86/kvm/Kconfig: Have KVM_AMD_SEV select ARCH_HAS_CC_PLATFORM The functionality to load SEV-SNP guests by the host will soon rely on cc_platform* helpers because the cpu_feature* API with the early patching is insufficient when SNP support needs to be disabled late. Therefore, pull that functionality in. Fixes: 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Tested-by: Srikanth Aithal Link: https://lore.kernel.org/r/20240327154317.29909-4-bp@alien8.de --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3aaf7e86a859a..0ebdd088f28b8 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -122,6 +122,7 @@ config KVM_AMD_SEV default y depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) + select ARCH_HAS_CC_PLATFORM help Provides support for launching Encrypted VMs (SEV) and Encrypted VMs with Encrypted State (SEV-ES) on AMD processors. -- GitLab From bc6f707fc0feec72acc2f49c312eb31d257363a3 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 27 Mar 2024 16:43:16 +0100 Subject: [PATCH 409/989] x86/cc: Add cc_platform_set/_clear() helpers Add functionality to set and/or clear different attributes of the machine as a confidential computing platform. Add the first one too: whether the machine is running as a host for SEV-SNP guests. Fixes: 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Tested-by: Srikanth Aithal Link: https://lore.kernel.org/r/20240327154317.29909-5-bp@alien8.de --- arch/x86/coco/core.c | 52 +++++++++++++++++++++++++++++++++++++ include/linux/cc_platform.h | 12 +++++++++ 2 files changed, 64 insertions(+) diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index ddd4efdc79d66..b31ef2424d194 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -20,6 +20,11 @@ enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE; u64 cc_mask __ro_after_init; +static struct cc_attr_flags { + __u64 host_sev_snp : 1, + __resv : 63; +} cc_flags; + static bool noinstr intel_cc_platform_has(enum cc_attr attr) { switch (attr) { @@ -93,6 +98,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_GUEST_SEV_SNP: return sev_status & MSR_AMD64_SEV_SNP_ENABLED; + case CC_ATTR_HOST_SEV_SNP: + return cc_flags.host_sev_snp; + default: return false; } @@ -153,6 +161,50 @@ u64 cc_mkdec(u64 val) } EXPORT_SYMBOL_GPL(cc_mkdec); +static void amd_cc_platform_clear(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_HOST_SEV_SNP: + cc_flags.host_sev_snp = 0; + break; + default: + break; + } +} + +void cc_platform_clear(enum cc_attr attr) +{ + switch (cc_vendor) { + case CC_VENDOR_AMD: + amd_cc_platform_clear(attr); + break; + default: + break; + } +} + +static void amd_cc_platform_set(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_HOST_SEV_SNP: + cc_flags.host_sev_snp = 1; + break; + default: + break; + } +} + +void cc_platform_set(enum cc_attr attr) +{ + switch (cc_vendor) { + case CC_VENDOR_AMD: + amd_cc_platform_set(attr); + break; + default: + break; + } +} + __init void cc_random_init(void) { /* diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index cb0d6cd1c12f2..60693a1458946 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -90,6 +90,14 @@ enum cc_attr { * Examples include TDX Guest. */ CC_ATTR_HOTPLUG_DISABLED, + + /** + * @CC_ATTR_HOST_SEV_SNP: AMD SNP enabled on the host. + * + * The host kernel is running with the necessary features + * enabled to run SEV-SNP guests. + */ + CC_ATTR_HOST_SEV_SNP, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM @@ -107,10 +115,14 @@ enum cc_attr { * * FALSE - Specified Confidential Computing attribute is not active */ bool cc_platform_has(enum cc_attr attr); +void cc_platform_set(enum cc_attr attr); +void cc_platform_clear(enum cc_attr attr); #else /* !CONFIG_ARCH_HAS_CC_PLATFORM */ static inline bool cc_platform_has(enum cc_attr attr) { return false; } +static inline void cc_platform_set(enum cc_attr attr) { } +static inline void cc_platform_clear(enum cc_attr attr) { } #endif /* CONFIG_ARCH_HAS_CC_PLATFORM */ -- GitLab From 0ecaefb303de69929dc0036d5021d01cec7ea052 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 27 Mar 2024 16:43:17 +0100 Subject: [PATCH 410/989] x86/CPU/AMD: Track SNP host status with cc_platform_*() The host SNP worthiness can determined later, after alternatives have been patched, in snp_rmptable_init() depending on cmdline options like iommu=pt which is incompatible with SNP, for example. Which means that one cannot use X86_FEATURE_SEV_SNP and will need to have a special flag for that control. Use that newly added CC_ATTR_HOST_SEV_SNP in the appropriate places. Move kdump_sev_callback() to its rightful place, while at it. Fixes: 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Tested-by: Srikanth Aithal Link: https://lore.kernel.org/r/20240327154317.29909-6-bp@alien8.de --- arch/x86/include/asm/sev.h | 4 ++-- arch/x86/kernel/cpu/amd.c | 38 ++++++++++++++++++------------ arch/x86/kernel/cpu/mtrr/generic.c | 2 +- arch/x86/kernel/sev.c | 10 -------- arch/x86/kvm/svm/sev.c | 2 +- arch/x86/virt/svm/sev.c | 26 +++++++++++++------- drivers/crypto/ccp/sev-dev.c | 2 +- drivers/iommu/amd/init.c | 4 +++- 8 files changed, 49 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 07e125f325283..7f57382afee41 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -228,7 +228,6 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct sn void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); u64 sev_get_status(void); -void kdump_sev_callback(void); void sev_show_status(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } @@ -258,7 +257,6 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 snp_get_unsupported_features(u64 status) { return 0; } static inline u64 sev_get_status(void) { return 0; } -static inline void kdump_sev_callback(void) { } static inline void sev_show_status(void) { } #endif @@ -270,6 +268,7 @@ int psmash(u64 pfn); int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable); int rmp_make_shared(u64 pfn, enum pg_level level); void snp_leak_pages(u64 pfn, unsigned int npages); +void kdump_sev_callback(void); #else static inline bool snp_probe_rmptable_info(void) { return false; } static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; } @@ -282,6 +281,7 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 as } static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} +static inline void kdump_sev_callback(void) { } #endif #endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6d8677e80ddbb..9bf17c9c29dad 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -345,6 +345,28 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } +static void bsp_determine_snp(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_ARCH_HAS_CC_PLATFORM + cc_vendor = CC_VENDOR_AMD; + + if (cpu_has(c, X86_FEATURE_SEV_SNP)) { + /* + * RMP table entry format is not architectural and is defined by the + * per-processor PPR. Restrict SNP support on the known CPU models + * for which the RMP table entry format is currently defined for. + */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && + c->x86 >= 0x19 && snp_probe_rmptable_info()) { + cc_platform_set(CC_ATTR_HOST_SEV_SNP); + } else { + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); + } + } +#endif +} + static void bsp_init_amd(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -452,21 +474,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) break; } - if (cpu_has(c, X86_FEATURE_SEV_SNP)) { - /* - * RMP table entry format is not architectural and it can vary by processor - * and is defined by the per-processor PPR. Restrict SNP support on the - * known CPU model and family for which the RMP table entry format is - * currently defined for. - */ - if (!boot_cpu_has(X86_FEATURE_ZEN3) && - !boot_cpu_has(X86_FEATURE_ZEN4) && - !boot_cpu_has(X86_FEATURE_ZEN5)) - setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); - else if (!snp_probe_rmptable_info()) - setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); - } - + bsp_determine_snp(c); return; warn: diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 422a4ddc2ab7c..7b29ebda024f4 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -108,7 +108,7 @@ static inline void k8_check_syscfg_dram_mod_en(void) (boot_cpu_data.x86 >= 0x0f))) return; - if (cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return; rdmsr(MSR_AMD64_SYSCFG, lo, hi); diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 7e1e63cc48e67..38ad066179d81 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -2284,16 +2284,6 @@ static int __init snp_init_platform_device(void) } device_initcall(snp_init_platform_device); -void kdump_sev_callback(void) -{ - /* - * Do wbinvd() on remote CPUs when SNP is enabled in order to - * safely do SNP_SHUTDOWN on the local CPU. - */ - if (cpu_feature_enabled(X86_FEATURE_SEV_SNP)) - wbinvd(); -} - void sev_show_status(void) { int i; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index ae0ac12382b92..3d310b473e059 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3174,7 +3174,7 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) unsigned long pfn; struct page *p; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); /* diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index cffe1157a90ac..ab0e8448bb6eb 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -77,7 +77,7 @@ static int __mfd_enable(unsigned int cpu) { u64 val; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return 0; rdmsrl(MSR_AMD64_SYSCFG, val); @@ -98,7 +98,7 @@ static int __snp_enable(unsigned int cpu) { u64 val; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return 0; rdmsrl(MSR_AMD64_SYSCFG, val); @@ -174,11 +174,11 @@ static int __init snp_rmptable_init(void) u64 rmptable_size; u64 val; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return 0; if (!amd_iommu_snp_en) - return 0; + goto nosnp; if (!probed_rmp_size) goto nosnp; @@ -225,7 +225,7 @@ skip_enable: return 0; nosnp: - setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); return -ENOSYS; } @@ -246,7 +246,7 @@ static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level) { struct rmpentry *large_entry, *entry; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return ERR_PTR(-ENODEV); entry = get_rmpentry(pfn); @@ -363,7 +363,7 @@ int psmash(u64 pfn) unsigned long paddr = pfn << PAGE_SHIFT; int ret; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return -ENODEV; if (!pfn_valid(pfn)) @@ -472,7 +472,7 @@ static int rmpupdate(u64 pfn, struct rmp_state *state) unsigned long paddr = pfn << PAGE_SHIFT; int ret, level; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return -ENODEV; level = RMP_TO_PG_LEVEL(state->pagesize); @@ -558,3 +558,13 @@ void snp_leak_pages(u64 pfn, unsigned int npages) spin_unlock(&snp_leaked_pages_list_lock); } EXPORT_SYMBOL_GPL(snp_leak_pages); + +void kdump_sev_callback(void) +{ + /* + * Do wbinvd() on remote CPUs when SNP is enabled in order to + * safely do SNP_SHUTDOWN on the local CPU. + */ + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + wbinvd(); +} diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index f44efbb89c346..2102377f727b1 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -1090,7 +1090,7 @@ static int __sev_snp_init_locked(int *error) void *arg = &data; int cmd, rc = 0; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return -ENODEV; sev = psp->sev_data; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index e7a44929f0daf..33228c1c8980f 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3228,7 +3228,7 @@ out: static void iommu_snp_enable(void) { #ifdef CONFIG_KVM_AMD_SEV - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return; /* * The SNP support requires that IOMMU must be enabled, and is @@ -3236,12 +3236,14 @@ static void iommu_snp_enable(void) */ if (no_iommu || iommu_default_passthrough()) { pr_err("SNP: IOMMU disabled or configured in passthrough mode, SNP cannot be supported.\n"); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); return; } amd_iommu_snp_en = check_feature(FEATURE_SNP); if (!amd_iommu_snp_en) { pr_err("SNP: IOMMU SNP feature not enabled, SNP cannot be supported.\n"); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); return; } -- GitLab From 72076fc9fe60b9143cd971fd8737718719bc512e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 4 Apr 2024 10:51:01 +0200 Subject: [PATCH 411/989] Revert "tg3: Remove residual error handling in tg3_suspend" This reverts commit 9ab4ad295622a3481818856762471c1f8c830e18. I went out of coffee and applied it to the wrong tree. Blame on me. Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/tg3.c | 30 +++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index e6ff3c9bd7e50..62ff4381ac83c 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -4019,7 +4019,7 @@ static int tg3_power_up(struct tg3 *tp) static int tg3_setup_phy(struct tg3 *, bool); -static void tg3_power_down_prepare(struct tg3 *tp) +static int tg3_power_down_prepare(struct tg3 *tp) { u32 misc_host_ctrl; bool device_should_wake, do_low_power; @@ -4263,7 +4263,7 @@ static void tg3_power_down_prepare(struct tg3 *tp) tg3_ape_driver_state_change(tp, RESET_KIND_SHUTDOWN); - return; + return 0; } static void tg3_power_down(struct tg3 *tp) @@ -18084,6 +18084,7 @@ static int tg3_suspend(struct device *device) { struct net_device *dev = dev_get_drvdata(device); struct tg3 *tp = netdev_priv(dev); + int err = 0; rtnl_lock(); @@ -18107,11 +18108,32 @@ static int tg3_suspend(struct device *device) tg3_flag_clear(tp, INIT_COMPLETE); tg3_full_unlock(tp); - tg3_power_down_prepare(tp); + err = tg3_power_down_prepare(tp); + if (err) { + int err2; + + tg3_full_lock(tp, 0); + + tg3_flag_set(tp, INIT_COMPLETE); + err2 = tg3_restart_hw(tp, true); + if (err2) + goto out; + + tg3_timer_start(tp); + + netif_device_attach(dev); + tg3_netif_start(tp); + +out: + tg3_full_unlock(tp); + + if (!err2) + tg3_phy_start(tp); + } unlock: rtnl_unlock(); - return 0; + return err; } static int tg3_resume(struct device *device) -- GitLab From a45e6889575c2067d3c0212b6bc1022891e65b91 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 28 Mar 2024 13:27:36 +0100 Subject: [PATCH 412/989] netfilter: nf_tables: release batch on table validation from abort path Unlike early commit path stage which triggers a call to abort, an explicit release of the batch is required on abort, otherwise mutex is released and commit_list remains in place. Add WARN_ON_ONCE to ensure commit_list is empty from the abort path before releasing the mutex. After this patch, commit_list is always assumed to be empty before grabbing the mutex, therefore 03c1f1ef1584 ("netfilter: Cleanup nft_net->module_list from nf_tables_exit_net()") only needs to release the pending modules for registration. Cc: stable@vger.kernel.org Fixes: c0391b6ab810 ("netfilter: nf_tables: missing validation from the abort path") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fd86f2720c9e7..ffcd3213c3358 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10455,10 +10455,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) struct nft_trans *trans, *next; LIST_HEAD(set_update_list); struct nft_trans_elem *te; + int err = 0; if (action == NFNL_ABORT_VALIDATE && nf_tables_validate(net) < 0) - return -EAGAIN; + err = -EAGAIN; list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { @@ -10655,7 +10656,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) else nf_tables_module_autoload_cleanup(net); - return 0; + return err; } static int nf_tables_abort(struct net *net, struct sk_buff *skb, @@ -10668,6 +10669,9 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, gc_seq = nft_gc_seq_begin(nft_net); ret = __nf_tables_abort(net, action); nft_gc_seq_end(nft_net, gc_seq); + + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + mutex_unlock(&nft_net->commit_mutex); return ret; @@ -11473,9 +11477,10 @@ static void __net_exit nf_tables_exit_net(struct net *net) gc_seq = nft_gc_seq_begin(nft_net); - if (!list_empty(&nft_net->commit_list) || - !list_empty(&nft_net->module_list)) - __nf_tables_abort(net, NFNL_ABORT_NONE); + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + + if (!list_empty(&nft_net->module_list)) + nf_tables_module_autoload_cleanup(net); __nft_release_tables(net); -- GitLab From 0d459e2ffb541841714839e8228b845458ed3b27 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 28 Mar 2024 14:23:55 +0100 Subject: [PATCH 413/989] netfilter: nf_tables: release mutex after nft_gc_seq_end from abort path The commit mutex should not be released during the critical section between nft_gc_seq_begin() and nft_gc_seq_end(), otherwise, async GC worker could collect expired objects and get the released commit lock within the same GC sequence. nf_tables_module_autoload() temporarily releases the mutex to load module dependencies, then it goes back to replay the transaction again. Move it at the end of the abort phase after nft_gc_seq_end() is called. Cc: stable@vger.kernel.org Fixes: 720344340fb9 ("netfilter: nf_tables: GC transaction race with abort path") Reported-by: Kuan-Ting Chen Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ffcd3213c3358..0d432d0674e12 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10651,11 +10651,6 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nf_tables_abort_release(trans); } - if (action == NFNL_ABORT_AUTOLOAD) - nf_tables_module_autoload(net); - else - nf_tables_module_autoload_cleanup(net); - return err; } @@ -10672,6 +10667,14 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + /* module autoload needs to happen after GC sequence update because it + * temporarily releases and grabs mutex again. + */ + if (action == NFNL_ABORT_AUTOLOAD) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + mutex_unlock(&nft_net->commit_mutex); return ret; -- GitLab From 24cea9677025e0de419989ecb692acd4bb34cac2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 2 Apr 2024 18:04:36 +0200 Subject: [PATCH 414/989] netfilter: nf_tables: flush pending destroy work before exit_net release Similar to 2c9f0293280e ("netfilter: nf_tables: flush pending destroy work before netlink notifier") to address a race between exit_net and the destroy workqueue. The trace below shows an element to be released via destroy workqueue while exit_net path (triggered via module removal) has already released the set that is used in such transaction. [ 1360.547789] BUG: KASAN: slab-use-after-free in nf_tables_trans_destroy_work+0x3f5/0x590 [nf_tables] [ 1360.547861] Read of size 8 at addr ffff888140500cc0 by task kworker/4:1/152465 [ 1360.547870] CPU: 4 PID: 152465 Comm: kworker/4:1 Not tainted 6.8.0+ #359 [ 1360.547882] Workqueue: events nf_tables_trans_destroy_work [nf_tables] [ 1360.547984] Call Trace: [ 1360.547991] [ 1360.547998] dump_stack_lvl+0x53/0x70 [ 1360.548014] print_report+0xc4/0x610 [ 1360.548026] ? __virt_addr_valid+0xba/0x160 [ 1360.548040] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 1360.548054] ? nf_tables_trans_destroy_work+0x3f5/0x590 [nf_tables] [ 1360.548176] kasan_report+0xae/0xe0 [ 1360.548189] ? nf_tables_trans_destroy_work+0x3f5/0x590 [nf_tables] [ 1360.548312] nf_tables_trans_destroy_work+0x3f5/0x590 [nf_tables] [ 1360.548447] ? __pfx_nf_tables_trans_destroy_work+0x10/0x10 [nf_tables] [ 1360.548577] ? _raw_spin_unlock_irq+0x18/0x30 [ 1360.548591] process_one_work+0x2f1/0x670 [ 1360.548610] worker_thread+0x4d3/0x760 [ 1360.548627] ? __pfx_worker_thread+0x10/0x10 [ 1360.548640] kthread+0x16b/0x1b0 [ 1360.548653] ? __pfx_kthread+0x10/0x10 [ 1360.548665] ret_from_fork+0x2f/0x50 [ 1360.548679] ? __pfx_kthread+0x10/0x10 [ 1360.548690] ret_from_fork_asm+0x1a/0x30 [ 1360.548707] [ 1360.548719] Allocated by task 192061: [ 1360.548726] kasan_save_stack+0x20/0x40 [ 1360.548739] kasan_save_track+0x14/0x30 [ 1360.548750] __kasan_kmalloc+0x8f/0xa0 [ 1360.548760] __kmalloc_node+0x1f1/0x450 [ 1360.548771] nf_tables_newset+0x10c7/0x1b50 [nf_tables] [ 1360.548883] nfnetlink_rcv_batch+0xbc4/0xdc0 [nfnetlink] [ 1360.548909] nfnetlink_rcv+0x1a8/0x1e0 [nfnetlink] [ 1360.548927] netlink_unicast+0x367/0x4f0 [ 1360.548935] netlink_sendmsg+0x34b/0x610 [ 1360.548944] ____sys_sendmsg+0x4d4/0x510 [ 1360.548953] ___sys_sendmsg+0xc9/0x120 [ 1360.548961] __sys_sendmsg+0xbe/0x140 [ 1360.548971] do_syscall_64+0x55/0x120 [ 1360.548982] entry_SYSCALL_64_after_hwframe+0x55/0x5d [ 1360.548994] Freed by task 192222: [ 1360.548999] kasan_save_stack+0x20/0x40 [ 1360.549009] kasan_save_track+0x14/0x30 [ 1360.549019] kasan_save_free_info+0x3b/0x60 [ 1360.549028] poison_slab_object+0x100/0x180 [ 1360.549036] __kasan_slab_free+0x14/0x30 [ 1360.549042] kfree+0xb6/0x260 [ 1360.549049] __nft_release_table+0x473/0x6a0 [nf_tables] [ 1360.549131] nf_tables_exit_net+0x170/0x240 [nf_tables] [ 1360.549221] ops_exit_list+0x50/0xa0 [ 1360.549229] free_exit_list+0x101/0x140 [ 1360.549236] unregister_pernet_operations+0x107/0x160 [ 1360.549245] unregister_pernet_subsys+0x1c/0x30 [ 1360.549254] nf_tables_module_exit+0x43/0x80 [nf_tables] [ 1360.549345] __do_sys_delete_module+0x253/0x370 [ 1360.549352] do_syscall_64+0x55/0x120 [ 1360.549360] entry_SYSCALL_64_after_hwframe+0x55/0x5d (gdb) list *__nft_release_table+0x473 0x1e033 is in __nft_release_table (net/netfilter/nf_tables_api.c:11354). 11349 list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { 11350 list_del(&flowtable->list); 11351 nft_use_dec(&table->use); 11352 nf_tables_flowtable_destroy(flowtable); 11353 } 11354 list_for_each_entry_safe(set, ns, &table->sets, list) { 11355 list_del(&set->list); 11356 nft_use_dec(&table->use); 11357 if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) 11358 nft_map_deactivate(&ctx, set); (gdb) [ 1360.549372] Last potentially related work creation: [ 1360.549376] kasan_save_stack+0x20/0x40 [ 1360.549384] __kasan_record_aux_stack+0x9b/0xb0 [ 1360.549392] __queue_work+0x3fb/0x780 [ 1360.549399] queue_work_on+0x4f/0x60 [ 1360.549407] nft_rhash_remove+0x33b/0x340 [nf_tables] [ 1360.549516] nf_tables_commit+0x1c6a/0x2620 [nf_tables] [ 1360.549625] nfnetlink_rcv_batch+0x728/0xdc0 [nfnetlink] [ 1360.549647] nfnetlink_rcv+0x1a8/0x1e0 [nfnetlink] [ 1360.549671] netlink_unicast+0x367/0x4f0 [ 1360.549680] netlink_sendmsg+0x34b/0x610 [ 1360.549690] ____sys_sendmsg+0x4d4/0x510 [ 1360.549697] ___sys_sendmsg+0xc9/0x120 [ 1360.549706] __sys_sendmsg+0xbe/0x140 [ 1360.549715] do_syscall_64+0x55/0x120 [ 1360.549725] entry_SYSCALL_64_after_hwframe+0x55/0x5d Fixes: 0935d5588400 ("netfilter: nf_tables: asynchronous release") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0d432d0674e12..17bf53cd0e848 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -11575,6 +11575,7 @@ static void __exit nf_tables_module_exit(void) unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); nft_chain_route_fini(); + nf_tables_trans_destroy_flush_work(); unregister_pernet_subsys(&nf_tables_net_ops); cancel_work_sync(&trans_gc_work); cancel_work_sync(&trans_destroy_work); -- GitLab From 994209ddf4f430946f6247616b2e33d179243769 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 1 Apr 2024 00:33:02 +0200 Subject: [PATCH 415/989] netfilter: nf_tables: reject new basechain after table flag update When dormant flag is toggled, hooks are disabled in the commit phase by iterating over current chains in table (existing and new). The following configuration allows for an inconsistent state: add table x add chain x y { type filter hook input priority 0; } add table x { flags dormant; } add chain x w { type filter hook input priority 1; } which triggers the following warning when trying to unregister chain w which is already unregistered. [ 127.322252] WARNING: CPU: 7 PID: 1211 at net/netfilter/core.c:50 1 __nf_unregister_net_hook+0x21a/0x260 [...] [ 127.322519] Call Trace: [ 127.322521] [ 127.322524] ? __warn+0x9f/0x1a0 [ 127.322531] ? __nf_unregister_net_hook+0x21a/0x260 [ 127.322537] ? report_bug+0x1b1/0x1e0 [ 127.322545] ? handle_bug+0x3c/0x70 [ 127.322552] ? exc_invalid_op+0x17/0x40 [ 127.322556] ? asm_exc_invalid_op+0x1a/0x20 [ 127.322563] ? kasan_save_free_info+0x3b/0x60 [ 127.322570] ? __nf_unregister_net_hook+0x6a/0x260 [ 127.322577] ? __nf_unregister_net_hook+0x21a/0x260 [ 127.322583] ? __nf_unregister_net_hook+0x6a/0x260 [ 127.322590] ? __nf_tables_unregister_hook+0x8a/0xe0 [nf_tables] [ 127.322655] nft_table_disable+0x75/0xf0 [nf_tables] [ 127.322717] nf_tables_commit+0x2571/0x2620 [nf_tables] Fixes: 179d9ba5559a ("netfilter: nf_tables: fix table flag updates") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 17bf53cd0e848..1eb51bf24fc2e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2449,6 +2449,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_stats __percpu *stats = NULL; struct nft_chain_hook hook = {}; + if (table->flags & __NFT_TABLE_F_UPDATE) + return -EINVAL; + if (flags & NFT_CHAIN_BINDING) return -EOPNOTSUPP; -- GitLab From 24225011d81b471acc0e1e315b7d9905459a6304 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Wed, 3 Apr 2024 15:22:04 +0800 Subject: [PATCH 416/989] netfilter: nf_tables: Fix potential data-race in __nft_flowtable_type_get() nft_unregister_flowtable_type() within nf_flow_inet_module_exit() can concurrent with __nft_flowtable_type_get() within nf_tables_newflowtable(). And thhere is not any protection when iterate over nf_tables_flowtables list in __nft_flowtable_type_get(). Therefore, there is pertential data-race of nf_tables_flowtables list entry. Use list_for_each_entry_rcu() to iterate over nf_tables_flowtables list in __nft_flowtable_type_get(), and use rcu_read_lock() in the caller nft_flowtable_type_get() to protect the entire type query process. Fixes: 3b49e2e94e6e ("netfilter: nf_tables: add flow table netlink frontend") Signed-off-by: Ziyang Xuan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1eb51bf24fc2e..e02d0ae4f4364 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8296,11 +8296,12 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, return err; } +/* call under rcu_read_lock */ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family) { const struct nf_flowtable_type *type; - list_for_each_entry(type, &nf_tables_flowtables, list) { + list_for_each_entry_rcu(type, &nf_tables_flowtables, list) { if (family == type->family) return type; } @@ -8312,9 +8313,13 @@ nft_flowtable_type_get(struct net *net, u8 family) { const struct nf_flowtable_type *type; + rcu_read_lock(); type = __nft_flowtable_type_get(family); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES -- GitLab From 1bc83a019bbe268be3526406245ec28c2458a518 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 3 Apr 2024 19:35:30 +0200 Subject: [PATCH 417/989] netfilter: nf_tables: discard table flag update with pending basechain deletion Hook unregistration is deferred to the commit phase, same occurs with hook updates triggered by the table dormant flag. When both commands are combined, this results in deleting a basechain while leaving its hook still registered in the core. Fixes: 179d9ba5559a ("netfilter: nf_tables: fix table flag updates") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e02d0ae4f4364..d89d779467197 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1209,10 +1209,11 @@ static bool nft_table_pending_update(const struct nft_ctx *ctx) return true; list_for_each_entry(trans, &nft_net->commit_list, list) { - if ((trans->msg_type == NFT_MSG_NEWCHAIN || - trans->msg_type == NFT_MSG_DELCHAIN) && - trans->ctx.table == ctx->table && - nft_trans_chain_update(trans)) + if (trans->ctx.table == ctx->table && + ((trans->msg_type == NFT_MSG_NEWCHAIN && + nft_trans_chain_update(trans)) || + (trans->msg_type == NFT_MSG_DELCHAIN && + nft_is_base_chain(trans->ctx.chain)))) return true; } -- GitLab From 596a4254915f94c927217fe09c33a6828f33fb25 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 2 Apr 2024 15:53:04 +0100 Subject: [PATCH 418/989] net: ravb: Always process TX descriptor ring The TX queue should be serviced each time the poll function is called, even if the full RX work budget has been consumed. This prevents starvation of the TX queue when RX bandwidth usage is high. Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20240402145305.82148-1-paul.barker.ct@bp.renesas.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index d1be030c88483..48803050abdbf 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1324,12 +1324,12 @@ static int ravb_poll(struct napi_struct *napi, int budget) int q = napi - priv->napi; int mask = BIT(q); int quota = budget; + bool unmask; /* Processing RX Descriptor Ring */ /* Clear RX interrupt */ ravb_write(ndev, ~(mask | RIS0_RESERVED), RIS0); - if (ravb_rx(ndev, "a, q)) - goto out; + unmask = !ravb_rx(ndev, "a, q); /* Processing TX Descriptor Ring */ spin_lock_irqsave(&priv->lock, flags); @@ -1339,6 +1339,9 @@ static int ravb_poll(struct napi_struct *napi, int budget) netif_wake_subqueue(ndev, q); spin_unlock_irqrestore(&priv->lock, flags); + if (!unmask) + goto out; + napi_complete(napi); /* Re-enable RX/TX interrupts */ -- GitLab From 101b76418d7163240bc74a7e06867dca0e51183e Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 2 Apr 2024 15:53:05 +0100 Subject: [PATCH 419/989] net: ravb: Always update error counters The error statistics should be updated each time the poll function is called, even if the full RX work budget has been consumed. This prevents the counts from becoming stuck when RX bandwidth usage is high. This also ensures that error counters are not updated after we've re-enabled interrupts as that could result in a race condition. Also drop an unnecessary space. Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20240402145305.82148-2-paul.barker.ct@bp.renesas.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 48803050abdbf..ba01c8cc3c906 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1339,6 +1339,15 @@ static int ravb_poll(struct napi_struct *napi, int budget) netif_wake_subqueue(ndev, q); spin_unlock_irqrestore(&priv->lock, flags); + /* Receive error message handling */ + priv->rx_over_errors = priv->stats[RAVB_BE].rx_over_errors; + if (info->nc_queues) + priv->rx_over_errors += priv->stats[RAVB_NC].rx_over_errors; + if (priv->rx_over_errors != ndev->stats.rx_over_errors) + ndev->stats.rx_over_errors = priv->rx_over_errors; + if (priv->rx_fifo_errors != ndev->stats.rx_fifo_errors) + ndev->stats.rx_fifo_errors = priv->rx_fifo_errors; + if (!unmask) goto out; @@ -1355,14 +1364,6 @@ static int ravb_poll(struct napi_struct *napi, int budget) } spin_unlock_irqrestore(&priv->lock, flags); - /* Receive error message handling */ - priv->rx_over_errors = priv->stats[RAVB_BE].rx_over_errors; - if (info->nc_queues) - priv->rx_over_errors += priv->stats[RAVB_NC].rx_over_errors; - if (priv->rx_over_errors != ndev->stats.rx_over_errors) - ndev->stats.rx_over_errors = priv->rx_over_errors; - if (priv->rx_fifo_errors != ndev->stats.rx_fifo_errors) - ndev->stats.rx_fifo_errors = priv->rx_fifo_errors; out: return budget - quota; } -- GitLab From c120209bce34c49dcaba32f15679574327d09f63 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 2 Apr 2024 20:33:56 +0200 Subject: [PATCH 420/989] net: dsa: sja1105: Fix parameters order in sja1110_pcs_mdio_write_c45() The definition and declaration of sja1110_pcs_mdio_write_c45() don't have parameters in the same order. Knowing that sja1110_pcs_mdio_write_c45() is used as a function pointer in 'sja1105_info' structure with .pcs_mdio_write_c45, and that we have: int (*pcs_mdio_write_c45)(struct mii_bus *bus, int phy, int mmd, int reg, u16 val); it is likely that the definition is the one to change. Found with cppcheck, funcArgOrderDifferent. Fixes: ae271547bba6 ("net: dsa: sja1105: C45 only transactions for PCS") Signed-off-by: Christophe JAILLET Reviewed-by: Michael Walle Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/ff2a5af67361988b3581831f7bd1eddebfb4c48f.1712082763.git.christophe.jaillet@wanadoo.fr Signed-off-by: Paolo Abeni --- drivers/net/dsa/sja1105/sja1105_mdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_mdio.c b/drivers/net/dsa/sja1105/sja1105_mdio.c index 833e55e4b9612..52ddb4ef259e9 100644 --- a/drivers/net/dsa/sja1105/sja1105_mdio.c +++ b/drivers/net/dsa/sja1105/sja1105_mdio.c @@ -94,7 +94,7 @@ int sja1110_pcs_mdio_read_c45(struct mii_bus *bus, int phy, int mmd, int reg) return tmp & 0xffff; } -int sja1110_pcs_mdio_write_c45(struct mii_bus *bus, int phy, int reg, int mmd, +int sja1110_pcs_mdio_write_c45(struct mii_bus *bus, int phy, int mmd, int reg, u16 val) { struct sja1105_mdio_private *mdio_priv = bus->priv; -- GitLab From b9846a386734e73a1414950ebfd50f04919f5e24 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Thu, 4 Apr 2024 09:47:13 +0530 Subject: [PATCH 421/989] ASoC: SOF: amd: fix for false dsp interrupts Before ACP firmware loading, DSP interrupts are not expected. Sometimes after reboot, it's observed that before ACP firmware is loaded false DSP interrupt is reported. Registering the interrupt handler before acp initialization causing false interrupts sometimes on reboot as ACP reset is not applied. Correct the sequence by invoking acp initialization sequence prior to registering interrupt handler. Fixes: 738a2b5e2cc9 ("ASoC: SOF: amd: Add IPC support for ACP IP block") Signed-off-by: Vijendar Mukunda Link: https://msgid.link/r/20240404041717.430545-1-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/acp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/soc/sof/amd/acp.c b/sound/soc/sof/amd/acp.c index be7dc1e02284a..c12c7f8205294 100644 --- a/sound/soc/sof/amd/acp.c +++ b/sound/soc/sof/amd/acp.c @@ -704,6 +704,10 @@ int amd_sof_acp_probe(struct snd_sof_dev *sdev) goto unregister_dev; } + ret = acp_init(sdev); + if (ret < 0) + goto free_smn_dev; + sdev->ipc_irq = pci->irq; ret = request_threaded_irq(sdev->ipc_irq, acp_irq_handler, acp_irq_thread, IRQF_SHARED, "AudioDSP", sdev); @@ -713,10 +717,6 @@ int amd_sof_acp_probe(struct snd_sof_dev *sdev) goto free_smn_dev; } - ret = acp_init(sdev); - if (ret < 0) - goto free_ipc_irq; - /* scan SoundWire capabilities exposed by DSDT */ ret = acp_sof_scan_sdw_devices(sdev, chip->sdw_acpi_dev_addr); if (ret < 0) { -- GitLab From 5bfc311dd6c376d350b39028b9000ad766ddc934 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 4 Apr 2024 15:11:05 +0300 Subject: [PATCH 422/989] usb: xhci: correct return value in case of STS_HCE If we get STS_HCE we give up on the interrupt, but for the purpose of IRQ handling that still counts as ours. We may return IRQ_NONE only if we are positive that it wasn't ours. Hence correct the default. Fixes: 2a25e66d676d ("xhci: print warning when HCE was set") Cc: stable@vger.kernel.org # v6.2+ Signed-off-by: Oliver Neukum Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240404121106.2842417-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 52278afea94be..575f0fd9c9f11 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -3133,7 +3133,7 @@ static int xhci_handle_events(struct xhci_hcd *xhci, struct xhci_interrupter *ir irqreturn_t xhci_irq(struct usb_hcd *hcd) { struct xhci_hcd *xhci = hcd_to_xhci(hcd); - irqreturn_t ret = IRQ_NONE; + irqreturn_t ret = IRQ_HANDLED; u32 status; spin_lock(&xhci->lock); @@ -3141,12 +3141,13 @@ irqreturn_t xhci_irq(struct usb_hcd *hcd) status = readl(&xhci->op_regs->status); if (status == ~(u32)0) { xhci_hc_died(xhci); - ret = IRQ_HANDLED; goto out; } - if (!(status & STS_EINT)) + if (!(status & STS_EINT)) { + ret = IRQ_NONE; goto out; + } if (status & STS_HCE) { xhci_warn(xhci, "WARNING: Host Controller Error\n"); @@ -3156,7 +3157,6 @@ irqreturn_t xhci_irq(struct usb_hcd *hcd) if (status & STS_FATAL) { xhci_warn(xhci, "WARNING: Host System Error\n"); xhci_halt(xhci); - ret = IRQ_HANDLED; goto out; } @@ -3167,7 +3167,6 @@ irqreturn_t xhci_irq(struct usb_hcd *hcd) */ status |= STS_EINT; writel(status, &xhci->op_regs->status); - ret = IRQ_HANDLED; /* This is the handler of the primary interrupter */ xhci_handle_events(xhci, xhci->interrupters[0]); -- GitLab From dda7e89e53d6ebf27c49df7d87a54e3e1614d332 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 4 Apr 2024 15:11:06 +0300 Subject: [PATCH 423/989] xhci: Fix root hub port null pointer dereference in xhci tracepoints The pointer from a xhci usb virt device to its root hub port (vdev->rhub_port) is set later when device is addressed, not while vdev is allocated. Tracepoints dereferenced this rhub_port pointer when freeing the virt device, which causes null pointer dereference if tracing is enabled and device is freed before addressed. This can happen if tracing is enabled and xhci driver is unloaded before a device is fully enumerated, or initial enumeration fails and device is reset and freed before retry. Don't dereference the rhub_port or show port numbers when tracing xhci_free_virt_device(). This info is not very useful anyway. Print the more useful slot id instead Fixes: 06790c19086f ("xhci: replace real & fake port with pointer to root hub port") Reported-by: Thinh Nguyen Closes: https://lore.kernel.org/linux-usb/20240402005007.klv2ij727fkz7rpd@synopsys.com/ Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240404121106.2842417-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-trace.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/usb/host/xhci-trace.h b/drivers/usb/host/xhci-trace.h index 1740000d54c29..5762564b9d733 100644 --- a/drivers/usb/host/xhci-trace.h +++ b/drivers/usb/host/xhci-trace.h @@ -172,8 +172,7 @@ DECLARE_EVENT_CLASS(xhci_log_free_virt_dev, __field(void *, vdev) __field(unsigned long long, out_ctx) __field(unsigned long long, in_ctx) - __field(int, hcd_portnum) - __field(int, hw_portnum) + __field(int, slot_id) __field(u16, current_mel) ), @@ -181,13 +180,12 @@ DECLARE_EVENT_CLASS(xhci_log_free_virt_dev, __entry->vdev = vdev; __entry->in_ctx = (unsigned long long) vdev->in_ctx->dma; __entry->out_ctx = (unsigned long long) vdev->out_ctx->dma; - __entry->hcd_portnum = (int) vdev->rhub_port->hcd_portnum; - __entry->hw_portnum = (int) vdev->rhub_port->hw_portnum; + __entry->slot_id = (int) vdev->slot_id; __entry->current_mel = (u16) vdev->current_mel; ), - TP_printk("vdev %p ctx %llx | %llx hcd_portnum %d hw_portnum %d current_mel %d", - __entry->vdev, __entry->in_ctx, __entry->out_ctx, - __entry->hcd_portnum, __entry->hw_portnum, __entry->current_mel + TP_printk("vdev %p slot %d ctx %llx | %llx current_mel %d", + __entry->vdev, __entry->slot_id, __entry->in_ctx, + __entry->out_ctx, __entry->current_mel ) ); -- GitLab From 69630926011c1f7170a465b7b5c228deb66e9372 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Mar 2024 00:02:00 +1100 Subject: [PATCH 424/989] powerpc/crypto/chacha-p10: Fix failure on non Power10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The chacha-p10-crypto module provides optimised chacha routines for Power10. It also selects CRYPTO_ARCH_HAVE_LIB_CHACHA which says it provides chacha_crypt_arch() to generic code. Notably the module needs to provide chacha_crypt_arch() regardless of whether it is loaded on Power10 or an older CPU. The implementation of chacha_crypt_arch() already has a fallback to chacha_crypt_generic(), however the module as a whole fails to load on pre-Power10, because of the use of module_cpu_feature_match(). This breaks for example loading wireguard: jostaberry-1:~ # modprobe -v wireguard insmod /lib/modules/6.8.0-lp155.8.g7e0e887-default/kernel/arch/powerpc/crypto/chacha-p10-crypto.ko.zst modprobe: ERROR: could not insert 'wireguard': No such device Fix it by removing module_cpu_feature_match(), and instead check the CPU feature manually. If the CPU feature is not found, the module still loads successfully, but doesn't register the Power10 specific algorithms. That allows chacha_crypt_generic() to remain available for use, fixing the problem. [root@fedora ~]# modprobe -v wireguard insmod /lib/modules/6.8.0-00001-g786a790c4d79/kernel/net/ipv4/udp_tunnel.ko insmod /lib/modules/6.8.0-00001-g786a790c4d79/kernel/net/ipv6/ip6_udp_tunnel.ko insmod /lib/modules/6.8.0-00001-g786a790c4d79/kernel/lib/crypto/libchacha.ko insmod /lib/modules/6.8.0-00001-g786a790c4d79/kernel/arch/powerpc/crypto/chacha-p10-crypto.ko insmod /lib/modules/6.8.0-00001-g786a790c4d79/kernel/lib/crypto/libchacha20poly1305.ko insmod /lib/modules/6.8.0-00001-g786a790c4d79/kernel/drivers/net/wireguard/wireguard.ko [ 18.910452][ T721] wireguard: allowedips self-tests: pass [ 18.914999][ T721] wireguard: nonce counter self-tests: pass [ 19.029066][ T721] wireguard: ratelimiter self-tests: pass [ 19.029257][ T721] wireguard: WireGuard 1.0.0 loaded. See www.wireguard.com for information. [ 19.029361][ T721] wireguard: Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. Reported-by: Michal Suchánek Closes: https://lore.kernel.org/all/20240315122005.GG20665@kitsune.suse.cz/ Acked-by: Herbert Xu Signed-off-by: Michael Ellerman Link: https://msgid.link/20240328130200.3041687-1-mpe@ellerman.id.au --- arch/powerpc/crypto/chacha-p10-glue.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/crypto/chacha-p10-glue.c b/arch/powerpc/crypto/chacha-p10-glue.c index 74fb86b0d2097..7c728755852e1 100644 --- a/arch/powerpc/crypto/chacha-p10-glue.c +++ b/arch/powerpc/crypto/chacha-p10-glue.c @@ -197,6 +197,9 @@ static struct skcipher_alg algs[] = { static int __init chacha_p10_init(void) { + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return 0; + static_branch_enable(&have_p10); return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); @@ -204,10 +207,13 @@ static int __init chacha_p10_init(void) static void __exit chacha_p10_exit(void) { + if (!static_branch_likely(&have_p10)) + return; + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } -module_cpu_feature_match(PPC_MODULE_FEATURE_P10, chacha_p10_init); +module_init(chacha_p10_init); module_exit(chacha_p10_exit); MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (P10 accelerated)"); -- GitLab From 3f03e7717c2999473a3ed38475ed0a962ba9f393 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 27 Mar 2024 18:53:08 +0100 Subject: [PATCH 425/989] usb: phy: MAINTAINERS: mark Freescale USB PHY as orphaned Emails to the only maintainer bounce: : host nxp-com.mail.protection.outlook.com[52.101.68.39] said: 550 5.4.1 Recipient address rejected: Access denied. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240327175308.520317-1-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7c121493f43d0..3c78979fb8197 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8752,10 +8752,9 @@ S: Maintained F: drivers/usb/gadget/udc/fsl* FREESCALE USB PHY DRIVER -M: Ran Wang L: linux-usb@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org -S: Maintained +S: Orphan F: drivers/usb/phy/phy-fsl-usb* FREEVXFS FILESYSTEM -- GitLab From ce4c8d21054ae9396cd759fe6e8157b525616dc4 Mon Sep 17 00:00:00 2001 From: "Christian A. Ehrhardt" Date: Mon, 1 Apr 2024 23:05:15 +0200 Subject: [PATCH 426/989] usb: typec: ucsi: Fix connector check on init Fix issues when initially checking for a connector change: - Use the correct connector number not the entire CCI. - Call ->read under the PPM lock. - Remove a bogus READ_ONCE. Fixes: 808a8b9e0b87 ("usb: typec: ucsi: Check for notifications after init") Cc: stable@kernel.org Signed-off-by: Christian A. Ehrhardt Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240401210515.1902048-1-lk@c--e.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index 31d8a46ae5e7c..bd6ae92aa39e7 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -1736,11 +1736,13 @@ static int ucsi_init(struct ucsi *ucsi) ucsi->connector = connector; ucsi->ntfy = ntfy; + mutex_lock(&ucsi->ppm_lock); ret = ucsi->ops->read(ucsi, UCSI_CCI, &cci, sizeof(cci)); + mutex_unlock(&ucsi->ppm_lock); if (ret) return ret; - if (UCSI_CCI_CONNECTOR(READ_ONCE(cci))) - ucsi_connector_change(ucsi, cci); + if (UCSI_CCI_CONNECTOR(cci)) + ucsi_connector_change(ucsi, UCSI_CCI_CONNECTOR(cci)); return 0; -- GitLab From 1500a7b2794d46beadd408132ddb5ef669c5c057 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Tue, 2 Apr 2024 13:09:50 +0200 Subject: [PATCH 427/989] usb: gadget: functionfs: Fix inverted DMA fence direction A "read" fence was installed when the DMABUF was to be written to, and a "write" fence was installed when the DMABUF was to be read from. Besides, dma_resv_usage_rw() should only be used when waiting for fences. Fixes: 7b07a2a7ca02 ("usb: gadget: functionfs: Add DMABUF import interface") Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20240402110951.16376-2-paul@crapouillou.net Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_fs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index bffbc1dc651f9..70e8532127ad5 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1578,6 +1578,7 @@ static int ffs_dmabuf_transfer(struct file *file, struct ffs_dmabuf_priv *priv; struct ffs_dma_fence *fence; struct usb_request *usb_req; + enum dma_resv_usage resv_dir; struct dma_buf *dmabuf; struct ffs_ep *ep; bool cookie; @@ -1665,8 +1666,9 @@ static int ffs_dmabuf_transfer(struct file *file, dma_fence_init(&fence->base, &ffs_dmabuf_fence_ops, &priv->lock, priv->context, seqno); - dma_resv_add_fence(dmabuf->resv, &fence->base, - dma_resv_usage_rw(epfile->in)); + resv_dir = epfile->in ? DMA_RESV_USAGE_WRITE : DMA_RESV_USAGE_READ; + + dma_resv_add_fence(dmabuf->resv, &fence->base, resv_dir); dma_resv_unlock(dmabuf->resv); /* Now that the dma_fence is in place, queue the transfer. */ -- GitLab From 862b416560c348f66d2091c56e71fd5462510cab Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Tue, 2 Apr 2024 13:09:51 +0200 Subject: [PATCH 428/989] usb: gadget: functionfs: Wait for fences before enqueueing DMABUF Instead of bailing when fences have already been installed on the DMABUF, wait for them (with a timeout) when doing a blocking operation. This fixes the issue where userspace would submit a DMABUF with fences already installed, with the (correct) expectation that it would just work. Fixes: 7b07a2a7ca02 ("usb: gadget: functionfs: Add DMABUF import interface") Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20240402110951.16376-3-paul@crapouillou.net Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_fs.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 70e8532127ad5..f855f1fc8e5e1 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -46,6 +46,8 @@ #define FUNCTIONFS_MAGIC 0xa647361 /* Chosen by a honest dice roll ;) */ +#define DMABUF_ENQUEUE_TIMEOUT_MS 5000 + MODULE_IMPORT_NS(DMA_BUF); /* Reference counter handling */ @@ -1580,9 +1582,11 @@ static int ffs_dmabuf_transfer(struct file *file, struct usb_request *usb_req; enum dma_resv_usage resv_dir; struct dma_buf *dmabuf; + unsigned long timeout; struct ffs_ep *ep; bool cookie; u32 seqno; + long retl; int ret; if (req->flags & ~USB_FFS_DMABUF_TRANSFER_MASK) @@ -1616,17 +1620,14 @@ static int ffs_dmabuf_transfer(struct file *file, goto err_attachment_put; /* Make sure we don't have writers */ - if (!dma_resv_test_signaled(dmabuf->resv, DMA_RESV_USAGE_WRITE)) { - pr_vdebug("FFS WRITE fence is not signaled\n"); - ret = -EBUSY; - goto err_resv_unlock; - } - - /* If we're writing to the DMABUF, make sure we don't have readers */ - if (epfile->in && - !dma_resv_test_signaled(dmabuf->resv, DMA_RESV_USAGE_READ)) { - pr_vdebug("FFS READ fence is not signaled\n"); - ret = -EBUSY; + timeout = nonblock ? 0 : msecs_to_jiffies(DMABUF_ENQUEUE_TIMEOUT_MS); + retl = dma_resv_wait_timeout(dmabuf->resv, + dma_resv_usage_rw(epfile->in), + true, timeout); + if (retl == 0) + retl = -EBUSY; + if (retl < 0) { + ret = (int)retl; goto err_resv_unlock; } -- GitLab From 3c88b8f471ee9512bc4ef02bebafdc53fb7c5d9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 27 Mar 2024 10:11:33 +0100 Subject: [PATCH 429/989] drm/xe: Use ring ops TLB invalidation for rebinds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For each rebind we insert a GuC TLB invalidation and add a corresponding unordered TLB invalidation fence. This might add a huge number of TLB invalidation fences to wait for so rather than doing that, defer the TLB invalidation to the next ring ops for each affected exec queue. Since the TLB is invalidated on exec_queue switch, we need to invalidate once for each affected exec_queue. v2: - Simplify if-statements around the tlb_flush_seqno. (Matthew Brost) - Add some comments and asserts. Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after rebinds issued from execs") Cc: Matthew Brost Cc: # v6.8+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240327091136.3271-2-thomas.hellstrom@linux.intel.com (cherry picked from commit 4fc4899e86f7afbd09f4bcb899f0fc57e0296e62) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_exec_queue_types.h | 5 +++++ drivers/gpu/drm/xe/xe_pt.c | 6 ++++-- drivers/gpu/drm/xe/xe_ring_ops.c | 11 ++++------- drivers/gpu/drm/xe/xe_sched_job.c | 10 ++++++++++ drivers/gpu/drm/xe/xe_sched_job_types.h | 2 ++ drivers/gpu/drm/xe/xe_vm_types.h | 5 +++++ 6 files changed, 30 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 62b3d9d1d7cdd..462b331950320 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -148,6 +148,11 @@ struct xe_exec_queue { const struct xe_ring_ops *ring_ops; /** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */ struct drm_sched_entity *entity; + /** + * @tlb_flush_seqno: The seqno of the last rebind tlb flush performed + * Protected by @vm's resv. Unused if @vm == NULL. + */ + u64 tlb_flush_seqno; /** @lrc: logical ring context for this exec queue */ struct xe_lrc lrc[]; }; diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 7f54bc3e389d5..9fd65f5d3d803 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1254,11 +1254,13 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue * non-faulting LR, in particular on user-space batch buffer chaining, * it needs to be done here. */ - if ((rebind && !xe_vm_in_lr_mode(vm) && !vm->batch_invalidate_tlb) || - (!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) { + if ((!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) { ifence = kzalloc(sizeof(*ifence), GFP_KERNEL); if (!ifence) return ERR_PTR(-ENOMEM); + } else if (rebind && !xe_vm_in_lr_mode(vm)) { + /* We bump also if batch_invalidate_tlb is true */ + vm->tlb_flush_seqno++; } rfence = kzalloc(sizeof(*rfence), GFP_KERNEL); diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index c4edffcd4a320..5b2b37b598130 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -219,10 +219,9 @@ static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc { u32 dw[MAX_JOB_SIZE_DW], i = 0; u32 ppgtt_flag = get_ppgtt_flag(job); - struct xe_vm *vm = job->q->vm; struct xe_gt *gt = job->q->gt; - if (vm && vm->batch_invalidate_tlb) { + if (job->ring_ops_flush_tlb) { dw[i++] = preparser_disable(true); i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, true, dw, i); @@ -270,7 +269,6 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, struct xe_gt *gt = job->q->gt; struct xe_device *xe = gt_to_xe(gt); bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE; - struct xe_vm *vm = job->q->vm; dw[i++] = preparser_disable(true); @@ -282,13 +280,13 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i); } - if (vm && vm->batch_invalidate_tlb) + if (job->ring_ops_flush_tlb) i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, true, dw, i); dw[i++] = preparser_disable(false); - if (!vm || !vm->batch_invalidate_tlb) + if (!job->ring_ops_flush_tlb) i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, dw, i); @@ -317,7 +315,6 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job, struct xe_gt *gt = job->q->gt; struct xe_device *xe = gt_to_xe(gt); bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK); - struct xe_vm *vm = job->q->vm; u32 mask_flags = 0; dw[i++] = preparser_disable(true); @@ -327,7 +324,7 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job, mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS; /* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */ - i = emit_pipe_invalidate(mask_flags, vm && vm->batch_invalidate_tlb, dw, i); + i = emit_pipe_invalidate(mask_flags, job->ring_ops_flush_tlb, dw, i); /* hsdes: 1809175790 */ if (has_aux_ccs(xe)) diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c index 8151ddafb9407..b0c7fa4693cfe 100644 --- a/drivers/gpu/drm/xe/xe_sched_job.c +++ b/drivers/gpu/drm/xe/xe_sched_job.c @@ -250,6 +250,16 @@ bool xe_sched_job_completed(struct xe_sched_job *job) void xe_sched_job_arm(struct xe_sched_job *job) { + struct xe_exec_queue *q = job->q; + struct xe_vm *vm = q->vm; + + if (vm && !xe_sched_job_is_migration(q) && !xe_vm_in_lr_mode(vm) && + (vm->batch_invalidate_tlb || vm->tlb_flush_seqno != q->tlb_flush_seqno)) { + xe_vm_assert_held(vm); + q->tlb_flush_seqno = vm->tlb_flush_seqno; + job->ring_ops_flush_tlb = true; + } + drm_sched_job_arm(&job->drm); } diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h index b1d83da50a53d..5e12724219fdd 100644 --- a/drivers/gpu/drm/xe/xe_sched_job_types.h +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h @@ -39,6 +39,8 @@ struct xe_sched_job { } user_fence; /** @migrate_flush_flags: Additional flush flags for migration jobs */ u32 migrate_flush_flags; + /** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */ + bool ring_ops_flush_tlb; /** @batch_addr: batch buffer address of job */ u64 batch_addr[]; }; diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index ae5fb565f6bf4..5747f136d24d1 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -264,6 +264,11 @@ struct xe_vm { bool capture_once; } error_capture; + /** + * @tlb_flush_seqno: Required TLB flush seqno for the next exec. + * protected by the vm resv. + */ + u64 tlb_flush_seqno; /** @batch_invalidate_tlb: Always invalidate TLB before batch start */ bool batch_invalidate_tlb; /** @xef: XE file handle for tracking this VM's drm client */ -- GitLab From a00e7e3fb4b9b30a9f2286a6f892b6e781e560a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 27 Mar 2024 10:11:34 +0100 Subject: [PATCH 430/989] drm/xe: Rework rebinding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of handling the vm's rebind fence separately, which is error prone if they are not strictly ordered, attach rebind fences as kernel fences to the vm's resv. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Rodrigo Vivi Cc: Matthew Brost Cc: # v6.8+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240327091136.3271-3-thomas.hellstrom@linux.intel.com (cherry picked from commit 5a091aff50b780ae29c7faf70a7a6c21c98a54c4) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_exec.c | 31 +++---------------------------- drivers/gpu/drm/xe/xe_pt.c | 2 +- drivers/gpu/drm/xe/xe_vm.c | 27 +++++++++------------------ drivers/gpu/drm/xe/xe_vm.h | 2 +- drivers/gpu/drm/xe/xe_vm_types.h | 3 --- 5 files changed, 14 insertions(+), 51 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index 826c8b3896725..79469b76a8bee 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -152,7 +152,6 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) struct drm_exec *exec = &vm_exec.exec; u32 i, num_syncs = 0, num_ufence = 0; struct xe_sched_job *job; - struct dma_fence *rebind_fence; struct xe_vm *vm; bool write_locked, skip_retry = false; ktime_t end = 0; @@ -294,35 +293,11 @@ retry: * Rebind any invalidated userptr or evicted BOs in the VM, non-compute * VM mode only. */ - rebind_fence = xe_vm_rebind(vm, false); - if (IS_ERR(rebind_fence)) { - err = PTR_ERR(rebind_fence); + err = xe_vm_rebind(vm, false); + if (err) goto err_put_job; - } - - /* - * We store the rebind_fence in the VM so subsequent execs don't get - * scheduled before the rebinds of userptrs / evicted BOs is complete. - */ - if (rebind_fence) { - dma_fence_put(vm->rebind_fence); - vm->rebind_fence = rebind_fence; - } - if (vm->rebind_fence) { - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, - &vm->rebind_fence->flags)) { - dma_fence_put(vm->rebind_fence); - vm->rebind_fence = NULL; - } else { - dma_fence_get(vm->rebind_fence); - err = drm_sched_job_add_dependency(&job->drm, - vm->rebind_fence); - if (err) - goto err_put_job; - } - } - /* Wait behind munmap style rebinds */ + /* Wait behind rebinds */ if (!xe_vm_in_lr_mode(vm)) { err = drm_sched_job_add_resv_dependencies(&job->drm, xe_vm_resv(vm), diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 9fd65f5d3d803..5fc4ad1e42981 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1299,7 +1299,7 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue } /* add shared fence now for pagetable delayed destroy */ - dma_resv_add_fence(xe_vm_resv(vm), fence, !rebind && + dma_resv_add_fence(xe_vm_resv(vm), fence, rebind || last_munmap_rebind ? DMA_RESV_USAGE_KERNEL : DMA_RESV_USAGE_BOOKKEEP); diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index f88faef4142bd..3b97e9c926091 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -522,7 +522,6 @@ static void preempt_rebind_work_func(struct work_struct *w) { struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work); struct drm_exec exec; - struct dma_fence *rebind_fence; unsigned int fence_count = 0; LIST_HEAD(preempt_fences); ktime_t end = 0; @@ -568,18 +567,11 @@ retry: if (err) goto out_unlock; - rebind_fence = xe_vm_rebind(vm, true); - if (IS_ERR(rebind_fence)) { - err = PTR_ERR(rebind_fence); + err = xe_vm_rebind(vm, true); + if (err) goto out_unlock; - } - - if (rebind_fence) { - dma_fence_wait(rebind_fence, false); - dma_fence_put(rebind_fence); - } - /* Wait on munmap style VM unbinds */ + /* Wait on rebinds and munmap style VM unbinds */ wait = dma_resv_wait_timeout(xe_vm_resv(vm), DMA_RESV_USAGE_KERNEL, false, MAX_SCHEDULE_TIMEOUT); @@ -773,14 +765,14 @@ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q, struct xe_sync_entry *syncs, u32 num_syncs, bool first_op, bool last_op); -struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker) +int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker) { - struct dma_fence *fence = NULL; + struct dma_fence *fence; struct xe_vma *vma, *next; lockdep_assert_held(&vm->lock); if (xe_vm_in_lr_mode(vm) && !rebind_worker) - return NULL; + return 0; xe_vm_assert_held(vm); list_for_each_entry_safe(vma, next, &vm->rebind_list, @@ -788,17 +780,17 @@ struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker) xe_assert(vm->xe, vma->tile_present); list_del_init(&vma->combined_links.rebind); - dma_fence_put(fence); if (rebind_worker) trace_xe_vma_rebind_worker(vma); else trace_xe_vma_rebind_exec(vma); fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false); if (IS_ERR(fence)) - return fence; + return PTR_ERR(fence); + dma_fence_put(fence); } - return fence; + return 0; } static void xe_vma_free(struct xe_vma *vma) @@ -1589,7 +1581,6 @@ static void vm_destroy_work_func(struct work_struct *w) XE_WARN_ON(vm->pt_root[id]); trace_xe_vm_free(vm); - dma_fence_put(vm->rebind_fence); kfree(vm); } diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h index 6df1f1c7f85d9..4853354336f29 100644 --- a/drivers/gpu/drm/xe/xe_vm.h +++ b/drivers/gpu/drm/xe/xe_vm.h @@ -207,7 +207,7 @@ int __xe_vm_userptr_needs_repin(struct xe_vm *vm); int xe_vm_userptr_check_repin(struct xe_vm *vm); -struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker); +int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker); int xe_vm_invalidate_vma(struct xe_vma *vma); diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index 5747f136d24d1..badf3945083d5 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -177,9 +177,6 @@ struct xe_vm { */ struct list_head rebind_list; - /** @rebind_fence: rebind fence from execbuf */ - struct dma_fence *rebind_fence; - /** * @destroy_work: worker to destroy VM, needed as a dma_fence signaling * from an irq context can be last put and the destroy needs to be able -- GitLab From fd1c8085113fb7c803fd81280f7e0bb25a5797ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 27 Mar 2024 10:11:35 +0100 Subject: [PATCH 431/989] drm/xe: Make TLB invalidation fences unordered MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit They can actually complete out-of-order, so allocate a unique fence context for each fence. Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after rebinds issued from execs") Cc: Matthew Brost Cc: # v6.8+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240327091136.3271-4-thomas.hellstrom@linux.intel.com (cherry picked from commit 0453f1757501df2e82b66b3183a24bba5a6f8fa3) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 1 - drivers/gpu/drm/xe/xe_gt_types.h | 7 ------- drivers/gpu/drm/xe/xe_pt.c | 3 +-- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c index f03e077f81a04..e598a4363d019 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -61,7 +61,6 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt) INIT_LIST_HEAD(>->tlb_invalidation.pending_fences); spin_lock_init(>->tlb_invalidation.pending_lock); spin_lock_init(>->tlb_invalidation.lock); - gt->tlb_invalidation.fence_context = dma_fence_context_alloc(1); INIT_DELAYED_WORK(>->tlb_invalidation.fence_tdr, xe_gt_tlb_fence_timeout); diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h index 70c615dd14986..07b2f724ec456 100644 --- a/drivers/gpu/drm/xe/xe_gt_types.h +++ b/drivers/gpu/drm/xe/xe_gt_types.h @@ -177,13 +177,6 @@ struct xe_gt { * xe_gt_tlb_fence_timeout after the timeut interval is over. */ struct delayed_work fence_tdr; - /** @tlb_invalidation.fence_context: context for TLB invalidation fences */ - u64 fence_context; - /** - * @tlb_invalidation.fence_seqno: seqno to TLB invalidation fences, protected by - * tlb_invalidation.lock - */ - u32 fence_seqno; /** @tlb_invalidation.lock: protects TLB invalidation fences */ spinlock_t lock; } tlb_invalidation; diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 5fc4ad1e42981..29d1a31f98041 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1135,8 +1135,7 @@ static int invalidation_fence_init(struct xe_gt *gt, spin_lock_irq(>->tlb_invalidation.lock); dma_fence_init(&ifence->base.base, &invalidation_fence_ops, >->tlb_invalidation.lock, - gt->tlb_invalidation.fence_context, - ++gt->tlb_invalidation.fence_seqno); + dma_fence_context_alloc(1), 1); spin_unlock_irq(>->tlb_invalidation.lock); INIT_LIST_HEAD(&ifence->base.link); -- GitLab From 3edd52bead30879644bb69fe4aafde67d2cd8512 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 27 Mar 2024 10:11:36 +0100 Subject: [PATCH 432/989] drm/xe: Move vma rebinding to the drm_exec locking loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rebinding might allocate page-table bos, causing evictions. To support blocking locking during these evictions, perform the rebinding in the drm_exec locking loop. Also Reserve fence slots where actually needed rather than trying to predict how many fence slots will be needed over a complete wound-wait transaction. v2: - Remove a leftover call to xe_vm_rebind() (Matt Brost) - Add a helper function xe_vm_validate_rebind() (Matt Brost) v3: - Add comments and squash with previous patch (Matt Brost) Fixes: 24f947d58fe5 ("drm/xe: Use DRM GPUVM helpers for external- and evicted objects") Fixes: 29f424eb8702 ("drm/xe/exec: move fence reservation") Cc: Matthew Auld Signed-off-by: Thomas Hellström Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240327091136.3271-5-thomas.hellstrom@linux.intel.com (cherry picked from commit 7ee7dd6f301341d5b1204fc19fa620d7f7f7e90d) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_exec.c | 52 ++--------------- drivers/gpu/drm/xe/xe_gt_pagefault.c | 3 +- drivers/gpu/drm/xe/xe_pt.c | 14 +++++ drivers/gpu/drm/xe/xe_vm.c | 83 +++++++++++++++++++--------- drivers/gpu/drm/xe/xe_vm.h | 6 +- 5 files changed, 83 insertions(+), 75 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index 79469b76a8bee..cc5e0f75de3c7 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -94,48 +94,16 @@ * Unlock all */ +/* + * Add validation and rebinding to the drm_exec locking loop, since both can + * trigger eviction which may require sleeping dma_resv locks. + */ static int xe_exec_fn(struct drm_gpuvm_exec *vm_exec) { struct xe_vm *vm = container_of(vm_exec->vm, struct xe_vm, gpuvm); - struct drm_gem_object *obj; - unsigned long index; - int num_fences; - int ret; - - ret = drm_gpuvm_validate(vm_exec->vm, &vm_exec->exec); - if (ret) - return ret; - - /* - * 1 fence slot for the final submit, and 1 more for every per-tile for - * GPU bind and 1 extra for CPU bind. Note that there are potentially - * many vma per object/dma-resv, however the fence slot will just be - * re-used, since they are largely the same timeline and the seqno - * should be in order. In the case of CPU bind there is dummy fence used - * for all CPU binds, so no need to have a per-tile slot for that. - */ - num_fences = 1 + 1 + vm->xe->info.tile_count; - /* - * We don't know upfront exactly how many fence slots we will need at - * the start of the exec, since the TTM bo_validate above can consume - * numerous fence slots. Also due to how the dma_resv_reserve_fences() - * works it only ensures that at least that many fence slots are - * available i.e if there are already 10 slots available and we reserve - * two more, it can just noop without reserving anything. With this it - * is quite possible that TTM steals some of the fence slots and then - * when it comes time to do the vma binding and final exec stage we are - * lacking enough fence slots, leading to some nasty BUG_ON() when - * adding the fences. Hence just add our own fences here, after the - * validate stage. - */ - drm_exec_for_each_locked_object(&vm_exec->exec, index, obj) { - ret = dma_resv_reserve_fences(obj->resv, num_fences); - if (ret) - return ret; - } - - return 0; + /* The fence slot added here is intended for the exec sched job. */ + return xe_vm_validate_rebind(vm, &vm_exec->exec, 1); } int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) @@ -289,14 +257,6 @@ retry: goto err_exec; } - /* - * Rebind any invalidated userptr or evicted BOs in the VM, non-compute - * VM mode only. - */ - err = xe_vm_rebind(vm, false); - if (err) - goto err_put_job; - /* Wait behind rebinds */ if (!xe_vm_in_lr_mode(vm)) { err = drm_sched_job_add_resv_dependencies(&job->drm, diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c index 241c294270d91..fa9e9853c53ba 100644 --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c @@ -100,10 +100,9 @@ static int xe_pf_begin(struct drm_exec *exec, struct xe_vma *vma, { struct xe_bo *bo = xe_vma_bo(vma); struct xe_vm *vm = xe_vma_vm(vma); - unsigned int num_shared = 2; /* slots for bind + move */ int err; - err = xe_vm_prepare_vma(exec, vma, num_shared); + err = xe_vm_lock_vma(exec, vma); if (err) return err; diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 29d1a31f98041..4efc8c1a3d7a9 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1235,6 +1235,13 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue err = xe_pt_prepare_bind(tile, vma, entries, &num_entries); if (err) goto err; + + err = dma_resv_reserve_fences(xe_vm_resv(vm), 1); + if (!err && !xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm) + err = dma_resv_reserve_fences(xe_vma_bo(vma)->ttm.base.resv, 1); + if (err) + goto err; + xe_tile_assert(tile, num_entries <= ARRAY_SIZE(entries)); xe_vm_dbg_print_entries(tile_to_xe(tile), entries, num_entries); @@ -1577,6 +1584,7 @@ __xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queu struct dma_fence *fence = NULL; struct invalidation_fence *ifence; struct xe_range_fence *rfence; + int err; LLIST_HEAD(deferred); @@ -1594,6 +1602,12 @@ __xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queu xe_pt_calc_rfence_interval(vma, &unbind_pt_update, entries, num_entries); + err = dma_resv_reserve_fences(xe_vm_resv(vm), 1); + if (!err && !xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm) + err = dma_resv_reserve_fences(xe_vma_bo(vma)->ttm.base.resv, 1); + if (err) + return ERR_PTR(err); + ifence = kzalloc(sizeof(*ifence), GFP_KERNEL); if (!ifence) return ERR_PTR(-ENOMEM); diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 3b97e9c926091..62d1ef8867a84 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -482,17 +482,53 @@ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec) return 0; } +/** + * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas + * @vm: The vm for which we are rebinding. + * @exec: The struct drm_exec with the locked GEM objects. + * @num_fences: The number of fences to reserve for the operation, not + * including rebinds and validations. + * + * Validates all evicted gem objects and rebinds their vmas. Note that + * rebindings may cause evictions and hence the validation-rebind + * sequence is rerun until there are no more objects to validate. + * + * Return: 0 on success, negative error code on error. In particular, + * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if + * the drm_exec transaction needs to be restarted. + */ +int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec, + unsigned int num_fences) +{ + struct drm_gem_object *obj; + unsigned long index; + int ret; + + do { + ret = drm_gpuvm_validate(&vm->gpuvm, exec); + if (ret) + return ret; + + ret = xe_vm_rebind(vm, false); + if (ret) + return ret; + } while (!list_empty(&vm->gpuvm.evict.list)); + + drm_exec_for_each_locked_object(exec, index, obj) { + ret = dma_resv_reserve_fences(obj->resv, num_fences); + if (ret) + return ret; + } + + return 0; +} + static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm, bool *done) { int err; - /* - * 1 fence for each preempt fence plus a fence for each tile from a - * possible rebind - */ - err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, vm->preempt.num_exec_queues + - vm->xe->info.tile_count); + err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0); if (err) return err; @@ -507,7 +543,7 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm, return 0; } - err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, vm->preempt.num_exec_queues); + err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0); if (err) return err; @@ -515,7 +551,13 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm, if (err) return err; - return drm_gpuvm_validate(&vm->gpuvm, exec); + /* + * Add validation and rebinding to the locking loop since both can + * cause evictions which may require blocing dma_resv locks. + * The fence reservation here is intended for the new preempt fences + * we attach at the end of the rebind work. + */ + return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues); } static void preempt_rebind_work_func(struct work_struct *w) @@ -996,35 +1038,26 @@ static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence) } /** - * xe_vm_prepare_vma() - drm_exec utility to lock a vma + * xe_vm_lock_vma() - drm_exec utility to lock a vma * @exec: The drm_exec object we're currently locking for. * @vma: The vma for witch we want to lock the vm resv and any attached * object's resv. - * @num_shared: The number of dma-fence slots to pre-allocate in the - * objects' reservation objects. * * Return: 0 on success, negative error code on error. In particular * may return -EDEADLK on WW transaction contention and -EINTR if * an interruptible wait is terminated by a signal. */ -int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma, - unsigned int num_shared) +int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma) { struct xe_vm *vm = xe_vma_vm(vma); struct xe_bo *bo = xe_vma_bo(vma); int err; XE_WARN_ON(!vm); - if (num_shared) - err = drm_exec_prepare_obj(exec, xe_vm_obj(vm), num_shared); - else - err = drm_exec_lock_obj(exec, xe_vm_obj(vm)); - if (!err && bo && !bo->vm) { - if (num_shared) - err = drm_exec_prepare_obj(exec, &bo->ttm.base, num_shared); - else - err = drm_exec_lock_obj(exec, &bo->ttm.base); - } + + err = drm_exec_lock_obj(exec, xe_vm_obj(vm)); + if (!err && bo && !bo->vm) + err = drm_exec_lock_obj(exec, &bo->ttm.base); return err; } @@ -1036,7 +1069,7 @@ static void xe_vma_destroy_unlocked(struct xe_vma *vma) drm_exec_init(&exec, 0, 0); drm_exec_until_all_locked(&exec) { - err = xe_vm_prepare_vma(&exec, vma, 0); + err = xe_vm_lock_vma(&exec, vma); drm_exec_retry_on_contention(&exec); if (XE_WARN_ON(err)) break; @@ -2503,7 +2536,7 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm, lockdep_assert_held_write(&vm->lock); - err = xe_vm_prepare_vma(exec, vma, 1); + err = xe_vm_lock_vma(exec, vma); if (err) return err; diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h index 4853354336f29..306cd0934a190 100644 --- a/drivers/gpu/drm/xe/xe_vm.h +++ b/drivers/gpu/drm/xe/xe_vm.h @@ -242,8 +242,10 @@ bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end); int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id); -int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma, - unsigned int num_shared); +int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma); + +int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec, + unsigned int num_fences); /** * xe_vm_resv() - Return's the vm's reservation object -- GitLab From 77a011012d7d8b98368a763bf74317c6d5ce00f1 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 1 Apr 2024 15:19:11 -0700 Subject: [PATCH 433/989] drm/xe: Use ordered wq for preempt fence waiting Preempt fences can sleep waiting for an exec queue suspend operation to complete. If the system_unbound_wq is used for waiting and the number of waiters exceeds max_active this will result in other users of the system_unbound_wq getting starved. Use a device private work queue for preempt fences to avoid starvation of the system_unbound_wq. Even though suspend operations can complete out-of-order, all suspend operations within a VM need to complete before the preempt rebind worker can start. With that, use a device private ordered wq for preempt fence waiting. v2: - Add comment about cleanup on failure (Matt R) - Update commit message (Lucas) Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Brost Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240401221913.139672-2-matthew.brost@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 37c15c4aae1fe3f67efd2641db8d8c25c2d524ab) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_device.c | 11 ++++++++++- drivers/gpu/drm/xe/xe_device_types.h | 3 +++ drivers/gpu/drm/xe/xe_preempt_fence.c | 2 +- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index ca85e81fdb443..d32ff3857e658 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -193,6 +193,9 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy) { struct xe_device *xe = to_xe_device(dev); + if (xe->preempt_fence_wq) + destroy_workqueue(xe->preempt_fence_wq); + if (xe->ordered_wq) destroy_workqueue(xe->ordered_wq); @@ -258,9 +261,15 @@ struct xe_device *xe_device_create(struct pci_dev *pdev, INIT_LIST_HEAD(&xe->pinned.external_vram); INIT_LIST_HEAD(&xe->pinned.evicted); + xe->preempt_fence_wq = alloc_ordered_workqueue("xe-preempt-fence-wq", 0); xe->ordered_wq = alloc_ordered_workqueue("xe-ordered-wq", 0); xe->unordered_wq = alloc_workqueue("xe-unordered-wq", 0, 0); - if (!xe->ordered_wq || !xe->unordered_wq) { + if (!xe->ordered_wq || !xe->unordered_wq || + !xe->preempt_fence_wq) { + /* + * Cleanup done in xe_device_destroy via + * drmm_add_action_or_reset register above + */ drm_err(&xe->drm, "Failed to allocate xe workqueues\n"); err = -ENOMEM; goto err; diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 9785eef2e5a4e..8e3a222b41cf0 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -363,6 +363,9 @@ struct xe_device { /** @ufence_wq: user fence wait queue */ wait_queue_head_t ufence_wq; + /** @preempt_fence_wq: used to serialize preempt fences */ + struct workqueue_struct *preempt_fence_wq; + /** @ordered_wq: used to serialize compute mode resume */ struct workqueue_struct *ordered_wq; diff --git a/drivers/gpu/drm/xe/xe_preempt_fence.c b/drivers/gpu/drm/xe/xe_preempt_fence.c index 7bce2a332603c..7d50c6e89d8e7 100644 --- a/drivers/gpu/drm/xe/xe_preempt_fence.c +++ b/drivers/gpu/drm/xe/xe_preempt_fence.c @@ -49,7 +49,7 @@ static bool preempt_fence_enable_signaling(struct dma_fence *fence) struct xe_exec_queue *q = pfence->q; pfence->error = q->ops->suspend(q); - queue_work(system_unbound_wq, &pfence->preempt_work); + queue_work(q->vm->xe->preempt_fence_wq, &pfence->preempt_work); return true; } -- GitLab From 05258a0a69b3c5d2c003f818702c0a52b6fea861 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Apr 2024 10:36:25 -0400 Subject: [PATCH 434/989] SUNRPC: Fix a slow server-side memory leak with RPC-over-TCP Jan Schunk reports that his small NFS servers suffer from memory exhaustion after just a few days. A bisect shows that commit e18e157bb5c8 ("SUNRPC: Send RPC message on TCP with a single sock_sendmsg() call") is the first bad commit. That commit assumed that sock_sendmsg() releases all the pages in the underlying bio_vec array, but the reality is that it doesn't. svc_xprt_release() releases the rqst's response pages, but the record marker page fragment isn't one of those, so it is never released. This is a narrow fix that can be applied to stable kernels. A more extensive fix is in the works. Reported-by: Jan Schunk Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218671 Fixes: e18e157bb5c8 ("SUNRPC: Send RPC message on TCP with a single sock_sendmsg() call") Cc: Alexander Duyck Cc: Jakub Kacinski Cc: David Howells Reviewed-by: David Howells Signed-off-by: Chuck Lever --- net/sunrpc/svcsock.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 545017a3daa4d..6b3f01beb294b 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1206,15 +1206,6 @@ err_noclose: * MSG_SPLICE_PAGES is used exclusively to reduce the number of * copy operations in this path. Therefore the caller must ensure * that the pages backing @xdr are unchanging. - * - * Note that the send is non-blocking. The caller has incremented - * the reference count on each page backing the RPC message, and - * the network layer will "put" these pages when transmission is - * complete. - * - * This is safe for our RPC services because the memory backing - * the head and tail components is never kmalloc'd. These always - * come from pages in the svc_rqst::rq_pages array. */ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, rpc_fraghdr marker, unsigned int *sentp) @@ -1244,6 +1235,7 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, 1 + count, sizeof(marker) + rqstp->rq_res.len); ret = sock_sendmsg(svsk->sk_sock, &msg); + page_frag_free(buf); if (ret < 0) return ret; *sentp += ret; -- GitLab From c4128304c2169b4664ed6fb6200f228cead2ab70 Mon Sep 17 00:00:00 2001 From: Kyle Tso Date: Thu, 4 Apr 2024 21:35:17 +0800 Subject: [PATCH 435/989] usb: typec: tcpm: Correct the PDO counting in pd_set Off-by-one errors happen because nr_snk_pdo and nr_src_pdo are incorrectly added one. The index of the loop is equal to the number of PDOs to be updated when leaving the loop and it doesn't need to be added one. When doing the power negotiation, TCPM relies on the "nr_snk_pdo" as the size of the local sink PDO array to match the Source capabilities of the partner port. If the off-by-one overflow occurs, a wrong RDO might be sent and unexpected power transfer might happen such as over voltage or over current (than expected). "nr_src_pdo" is used to set the Rp level when the port is in Source role. It is also the array size of the local Source capabilities when filling up the buffer which will be sent as the Source PDOs (such as in Power Negotiation). If the off-by-one overflow occurs, a wrong Rp level might be set and wrong Source PDOs will be sent to the partner port. This could potentially cause over current or port resets. Fixes: cd099cde4ed2 ("usb: typec: tcpm: Support multiple capabilities") Cc: stable@vger.kernel.org Signed-off-by: Kyle Tso Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240404133517.2707955-1-kyletso@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index c26fb70c3ec6c..ab6ed6111ed05 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -6855,14 +6855,14 @@ static int tcpm_pd_set(struct typec_port *p, struct usb_power_delivery *pd) if (data->sink_desc.pdo[0]) { for (i = 0; i < PDO_MAX_OBJECTS && data->sink_desc.pdo[i]; i++) port->snk_pdo[i] = data->sink_desc.pdo[i]; - port->nr_snk_pdo = i + 1; + port->nr_snk_pdo = i; port->operating_snk_mw = data->operating_snk_mw; } if (data->source_desc.pdo[0]) { for (i = 0; i < PDO_MAX_OBJECTS && data->source_desc.pdo[i]; i++) port->src_pdo[i] = data->source_desc.pdo[i]; - port->nr_src_pdo = i + 1; + port->nr_src_pdo = i; } switch (port->state) { -- GitLab From 6334b8e4553cc69f51e383c9de545082213d785e Mon Sep 17 00:00:00 2001 From: Norihiko Hama Date: Wed, 27 Mar 2024 11:35:50 +0900 Subject: [PATCH 436/989] usb: gadget: f_ncm: Fix UAF ncm object at re-bind after usb ep transport error When ncm function is working and then stop usb0 interface for link down, eth_stop() is called. At this piont, accidentally if usb transport error should happen in usb_ep_enable(), 'in_ep' and/or 'out_ep' may not be enabled. After that, ncm_disable() is called to disable for ncm unbind but gether_disconnect() is never called since 'in_ep' is not enabled. As the result, ncm object is released in ncm unbind but 'dev->port_usb' associated to 'ncm->port' is not NULL. And when ncm bind again to recover netdev, ncm object is reallocated but usb0 interface is already associated to previous released ncm object. Therefore, once usb0 interface is up and eth_start_xmit() is called, released ncm object is dereferrenced and it might cause use-after-free memory. [function unlink via configfs] usb0: eth_stop dev->port_usb=ffffff9b179c3200 --> error happens in usb_ep_enable(). NCM: ncm_disable: ncm=ffffff9b179c3200 --> no gether_disconnect() since ncm->port.in_ep->enabled is false. NCM: ncm_unbind: ncm unbind ncm=ffffff9b179c3200 NCM: ncm_free: ncm free ncm=ffffff9b179c3200 <-- released ncm [function link via configfs] NCM: ncm_alloc: ncm alloc ncm=ffffff9ac4f8a000 NCM: ncm_bind: ncm bind ncm=ffffff9ac4f8a000 NCM: ncm_set_alt: ncm=ffffff9ac4f8a000 alt=0 usb0: eth_open dev->port_usb=ffffff9b179c3200 <-- previous released ncm usb0: eth_start dev->port_usb=ffffff9b179c3200 <-- eth_start_xmit() --> dev->wrap() Unable to handle kernel paging request at virtual address dead00000000014f This patch addresses the issue by checking if 'ncm->netdev' is not NULL at ncm_disable() to call gether_disconnect() to deassociate 'dev->port_usb'. It's more reasonable to check 'ncm->netdev' to call gether_connect/disconnect rather than check 'ncm->port.in_ep->enabled' since it might not be enabled but the gether connection might be established. Signed-off-by: Norihiko Hama Cc: stable Link: https://lore.kernel.org/r/20240327023550.51214-1-Norihiko.Hama@alpsalpine.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_ncm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c index 28f4e6552e845..0acc32ed99609 100644 --- a/drivers/usb/gadget/function/f_ncm.c +++ b/drivers/usb/gadget/function/f_ncm.c @@ -878,7 +878,7 @@ static int ncm_set_alt(struct usb_function *f, unsigned intf, unsigned alt) if (alt > 1) goto fail; - if (ncm->port.in_ep->enabled) { + if (ncm->netdev) { DBG(cdev, "reset ncm\n"); ncm->netdev = NULL; gether_disconnect(&ncm->port); @@ -1367,7 +1367,7 @@ static void ncm_disable(struct usb_function *f) DBG(cdev, "ncm deactivated\n"); - if (ncm->port.in_ep->enabled) { + if (ncm->netdev) { ncm->netdev = NULL; gether_disconnect(&ncm->port); } -- GitLab From b3b95964590a3d756d69ea8604c856de805479ad Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 4 Apr 2024 11:33:27 +0200 Subject: [PATCH 437/989] gpio: cdev: check for NULL labels when sanitizing them for irqs We need to take into account that a line's consumer label may be NULL and not try to kstrdup() it in that case but rather pass the NULL pointer up the stack to the interrupt request function. To that end: let make_irq_label() return NULL as a valid return value and use ERR_PTR() instead to signal an allocation failure to callers. Cc: stable@vger.kernel.org Fixes: b34490879baa ("gpio: cdev: sanitize the label before requesting the interrupt") Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/lkml/20240402093534.212283-1-naresh.kamboju@linaro.org/ Signed-off-by: Bartosz Golaszewski Tested-by: Anders Roxell --- drivers/gpio/gpiolib-cdev.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index fa96356102510..1426cc1c4a285 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -1085,7 +1085,16 @@ static u32 gpio_v2_line_config_debounce_period(struct gpio_v2_line_config *lc, static inline char *make_irq_label(const char *orig) { - return kstrdup_and_replace(orig, '/', ':', GFP_KERNEL); + char *new; + + if (!orig) + return NULL; + + new = kstrdup_and_replace(orig, '/', ':', GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + return new; } static inline void free_irq_label(const char *label) @@ -1158,8 +1167,8 @@ static int edge_detector_setup(struct line *line, irqflags |= IRQF_ONESHOT; label = make_irq_label(line->req->label); - if (!label) - return -ENOMEM; + if (IS_ERR(label)) + return PTR_ERR(label); /* Request a thread to read the events */ ret = request_threaded_irq(irq, edge_irq_handler, edge_irq_thread, @@ -2217,8 +2226,8 @@ static int lineevent_create(struct gpio_device *gdev, void __user *ip) goto out_free_le; label = make_irq_label(le->label); - if (!label) { - ret = -ENOMEM; + if (IS_ERR(label)) { + ret = PTR_ERR(label); goto out_free_le; } -- GitLab From d920a2ed8620be04a3301e1a9c2b7cc1de65f19d Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Tue, 5 Mar 2024 14:51:38 +0800 Subject: [PATCH 438/989] usb: Disable USB3 LPM at shutdown SanDisks USB3 storage may disapper after system reboot: usb usb2-port3: link state change xhci_hcd 0000:00:14.0: clear port3 link state change, portsc: 0x2c0 usb usb2-port3: do warm reset, port only xhci_hcd 0000:00:14.0: xhci_hub_status_data: stopping usb2 port polling xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x2b0, return 0x2b0 usb usb2-port3: not warm reset yet, waiting 50ms xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x2f0, return 0x2f0 usb usb2-port3: not warm reset yet, waiting 200ms ... xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x6802c0, return 0x7002c0 usb usb2-port3: not warm reset yet, waiting 200ms xhci_hcd 0000:00:14.0: clear port3 reset change, portsc: 0x4802c0 xhci_hcd 0000:00:14.0: clear port3 warm(BH) reset change, portsc: 0x4002c0 xhci_hcd 0000:00:14.0: clear port3 link state change, portsc: 0x2c0 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x2c0, return 0x2c0 usb usb2-port3: not enabled, trying warm reset again... This is due to the USB device still cause port change event after xHCI is shuted down: xhci_hcd 0000:38:00.0: // Setting command ring address to 0xffffe001 xhci_hcd 0000:38:00.0: xhci_resume: starting usb3 port polling. xhci_hcd 0000:38:00.0: xhci_hub_status_data: stopping usb4 port polling xhci_hcd 0000:38:00.0: xhci_hub_status_data: stopping usb3 port polling xhci_hcd 0000:38:00.0: hcd_pci_runtime_resume: 0 xhci_hcd 0000:38:00.0: xhci_shutdown: stopping usb3 port polling. xhci_hcd 0000:38:00.0: // Halt the HC xhci_hcd 0000:38:00.0: xhci_shutdown completed - status = 1 xhci_hcd 0000:00:14.0: xhci_shutdown: stopping usb1 port polling. xhci_hcd 0000:00:14.0: // Halt the HC xhci_hcd 0000:00:14.0: xhci_shutdown completed - status = 1 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x1203, return 0x203 xhci_hcd 0000:00:14.0: set port reset, actual port 2-3 status = 0x1311 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x201203, return 0x100203 xhci_hcd 0000:00:14.0: clear port3 reset change, portsc: 0x1203 xhci_hcd 0000:00:14.0: clear port3 warm(BH) reset change, portsc: 0x1203 xhci_hcd 0000:00:14.0: clear port3 link state change, portsc: 0x1203 xhci_hcd 0000:00:14.0: clear port3 connect change, portsc: 0x1203 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x1203, return 0x203 usb 2-3: device not accepting address 2, error -108 xhci_hcd 0000:00:14.0: xHCI dying or halted, can't queue_command xhci_hcd 0000:00:14.0: Set port 2-3 link state, portsc: 0x1203, write 0x11261 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x1263, return 0x263 xhci_hcd 0000:00:14.0: set port reset, actual port 2-3 status = 0x1271 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x12b1, return 0x2b1 usb usb2-port3: not reset yet, waiting 60ms ACPI: PM: Preparing to enter system sleep state S5 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x12f1, return 0x2f1 usb usb2-port3: not reset yet, waiting 200ms reboot: Restarting system The port change event is caused by LPM transition, so disabling LPM at shutdown to make sure the device is in U0 for warmboot. Signed-off-by: Kai-Heng Feng Cc: stable Link: https://lore.kernel.org/r/20240305065140.66801-1-kai.heng.feng@canonical.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/port.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/port.c b/drivers/usb/core/port.c index 686c01af03e63..0e1262a077aea 100644 --- a/drivers/usb/core/port.c +++ b/drivers/usb/core/port.c @@ -449,8 +449,10 @@ static void usb_port_shutdown(struct device *dev) { struct usb_port *port_dev = to_usb_port(dev); - if (port_dev->child) + if (port_dev->child) { usb_disable_usb2_hardware_lpm(port_dev->child); + usb_unlocked_disable_lpm(port_dev->child); + } } static const struct dev_pm_ops usb_port_pm_ops = { -- GitLab From 1fc9af813b25e146d3607669247d0f970f5a87c3 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 5 Jan 2024 21:46:11 +0300 Subject: [PATCH 439/989] drm/panfrost: Fix the error path in panfrost_mmu_map_fault_addr() Subject: [PATCH] drm/panfrost: Fix the error path in panfrost_mmu_map_fault_addr() If some the pages or sgt allocation failed, we shouldn't release the pages ref we got earlier, otherwise we will end up with unbalanced get/put_pages() calls. We should instead leave everything in place and let the BO release function deal with extra cleanup when the object is destroyed, or let the fault handler try again next time it's called. Fixes: 187d2929206e ("drm/panfrost: Add support for GPU heap allocations") Cc: Reviewed-by: Steven Price Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Boris Brezillon Co-developed-by: Dmitry Osipenko Signed-off-by: Dmitry Osipenko Link: https://patchwork.freedesktop.org/patch/msgid/20240105184624.508603-18-dmitry.osipenko@collabora.com --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index f38385fe76bbb..b91019cd5acb1 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -502,11 +502,18 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, mapping_set_unevictable(mapping); for (i = page_offset; i < page_offset + NUM_FAULT_PAGES; i++) { + /* Can happen if the last fault only partially filled this + * section of the pages array before failing. In that case + * we skip already filled pages. + */ + if (pages[i]) + continue; + pages[i] = shmem_read_mapping_page(mapping, i); if (IS_ERR(pages[i])) { ret = PTR_ERR(pages[i]); pages[i] = NULL; - goto err_pages; + goto err_unlock; } } @@ -514,7 +521,7 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, ret = sg_alloc_table_from_pages(sgt, pages + page_offset, NUM_FAULT_PAGES, 0, SZ_2M, GFP_KERNEL); if (ret) - goto err_pages; + goto err_unlock; ret = dma_map_sgtable(pfdev->dev, sgt, DMA_BIDIRECTIONAL, 0); if (ret) @@ -537,8 +544,6 @@ out: err_map: sg_free_table(sgt); -err_pages: - drm_gem_shmem_put_pages(&bo->base); err_unlock: dma_resv_unlock(obj->resv); err_bo: -- GitLab From 3ddf944b32f88741c303f0b21459dbb3872b8bc5 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 13 Mar 2024 14:48:27 +0100 Subject: [PATCH 440/989] x86/mce: Make sure to grab mce_sysfs_mutex in set_bank() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modifying a MCA bank's MCA_CTL bits which control which error types to be reported is done over /sys/devices/system/machinecheck/ ├── machinecheck0 │   ├── bank0 │   ├── bank1 │   ├── bank10 │   ├── bank11 ... sysfs nodes by writing the new bit mask of events to enable. When the write is accepted, the kernel deletes all current timers and reinits all banks. Doing that in parallel can lead to initializing a timer which is already armed and in the timer wheel, i.e., in use already: ODEBUG: init active (active state 0) object: ffff888063a28000 object type: timer_list hint: mce_timer_fn+0x0/0x240 arch/x86/kernel/cpu/mce/core.c:2642 WARNING: CPU: 0 PID: 8120 at lib/debugobjects.c:514 debug_print_object+0x1a0/0x2a0 lib/debugobjects.c:514 Fix that by grabbing the sysfs mutex as the rest of the MCA sysfs code does. Reported by: Yue Sun Reported by: xingwei lee Signed-off-by: Borislav Petkov (AMD) Cc: Link: https://lore.kernel.org/r/CAEkJfYNiENwQY8yV1LYJ9LjJs%2Bx_-PqMv98gKig55=2vbzffRw@mail.gmail.com --- arch/x86/kernel/cpu/mce/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b5cc557cfc373..84d41be6d06ba 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2500,12 +2500,14 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, return -EINVAL; b = &per_cpu(mce_banks_array, s->id)[bank]; - if (!b->init) return -ENODEV; b->ctl = new; + + mutex_lock(&mce_sysfs_mutex); mce_restart(); + mutex_unlock(&mce_sysfs_mutex); return size; } -- GitLab From 0551ec93a00d935efd86b45bf70cefdc4e515b65 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Apr 2024 14:47:17 +0200 Subject: [PATCH 441/989] nvme: don't create a multipath node for zero capacity devices Apparently there are nvme controllers around that report namespaces in the namespace list which have zero capacity. Return -ENXIO instead of -ENODEV from nvme_update_ns_info_block so we don't create a hidden multipath node for these namespaces but entirely ignore them. Fixes: 46e7422cda84 ("nvme: move common logic into nvme_update_ns_info") Reported-by: Nilay Shroff Signed-off-by: Christoph Hellwig Tested-by: Nilay Shroff Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 504dc352c458d..27281a9a8951d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2089,7 +2089,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (id->ncap == 0) { /* namespace not allocated or attached */ info->is_removed = true; - ret = -ENODEV; + ret = -ENXIO; goto out; } lbaf = nvme_lbaf_index(id->flbas); -- GitLab From 95409e277d8343810adf8700d29d4329828d452b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 3 Apr 2024 13:31:14 +0200 Subject: [PATCH 442/989] nvmet: implement unique discovery NQN Unique discovery NQNs allow to differentiate between discovery services from (typically physically separate) NVMe-oF subsystems. This is required for establishing secured connections as otherwise the credentials won't be unique and the integrity of the connection cannot be guaranteed. This patch adds a configfs attribute 'discovery_nqn' in the 'nvmet' configfs directory to specify the unique discovery NQN. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 47 ++++++++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 7 +++++ 2 files changed, 54 insertions(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 77a6e817b3159..a2325330bf221 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1613,6 +1613,11 @@ static struct config_group *nvmet_subsys_make(struct config_group *group, return ERR_PTR(-EINVAL); } + if (sysfs_streq(name, nvmet_disc_subsys->subsysnqn)) { + pr_err("can't create subsystem using unique discovery NQN\n"); + return ERR_PTR(-EINVAL); + } + subsys = nvmet_subsys_alloc(name, NVME_NQN_NVME); if (IS_ERR(subsys)) return ERR_CAST(subsys); @@ -2159,7 +2164,49 @@ static const struct config_item_type nvmet_hosts_type = { static struct config_group nvmet_hosts_group; +static ssize_t nvmet_root_discovery_nqn_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%s\n", nvmet_disc_subsys->subsysnqn); +} + +static ssize_t nvmet_root_discovery_nqn_store(struct config_item *item, + const char *page, size_t count) +{ + struct list_head *entry; + size_t len; + + len = strcspn(page, "\n"); + if (!len || len > NVMF_NQN_FIELD_LEN - 1) + return -EINVAL; + + down_write(&nvmet_config_sem); + list_for_each(entry, &nvmet_subsystems_group.cg_children) { + struct config_item *item = + container_of(entry, struct config_item, ci_entry); + + if (!strncmp(config_item_name(item), page, len)) { + pr_err("duplicate NQN %s\n", config_item_name(item)); + up_write(&nvmet_config_sem); + return -EINVAL; + } + } + memset(nvmet_disc_subsys->subsysnqn, 0, NVMF_NQN_FIELD_LEN); + memcpy(nvmet_disc_subsys->subsysnqn, page, len); + up_write(&nvmet_config_sem); + + return len; +} + +CONFIGFS_ATTR(nvmet_root_, discovery_nqn); + +static struct configfs_attribute *nvmet_root_attrs[] = { + &nvmet_root_attr_discovery_nqn, + NULL, +}; + static const struct config_item_type nvmet_root_type = { + .ct_attrs = nvmet_root_attrs, .ct_owner = THIS_MODULE, }; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 6bbe4df0166ca..8860a3eb71ec8 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1541,6 +1541,13 @@ static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, } down_read(&nvmet_config_sem); + if (!strncmp(nvmet_disc_subsys->subsysnqn, subsysnqn, + NVMF_NQN_SIZE)) { + if (kref_get_unless_zero(&nvmet_disc_subsys->ref)) { + up_read(&nvmet_config_sem); + return nvmet_disc_subsys; + } + } list_for_each_entry(p, &port->subsystems, entry) { if (!strncmp(p->subsys->subsysnqn, subsysnqn, NVMF_NQN_SIZE)) { -- GitLab From db67bb39eff0fc5db21f49f8ae93e67ed2c5fe01 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 4 Apr 2024 16:41:30 +0200 Subject: [PATCH 443/989] nvmet-fc: move RCU read lock to nvmet_fc_assoc_exists The RCU lock is only needed for the lookup loop and not for list_ad_tail_rcu call. Thus move it down the call chain into nvmet_fc_assoc_exists. While at it also fix the name typo of the function. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/target/fc.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index fd229f310c931..337ee1cb09ae6 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1115,16 +1115,21 @@ nvmet_fc_schedule_delete_assoc(struct nvmet_fc_tgt_assoc *assoc) } static bool -nvmet_fc_assoc_exits(struct nvmet_fc_tgtport *tgtport, u64 association_id) +nvmet_fc_assoc_exists(struct nvmet_fc_tgtport *tgtport, u64 association_id) { struct nvmet_fc_tgt_assoc *a; + bool found = false; + rcu_read_lock(); list_for_each_entry_rcu(a, &tgtport->assoc_list, a_list) { - if (association_id == a->association_id) - return true; + if (association_id == a->association_id) { + found = true; + break; + } } + rcu_read_unlock(); - return false; + return found; } static struct nvmet_fc_tgt_assoc * @@ -1164,13 +1169,11 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle) ran = ran << BYTES_FOR_QID_SHIFT; spin_lock_irqsave(&tgtport->lock, flags); - rcu_read_lock(); - if (!nvmet_fc_assoc_exits(tgtport, ran)) { + if (!nvmet_fc_assoc_exists(tgtport, ran)) { assoc->association_id = ran; list_add_tail_rcu(&assoc->a_list, &tgtport->assoc_list); done = true; } - rcu_read_unlock(); spin_unlock_irqrestore(&tgtport->lock, flags); } while (!done); -- GitLab From 205fb5fa6fde1b5b426015eb1ff69f2ff25ef5bb Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 4 Apr 2024 16:41:31 +0200 Subject: [PATCH 444/989] nvme-fc: rename free_ctrl callback to match name pattern Rename nvme_fc_nvme_ctrl_freed to nvme_fc_free_ctrl to match the name pattern for the callback. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Daniel Wagner Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 68a5d971657bb..a5b29e9ad342d 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2428,7 +2428,7 @@ nvme_fc_ctrl_get(struct nvme_fc_ctrl *ctrl) * controller. Called after last nvme_put_ctrl() call */ static void -nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl) +nvme_fc_free_ctrl(struct nvme_ctrl *nctrl) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); @@ -3384,7 +3384,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, - .free_ctrl = nvme_fc_nvme_ctrl_freed, + .free_ctrl = nvme_fc_free_ctrl, .submit_async_event = nvme_fc_submit_async_event, .delete_ctrl = nvme_fc_delete_ctrl, .get_address = nvmf_get_address, -- GitLab From 1a4ea83a6e67f1415a1f17c1af5e9c814c882bb5 Mon Sep 17 00:00:00 2001 From: Yuanhe Shu Date: Mon, 26 Feb 2024 11:18:16 +0800 Subject: [PATCH 445/989] selftests/ftrace: Limit length in subsystem-enable tests While sched* events being traced and sched* events continuously happen, "[xx] event tracing - enable/disable with subsystem level files" would not stop as on some slower systems it seems to take forever. Select the first 100 lines of output would be enough to judge whether there are more than 3 types of sched events. Fixes: 815b18ea66d6 ("ftracetest: Add basic event tracing test cases") Cc: stable@vger.kernel.org Signed-off-by: Yuanhe Shu Acked-by: Masami Hiramatsu (Google) Acked-by: Steven Rostedt (Google) Signed-off-by: Shuah Khan --- .../selftests/ftrace/test.d/event/subsystem-enable.tc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc index b1ede62498667..b7c8f29c09a97 100644 --- a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc +++ b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc @@ -18,7 +18,7 @@ echo 'sched:*' > set_event yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -lt 3 ]; then fail "at least fork, exec and exit events should be recorded" fi @@ -29,7 +29,7 @@ echo 1 > events/sched/enable yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -lt 3 ]; then fail "at least fork, exec and exit events should be recorded" fi @@ -40,7 +40,7 @@ echo 0 > events/sched/enable yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -ne 0 ]; then fail "any of scheduler events should not be recorded" fi -- GitLab From 2e91bb99b9d4f756e92e83c4453f894dda220f09 Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Wed, 3 Apr 2024 15:21:58 +0200 Subject: [PATCH 446/989] net: usb: ax88179_178a: avoid the interface always configured as random address After the commit d2689b6a86b9 ("net: usb: ax88179_178a: avoid two consecutive device resets"), reset is not executed from bind operation and mac address is not read from the device registers or the devicetree at that moment. Since the check to configure if the assigned mac address is random or not for the interface, happens after the bind operation from usbnet_probe, the interface keeps configured as random address, although the address is correctly read and set during open operation (the only reset now). In order to keep only one reset for the device and to avoid the interface always configured as random address, after reset, configure correctly the suitable field from the driver, if the mac address is read successfully from the device registers or the devicetree. Take into account if a locally administered address (random) was previously stored. cc: stable@vger.kernel.org # 6.6+ Fixes: d2689b6a86b9 ("net: usb: ax88179_178a: avoid two consecutive device resets") Reported-by: Dave Stevenson Signed-off-by: Jose Ignacio Tornos Martinez Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240403132158.344838-1-jtornosm@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/ax88179_178a.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 88e084534853d..a9c418890a1ca 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1273,6 +1273,8 @@ static void ax88179_get_mac_addr(struct usbnet *dev) if (is_valid_ether_addr(mac)) { eth_hw_addr_set(dev->net, mac); + if (!is_local_ether_addr(mac)) + dev->net->addr_assign_type = NET_ADDR_PERM; } else { netdev_info(dev->net, "invalid MAC address, using random\n"); eth_hw_addr_random(dev->net); -- GitLab From d313eb8b77557a6d5855f42d2234bd592c7b50dd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 3 Apr 2024 13:09:08 +0000 Subject: [PATCH 447/989] net/sched: act_skbmod: prevent kernel-infoleak syzbot found that tcf_skbmod_dump() was copying four bytes from kernel stack to user space [1]. The issue here is that 'struct tc_skbmod' has a four bytes hole. We need to clear the structure before filling fields. [1] BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:114 [inline] BUG: KMSAN: kernel-infoleak in copy_to_user_iter lib/iov_iter.c:24 [inline] BUG: KMSAN: kernel-infoleak in iterate_ubuf include/linux/iov_iter.h:29 [inline] BUG: KMSAN: kernel-infoleak in iterate_and_advance2 include/linux/iov_iter.h:245 [inline] BUG: KMSAN: kernel-infoleak in iterate_and_advance include/linux/iov_iter.h:271 [inline] BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x366/0x2520 lib/iov_iter.c:185 instrument_copy_to_user include/linux/instrumented.h:114 [inline] copy_to_user_iter lib/iov_iter.c:24 [inline] iterate_ubuf include/linux/iov_iter.h:29 [inline] iterate_and_advance2 include/linux/iov_iter.h:245 [inline] iterate_and_advance include/linux/iov_iter.h:271 [inline] _copy_to_iter+0x366/0x2520 lib/iov_iter.c:185 copy_to_iter include/linux/uio.h:196 [inline] simple_copy_to_iter net/core/datagram.c:532 [inline] __skb_datagram_iter+0x185/0x1000 net/core/datagram.c:420 skb_copy_datagram_iter+0x5c/0x200 net/core/datagram.c:546 skb_copy_datagram_msg include/linux/skbuff.h:4050 [inline] netlink_recvmsg+0x432/0x1610 net/netlink/af_netlink.c:1962 sock_recvmsg_nosec net/socket.c:1046 [inline] sock_recvmsg+0x2c4/0x340 net/socket.c:1068 __sys_recvfrom+0x35a/0x5f0 net/socket.c:2242 __do_sys_recvfrom net/socket.c:2260 [inline] __se_sys_recvfrom net/socket.c:2256 [inline] __x64_sys_recvfrom+0x126/0x1d0 net/socket.c:2256 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was stored to memory at: pskb_expand_head+0x30f/0x19d0 net/core/skbuff.c:2253 netlink_trim+0x2c2/0x330 net/netlink/af_netlink.c:1317 netlink_unicast+0x9f/0x1260 net/netlink/af_netlink.c:1351 nlmsg_unicast include/net/netlink.h:1144 [inline] nlmsg_notify+0x21d/0x2f0 net/netlink/af_netlink.c:2610 rtnetlink_send+0x73/0x90 net/core/rtnetlink.c:741 rtnetlink_maybe_send include/linux/rtnetlink.h:17 [inline] tcf_add_notify net/sched/act_api.c:2048 [inline] tcf_action_add net/sched/act_api.c:2071 [inline] tc_ctl_action+0x146e/0x19d0 net/sched/act_api.c:2119 rtnetlink_rcv_msg+0x1737/0x1900 net/core/rtnetlink.c:6595 netlink_rcv_skb+0x375/0x650 net/netlink/af_netlink.c:2559 rtnetlink_rcv+0x34/0x40 net/core/rtnetlink.c:6613 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0xf4c/0x1260 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x10df/0x11f0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 ____sys_sendmsg+0x877/0xb60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x4a0 net/socket.c:2674 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was stored to memory at: __nla_put lib/nlattr.c:1041 [inline] nla_put+0x1c6/0x230 lib/nlattr.c:1099 tcf_skbmod_dump+0x23f/0xc20 net/sched/act_skbmod.c:256 tcf_action_dump_old net/sched/act_api.c:1191 [inline] tcf_action_dump_1+0x85e/0x970 net/sched/act_api.c:1227 tcf_action_dump+0x1fd/0x460 net/sched/act_api.c:1251 tca_get_fill+0x519/0x7a0 net/sched/act_api.c:1628 tcf_add_notify_msg net/sched/act_api.c:2023 [inline] tcf_add_notify net/sched/act_api.c:2042 [inline] tcf_action_add net/sched/act_api.c:2071 [inline] tc_ctl_action+0x1365/0x19d0 net/sched/act_api.c:2119 rtnetlink_rcv_msg+0x1737/0x1900 net/core/rtnetlink.c:6595 netlink_rcv_skb+0x375/0x650 net/netlink/af_netlink.c:2559 rtnetlink_rcv+0x34/0x40 net/core/rtnetlink.c:6613 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0xf4c/0x1260 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x10df/0x11f0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 ____sys_sendmsg+0x877/0xb60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x4a0 net/socket.c:2674 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Local variable opt created at: tcf_skbmod_dump+0x9d/0xc20 net/sched/act_skbmod.c:244 tcf_action_dump_old net/sched/act_api.c:1191 [inline] tcf_action_dump_1+0x85e/0x970 net/sched/act_api.c:1227 Bytes 188-191 of 248 are uninitialized Memory access of size 248 starts at ffff888117697680 Data copied to user address 00007ffe56d855f0 Fixes: 86da71b57383 ("net_sched: Introduce skbmod action") Signed-off-by: Eric Dumazet Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240403130908.93421-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/act_skbmod.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 39945b139c481..cd0accaf844a1 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -241,13 +241,13 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_skbmod *d = to_skbmod(a); unsigned char *b = skb_tail_pointer(skb); struct tcf_skbmod_params *p; - struct tc_skbmod opt = { - .index = d->tcf_index, - .refcnt = refcount_read(&d->tcf_refcnt) - ref, - .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, - }; + struct tc_skbmod opt; struct tcf_t t; + memset(&opt, 0, sizeof(opt)); + opt.index = d->tcf_index; + opt.refcnt = refcount_read(&d->tcf_refcnt) - ref, + opt.bindcnt = atomic_read(&d->tcf_bindcnt) - bind; spin_lock_bh(&d->tcf_lock); opt.action = d->tcf_action; p = rcu_dereference_protected(d->skbmod_p, -- GitLab From 0c83842df40f86e529db6842231154772c20edcc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Apr 2024 12:20:51 +0000 Subject: [PATCH 448/989] netfilter: validate user input for expected length I got multiple syzbot reports showing old bugs exposed by BPF after commit 20f2505fb436 ("bpf: Try to avoid kzalloc in cgroup/{s,g}etsockopt") setsockopt() @optlen argument should be taken into account before copying data. BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in do_replace net/ipv4/netfilter/ip_tables.c:1111 [inline] BUG: KASAN: slab-out-of-bounds in do_ipt_set_ctl+0x902/0x3dd0 net/ipv4/netfilter/ip_tables.c:1627 Read of size 96 at addr ffff88802cd73da0 by task syz-executor.4/7238 CPU: 1 PID: 7238 Comm: syz-executor.4 Not tainted 6.9.0-rc2-next-20240403-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 kasan_check_range+0x282/0x290 mm/kasan/generic.c:189 __asan_memcpy+0x29/0x70 mm/kasan/shadow.c:105 copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] copy_from_sockptr include/linux/sockptr.h:55 [inline] do_replace net/ipv4/netfilter/ip_tables.c:1111 [inline] do_ipt_set_ctl+0x902/0x3dd0 net/ipv4/netfilter/ip_tables.c:1627 nf_setsockopt+0x295/0x2c0 net/netfilter/nf_sockopt.c:101 do_sock_setsockopt+0x3af/0x720 net/socket.c:2311 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x72/0x7a RIP: 0033:0x7fd22067dde9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fd21f9ff0c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00007fd2207abf80 RCX: 00007fd22067dde9 RDX: 0000000000000040 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 00007fd2206ca47a R08: 0000000000000001 R09: 0000000000000000 R10: 0000000020000880 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007fd2207abf80 R15: 00007ffd2d0170d8 Allocated by task 7238: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:370 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:387 kasan_kmalloc include/linux/kasan.h:211 [inline] __do_kmalloc_node mm/slub.c:4069 [inline] __kmalloc_noprof+0x200/0x410 mm/slub.c:4082 kmalloc_noprof include/linux/slab.h:664 [inline] __cgroup_bpf_run_filter_setsockopt+0xd47/0x1050 kernel/bpf/cgroup.c:1869 do_sock_setsockopt+0x6b4/0x720 net/socket.c:2293 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x72/0x7a The buggy address belongs to the object at ffff88802cd73da0 which belongs to the cache kmalloc-8 of size 8 The buggy address is located 0 bytes inside of allocated 1-byte region [ffff88802cd73da0, ffff88802cd73da1) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88802cd73020 pfn:0x2cd73 flags: 0xfff80000000000(node=0|zone=1|lastcpupid=0xfff) page_type: 0xffffefff(slab) raw: 00fff80000000000 ffff888015041280 dead000000000100 dead000000000122 raw: ffff88802cd73020 000000008080007f 00000001ffffefff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x12cc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY), pid 5103, tgid 2119833701 (syz-executor.4), ts 5103, free_ts 70804600828 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x1f3/0x230 mm/page_alloc.c:1490 prep_new_page mm/page_alloc.c:1498 [inline] get_page_from_freelist+0x2e7e/0x2f40 mm/page_alloc.c:3454 __alloc_pages_noprof+0x256/0x6c0 mm/page_alloc.c:4712 __alloc_pages_node_noprof include/linux/gfp.h:244 [inline] alloc_pages_node_noprof include/linux/gfp.h:271 [inline] alloc_slab_page+0x5f/0x120 mm/slub.c:2249 allocate_slab+0x5a/0x2e0 mm/slub.c:2412 new_slab mm/slub.c:2465 [inline] ___slab_alloc+0xcd1/0x14b0 mm/slub.c:3615 __slab_alloc+0x58/0xa0 mm/slub.c:3705 __slab_alloc_node mm/slub.c:3758 [inline] slab_alloc_node mm/slub.c:3936 [inline] __do_kmalloc_node mm/slub.c:4068 [inline] kmalloc_node_track_caller_noprof+0x286/0x450 mm/slub.c:4089 kstrdup+0x3a/0x80 mm/util.c:62 device_rename+0xb5/0x1b0 drivers/base/core.c:4558 dev_change_name+0x275/0x860 net/core/dev.c:1232 do_setlink+0xa4b/0x41f0 net/core/rtnetlink.c:2864 __rtnl_newlink net/core/rtnetlink.c:3680 [inline] rtnl_newlink+0x180b/0x20a0 net/core/rtnetlink.c:3727 rtnetlink_rcv_msg+0x89b/0x10d0 net/core/rtnetlink.c:6594 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2559 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1361 page last free pid 5146 tgid 5146 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1110 [inline] free_unref_page+0xd3c/0xec0 mm/page_alloc.c:2617 discard_slab mm/slub.c:2511 [inline] __put_partials+0xeb/0x130 mm/slub.c:2980 put_cpu_partial+0x17c/0x250 mm/slub.c:3055 __slab_free+0x2ea/0x3d0 mm/slub.c:4254 qlink_free mm/kasan/quarantine.c:163 [inline] qlist_free_all+0x9e/0x140 mm/kasan/quarantine.c:179 kasan_quarantine_reduce+0x14f/0x170 mm/kasan/quarantine.c:286 __kasan_slab_alloc+0x23/0x80 mm/kasan/common.c:322 kasan_slab_alloc include/linux/kasan.h:201 [inline] slab_post_alloc_hook mm/slub.c:3888 [inline] slab_alloc_node mm/slub.c:3948 [inline] __do_kmalloc_node mm/slub.c:4068 [inline] __kmalloc_node_noprof+0x1d7/0x450 mm/slub.c:4076 kmalloc_node_noprof include/linux/slab.h:681 [inline] kvmalloc_node_noprof+0x72/0x190 mm/util.c:634 bucket_table_alloc lib/rhashtable.c:186 [inline] rhashtable_rehash_alloc+0x9e/0x290 lib/rhashtable.c:367 rht_deferred_worker+0x4e1/0x2440 lib/rhashtable.c:427 process_one_work kernel/workqueue.c:3218 [inline] process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3299 worker_thread+0x86d/0xd70 kernel/workqueue.c:3380 kthread+0x2f0/0x390 kernel/kthread.c:388 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:243 Memory state around the buggy address: ffff88802cd73c80: 07 fc fc fc 05 fc fc fc 05 fc fc fc fa fc fc fc ffff88802cd73d00: fa fc fc fc fa fc fc fc fa fc fc fc fa fc fc fc >ffff88802cd73d80: fa fc fc fc 01 fc fc fc fa fc fc fc fa fc fc fc ^ ffff88802cd73e00: fa fc fc fc fa fc fc fc 05 fc fc fc 07 fc fc fc ffff88802cd73e80: 07 fc fc fc 07 fc fc fc 07 fc fc fc 07 fc fc fc Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Pablo Neira Ayuso Link: https://lore.kernel.org/r/20240404122051.2303764-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/bridge/netfilter/ebtables.c | 6 ++++++ net/ipv4/netfilter/arp_tables.c | 4 ++++ net/ipv4/netfilter/ip_tables.c | 4 ++++ net/ipv6/netfilter/ip6_tables.c | 4 ++++ 4 files changed, 18 insertions(+) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 99d82676f780a..cbd0e3586c3f6 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1111,6 +1111,8 @@ static int do_replace(struct net *net, sockptr_t arg, unsigned int len) struct ebt_table_info *newinfo; struct ebt_replace tmp; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1423,6 +1425,8 @@ static int update_counters(struct net *net, sockptr_t arg, unsigned int len) { struct ebt_replace hlp; + if (len < sizeof(hlp)) + return -EINVAL; if (copy_from_sockptr(&hlp, arg, sizeof(hlp))) return -EFAULT; @@ -2352,6 +2356,8 @@ static int compat_update_counters(struct net *net, sockptr_t arg, { struct compat_ebt_replace hlp; + if (len < sizeof(hlp)) + return -EINVAL; if (copy_from_sockptr(&hlp, arg, sizeof(hlp))) return -EFAULT; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 2407066b0fec1..b150c9929b12e 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -956,6 +956,8 @@ static int do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct arpt_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1254,6 +1256,8 @@ static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct arpt_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 7da1df4997d05..4876707595781 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1108,6 +1108,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ipt_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1492,6 +1494,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ipt_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index fd9f049d6d41e..636b360311c53 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1125,6 +1125,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ip6t_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1501,6 +1503,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ip6t_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; -- GitLab From 72d7cb5c190befbb095bae7737e71560ec0fcaa6 Mon Sep 17 00:00:00 2001 From: Shengyu Li Date: Wed, 27 Mar 2024 05:13:15 +0800 Subject: [PATCH 449/989] selftests/harness: Prevent infinite loop due to Assert in FIXTURE_TEARDOWN This patch addresses an issue in the selftests/harness where an assertion within FIXTURE_TEARDOWN could trigger an infinite loop. The problem arises because the teardown procedure is meant to execute once, but the presence of failing assertions (ASSERT_EQ(0, 1)) leads to repeated attempts to execute teardown due to the long jump mechanism used by the harness for handling assertions. To resolve this, the patch ensures that the teardown process runs only once, regardless of assertion outcomes, preventing the infinite loop and allowing tests to fail. A simple test demo(test.c): #include "kselftest_harness.h" FIXTURE(f) { int fd; }; FIXTURE_SETUP(f) { self->fd = 0; } FIXTURE_TEARDOWN(f) { TH_LOG("TEARDOWN"); ASSERT_EQ(0, 1); self->fd = -1; } TEST_F(f, open_close) { ASSERT_NE(self->fd, 1); } TEST_HARNESS_MAIN will always output the following output due to a dead loop until timeout: # test.c:15:open_close:TEARDOWN # test.c:16:open_close:Expected 0 (0) == 1 (1) # test.c:15:open_close:TEARDOWN # test.c:16:open_close:Expected 0 (0) == 1 (1) ... But here's what we should and expect to get: TAP version 13 1..1 # Starting 1 tests from 2 test cases. # RUN f.open_close ... # test.c:15:open_close:TEARDOWN # test.c:16:open_close:Expected 0 (0) == 1 (1) # open_close: Test terminated by assertion # FAIL f.open_close not ok 1 f.open_close # FAILED: 0 / 1 tests passed. # Totals: pass:0 fail:1 xfail:0 xpass:0 skip:0 error:0 also this is related to the issue mentioned in this patch https://patchwork.kernel.org/project/linux-kselftest/patch/e2ba3f8c-80e6-477d-9cea-1c9af820e0ed@alu.unizg.hr/ Signed-off-by: Shengyu Li Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest_harness.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 4fd735e48ee7e..230d628848859 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -383,6 +383,7 @@ FIXTURE_DATA(fixture_name) self; \ pid_t child = 1; \ int status = 0; \ + bool jmp = false; \ memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ if (setjmp(_metadata->env) == 0) { \ /* Use the same _metadata. */ \ @@ -399,8 +400,10 @@ _metadata->exit_code = KSFT_FAIL; \ } \ } \ + else \ + jmp = true; \ if (child == 0) { \ - if (_metadata->setup_completed && !_metadata->teardown_parent) \ + if (_metadata->setup_completed && !_metadata->teardown_parent && !jmp) \ fixture_name##_teardown(_metadata, &self, variant->data); \ _exit(0); \ } \ -- GitLab From 83092341e15d0dfee1caa8dc502f66c815ccd78a Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Thu, 4 Apr 2024 11:33:28 +0200 Subject: [PATCH 450/989] gpio: cdev: fix missed label sanitizing in debounce_setup() When adding sanitization of the label, the path through edge_detector_setup() that leads to debounce_setup() was overlooked. A request taking this path does not allocate a new label and the request label is freed twice when the request is released, resulting in memory corruption. Add label sanitization to debounce_setup(). Cc: stable@vger.kernel.org Fixes: b34490879baa ("gpio: cdev: sanitize the label before requesting the interrupt") Signed-off-by: Kent Gibson [Bartosz: rebased on top of the fix for empty GPIO labels] Co-developed-by: Bartosz Golaszewski Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-cdev.c | 49 +++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index 1426cc1c4a285..d09c7d7283655 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -728,6 +728,25 @@ static u32 line_event_id(int level) GPIO_V2_LINE_EVENT_FALLING_EDGE; } +static inline char *make_irq_label(const char *orig) +{ + char *new; + + if (!orig) + return NULL; + + new = kstrdup_and_replace(orig, '/', ':', GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + return new; +} + +static inline void free_irq_label(const char *label) +{ + kfree(label); +} + #ifdef CONFIG_HTE static enum hte_return process_hw_ts_thread(void *p) @@ -1015,6 +1034,7 @@ static int debounce_setup(struct line *line, unsigned int debounce_period_us) { unsigned long irqflags; int ret, level, irq; + char *label; /* try hardware */ ret = gpiod_set_debounce(line->desc, debounce_period_us); @@ -1037,11 +1057,17 @@ static int debounce_setup(struct line *line, unsigned int debounce_period_us) if (irq < 0) return -ENXIO; + label = make_irq_label(line->req->label); + if (IS_ERR(label)) + return -ENOMEM; + irqflags = IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING; ret = request_irq(irq, debounce_irq_handler, irqflags, - line->req->label, line); - if (ret) + label, line); + if (ret) { + free_irq_label(label); return ret; + } line->irq = irq; } else { ret = hte_edge_setup(line, GPIO_V2_LINE_FLAG_EDGE_BOTH); @@ -1083,25 +1109,6 @@ static u32 gpio_v2_line_config_debounce_period(struct gpio_v2_line_config *lc, return 0; } -static inline char *make_irq_label(const char *orig) -{ - char *new; - - if (!orig) - return NULL; - - new = kstrdup_and_replace(orig, '/', ':', GFP_KERNEL); - if (!new) - return ERR_PTR(-ENOMEM); - - return new; -} - -static inline void free_irq_label(const char *label) -{ - kfree(label); -} - static void edge_detector_stop(struct line *line) { if (line->irq) { -- GitLab From 8130b05c559d1aa83d0c8971b422ba0da18ef24a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 4 Apr 2024 12:42:00 +0200 Subject: [PATCH 451/989] PM: EM: fix wrong utilization estimation in em_cpu_energy() Commit 1b600da51073 ("PM: EM: Optimize em_cpu_energy() and remove division") has added back map_util_perf() in em_cpu_energy() computation which has been removed with the rework of scheduler/cpufreq interface. This is wrong because sugov_effective_cpu_perf() already takes care of mapping the utilization to a performance level. Fixes: 1b600da51073 ("PM: EM: Optimize em_cpu_energy() and remove division") Signed-off-by: Vincent Guittot Reviewed-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 770755df852f1..70cd7258cd29f 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -245,7 +245,6 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * max utilization to the allowed CPU capacity before calculating * effective performance. */ - max_util = map_util_perf(max_util); max_util = min(max_util, allowed_cpu_cap); /* -- GitLab From 90f8917e7a15f6dd508779048bdf00ce119b6ca0 Mon Sep 17 00:00:00 2001 From: Chaitanya Kumar Borah Date: Thu, 4 Apr 2024 13:48:13 -0500 Subject: [PATCH 452/989] ASoC: SOF: Core: Add remove_late() to sof_init_environment failure path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In cases where the sof driver is unable to find the firmware and/or topology file [1], it exits without releasing the i915 runtime pm wakeref [2]. This results in dmesg warnings[3] during suspend/resume or driver unbind. Add remove_late() to the failure path of sof_init_environment so that i915 wakeref is released appropriately [1] [ 8.990366] sof-audio-pci-intel-mtl 0000:00:1f.3: SOF firmware and/or topology file not found. [ 8.990396] sof-audio-pci-intel-mtl 0000:00:1f.3: Supported default profiles [ 8.990398] sof-audio-pci-intel-mtl 0000:00:1f.3: - ipc type 1 (Requested): [ 8.990399] sof-audio-pci-intel-mtl 0000:00:1f.3: Firmware file: intel/sof-ipc4/mtl/sof-mtl.ri [ 8.990401] sof-audio-pci-intel-mtl 0000:00:1f.3: Topology file: intel/sof-ace-tplg/sof-mtl-rt711-2ch.tplg [ 8.990402] sof-audio-pci-intel-mtl 0000:00:1f.3: Check if you have 'sof-firmware' package installed. [ 8.990403] sof-audio-pci-intel-mtl 0000:00:1f.3: Optionally it can be manually downloaded from: [ 8.990404] sof-audio-pci-intel-mtl 0000:00:1f.3: https://github.com/thesofproject/sof-bin/ [ 8.999088] sof-audio-pci-intel-mtl 0000:00:1f.3: error: sof_probe_work failed err: -2 [2] ref_tracker: 0000:00:02.0@ffff9b8511b6a378 has 1/5 users at track_intel_runtime_pm_wakeref.part.0+0x36/0x70 [i915] __intel_runtime_pm_get+0x51/0xb0 [i915] intel_runtime_pm_get+0x17/0x20 [i915] intel_display_power_get+0x2f/0x70 [i915] i915_audio_component_get_power+0x23/0x120 [i915] snd_hdac_display_power+0x89/0x130 [snd_hda_core] hda_codec_i915_init+0x3f/0x50 [snd_sof_intel_hda] hda_dsp_probe_early+0x170/0x250 [snd_sof_intel_hda_common] snd_sof_device_probe+0x224/0x320 [snd_sof] sof_pci_probe+0x15b/0x220 [snd_sof_pci] hda_pci_intel_probe+0x30/0x70 [snd_sof_intel_hda_common] local_pci_probe+0x4c/0xb0 pci_device_probe+0xcc/0x250 really_probe+0x18e/0x420 __driver_probe_device+0x7e/0x170 driver_probe_device+0x23/0xa0 [3] [ 484.105070] ------------[ cut here ]------------ [ 484.108238] thunderbolt 0000:00:0d.2: PM: pci_pm_suspend_late+0x0/0x50 returned 0 after 0 usecs [ 484.117106] i915 0000:00:02.0: i915 raw-wakerefs=1 wakelocks=1 on cleanup [ 484.792005] WARNING: CPU: 2 PID: 2405 at drivers/gpu/drm/i915/intel_runtime_pm.c:444 intel_runtime_pm_driver_release+0x6c/0x80 Tested-by: Rodrigo Vivi Reviewed-by: Rodrigo Vivi Reviewed-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Kai Vehmanen Signed-off-by: Chaitanya Kumar Borah Signed-off-by: Pierre-Louis Bossart Acked-by: Lucas De Marchi Link: https://github.com/thesofproject/linux/pull/4878 Signed-off-by: Rodrigo Vivi Link: https://msgid.link/r/20240404184813.134566-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sound/soc/sof/core.c b/sound/soc/sof/core.c index 9b00ede2a486a..cc84d4c81be9d 100644 --- a/sound/soc/sof/core.c +++ b/sound/soc/sof/core.c @@ -339,8 +339,7 @@ static int sof_init_environment(struct snd_sof_dev *sdev) ret = snd_sof_probe(sdev); if (ret < 0) { dev_err(sdev->dev, "failed to probe DSP %d\n", ret); - sof_ops_free(sdev); - return ret; + goto err_sof_probe; } /* check machine info */ @@ -358,15 +357,18 @@ static int sof_init_environment(struct snd_sof_dev *sdev) ret = validate_sof_ops(sdev); if (ret < 0) { snd_sof_remove(sdev); + snd_sof_remove_late(sdev); return ret; } } + return 0; + err_machine_check: - if (ret) { - snd_sof_remove(sdev); - sof_ops_free(sdev); - } + snd_sof_remove(sdev); +err_sof_probe: + snd_sof_remove_late(sdev); + sof_ops_free(sdev); return ret; } -- GitLab From dd33e5dc7247041b565014f66286c9566b0e32b6 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 19 Mar 2024 16:40:05 +0100 Subject: [PATCH 453/989] riscv: use KERN_INFO in do_trap Print the instruction dump with info instead of emergency level. The unhandled signal message is only for informational purpose. Fixes: b8a03a634129 ("riscv: add userland instruction dump to RISC-V splats") Signed-off-by: Andreas Schwab Reviewed-by: Conor Dooley Reviewed-by: Atish Patra Reviewed-by: Yunhui Cui Link: https://lore.kernel.org/r/mvmy1aegrhm.fsf@suse.de Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 868d6280cf667..05a16b1f0aee8 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -122,7 +122,7 @@ void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr) print_vma_addr(KERN_CONT " in ", instruction_pointer(regs)); pr_cont("\n"); __show_regs(regs); - dump_instr(KERN_EMERG, regs); + dump_instr(KERN_INFO, regs); } force_sig_fault(signo, code, (void __user *)addr); -- GitLab From 8a48ea87ce89fb701624f4b9e82556c81f30c7dc Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Tue, 26 Mar 2024 21:30:16 +0100 Subject: [PATCH 454/989] riscv: Fix warning by declaring arch_cpu_idle() as noinstr The following warning appears when using ftrace: [89855.443413] RCU not on for: arch_cpu_idle+0x0/0x1c [89855.445640] WARNING: CPU: 5 PID: 0 at include/linux/trace_recursion.h:162 arch_ftrace_ops_list_func+0x208/0x228 [89855.445824] Modules linked in: xt_conntrack(E) nft_chain_nat(E) xt_MASQUERADE(E) nf_conntrack_netlink(E) xt_addrtype(E) nft_compat(E) nf_tables(E) nfnetlink(E) br_netfilter(E) cfg80211(E) nls_iso8859_1(E) ofpart(E) redboot(E) cmdlinepart(E) cfi_cmdset_0001(E) virtio_net(E) cfi_probe(E) cfi_util(E) 9pnet_virtio(E) gen_probe(E) net_failover(E) virtio_rng(E) failover(E) 9pnet(E) physmap(E) map_funcs(E) chipreg(E) mtd(E) uio_pdrv_genirq(E) uio(E) dm_multipath(E) scsi_dh_rdac(E) scsi_dh_emc(E) scsi_dh_alua(E) drm(E) efi_pstore(E) backlight(E) ip_tables(E) x_tables(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E) xor(E) async_tx(E) raid6_pq(E) raid1(E) raid0(E) virtio_blk(E) [89855.451563] CPU: 5 PID: 0 Comm: swapper/5 Tainted: G E 6.8.0-rc6ubuntu-defconfig #2 [89855.451726] Hardware name: riscv-virtio,qemu (DT) [89855.451899] epc : arch_ftrace_ops_list_func+0x208/0x228 [89855.452016] ra : arch_ftrace_ops_list_func+0x208/0x228 [89855.452119] epc : ffffffff8016b216 ra : ffffffff8016b216 sp : ffffaf808090fdb0 [89855.452171] gp : ffffffff827c7680 tp : ffffaf808089ad40 t0 : ffffffff800c0dd8 [89855.452216] t1 : 0000000000000001 t2 : 0000000000000000 s0 : ffffaf808090fe30 [89855.452306] s1 : 0000000000000000 a0 : 0000000000000026 a1 : ffffffff82cd6ac8 [89855.452423] a2 : ffffffff800458c8 a3 : ffffaf80b1870640 a4 : 0000000000000000 [89855.452646] a5 : 0000000000000000 a6 : 00000000ffffffff a7 : ffffffffffffffff [89855.452698] s2 : ffffffff82766872 s3 : ffffffff80004caa s4 : ffffffff80ebea90 [89855.452743] s5 : ffffaf808089bd40 s6 : 8000000a00006e00 s7 : 0000000000000008 [89855.452787] s8 : 0000000000002000 s9 : 0000000080043700 s10: 0000000000000000 [89855.452831] s11: 0000000000000000 t3 : 0000000000100000 t4 : 0000000000000064 [89855.452874] t5 : 000000000000000c t6 : ffffaf80b182dbfc [89855.452929] status: 0000000200000100 badaddr: 0000000000000000 cause: 0000000000000003 [89855.453053] [] arch_ftrace_ops_list_func+0x208/0x228 [89855.453191] [] ftrace_call+0x8/0x22 [89855.453265] [] do_idle+0x24c/0x2ca [89855.453357] [] return_to_handler+0x0/0x26 [89855.453429] [] smp_callin+0x92/0xb6 [89855.453785] ---[ end trace 0000000000000000 ]--- To fix this, mark arch_cpu_idle() as noinstr, like it is done in commit a9cbc1b471d2 ("s390/idle: mark arch_cpu_idle() noinstr"). Reported-by: Evgenii Shatokhin Closes: https://lore.kernel.org/linux-riscv/51f21b87-ebed-4411-afbc-c00d3dea2bab@yadro.com/ Fixes: cfbc4f81c9d0 ("riscv: Select ARCH_WANTS_NO_INSTR") Signed-off-by: Alexandre Ghiti Reviewed-by: Andy Chiu Tested-by: Andy Chiu Acked-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240326203017.310422-2-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 92922dbd5b5c1..6abeecbfc51d0 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -37,7 +37,7 @@ EXPORT_SYMBOL(__stack_chk_guard); extern asmlinkage void ret_from_fork(void); -void arch_cpu_idle(void) +void noinstr arch_cpu_idle(void) { cpu_do_idle(); } -- GitLab From a370c2419e4680a27382d9231edcf739d5d74efc Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Tue, 26 Mar 2024 21:30:17 +0100 Subject: [PATCH 455/989] riscv: Disable preemption when using patch_map() patch_map() uses fixmap mappings to circumvent the non-writability of the kernel text mapping. The __set_fixmap() function only flushes the current cpu tlb, it does not emit an IPI so we must make sure that while we use a fixmap mapping, the current task is not migrated on another cpu which could miss the newly introduced fixmap mapping. So in order to avoid any task migration, disable the preemption. Reported-by: Andrea Parri Closes: https://lore.kernel.org/all/ZcS+GAaM25LXsBOl@andrea/ Reported-by: Andy Chiu Closes: https://lore.kernel.org/linux-riscv/CABgGipUMz3Sffu-CkmeUB1dKVwVQ73+7=sgC45-m0AE9RCjOZg@mail.gmail.com/ Fixes: cad539baa48f ("riscv: implement a memset like function for text") Fixes: 0ff7c3b33127 ("riscv: Use text_mutex instead of patch_lock") Co-developed-by: Andy Chiu Signed-off-by: Andy Chiu Signed-off-by: Alexandre Ghiti Acked-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240326203017.310422-3-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/patch.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c index 37e87fdcf6a00..30e12b310cab7 100644 --- a/arch/riscv/kernel/patch.c +++ b/arch/riscv/kernel/patch.c @@ -80,6 +80,8 @@ static int __patch_insn_set(void *addr, u8 c, size_t len) */ lockdep_assert_held(&text_mutex); + preempt_disable(); + if (across_pages) patch_map(addr + PAGE_SIZE, FIX_TEXT_POKE1); @@ -92,6 +94,8 @@ static int __patch_insn_set(void *addr, u8 c, size_t len) if (across_pages) patch_unmap(FIX_TEXT_POKE1); + preempt_enable(); + return 0; } NOKPROBE_SYMBOL(__patch_insn_set); @@ -122,6 +126,8 @@ static int __patch_insn_write(void *addr, const void *insn, size_t len) if (!riscv_patch_in_stop_machine) lockdep_assert_held(&text_mutex); + preempt_disable(); + if (across_pages) patch_map(addr + PAGE_SIZE, FIX_TEXT_POKE1); @@ -134,6 +140,8 @@ static int __patch_insn_write(void *addr, const void *insn, size_t len) if (across_pages) patch_unmap(FIX_TEXT_POKE1); + preempt_enable(); + return ret; } NOKPROBE_SYMBOL(__patch_insn_write); -- GitLab From d14fa1fcf69db9d070e75f1c4425211fa619dfc8 Mon Sep 17 00:00:00 2001 From: Stefan O'Rear Date: Wed, 27 Mar 2024 02:12:58 -0400 Subject: [PATCH 456/989] riscv: process: Fix kernel gp leakage childregs represents the registers which are active for the new thread in user context. For a kernel thread, childregs->gp is never used since the kernel gp is not touched by switch_to. For a user mode helper, the gp value can be observed in user space after execve or possibly by other means. [From the email thread] The /* Kernel thread */ comment is somewhat inaccurate in that it is also used for user_mode_helper threads, which exec a user process, e.g. /sbin/init or when /proc/sys/kernel/core_pattern is a pipe. Such threads do not have PF_KTHREAD set and are valid targets for ptrace etc. even before they exec. childregs is the *user* context during syscall execution and it is observable from userspace in at least five ways: 1. kernel_execve does not currently clear integer registers, so the starting register state for PID 1 and other user processes started by the kernel has sp = user stack, gp = kernel __global_pointer$, all other integer registers zeroed by the memset in the patch comment. This is a bug in its own right, but I'm unwilling to bet that it is the only way to exploit the issue addressed by this patch. 2. ptrace(PTRACE_GETREGSET): you can PTRACE_ATTACH to a user_mode_helper thread before it execs, but ptrace requires SIGSTOP to be delivered which can only happen at user/kernel boundaries. 3. /proc/*/task/*/syscall: this is perfectly happy to read pt_regs for user_mode_helpers before the exec completes, but gp is not one of the registers it returns. 4. PERF_SAMPLE_REGS_USER: LOCKDOWN_PERF normally prevents access to kernel addresses via PERF_SAMPLE_REGS_INTR, but due to this bug kernel addresses are also exposed via PERF_SAMPLE_REGS_USER which is permitted under LOCKDOWN_PERF. I have not attempted to write exploit code. 5. Much of the tracing infrastructure allows access to user registers. I have not attempted to determine which forms of tracing allow access to user registers without already allowing access to kernel registers. Fixes: 7db91e57a0ac ("RISC-V: Task implementation") Cc: stable@vger.kernel.org Signed-off-by: Stefan O'Rear Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240327061258.2370291-1-sorear@fastmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/process.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 6abeecbfc51d0..e4bc61c4e58af 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -27,8 +27,6 @@ #include #include -register unsigned long gp_in_global __asm__("gp"); - #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include unsigned long __stack_chk_guard __read_mostly; @@ -207,7 +205,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (unlikely(args->fn)) { /* Kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->gp = gp_in_global; /* Supervisor/Machine, irqs on: */ childregs->status = SR_PP | SR_PIE; -- GitLab From 01e5f4fc0fead3ef19000e1d2fc748e87aac3f02 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 4 Apr 2024 15:50:26 -0400 Subject: [PATCH 457/989] bcachefs: Make snapshot_is_ancestor() safe Snapshot table accesses generally need to be checking for invalid snapshot ID now, fix one that was missed. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 0e806f04f3d7c..544322d5c2517 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -125,6 +125,15 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances return s->parent; } +static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) +{ + const struct snapshot_t *s = __snapshot_t(t, id); + if (!s) + return false; + + return test_bit(ancestor - id - 1, s->is_ancestor); +} + bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) { bool ret; @@ -140,13 +149,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) while (id && id < ancestor - IS_ANCESTOR_BITMAP) id = get_ancestor_below(t, id, ancestor); - if (id && id < ancestor) { - ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor); + ret = id && id < ancestor + ? test_ancestor_bitmap(t, id, ancestor) + : id == ancestor; - EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); - } else { - ret = id == ancestor; - } + EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); out: rcu_read_unlock(); -- GitLab From be42e4a621fee05e3299169fbb1068b473e779c2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 4 Apr 2024 16:51:40 -0400 Subject: [PATCH 458/989] bcachefs: Bump limit in btree_trans_too_many_iters() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 2 +- fs/bcachefs/btree_types.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 24772538e4cc7..1d58d447b386c 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -642,7 +642,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *); static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8) + if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8) return __bch2_btree_trans_too_many_iters(trans); return 0; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 9404d96c38f3b..e0c982a4195c7 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -364,7 +364,21 @@ struct btree_insert_entry { unsigned long ip_allocated; }; +/* Number of btree paths we preallocate, usually enough */ #define BTREE_ITER_INITIAL 64 +/* + * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code + * paths should run inside this limit, and if they don't it usually indicates a + * bug (leaking/duplicated btree paths). + * + * exception: some fsck paths + * + * bugs with excessive path usage seem to have possibly been eliminated now, so + * we might consider eliminating this (and btree_trans_too_many_iter()) at some + * point. + */ +#define BTREE_ITER_NORMAL_LIMIT 256 +/* never exceed limit */ #define BTREE_ITER_MAX (1U << 10) struct btree_trans_commit_hook; -- GitLab From 9fb3036fe3d9414ae32a97d01d7ccf7550e168a7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 3 Apr 2024 19:15:53 -0400 Subject: [PATCH 459/989] bcachefs: Move btree_updates to debugfs sysfs is limited to PAGE_SIZE, and when we're debugging strange deadlocks/priority inversions we need to see the full list. Signed-off-by: Kent Overstreet --- fs/bcachefs/debug.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- fs/bcachefs/sysfs.c | 6 ------ 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 7b681ae91510e..cd99b73994144 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -13,6 +13,7 @@ #include "btree_iter.h" #include "btree_locking.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "buckets.h" #include "debug.h" #include "error.h" @@ -668,7 +669,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, i->size = size; i->ret = 0; - do { + while (1) { err = flush_buf(i); if (err) return err; @@ -676,9 +677,12 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, if (!i->size) break; + if (done) + break; + done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); i->iter++; - } while (!done); + } if (i->buf.allocation_failure) return -ENOMEM; @@ -693,13 +697,45 @@ static const struct file_operations journal_pins_ops = { .read = bch2_journal_pins_read, }; +static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (!i->iter) { + bch2_btree_updates_to_text(&i->buf, c); + i->iter++; + } + + err = flush_buf(i); + if (err) + return err; + + if (i->buf.allocation_failure) + return -ENOMEM; + + return i->ret; +} + +static const struct file_operations btree_updates_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_btree_updates_read, +}; + static int btree_transaction_stats_open(struct inode *inode, struct file *file) { struct bch_fs *c = inode->i_private; struct dump_iter *i; i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) return -ENOMEM; @@ -902,6 +938,9 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops); + debugfs_create_file("btree_updates", 0400, c->fs_debug_dir, + c->btree_debug, &btree_updates_ops); + debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, c, &btree_transaction_stats_op); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index c86a93a8d8fc8..b18b0cc81b594 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -17,7 +17,6 @@ #include "btree_iter.h" #include "btree_key_cache.h" #include "btree_update.h" -#include "btree_update_interior.h" #include "btree_gc.h" #include "buckets.h" #include "clock.h" @@ -166,7 +165,6 @@ read_attribute(btree_write_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); read_attribute(journal_debug); -read_attribute(btree_updates); read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(stripes_heap); @@ -415,9 +413,6 @@ SHOW(bch2_fs) if (attr == &sysfs_journal_debug) bch2_journal_debug_to_text(out, &c->journal); - if (attr == &sysfs_btree_updates) - bch2_btree_updates_to_text(out, c); - if (attr == &sysfs_btree_cache) bch2_btree_cache_to_text(out, c); @@ -639,7 +634,6 @@ SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { &sysfs_flags, &sysfs_journal_debug, - &sysfs_btree_updates, &sysfs_btree_cache, &sysfs_btree_key_cache, &sysfs_new_stripes, -- GitLab From d880a43836d5e2ba951b10471104cdacc2eefbed Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 3 Apr 2024 19:52:10 -0400 Subject: [PATCH 460/989] bcachefs: Further improve btree_update_to_text() Print start and end level of the btree update; also a bit of cleanup. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 100 +++++++++++++--------------- fs/bcachefs/btree_update_interior.h | 3 +- 2 files changed, 48 insertions(+), 55 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 32397b99752fd..24fbd5be70a20 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -26,9 +26,9 @@ #include -const char * const bch2_btree_update_modes[] = { +static const char * const bch2_btree_update_modes[] = { #define x(t) #t, - BCH_WATERMARKS() + BTREE_UPDATE_MODES() #undef x NULL }; @@ -850,15 +850,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - BUG_ON(as->mode != BTREE_UPDATE_none); + BUG_ON(as->update_level_end < b->c.level); BUG_ON(!btree_node_dirty(b)); BUG_ON(!b->c.level); + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + as->mode = BTREE_UPDATE_node; as->b = b; + as->update_level_end = b->c.level; set_btree_node_write_blocked(b); list_add(&as->write_blocked_list, &b->write_blocked); @@ -1100,7 +1102,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level, bool split, unsigned flags) + unsigned level_start, bool split, unsigned flags) { struct bch_fs *c = trans->c; struct btree_update *as; @@ -1108,7 +1110,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0; unsigned nr_nodes[2] = { 0, 0 }; - unsigned update_level = level; + unsigned level_end = level_start; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; int ret = 0; u32 restart_count = trans->restart_count; @@ -1140,17 +1142,17 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, } while (1) { - nr_nodes[!!update_level] += 1 + split; - update_level++; + nr_nodes[!!level_end] += 1 + split; + level_end++; - ret = bch2_btree_path_upgrade(trans, path, update_level + 1); + ret = bch2_btree_path_upgrade(trans, path, level_end + 1); if (ret) return ERR_PTR(ret); - if (!btree_path_node(path, update_level)) { + if (!btree_path_node(path, level_end)) { /* Allocating new root? */ nr_nodes[1] += split; - update_level = BTREE_MAX_DEPTH; + level_end = BTREE_MAX_DEPTH; break; } @@ -1158,11 +1160,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * Always check for space for two keys, even if we won't have to * split at prior level - it might have been a merge instead: */ - if (bch2_btree_node_insert_fits(path->l[update_level].b, + if (bch2_btree_node_insert_fits(path->l[level_end].b, BKEY_BTREE_PTR_U64s_MAX * 2)) break; - split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); + split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); } if (!down_read_trylock(&c->gc_lock)) { @@ -1176,14 +1178,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); memset(as, 0, sizeof(*as)); closure_init(&as->cl, NULL); - as->c = c; - as->start_time = start_time; - as->ip_started = _RET_IP_; - as->mode = BTREE_UPDATE_none; - as->watermark = watermark; - as->took_gc_lock = true; - as->btree_id = path->btree_id; - as->update_level = update_level; + as->c = c; + as->start_time = start_time; + as->ip_started = _RET_IP_; + as->mode = BTREE_UPDATE_none; + as->watermark = watermark; + as->took_gc_lock = true; + as->btree_id = path->btree_id; + as->update_level_start = level_start; + as->update_level_end = level_end; INIT_LIST_HEAD(&as->list); INIT_LIST_HEAD(&as->unwritten_list); INIT_LIST_HEAD(&as->write_blocked_list); @@ -1373,12 +1376,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, } static void -__bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter node_iter, - struct keylist *keys) +bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter node_iter, + struct keylist *keys) { struct bkey_i *insert = bch2_keylist_front(keys); struct bkey_packed *k; @@ -1534,7 +1537,7 @@ static void btree_split_insert_keys(struct btree_update *as, bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); - __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); + bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); BUG_ON(bch2_btree_node_check_topology(trans, b)); } @@ -1714,27 +1717,6 @@ err: goto out; } -static void -bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct keylist *keys) -{ - struct btree_path *linked; - unsigned i; - - __bch2_btree_insert_keys_interior(as, trans, path, b, - path->l[b->c.level].iter, keys); - - btree_update_updated_node(as, b); - - trans_for_each_path_with_node(trans, b, linked, i) - bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - - bch2_trans_verify_paths(trans); -} - /** * bch2_btree_insert_node - insert bkeys into a given btree node * @@ -1755,7 +1737,8 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t struct keylist *keys) { struct bch_fs *c = as->c; - struct btree_path *path = trans->paths + path_idx; + struct btree_path *path = trans->paths + path_idx, *linked; + unsigned i; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; @@ -1784,7 +1767,13 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t return ret; } - bch2_btree_insert_keys_interior(as, trans, path, b, keys); + bch2_btree_insert_keys_interior(as, trans, path, b, + path->l[b->c.level].iter, keys); + + trans_for_each_path_with_node(trans, b, linked, i) + bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); + + bch2_trans_verify_paths(trans); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; @@ -1798,6 +1787,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_maybe_compact_whiteouts(c, b)) bch2_trans_node_reinit_iter(trans, b); + btree_update_updated_node(as, b); bch2_btree_node_unlock_write(trans, path, b); BUG_ON(bch2_btree_node_check_topology(trans, b)); @@ -1807,7 +1797,7 @@ split: * We could attempt to avoid the transaction restart, by calling * bch2_btree_path_upgrade() and allocating more nodes: */ - if (b->c.level >= as->update_level) { + if (b->c.level >= as->update_level_end) { trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } @@ -2519,9 +2509,11 @@ void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned lev static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) { - prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", (void *) as->ip_started, bch2_btree_id_str(as->btree_id), + as->update_level_start, + as->update_level_end, bch2_watermarks[as->watermark], bch2_btree_update_modes[as->mode], as->nodes_written, diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 88dcf5a22a3bd..c1a479ebaad12 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -57,7 +57,8 @@ struct btree_update { unsigned took_gc_lock:1; enum btree_id btree_id; - unsigned update_level; + unsigned update_level_start; + unsigned update_level_end; struct disk_reservation disk_res; -- GitLab From 9802ff48f3fd8ae5d6699c5a32afc76769920c98 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 20 Feb 2024 21:08:24 -0500 Subject: [PATCH 461/989] bcachefs: Print shutdown journal sequence number Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index ed63018f21bef..8daf80a38d60c 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -288,8 +288,13 @@ static void __bch2_fs_read_only(struct bch_fs *c) if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && !test_bit(BCH_FS_emergency_ro, &c->flags)) set_bit(BCH_FS_clean_shutdown, &c->flags); + bch2_fs_journal_stop(&c->journal); + bch_info(c, "%sshutdown complete, journal seq %llu", + test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", + c->journal.seq_ondisk); + /* * After stopping journal: */ -- GitLab From d4e655c49f474deffaf5ed7e65034b8167ee39c8 Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Mon, 1 Apr 2024 21:10:38 +0200 Subject: [PATCH 462/989] scsi: sg: Avoid race in error handling & drop bogus warn Commit 27f58c04a8f4 ("scsi: sg: Avoid sg device teardown race") introduced an incorrect WARN_ON_ONCE() and missed a sequence where sg_device_destroy() was used after scsi_device_put(). sg_device_destroy() is accessing the parent scsi_device request_queue which will already be set to NULL when the preceding call to scsi_device_put() removed the last reference to the parent scsi_device. Drop the incorrect WARN_ON_ONCE() - allowing more than one concurrent access to the sg device - and make sure sg_device_destroy() is not used after scsi_device_put() in the error handling. Link: https://lore.kernel.org/all/5375B275-D137-4D5F-BE25-6AF8ACAE41EF@linux.ibm.com Fixes: 27f58c04a8f4 ("scsi: sg: Avoid sg device teardown race") Cc: stable@vger.kernel.org Signed-off-by: Alexander Wetzel Link: https://lore.kernel.org/r/20240401191038.18359-1-Alexander@wetzel-home.de Tested-by: Sachin Sant Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/sg.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 386981c6976a5..baf870a03ecf6 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -285,6 +285,7 @@ sg_open(struct inode *inode, struct file *filp) int dev = iminor(inode); int flags = filp->f_flags; struct request_queue *q; + struct scsi_device *device; Sg_device *sdp; Sg_fd *sfp; int retval; @@ -301,11 +302,12 @@ sg_open(struct inode *inode, struct file *filp) /* This driver's module count bumped by fops_get in */ /* Prevent the device driver from vanishing while we sleep */ - retval = scsi_device_get(sdp->device); + device = sdp->device; + retval = scsi_device_get(device); if (retval) goto sg_put; - retval = scsi_autopm_get_device(sdp->device); + retval = scsi_autopm_get_device(device); if (retval) goto sdp_put; @@ -313,7 +315,7 @@ sg_open(struct inode *inode, struct file *filp) * check if O_NONBLOCK. Permits SCSI commands to be issued * during error recovery. Tread carefully. */ if (!((flags & O_NONBLOCK) || - scsi_block_when_processing_errors(sdp->device))) { + scsi_block_when_processing_errors(device))) { retval = -ENXIO; /* we are in error recovery for this device */ goto error_out; @@ -344,7 +346,7 @@ sg_open(struct inode *inode, struct file *filp) if (sdp->open_cnt < 1) { /* no existing opens */ sdp->sgdebug = 0; - q = sdp->device->request_queue; + q = device->request_queue; sdp->sg_tablesize = queue_max_segments(q); } sfp = sg_add_sfp(sdp); @@ -370,10 +372,11 @@ out_undo: error_mutex_locked: mutex_unlock(&sdp->open_rel_lock); error_out: - scsi_autopm_put_device(sdp->device); + scsi_autopm_put_device(device); sdp_put: - scsi_device_put(sdp->device); - goto sg_put; + kref_put(&sdp->d_ref, sg_device_destroy); + scsi_device_put(device); + return retval; } /* Release resources associated with a successful sg_open() @@ -2233,7 +2236,6 @@ sg_remove_sfp_usercontext(struct work_struct *work) "sg_remove_sfp: sfp=0x%p\n", sfp)); kfree(sfp); - WARN_ON_ONCE(kref_read(&sdp->d_ref) != 1); kref_put(&sdp->d_ref, sg_device_destroy); scsi_device_put(device); module_put(THIS_MODULE); -- GitLab From 8cb4a9a82b21623dbb4b3051dd30d98356cf95bc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Apr 2024 17:16:14 -0700 Subject: [PATCH 463/989] x86/cpufeatures: Add CPUID_LNX_5 to track recently added Linux-defined word Add CPUID_LNX_5 to track cpufeatures' word 21, and add the appropriate compile-time assert in KVM to prevent direct lookups on the features in CPUID_LNX_5. KVM uses X86_FEATURE_* flags to manage guest CPUID, and so must translate features that are scattered by Linux from the Linux-defined bit to the hardware-defined bit, i.e. should never try to directly access scattered features in guest CPUID. Opportunistically add NR_CPUID_WORDS to enum cpuid_leafs, along with a compile-time assert in KVM's CPUID infrastructure to ensure that future additions update cpuid_leafs along with NCAPINTS. No functional change intended. Fixes: 7f274e609f3d ("x86/cpufeatures: Add new word for scattered features") Cc: Sandipan Das Signed-off-by: Sean Christopherson Acked-by: Dave Hansen Signed-off-by: Linus Torvalds --- arch/x86/include/asm/cpufeature.h | 2 ++ arch/x86/kvm/reverse_cpuid.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 42157ddcc09d4..686e92d2663ee 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -33,6 +33,8 @@ enum cpuid_leafs CPUID_7_EDX, CPUID_8000_001F_EAX, CPUID_8000_0021_EAX, + CPUID_LNX_5, + NR_CPUID_WORDS, }; #define X86_CAP_FMT_NUM "%d:%d" diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index aadefcaa9561d..58ac8d69c94bd 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -102,10 +102,12 @@ static const struct cpuid_reg reverse_cpuid[] = { */ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) { + BUILD_BUG_ON(NR_CPUID_WORDS != NCAPINTS); BUILD_BUG_ON(x86_leaf == CPUID_LNX_1); BUILD_BUG_ON(x86_leaf == CPUID_LNX_2); BUILD_BUG_ON(x86_leaf == CPUID_LNX_3); BUILD_BUG_ON(x86_leaf == CPUID_LNX_4); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_5); BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); } -- GitLab From d464dac47260a33add5a206fd3289ec1216e8435 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 5 Apr 2024 07:58:10 +0200 Subject: [PATCH 464/989] usb: gadget: fsl: Initialize udc before using it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fsl_ep_queue() is only called by usb_ep_queue() (as ep->ops->queue()). So _ep isn't NULL. As ep->ops->queue = fsl_ep_queue, the ep was initialized by struct_ep_setup() and so ep->udc isn't NULL either. Drop the check for _ep being NULL and assign udc earlier to prevent following an uninitialized pointer in the two dev_vdbg()s in lines 878 and 882. This fixes a compiler warning when using clang and CONFIG_USB_GADGET_VERBOSE=y. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404050227.TTvcCPBu-lkp@intel.com/ Fixes: 6025f20f16c2 ("usb: gadget: fsl-udc: Replace custom log wrappers by dev_{err,warn,dbg,vdbg}") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240405055812.694123-2-u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/fsl_udc_core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/udc/fsl_udc_core.c b/drivers/usb/gadget/udc/fsl_udc_core.c index e82d03224f940..3432ebfae9787 100644 --- a/drivers/usb/gadget/udc/fsl_udc_core.c +++ b/drivers/usb/gadget/udc/fsl_udc_core.c @@ -868,7 +868,7 @@ fsl_ep_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t gfp_flags) { struct fsl_ep *ep = container_of(_ep, struct fsl_ep, ep); struct fsl_req *req = container_of(_req, struct fsl_req, req); - struct fsl_udc *udc; + struct fsl_udc *udc = ep->udc; unsigned long flags; int ret; @@ -878,7 +878,7 @@ fsl_ep_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t gfp_flags) dev_vdbg(&udc->gadget.dev, "%s, bad params\n", __func__); return -EINVAL; } - if (unlikely(!_ep || !ep->ep.desc)) { + if (unlikely(!ep->ep.desc)) { dev_vdbg(&udc->gadget.dev, "%s, bad ep\n", __func__); return -EINVAL; } @@ -887,7 +887,6 @@ fsl_ep_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t gfp_flags) return -EMSGSIZE; } - udc = ep->udc; if (!udc->driver || udc->gadget.speed == USB_SPEED_UNKNOWN) return -ESHUTDOWN; -- GitLab From 5957e0a28b5177849f7666d041b32f5dc7d27427 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 5 Apr 2024 02:43:08 -0400 Subject: [PATCH 465/989] bcachefs: Fix rebalance from durability=0 device Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 34731ee0217f6..0022b51ce3c09 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -598,6 +598,8 @@ int bch2_data_update_init(struct btree_trans *trans, i++; } + unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); + /* * If current extent durability is less than io_opts.data_replicas, * we're not trying to rereplicate the extent up to data_replicas here - @@ -607,7 +609,7 @@ int bch2_data_update_init(struct btree_trans *trans, * rereplicate, currently, so that users don't get an unexpected -ENOSPC */ if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) && - durability_have >= io_opts.data_replicas) { + !durability_required) { m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ @@ -616,11 +618,18 @@ int bch2_data_update_init(struct btree_trans *trans, goto done; } - m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) + + m->op.nr_replicas = min(durability_removing, durability_required) + m->data_opts.extra_replicas; - m->op.nr_replicas_required = m->op.nr_replicas; - BUG_ON(!m->op.nr_replicas); + /* + * If device(s) were set to durability=0 after data was written to them + * we can end up with a duribilty=0 extent, and the normal algorithm + * that tries not to increase durability doesn't work: + */ + if (!(durability_have + durability_removing)) + m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); + + m->op.nr_replicas_required = m->op.nr_replicas; if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, -- GitLab From bc004f5038220b1891ef4107134ccae44be55109 Mon Sep 17 00:00:00 2001 From: Jammy Huang Date: Wed, 3 Apr 2024 17:02:46 +0800 Subject: [PATCH 466/989] drm/ast: Fix soft lockup There is a while-loop in ast_dp_set_on_off() that could lead to infinite-loop. This is because the register, VGACRI-Dx, checked in this API is a scratch register actually controlled by a MCU, named DPMCU, in BMC. These scratch registers are protected by scu-lock. If suc-lock is not off, DPMCU can not update these registers and then host will have soft lockup due to never updated status. DPMCU is used to control DP and relative registers to handshake with host's VGA driver. Even the most time-consuming task, DP's link training, is less than 100ms. 200ms should be enough. Signed-off-by: Jammy Huang Fixes: 594e9c04b586 ("drm/ast: Create the driver for ASPEED proprietory Display-Port") Reviewed-by: Jocelyn Falempe Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Cc: KuoHsiang Chou Cc: Thomas Zimmermann Cc: Dave Airlie Cc: Jocelyn Falempe Cc: dri-devel@lists.freedesktop.org Cc: # v5.19+ Link: https://patchwork.freedesktop.org/patch/msgid/20240403090246.1495487-1-jammy_huang@aspeedtech.com --- drivers/gpu/drm/ast/ast_dp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/ast/ast_dp.c b/drivers/gpu/drm/ast/ast_dp.c index ebb6d8ebd44eb..1e9259416980e 100644 --- a/drivers/gpu/drm/ast/ast_dp.c +++ b/drivers/gpu/drm/ast/ast_dp.c @@ -180,6 +180,7 @@ void ast_dp_set_on_off(struct drm_device *dev, bool on) { struct ast_device *ast = to_ast_device(dev); u8 video_on_off = on; + u32 i = 0; // Video On/Off ast_set_index_reg_mask(ast, AST_IO_VGACRI, 0xE3, (u8) ~AST_DP_VIDEO_ENABLE, on); @@ -192,6 +193,8 @@ void ast_dp_set_on_off(struct drm_device *dev, bool on) ASTDP_MIRROR_VIDEO_ENABLE) != video_on_off) { // wait 1 ms mdelay(1); + if (++i > 200) + break; } } } -- GitLab From 61f7fdf8fd00ce33d30ca3fae8d643c0850ce945 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Apr 2024 23:48:59 +0200 Subject: [PATCH 467/989] timers/migration: Fix ignored event due to missing CPU update When a group event is updated with its expiry unchanged but a different CPU, that target change may go unnoticed and the event may be propagated up with a stale CPU value. The following depicts a scenario that has been actually observed: [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = TGRP1:0 (T0) / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T0 / \ 0 (T0) 1 (T1) idle idle 0) The hierarchy has 3 levels. The left part (GRP1:0) is all idle, including CPU 0 and CPU 1 which have a timer each: T0 and T1. They have the same expiry value. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = KTIME_MAX / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T0 / \ 0 (T0) 1 (T1) idle idle 1) The migrator in GRP1:1 handles remotely T0. The event is dequeued from the top and T0 executed. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = KTIME_MAX / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T1 / \ 0 1 (T1) idle idle 2) The migrator in GRP1:1 fetches the next timer for CPU 0 and finds none. But it updates the events from its groups, starting with GRP0:0 which now has T1 as its next event. So far so good. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = KTIME_MAX / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T1 / \ 0 1 (T1) idle idle 3) The migrator in GRP1:1 proceeds upward and updates the events in GRP1:0. The child event TGRP0:0 is found queued with the same expiry as before. And therefore it is left unchanged. However the target CPU is not the same but that fact is ignored so TGRP0:0 still points to CPU 0 when it should point to CPU 1. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = TGRP1:0 (T0) / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T1 / \ 0 1 (T1) idle idle 4) The propagation has reached the top level and TGRP1:0, having TGRP0:0 as its first event, also wrongly points to CPU 0. TGRP1:0 is added to the top level group. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = KTIME_MAX / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T1 / \ 0 1 (T1) idle idle 5) The migrator in GRP1:1 dequeues the next event in top level pointing to CPU 0. But since it actually doesn't see any real event in CPU 0, it early returns. 6) T1 is left unhandled until either CPU 0 or CPU 1 wake up. Some other bad scenario may involve trees with just two levels. Fix this with unconditionally updating the CPU of the child event before considering to early return while updating a queued event with an unchanged expiry value. Fixes: 7ee988770326 ("timers: Implement the hierarchical pull model") Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Reviewed-by: Anna-Maria Behnsen Link: https://lore.kernel.org/r/Zg2Ct6M2RJAYHgCB@localhost.localdomain --- kernel/time/timer_migration.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index c63a0afdcebed..e3075e40cb432 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -762,8 +762,11 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, * queue when the expiry time changed only or when it could be ignored. */ if (timerqueue_node_queued(&evt->nextevt)) { - if ((evt->nextevt.expires == nextexp) && !evt->ignore) + if ((evt->nextevt.expires == nextexp) && !evt->ignore) { + /* Make sure not to miss a new CPU event with the same expiry */ + evt->cpu = first_childevt->cpu; goto check_toplvl; + } if (!timerqueue_del(&group->events, &evt->nextevt)) WRITE_ONCE(group->next_expiry, KTIME_MAX); -- GitLab From 7a96a84bfbee96871bb16c70ee3e93d564e190f4 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 5 Apr 2024 10:53:21 +0200 Subject: [PATCH 468/989] timers/migration: Return early on deactivation Commit 4b6f4c5a67c0 ("timer/migration: Remove buggy early return on deactivation") removed the logic to return early in tmigr_update_events() on deactivation. With this the problem with a not properly updated first global event in a hierarchy containing only a single group was fixed. But when having a look at this code path with a hierarchy with more than a single level, now unnecessary work is done (example is partially copied from the message of the commit mentioned above): [GRP1:0] migrator = GRP0:0 active = GRP0:0 nextevt = T0:0i, T0:1 / \ [GRP0:0] [GRP0:1] migrator = 0 migrator = NONE active = 0 active = NONE nextevt = T0i, T1 nextevt = T2 / \ / \ 0 (T0i) 1 (T1) 2 (T2) 3 active idle idle idle 0) CPU 0 is active thus its event is ignored (the letter 'i') and so are upper levels' events. CPU 1 is idle and has the timer T1 enqueued. CPU 2 also has a timer. The expiry order is T0 (ignored) < T1 < T2 [GRP1:0] migrator = GRP0:0 active = GRP0:0 nextevt = T0:0i, T0:1 / \ [GRP0:0] [GRP0:1] migrator = NONE migrator = NONE active = NONE active = NONE nextevt = T1 nextevt = T2 / \ / \ 0 (T0i) 1 (T1) 2 (T2) 3 idle idle idle idle 1) CPU 0 goes idle without global event queued. Therefore KTIME_MAX is pushed as its next expiry and its own event kept as "ignore". Without this early return the following steps happen in tmigr_update_events() when child = null and group = GRP0:0 : lock(GRP0:0->lock); timerqueue_del(GRP0:0, T0i); unlock(GRP0:0->lock); [GRP1:0] migrator = NONE active = NONE nextevt = T0:0, T0:1 / \ [GRP0:0] [GRP0:1] migrator = NONE migrator = NONE active = NONE active = NONE nextevt = T1 nextevt = T2 / \ / \ 0 (T0i) 1 (T1) 2 (T2) 3 idle idle idle idle 2) The change now propagates up to the top. Then tmigr_update_events() updates the group event of GRP0:0 and executes the following steps (child = GRP0:0 and group = GRP0:0): lock(GRP0:0->lock); lock(GRP1:0->lock); evt = tmigr_next_groupevt(GRP0:0); -> this removes the ignored events in GRP0:0 ... update GRP1:0 group event and timerqueue ... unlock(GRP1:0->lock); unlock(GRP0:0->lock); So the dance in 1) with locking the GRP0:0->lock and removing the T0i from the timerqueue is redundand as this is done nevertheless in 2) when tmigr_next_groupevt(GRP0:0) is executed. Revert commit 4b6f4c5a67c0 ("timer/migration: Remove buggy early return on deactivation") and add a condition into return path to skip the return only, when hierarchy contains a single group. Adapt comments accordingly. Fixes: 4b6f4c5a67c0 ("timer/migration: Remove buggy early return on deactivation") Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/87cyr49on2.fsf@somnus --- kernel/time/timer_migration.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index e3075e40cb432..ccba875d2234f 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -751,6 +751,33 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, first_childevt = evt = data->evt; + /* + * Walking the hierarchy is required in any case when a + * remote expiry was done before. This ensures to not lose + * already queued events in non active groups (see section + * "Required event and timerqueue update after a remote + * expiry" in the documentation at the top). + * + * The two call sites which are executed without a remote expiry + * before, are not prevented from propagating changes through + * the hierarchy by the return: + * - When entering this path by tmigr_new_timer(), @evt->ignore + * is never set. + * - tmigr_inactive_up() takes care of the propagation by + * itself and ignores the return value. But an immediate + * return is possible if there is a parent, sparing group + * locking at this level, because the upper walking call to + * the parent will take care about removing this event from + * within the group and update next_expiry accordingly. + * + * However if there is no parent, ie: the hierarchy has only a + * single level so @group is the top level group, make sure the + * first event information of the group is updated properly and + * also handled properly, so skip this fast return path. + */ + if (evt->ignore && !remote && group->parent) + return true; + raw_spin_lock(&group->lock); childstate.state = 0; -- GitLab From caeb4b0a11b3393e43f7fa8e0a5a18462acc66bd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 31 Mar 2024 17:52:12 -0400 Subject: [PATCH 469/989] aio: Fix null ptr deref in aio_complete() wakeup list_del_init_careful() needs to be the last access to the wait queue entry - it effectively unlocks access. Previously, finish_wait() would see the empty list head and skip taking the lock, and then we'd return - but the completion path would still attempt to do the wakeup after the task_struct pointer had been overwritten. Fixes: 71eb6b6b0ba9 ("fs/aio: obey min_nr when doing wakeups") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/linux-fsdevel/CAHTA-ubfwwB51A5Wg5M6H_rPEQK9pNf8FkAGH=vr=FEkyRrtqw@mail.gmail.com/ Signed-off-by: Kent Overstreet Link: https://lore.kernel.org/stable/20240331215212.522544-1-kent.overstreet%40linux.dev Link: https://lore.kernel.org/r/20240331215212.522544-1-kent.overstreet@linux.dev Signed-off-by: Christian Brauner --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index 9cdaa2faa5363..0f4f531c97800 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1202,8 +1202,8 @@ static void aio_complete(struct aio_kiocb *iocb) spin_lock_irqsave(&ctx->wait.lock, flags); list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry) if (avail >= curr->min_nr) { - list_del_init_careful(&curr->w.entry); wake_up_process(curr->w.private); + list_del_init_careful(&curr->w.entry); } spin_unlock_irqrestore(&ctx->wait.lock, flags); } -- GitLab From 07ed11afb68d94eadd4ffc082b97c2331307c5ea Mon Sep 17 00:00:00 2001 From: Alex Constantino Date: Thu, 4 Apr 2024 19:14:48 +0100 Subject: [PATCH 470/989] Revert "drm/qxl: simplify qxl_fence_wait" This reverts commit 5a838e5d5825c85556011478abde708251cc0776. Changes from commit 5a838e5d5825 ("drm/qxl: simplify qxl_fence_wait") would result in a '[TTM] Buffer eviction failed' exception whenever it reached a timeout. Due to a dependency to DMA_FENCE_WARN this also restores some code deleted by commit d72277b6c37d ("dma-buf: nuke DMA_FENCE_TRACE macros v2"). Fixes: 5a838e5d5825 ("drm/qxl: simplify qxl_fence_wait") Link: https://lore.kernel.org/regressions/ZTgydqRlK6WX_b29@eldamar.lan/ Reported-by: Timo Lindfors Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1054514 Signed-off-by: Alex Constantino Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20240404181448.1643-2-dreaming.about.electric.sheep@gmail.com --- drivers/gpu/drm/qxl/qxl_release.c | 50 +++++++++++++++++++++++++++---- include/linux/dma-fence.h | 7 +++++ 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c index 368d26da0d6a2..9febc8b73f09e 100644 --- a/drivers/gpu/drm/qxl/qxl_release.c +++ b/drivers/gpu/drm/qxl/qxl_release.c @@ -58,16 +58,56 @@ static long qxl_fence_wait(struct dma_fence *fence, bool intr, signed long timeout) { struct qxl_device *qdev; + struct qxl_release *release; + int count = 0, sc = 0; + bool have_drawable_releases; unsigned long cur, end = jiffies + timeout; qdev = container_of(fence->lock, struct qxl_device, release_lock); + release = container_of(fence, struct qxl_release, base); + have_drawable_releases = release->type == QXL_RELEASE_DRAWABLE; - if (!wait_event_timeout(qdev->release_event, - (dma_fence_is_signaled(fence) || - (qxl_io_notify_oom(qdev), 0)), - timeout)) - return 0; +retry: + sc++; + + if (dma_fence_is_signaled(fence)) + goto signaled; + + qxl_io_notify_oom(qdev); + + for (count = 0; count < 11; count++) { + if (!qxl_queue_garbage_collect(qdev, true)) + break; + + if (dma_fence_is_signaled(fence)) + goto signaled; + } + + if (dma_fence_is_signaled(fence)) + goto signaled; + + if (have_drawable_releases || sc < 4) { + if (sc > 2) + /* back off */ + usleep_range(500, 1000); + + if (time_after(jiffies, end)) + return 0; + + if (have_drawable_releases && sc > 300) { + DMA_FENCE_WARN(fence, + "failed to wait on release %llu after spincount %d\n", + fence->context & ~0xf0000000, sc); + goto signaled; + } + goto retry; + } + /* + * yeah, original sync_obj_wait gave up after 3 spins when + * have_drawable_releases is not set. + */ +signaled: cur = jiffies; if (time_after(cur, end)) return 0; diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index e06bad467f55e..c3f9bb6602ba2 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -682,4 +682,11 @@ static inline bool dma_fence_is_container(struct dma_fence *fence) return dma_fence_is_array(fence) || dma_fence_is_chain(fence); } +#define DMA_FENCE_WARN(f, fmt, args...) \ + do { \ + struct dma_fence *__ff = (f); \ + pr_warn("f %llu#%llu: " fmt, __ff->context, __ff->seqno,\ + ##args); \ + } while (0) + #endif /* __LINUX_DMA_FENCE_H */ -- GitLab From 24cfd86433c920188ac3f02df8aba6bc4c792f4b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Apr 2024 18:30:14 +0900 Subject: [PATCH 471/989] ata: ahci: Add mask_port_map module parameter Commits 0077a504e1a4 ("ahci: asm1166: correct count of reported ports") and 9815e3961754 ("ahci: asm1064: correct count of reported ports") attempted to limit the ports of the ASM1166 and ASM1064 AHCI controllers to avoid long boot times caused by the fact that these adapters report a port map larger than the number of physical ports. The excess ports are "virtual" to hide port multiplier devices and probing these ports takes time. However, these commits caused a regression for users that do use PMP devices, as the ATA devices connected to the PMP cannot be scanned. These commits have thus been reverted by commit 6cd8adc3e18 ("ahci: asm1064: asm1166: don't limit reported ports") to allow the discovery of devices connected through a port multiplier. But this revert re-introduced the long boot times for users that do not use a port multiplier setup. This patch adds the mask_port_map ahci module parameter to allow users to manually specify port map masks for controllers. In the case of the ASMedia 1166 and 1064 controllers, users that do not have port multiplier devices can mask the excess virtual ports exposed by the controller to speedup port scanning, thus reducing boot time. The mask_port_map parameter accepts 2 different formats: - mask_port_map= This applies the same mask to all AHCI controllers present in the system. This format is convenient for small systems that have only a single AHCI controller. - mask_port_map==,=mask,... This applies the specified masks only to the PCI device listed. The field is a regular PCI device ID (domain:bus:dev.func). This ID can be seen following "ahci" in the kernel messages. E.g. for "ahci 0000:01:00.0: 2/2 ports implemented (port mask 0x3)", the field is "0000:01:00.0". When used, the function ahci_save_initial_config() indicates that a port map mask was applied with the message "masking port_map ...". E.g.: without a mask: modprobe ahci dmesg | grep ahci ... ahci 0000:00:17.0: AHCI vers 0001.0301, 32 command slots, 6 Gbps, SATA mode ahci 0000:00:17.0: (0000:00:17.0) 8/8 ports implemented (port mask 0xff) With a mask: modprobe ahci mask_port_map=0000:00:17.0=0x1 dmesg | grep ahci ... ahci 0000:00:17.0: masking port_map 0xff -> 0x1 ahci 0000:00:17.0: AHCI vers 0001.0301, 32 command slots, 6 Gbps, SATA mode ahci 0000:00:17.0: (0000:00:17.0) 1/8 ports implemented (port mask 0x1) Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel --- drivers/ata/ahci.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 562302e2e57ce..6548f10e61d9c 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -666,6 +666,87 @@ static int mobile_lpm_policy = -1; module_param(mobile_lpm_policy, int, 0644); MODULE_PARM_DESC(mobile_lpm_policy, "Default LPM policy for mobile chipsets"); +static char *ahci_mask_port_map; +module_param_named(mask_port_map, ahci_mask_port_map, charp, 0444); +MODULE_PARM_DESC(mask_port_map, + "32-bits port map masks to ignore controllers ports. " + "Valid values are: " + "\"\" to apply the same mask to all AHCI controller " + "devices, and \"=,=,...\" to " + "specify different masks for the controllers specified, " + "where is the PCI ID of an AHCI controller in the " + "form \"domain:bus:dev.func\""); + +static void ahci_apply_port_map_mask(struct device *dev, + struct ahci_host_priv *hpriv, char *mask_s) +{ + unsigned int mask; + + if (kstrtouint(mask_s, 0, &mask)) { + dev_err(dev, "Invalid port map mask\n"); + return; + } + + hpriv->mask_port_map = mask; +} + +static void ahci_get_port_map_mask(struct device *dev, + struct ahci_host_priv *hpriv) +{ + char *param, *end, *str, *mask_s; + char *name; + + if (!strlen(ahci_mask_port_map)) + return; + + str = kstrdup(ahci_mask_port_map, GFP_KERNEL); + if (!str) + return; + + /* Handle single mask case */ + if (!strchr(str, '=')) { + ahci_apply_port_map_mask(dev, hpriv, str); + goto free; + } + + /* + * Mask list case: parse the parameter to apply the mask only if + * the device name matches. + */ + param = str; + end = param + strlen(param); + while (param && param < end && *param) { + name = param; + param = strchr(name, '='); + if (!param) + break; + + *param = '\0'; + param++; + if (param >= end) + break; + + if (strcmp(dev_name(dev), name) != 0) { + param = strchr(param, ','); + if (param) + param++; + continue; + } + + mask_s = param; + param = strchr(mask_s, ','); + if (param) { + *param = '\0'; + param++; + } + + ahci_apply_port_map_mask(dev, hpriv, mask_s); + } + +free: + kfree(str); +} + static void ahci_pci_save_initial_config(struct pci_dev *pdev, struct ahci_host_priv *hpriv) { @@ -688,6 +769,10 @@ static void ahci_pci_save_initial_config(struct pci_dev *pdev, "Disabling your PATA port. Use the boot option 'ahci.marvell_enable=0' to avoid this.\n"); } + /* Handle port map masks passed as module parameter. */ + if (ahci_mask_port_map) + ahci_get_port_map_mask(&pdev->dev, hpriv); + ahci_save_initial_config(&pdev->dev, hpriv); } -- GitLab From 648dae58a830ecceea3b1bebf68432435980f137 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:12 -0700 Subject: [PATCH 472/989] cxl: Remove checking of iter in cxl_endpoint_get_perf_coordinates() The while() loop in cxl_endpoint_get_perf_coordinates() checks to see if 'iter' is valid as part of the condition breaking out of the loop. is_cxl_root() will stop the loop before the next iteration could go NULL. Remove the iter check. The presence of the iter or removing the iter does not impact the behavior of the code. This is a code clean up and not a bug fix. Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-2-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 2b0cab556072f..6cbde50a742bf 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2197,7 +2197,7 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, * port each iteration. If the parent is cxl root then there is * nothing to gather. */ - while (iter && !is_cxl_root(to_cxl_port(iter->dev.parent))) { + while (!is_cxl_root(to_cxl_port(iter->dev.parent))) { cxl_coordinates_combine(&c, &c, &dport->sw_coord); c.write_latency += dport->link_latency; c.read_latency += dport->link_latency; -- GitLab From 838ae9f45c4e43b4633d8b0ad1fbedff9ecf177d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 30 Mar 2024 07:12:03 -0700 Subject: [PATCH 473/989] nouveau/gsp: Avoid addressing beyond end of rpc->entries Using the end of rpc->entries[] for addressing runs into both compile-time and run-time detection of accessing beyond the end of the array. Use the base pointer instead, since was allocated with the additional bytes for storing the strings. Avoids the following warning in future GCC releases with support for __counted_by: In function 'fortify_memcpy_chk', inlined from 'r535_gsp_rpc_set_registry' at ../drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c:1123:3: ../include/linux/fortify-string.h:553:25: error: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning] 553 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ for this code: strings = (char *)&rpc->entries[NV_GSP_REG_NUM_ENTRIES]; ... memcpy(strings, r535_registry_entries[i].name, name_len); Signed-off-by: Kees Cook Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240330141159.work.063-kees@kernel.org --- drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c index 9994cbd6f1c40..9858c1438aa7f 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c @@ -1112,7 +1112,7 @@ r535_gsp_rpc_set_registry(struct nvkm_gsp *gsp) rpc->numEntries = NV_GSP_REG_NUM_ENTRIES; str_offset = offsetof(typeof(*rpc), entries[NV_GSP_REG_NUM_ENTRIES]); - strings = (char *)&rpc->entries[NV_GSP_REG_NUM_ENTRIES]; + strings = (char *)rpc + str_offset; for (i = 0; i < NV_GSP_REG_NUM_ENTRIES; i++) { int name_len = strlen(r535_registry_entries[i].name) + 1; -- GitLab From 185fdb4697cc9684a02f2fab0530ecdd0c2f15d4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Apr 2024 18:02:25 +0200 Subject: [PATCH 474/989] nouveau: fix function cast warning Calling a function through an incompatible pointer type causes breaks kcfi, so clang warns about the assignment: drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c:73:10: error: cast from 'void (*)(const void *)' to 'void (*)(void *)' converts to incompatible function type [-Werror,-Wcast-function-type-strict] 73 | .fini = (void(*)(void *))kfree, Avoid this with a trivial wrapper. Fixes: c39f472e9f14 ("drm/nouveau: remove symlinks, move core/ to nvkm/ (no code changes)") Signed-off-by: Arnd Bergmann Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240404160234.2923554-1-arnd@kernel.org --- drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c index 4bf486b571013..cb05f7f48a98b 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c @@ -66,11 +66,16 @@ of_init(struct nvkm_bios *bios, const char *name) return ERR_PTR(-EINVAL); } +static void of_fini(void *p) +{ + kfree(p); +} + const struct nvbios_source nvbios_of = { .name = "OpenFirmware", .init = of_init, - .fini = (void(*)(void *))kfree, + .fini = of_fini, .read = of_read, .size = of_size, .rw = false, -- GitLab From 0c3b532ad3fbf82884a2e7e83e37c7dcdd4d1d99 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 5 Apr 2024 19:25:21 +0300 Subject: [PATCH 475/989] gpio: wcove: Use -ENOTSUPP consistently The GPIO library expects the drivers to return -ENOTSUPP in some cases and not using analogue POSIX code. Make the driver to follow this. Reviewed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Andy Shevchenko --- drivers/gpio/gpio-wcove.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-wcove.c b/drivers/gpio/gpio-wcove.c index c18b6b47384f1..94ca9d03c0949 100644 --- a/drivers/gpio/gpio-wcove.c +++ b/drivers/gpio/gpio-wcove.c @@ -104,7 +104,7 @@ static inline int to_reg(int gpio, enum ctrl_register type) unsigned int reg = type == CTRL_IN ? GPIO_IN_CTRL_BASE : GPIO_OUT_CTRL_BASE; if (gpio >= WCOVE_GPIO_NUM) - return -EOPNOTSUPP; + return -ENOTSUPP; return reg + gpio; } -- GitLab From ace0ebe5c98d66889f19e0f30e2518d0c58d0e04 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 5 Apr 2024 19:26:22 +0300 Subject: [PATCH 476/989] gpio: crystalcove: Use -ENOTSUPP consistently The GPIO library expects the drivers to return -ENOTSUPP in some cases and not using analogue POSIX code. Make the driver to follow this. Signed-off-by: Andy Shevchenko --- drivers/gpio/gpio-crystalcove.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-crystalcove.c b/drivers/gpio/gpio-crystalcove.c index 1ee62cd58582b..25db014494a4d 100644 --- a/drivers/gpio/gpio-crystalcove.c +++ b/drivers/gpio/gpio-crystalcove.c @@ -92,7 +92,7 @@ static inline int to_reg(int gpio, enum ctrl_register reg_type) case 0x5e: return GPIOPANELCTL; default: - return -EOPNOTSUPP; + return -ENOTSUPP; } } -- GitLab From 10396f4df8b75ff6ab0aa2cd74296565466f2c8d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 5 Apr 2024 13:56:18 -0400 Subject: [PATCH 477/989] nfsd: hold a lighter-weight client reference over CB_RECALL_ANY Currently the CB_RECALL_ANY job takes a cl_rpc_users reference to the client. While a callback job is technically an RPC that counter is really more for client-driven RPCs, and this has the effect of preventing the client from being unhashed until the callback completes. If nfsd decides to send a CB_RECALL_ANY just as the client reboots, we can end up in a situation where the callback can't complete on the (now dead) callback channel, but the new client can't connect because the old client can't be unhashed. This usually manifests as a NFS4ERR_DELAY return on the CREATE_SESSION operation. The job is only holding a reference to the client so it can clear a flag after the RPC completes. Fix this by having CB_RECALL_ANY instead hold a reference to the cl_nfsdfs.cl_ref. Typically we only take that sort of reference when dealing with the nfsdfs info files, but it should work appropriately here to ensure that the nfs4_client doesn't disappear. Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") Reported-by: Vladimir Benes Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 5fcd93f7cb8c7..3cef81e196c68 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3042,12 +3042,9 @@ static void nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - spin_lock(&nn->client_lock); clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); - put_client_renew_locked(clp); - spin_unlock(&nn->client_lock); + drop_client(clp); } static int @@ -6616,7 +6613,7 @@ deleg_reaper(struct nfsd_net *nn) list_add(&clp->cl_ra_cblist, &cblist); /* release in nfsd4_cb_recall_any_release */ - atomic_inc(&clp->cl_rpc_users); + kref_get(&clp->cl_nfsdfs.cl_ref); set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); clp->cl_ra_time = ktime_get_boottime_seconds(); } -- GitLab From d3bbc4dfcc8d436b1e27a0bb6a5893b090fc1cab Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 26 Mar 2024 22:23:24 +0100 Subject: [PATCH 478/989] drm/msm: fix the `CRASHDUMP_READ` target of `a6xx_get_shader_block()` Clang 14 in an (essentially) defconfig arm64 build for next-20240326 reports [1]: drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c:843:6: error: variable 'out' set but not used [-Werror,-Wunused-but-set-variable] The variable `out` in these functions is meant to compute the `target` of `CRASHDUMP_READ()`, but in this case only the initial value (`dumper->iova + A6XX_CD_DATA_OFFSET`) was being passed. Thus use `out` as it was intended by Connor [2]. There was an alternative patch at [3] that removed the variable altogether, but that would only use the initial value. Fixes: 64d6255650d4 ("drm/msm: More fully implement devcoredump for a7xx") Closes: https://lore.kernel.org/lkml/CANiq72mjc5t4n25SQvYSrOEhxxpXYPZ4pPzneSJHEnc3qApu2Q@mail.gmail.com/ [1] Link: https://lore.kernel.org/lkml/CACu1E7HhCKMJd6fixZSPiNAz6ekoZnkMTHTcLFVmbZ-9VoLxKg@mail.gmail.com/ [2] Link: https://lore.kernel.org/lkml/20240307093727.1978126-1-colin.i.king@gmail.com/ [3] Signed-off-by: Miguel Ojeda Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/584955/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c index 1f5245fc2cdc6..a847a0f7a73c9 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c @@ -852,7 +852,7 @@ static void a6xx_get_shader_block(struct msm_gpu *gpu, (block->type << 8) | i); in += CRASHDUMP_READ(in, REG_A6XX_HLSQ_DBG_AHB_READ_APERTURE, - block->size, dumper->iova + A6XX_CD_DATA_OFFSET); + block->size, out); out += block->size * sizeof(u32); } -- GitLab From 65291dcfcf8936e1b23cfd7718fdfde7cfaf7706 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 26 Mar 2024 15:32:08 +0100 Subject: [PATCH 479/989] mm/secretmem: fix GUP-fast succeeding on secretmem folios folio_is_secretmem() currently relies on secretmem folios being LRU folios, to save some cycles. However, folios might reside in a folio batch without the LRU flag set, or temporarily have their LRU flag cleared. Consequently, the LRU flag is unreliable for this purpose. In particular, this is the case when secretmem_fault() allocates a fresh page and calls filemap_add_folio()->folio_add_lru(). The folio might be added to the per-cpu folio batch and won't get the LRU flag set until the batch was drained using e.g., lru_add_drain(). Consequently, folio_is_secretmem() might not detect secretmem folios and GUP-fast can succeed in grabbing a secretmem folio, crashing the kernel when we would later try reading/writing to the folio, because the folio has been unmapped from the directmap. Fix it by removing that unreliable check. Link: https://lkml.kernel.org/r/20240326143210.291116-2-david@redhat.com Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas") Signed-off-by: David Hildenbrand Reported-by: xingwei lee Reported-by: yue sun Closes: https://lore.kernel.org/lkml/CABOYnLyevJeravW=QrH0JUPYEcDN160aZFb7kwndm-J2rmz0HQ@mail.gmail.com/ Debugged-by: Miklos Szeredi Tested-by: Miklos Szeredi Reviewed-by: Mike Rapoport (IBM) Cc: Lorenzo Stoakes Cc: Signed-off-by: Andrew Morton --- include/linux/secretmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h index 35f3a4a8ceb1e..acf7e1a3f3def 100644 --- a/include/linux/secretmem.h +++ b/include/linux/secretmem.h @@ -13,10 +13,10 @@ static inline bool folio_is_secretmem(struct folio *folio) /* * Using folio_mapping() is quite slow because of the actual call * instruction. - * We know that secretmem pages are not compound and LRU so we can + * We know that secretmem pages are not compound, so we can * save a couple of cycles here. */ - if (folio_test_large(folio) || !folio_test_lru(folio)) + if (folio_test_large(folio)) return false; mapping = (struct address_space *) -- GitLab From 8434f9aa6b7e77bc1459d75d1293c3f55bf4687b Mon Sep 17 00:00:00 2001 From: John Sperbeck Date: Sat, 23 Mar 2024 08:29:34 -0700 Subject: [PATCH 480/989] init: open output files from cpio unpacking with O_LARGEFILE If a member of a cpio archive for an initrd or initrams is larger than 2Gb, we'll eventually fail to write to that file when we get to that limit, unless O_LARGEFILE is set. The problem can be seen with this recipe, assuming that BLK_DEV_RAM is not configured: cd /tmp dd if=/dev/zero of=BIGFILE bs=1048576 count=2200 echo BIGFILE | cpio -o -H newc -R root:root > initrd.img kexec -l /boot/vmlinuz-$(uname -r) --initrd=initrd.img --reuse-cmdline kexec -e The console will show 'Initramfs unpacking failed: write error'. With the patch, the error is gone. Link: https://lkml.kernel.org/r/20240323152934.3307391-1-jsperbeck@google.com Signed-off-by: John Sperbeck Cc: Jens Axboe Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- init/initramfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/initramfs.c b/init/initramfs.c index 3127e0bf7bbd1..a298a3854a801 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -367,7 +367,7 @@ static int __init do_name(void) if (S_ISREG(mode)) { int ml = maybe_link(); if (ml >= 0) { - int openflags = O_WRONLY|O_CREAT; + int openflags = O_WRONLY|O_CREAT|O_LARGEFILE; if (ml != 1) openflags |= O_TRUNC; wfile = filp_open(collected, openflags, mode); -- GitLab From 4ed91fa9177b236b73a271f11a333a98f076eb63 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Sat, 23 Mar 2024 15:15:44 +0100 Subject: [PATCH 481/989] mm: vmalloc: bail out early in find_vmap_area() if vmap is not init During the boot the s390 system triggers "spinlock bad magic" messages if the spinlock debugging is enabled: [ 0.465445] BUG: spinlock bad magic on CPU#0, swapper/0 [ 0.465490] lock: single+0x1860/0x1958, .magic: 00000000, .owner: /-1, .owner_cpu: 0 [ 0.466067] CPU: 0 PID: 0 Comm: swapper Not tainted 6.8.0-12955-g8e938e398669 #1 [ 0.466188] Hardware name: QEMU 8561 QEMU (KVM/Linux) [ 0.466270] Call Trace: [ 0.466470] [<00000000011f26c8>] dump_stack_lvl+0x98/0xd8 [ 0.466516] [<00000000001dcc6a>] do_raw_spin_lock+0x8a/0x108 [ 0.466545] [<000000000042146c>] find_vmap_area+0x6c/0x108 [ 0.466572] [<000000000042175a>] find_vm_area+0x22/0x40 [ 0.466597] [<000000000012f152>] __set_memory+0x132/0x150 [ 0.466624] [<0000000001cc0398>] vmem_map_init+0x40/0x118 [ 0.466651] [<0000000001cc0092>] paging_init+0x22/0x68 [ 0.466677] [<0000000001cbbed2>] setup_arch+0x52a/0x708 [ 0.466702] [<0000000001cb6140>] start_kernel+0x80/0x5c8 [ 0.466727] [<0000000000100036>] startup_continue+0x36/0x40 it happens because such system tries to access some vmap areas whereas the vmalloc initialization is not even yet done: [ 0.465490] lock: single+0x1860/0x1958, .magic: 00000000, .owner: /-1, .owner_cpu: 0 [ 0.466067] CPU: 0 PID: 0 Comm: swapper Not tainted 6.8.0-12955-g8e938e398669 #1 [ 0.466188] Hardware name: QEMU 8561 QEMU (KVM/Linux) [ 0.466270] Call Trace: [ 0.466470] dump_stack_lvl (lib/dump_stack.c:117) [ 0.466516] do_raw_spin_lock (kernel/locking/spinlock_debug.c:87 kernel/locking/spinlock_debug.c:115) [ 0.466545] find_vmap_area (mm/vmalloc.c:1059 mm/vmalloc.c:2364) [ 0.466572] find_vm_area (mm/vmalloc.c:3150) [ 0.466597] __set_memory (arch/s390/mm/pageattr.c:360 arch/s390/mm/pageattr.c:393) [ 0.466624] vmem_map_init (./arch/s390/include/asm/set_memory.h:55 arch/s390/mm/vmem.c:660) [ 0.466651] paging_init (arch/s390/mm/init.c:97) [ 0.466677] setup_arch (arch/s390/kernel/setup.c:972) [ 0.466702] start_kernel (init/main.c:899) [ 0.466727] startup_continue (arch/s390/kernel/head64.S:35) [ 0.466811] INFO: lockdep is turned off. ... [ 0.718250] vmalloc init - busy lock init 0000000002871860 [ 0.718328] vmalloc init - busy lock init 00000000028731b8 Some background. It worked before because the lock that is in question was statically defined and initialized. As of now, the locks and data structures are initialized in the vmalloc_init() function. To address that issue add the check whether the "vmap_initialized" variable is set, if not find_vmap_area() bails out on entry returning NULL. Link: https://lkml.kernel.org/r/20240323141544.4150-1-urezki@gmail.com Fixes: 72210662c5a2 ("mm: vmalloc: offload free_vmap_area_lock lock") Signed-off-by: Uladzislau Rezki (Sony) Tested-by: Guenter Roeck Reviewed-by: Baoquan He Acked-by: Heiko Carstens Cc: Christoph Hellwig Cc: Dave Chinner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 22aa63f4ef632..0d77d171b5d9f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2343,6 +2343,9 @@ struct vmap_area *find_vmap_area(unsigned long addr) struct vmap_area *va; int i, j; + if (unlikely(!vmap_initialized)) + return NULL; + /* * An addr_to_node_id(addr) converts an address to a node index * where a VA is located. If VA spans several zones and passed -- GitLab From fc2c22693c608125bbce174c1952eb4db2f8d07f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 28 Mar 2024 15:03:30 +0100 Subject: [PATCH 482/989] mm: vmalloc: fix lockdep warning A lockdep reports a possible deadlock in the find_vmap_area_exceed_addr_lock() function: ============================================ WARNING: possible recursive locking detected 6.9.0-rc1-00060-ged3ccc57b108-dirty #6140 Not tainted -------------------------------------------- drgn/455 is trying to acquire lock: ffff0000c00131d0 (&vn->busy.lock/1){+.+.}-{2:2}, at: find_vmap_area_exceed_addr_lock+0x64/0x124 but task is already holding lock: ffff0000c0011878 (&vn->busy.lock/1){+.+.}-{2:2}, at: find_vmap_area_exceed_addr_lock+0x64/0x124 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&vn->busy.lock/1); lock(&vn->busy.lock/1); *** DEADLOCK *** indeed it can happen if the find_vmap_area_exceed_addr_lock() gets called concurrently because it tries to acquire two nodes locks. It was done to prevent removing a lowest VA found on a previous step. To address this a lowest VA is found first without holding a node lock where it resides. As a last step we check if a VA still there because it can go away, if removed, proceed with next lowest. [akpm@linux-foundation.org: fix comment typos, per Baoquan] Link: https://lkml.kernel.org/r/20240328140330.4747-1-urezki@gmail.com Fixes: 53becf32aec1 ("mm: vmalloc: support multiple nodes in vread_iter") Signed-off-by: Uladzislau Rezki (Sony) Tested-by: Jens Axboe Tested-by: Omar Sandoval Reported-by: Jens Axboe Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 73 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0d77d171b5d9f..68fa001648cc1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -989,6 +989,27 @@ unsigned long vmalloc_nr_pages(void) return atomic_long_read(&nr_vmalloc_pages); } +static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + + addr = (unsigned long)kasan_reset_tag((void *)addr); + + while (n) { + struct vmap_area *va; + + va = rb_entry(n, struct vmap_area, rb_node); + if (addr < va->va_start) + n = n->rb_left; + else if (addr >= va->va_end) + n = n->rb_right; + else + return va; + } + + return NULL; +} + /* Look up the first VA which satisfies addr < va_end, NULL if none. */ static struct vmap_area * __find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) @@ -1025,47 +1046,39 @@ __find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) static struct vmap_node * find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) { - struct vmap_node *vn, *va_node = NULL; - struct vmap_area *va_lowest; + unsigned long va_start_lowest; + struct vmap_node *vn; int i; - for (i = 0; i < nr_vmap_nodes; i++) { +repeat: + for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; spin_lock(&vn->busy.lock); - va_lowest = __find_vmap_area_exceed_addr(addr, &vn->busy.root); - if (va_lowest) { - if (!va_node || va_lowest->va_start < (*va)->va_start) { - if (va_node) - spin_unlock(&va_node->busy.lock); - - *va = va_lowest; - va_node = vn; - continue; - } - } + *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root); + + if (*va) + if (!va_start_lowest || (*va)->va_start < va_start_lowest) + va_start_lowest = (*va)->va_start; spin_unlock(&vn->busy.lock); } - return va_node; -} - -static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) -{ - struct rb_node *n = root->rb_node; + /* + * Check if found VA exists, it might have gone away. In this case we + * repeat the search because a VA has been removed concurrently and we + * need to proceed to the next one, which is a rare case. + */ + if (va_start_lowest) { + vn = addr_to_node(va_start_lowest); - addr = (unsigned long)kasan_reset_tag((void *)addr); + spin_lock(&vn->busy.lock); + *va = __find_vmap_area(va_start_lowest, &vn->busy.root); - while (n) { - struct vmap_area *va; + if (*va) + return vn; - va = rb_entry(n, struct vmap_area, rb_node); - if (addr < va->va_start) - n = n->rb_left; - else if (addr >= va->va_end) - n = n->rb_right; - else - return va; + spin_unlock(&vn->busy.lock); + goto repeat; } return NULL; -- GitLab From 176517c9310281d00dd3210ab4cc4d3cdc26b17e Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Fri, 29 Mar 2024 18:58:10 +0000 Subject: [PATCH 483/989] selftests/mm: include strings.h for ffsl Got a compilation error on Android for ffsl after 91b80cc5b39f ("selftests: mm: fix map_hugetlb failure on 64K page size systems") included vm_util.h. Link: https://lkml.kernel.org/r/20240329185814.16304-1-edliaw@google.com Fixes: af605d26a8f2 ("selftests/mm: merge util.h into vm_util.h") Signed-off-by: Edward Liaw Reviewed-by: Muhammad Usama Anjum Cc: Axel Rasmussen Cc: David Hildenbrand Cc: "Mike Rapoport (IBM)" Cc: Peter Xu Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/vm_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index c02990bbd56f4..9007c420d52c5 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -3,7 +3,7 @@ #include #include #include -#include /* ffsl() */ +#include /* ffsl() */ #include /* _SC_PAGESIZE */ #define BIT_ULL(nr) (1ULL << (nr)) -- GitLab From 87f0e65cdf762f7c4d7ad1466f0cc7c1381f1996 Mon Sep 17 00:00:00 2001 From: Alexey Makhalov Date: Tue, 2 Apr 2024 16:23:34 -0700 Subject: [PATCH 484/989] MAINTAINERS: change vmware.com addresses to broadcom.com Update all remaining vmware.com email addresses to actual broadcom.com. Add corresponding .mailmap entries for maintainers who contributed in the past as the vmware.com address will start bouncing soon. Maintainership update. Jeff Sipek has left VMware, Nick Shi will be maintaining VMware PTP. Link: https://lkml.kernel.org/r/20240402232334.33167-1-alexey.makhalov@broadcom.com Signed-off-by: Alexey Makhalov Acked-by: Florian Fainelli Acked-by: Ajay Kaher Acked-by: Ronak Doshi Acked-by: Nick Shi Acked-by: Bryan Tan Acked-by: Vishnu Dasa Acked-by: Vishal Bhakta Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- .mailmap | 5 +++++ MAINTAINERS | 46 +++++++++++++++++++++++----------------------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/.mailmap b/.mailmap index 59c9a841bf714..8284692f96107 100644 --- a/.mailmap +++ b/.mailmap @@ -20,6 +20,7 @@ Adam Oldham Adam Radford Adriana Reus Adrian Bunk +Ajay Kaher Akhil P Oommen Alan Cox Alan Cox @@ -36,6 +37,7 @@ Alexei Avshalom Lazar Alexei Starovoitov Alexei Starovoitov Alexei Starovoitov +Alexey Makhalov Alex Hung Alex Shi Alex Shi @@ -110,6 +112,7 @@ Brendan Higgins Brian Avery Brian King Brian Silverman +Bryan Tan Cai Huoqing Can Guo Carl Huang @@ -529,6 +532,7 @@ Rocky Liao Roman Gushchin Roman Gushchin Roman Gushchin +Ronak Doshi Muchun Song Muchun Song Ross Zwisler @@ -651,6 +655,7 @@ Viresh Kumar Viresh Kumar Viresh Kumar Viresh Kumar +Vishnu Dasa Vivek Aknurwar Vivien Didelot Vlad Dogaru diff --git a/MAINTAINERS b/MAINTAINERS index 7c121493f43d0..bd9fbb315e692 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16731,9 +16731,9 @@ F: include/uapi/linux/ppdev.h PARAVIRT_OPS INTERFACE M: Juergen Gross -R: Ajay Kaher -R: Alexey Makhalov -R: VMware PV-Drivers Reviewers +R: Ajay Kaher +R: Alexey Makhalov +R: Broadcom internal kernel review list L: virtualization@lists.linux.dev L: x86@kernel.org S: Supported @@ -23652,9 +23652,9 @@ S: Supported F: drivers/misc/vmw_balloon.c VMWARE HYPERVISOR INTERFACE -M: Ajay Kaher -M: Alexey Makhalov -R: VMware PV-Drivers Reviewers +M: Ajay Kaher +M: Alexey Makhalov +R: Broadcom internal kernel review list L: virtualization@lists.linux.dev L: x86@kernel.org S: Supported @@ -23663,34 +23663,34 @@ F: arch/x86/include/asm/vmware.h F: arch/x86/kernel/cpu/vmware.c VMWARE PVRDMA DRIVER -M: Bryan Tan -M: Vishnu Dasa -R: VMware PV-Drivers Reviewers +M: Bryan Tan +M: Vishnu Dasa +R: Broadcom internal kernel review list L: linux-rdma@vger.kernel.org S: Supported F: drivers/infiniband/hw/vmw_pvrdma/ VMWARE PVSCSI DRIVER -M: Vishal Bhakta -R: VMware PV-Drivers Reviewers +M: Vishal Bhakta +R: Broadcom internal kernel review list L: linux-scsi@vger.kernel.org S: Supported F: drivers/scsi/vmw_pvscsi.c F: drivers/scsi/vmw_pvscsi.h VMWARE VIRTUAL PTP CLOCK DRIVER -M: Jeff Sipek -R: Ajay Kaher -R: Alexey Makhalov -R: VMware PV-Drivers Reviewers +M: Nick Shi +R: Ajay Kaher +R: Alexey Makhalov +R: Broadcom internal kernel review list L: netdev@vger.kernel.org S: Supported F: drivers/ptp/ptp_vmw.c VMWARE VMCI DRIVER -M: Bryan Tan -M: Vishnu Dasa -R: VMware PV-Drivers Reviewers +M: Bryan Tan +M: Vishnu Dasa +R: Broadcom internal kernel review list L: linux-kernel@vger.kernel.org S: Supported F: drivers/misc/vmw_vmci/ @@ -23705,16 +23705,16 @@ F: drivers/input/mouse/vmmouse.c F: drivers/input/mouse/vmmouse.h VMWARE VMXNET3 ETHERNET DRIVER -M: Ronak Doshi -R: VMware PV-Drivers Reviewers +M: Ronak Doshi +R: Broadcom internal kernel review list L: netdev@vger.kernel.org S: Supported F: drivers/net/vmxnet3/ VMWARE VSOCK VMCI TRANSPORT DRIVER -M: Bryan Tan -M: Vishnu Dasa -R: VMware PV-Drivers Reviewers +M: Bryan Tan +M: Vishnu Dasa +R: Broadcom internal kernel review list L: linux-kernel@vger.kernel.org S: Supported F: net/vmw_vsock/vmci_transport* -- GitLab From 04c35ab3bdae7fefbd7c7a7355f29fa03a035221 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 3 Apr 2024 23:21:30 +0200 Subject: [PATCH 485/989] x86/mm/pat: fix VM_PAT handling in COW mappings PAT handling won't do the right thing in COW mappings: the first PTE (or, in fact, all PTEs) can be replaced during write faults to point at anon folios. Reliably recovering the correct PFN and cachemode using follow_phys() from PTEs will not work in COW mappings. Using follow_phys(), we might just get the address+protection of the anon folio (which is very wrong), or fail on swap/nonswap entries, failing follow_phys() and triggering a WARN_ON_ONCE() in untrack_pfn() and track_pfn_copy(), not properly calling free_pfn_range(). In free_pfn_range(), we either wouldn't call memtype_free() or would call it with the wrong range, possibly leaking memory. To fix that, let's update follow_phys() to refuse returning anon folios, and fallback to using the stored PFN inside vma->vm_pgoff for COW mappings if we run into that. We will now properly handle untrack_pfn() with COW mappings, where we don't need the cachemode. We'll have to fail fork()->track_pfn_copy() if the first page was replaced by an anon folio, though: we'd have to store the cachemode in the VMA to make this work, likely growing the VMA size. For now, lets keep it simple and let track_pfn_copy() just fail in that case: it would have failed in the past with swap/nonswap entries already, and it would have done the wrong thing with anon folios. Simple reproducer to trigger the WARN_ON_ONCE() in untrack_pfn(): <--- C reproducer ---> #include #include #include #include int main(void) { struct io_uring_params p = {}; int ring_fd; size_t size; char *map; ring_fd = io_uring_setup(1, &p); if (ring_fd < 0) { perror("io_uring_setup"); return 1; } size = p.sq_off.array + p.sq_entries * sizeof(unsigned); /* Map the submission queue ring MAP_PRIVATE */ map = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, ring_fd, IORING_OFF_SQ_RING); if (map == MAP_FAILED) { perror("mmap"); return 1; } /* We have at least one page. Let's COW it. */ *map = 0; pause(); return 0; } <--- C reproducer ---> On a system with 16 GiB RAM and swap configured: # ./iouring & # memhog 16G # killall iouring [ 301.552930] ------------[ cut here ]------------ [ 301.553285] WARNING: CPU: 7 PID: 1402 at arch/x86/mm/pat/memtype.c:1060 untrack_pfn+0xf4/0x100 [ 301.553989] Modules linked in: binfmt_misc nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_g [ 301.558232] CPU: 7 PID: 1402 Comm: iouring Not tainted 6.7.5-100.fc38.x86_64 #1 [ 301.558772] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebu4 [ 301.559569] RIP: 0010:untrack_pfn+0xf4/0x100 [ 301.559893] Code: 75 c4 eb cf 48 8b 43 10 8b a8 e8 00 00 00 3b 6b 28 74 b8 48 8b 7b 30 e8 ea 1a f7 000 [ 301.561189] RSP: 0018:ffffba2c0377fab8 EFLAGS: 00010282 [ 301.561590] RAX: 00000000ffffffea RBX: ffff9208c8ce9cc0 RCX: 000000010455e047 [ 301.562105] RDX: 07fffffff0eb1e0a RSI: 0000000000000000 RDI: ffff9208c391d200 [ 301.562628] RBP: 0000000000000000 R08: ffffba2c0377fab8 R09: 0000000000000000 [ 301.563145] R10: ffff9208d2292d50 R11: 0000000000000002 R12: 00007fea890e0000 [ 301.563669] R13: 0000000000000000 R14: ffffba2c0377fc08 R15: 0000000000000000 [ 301.564186] FS: 0000000000000000(0000) GS:ffff920c2fbc0000(0000) knlGS:0000000000000000 [ 301.564773] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 301.565197] CR2: 00007fea88ee8a20 CR3: 00000001033a8000 CR4: 0000000000750ef0 [ 301.565725] PKRU: 55555554 [ 301.565944] Call Trace: [ 301.566148] [ 301.566325] ? untrack_pfn+0xf4/0x100 [ 301.566618] ? __warn+0x81/0x130 [ 301.566876] ? untrack_pfn+0xf4/0x100 [ 301.567163] ? report_bug+0x171/0x1a0 [ 301.567466] ? handle_bug+0x3c/0x80 [ 301.567743] ? exc_invalid_op+0x17/0x70 [ 301.568038] ? asm_exc_invalid_op+0x1a/0x20 [ 301.568363] ? untrack_pfn+0xf4/0x100 [ 301.568660] ? untrack_pfn+0x65/0x100 [ 301.568947] unmap_single_vma+0xa6/0xe0 [ 301.569247] unmap_vmas+0xb5/0x190 [ 301.569532] exit_mmap+0xec/0x340 [ 301.569801] __mmput+0x3e/0x130 [ 301.570051] do_exit+0x305/0xaf0 ... Link: https://lkml.kernel.org/r/20240403212131.929421-3-david@redhat.com Signed-off-by: David Hildenbrand Reported-by: Wupeng Ma Closes: https://lkml.kernel.org/r/20240227122814.3781907-1-mawupeng1@huawei.com Fixes: b1a86e15dc03 ("x86, pat: remove the dependency on 'vm_pgoff' in track/untrack pfn vma routines") Fixes: 5899329b1910 ("x86: PAT: implement track/untrack of pfnmap regions for x86 - v3") Acked-by: Ingo Molnar Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Signed-off-by: Andrew Morton --- arch/x86/mm/pat/memtype.c | 49 ++++++++++++++++++++++++++++----------- mm/memory.c | 4 ++++ 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 0d72183b5dd02..36b603d0cddef 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -947,6 +947,38 @@ static void free_pfn_range(u64 paddr, unsigned long size) memtype_free(paddr, paddr + size); } +static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, + pgprot_t *pgprot) +{ + unsigned long prot; + + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT)); + + /* + * We need the starting PFN and cachemode used for track_pfn_remap() + * that covered the whole VMA. For most mappings, we can obtain that + * information from the page tables. For COW mappings, we might now + * suddenly have anon folios mapped and follow_phys() will fail. + * + * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to + * detect the PFN. If we need the cachemode as well, we're out of luck + * for now and have to fail fork(). + */ + if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) { + if (pgprot) + *pgprot = __pgprot(prot); + return 0; + } + if (is_cow_mapping(vma->vm_flags)) { + if (pgprot) + return -EINVAL; + *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; + return 0; + } + WARN_ON_ONCE(1); + return -EINVAL; +} + /* * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). @@ -957,20 +989,13 @@ static void free_pfn_range(u64 paddr, unsigned long size) int track_pfn_copy(struct vm_area_struct *vma) { resource_size_t paddr; - unsigned long prot; unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; if (vma->vm_flags & VM_PAT) { - /* - * reserve the whole chunk covered by vma. We need the - * starting address and protection from pte. - */ - if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { - WARN_ON_ONCE(1); + if (get_pat_info(vma, &paddr, &pgprot)) return -EINVAL; - } - pgprot = __pgprot(prot); + /* reserve the whole chunk covered by vma. */ return reserve_pfn_range(paddr, vma_size, &pgprot, 1); } @@ -1045,7 +1070,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked) { resource_size_t paddr; - unsigned long prot; if (vma && !(vma->vm_flags & VM_PAT)) return; @@ -1053,11 +1077,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, /* free the chunk starting from pfn or the whole chunk */ paddr = (resource_size_t)pfn << PAGE_SHIFT; if (!paddr && !size) { - if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { - WARN_ON_ONCE(1); + if (get_pat_info(vma, &paddr, NULL)) return; - } - size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); diff --git a/mm/memory.c b/mm/memory.c index 904f70b994985..d2155ced45f8f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5973,6 +5973,10 @@ int follow_phys(struct vm_area_struct *vma, goto out; pte = ptep_get(ptep); + /* Never return PFNs of anon folios in COW mappings. */ + if (vm_normal_folio(vma, address, pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; -- GitLab From a6c1d9cb9a68bfa4512248419c4f4d880d19fe90 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 1 Apr 2024 17:14:58 -0700 Subject: [PATCH 486/989] stackdepot: rename pool_index to pool_index_plus_1 Commit 3ee34eabac2a ("lib/stackdepot: fix first entry having a 0-handle") changed the meaning of the pool_index field to mean "the pool index plus 1". This made the code accessing this field less self-documenting, as well as causing debuggers such as drgn to not be able to easily remain compatible with both old and new kernels, because they typically do that by testing for presence of the new field. Because stackdepot is a debugging tool, we should make sure that it is debugger friendly. Therefore, give the field a different name to improve readability as well as enabling debugger backwards compatibility. This is needed in 6.9, which would otherwise become an odd release with the new semantics and old name so debuggers wouldn't recognize the new semantics there. Fixes: 3ee34eabac2a ("lib/stackdepot: fix first entry having a 0-handle") Link: https://lkml.kernel.org/r/20240402001500.53533-1-pcc@google.com Link: https://linux-review.googlesource.com/id/Ib3e70c36c1d230dd0a118dc22649b33e768b9f88 Signed-off-by: Peter Collingbourne Acked-by: Vlastimil Babka Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Acked-by: Oscar Salvador Cc: Andrey Konovalov Cc: Michal Hocko Cc: Omar Sandoval Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 7 +++---- lib/stackdepot.c | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 3c6caa5abc7c4..e9ec32fb97d4a 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -44,10 +44,9 @@ typedef u32 depot_stack_handle_t; union handle_parts { depot_stack_handle_t handle; struct { - /* pool_index is offset by 1 */ - u32 pool_index : DEPOT_POOL_INDEX_BITS; - u32 offset : DEPOT_OFFSET_BITS; - u32 extra : STACK_DEPOT_EXTRA_BITS; + u32 pool_index_plus_1 : DEPOT_POOL_INDEX_BITS; + u32 offset : DEPOT_OFFSET_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; }; }; diff --git a/lib/stackdepot.c b/lib/stackdepot.c index af6cc19a20033..68c97387aa54e 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -330,7 +330,7 @@ static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size) stack = current_pool + pool_offset; /* Pre-initialize handle once. */ - stack->handle.pool_index = pool_index + 1; + stack->handle.pool_index_plus_1 = pool_index + 1; stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN; stack->handle.extra = 0; INIT_LIST_HEAD(&stack->hash_list); @@ -441,7 +441,7 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) const int pools_num_cached = READ_ONCE(pools_num); union handle_parts parts = { .handle = handle }; void *pool; - u32 pool_index = parts.pool_index - 1; + u32 pool_index = parts.pool_index_plus_1 - 1; size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; -- GitLab From 9dc23cba0927d09cb481da064c8413eb9df42e2b Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Thu, 28 Mar 2024 09:02:45 +0100 Subject: [PATCH 487/989] drm/msm/adreno: Set highest_bank_bit for A619 The default highest_bank_bit of 15 didn't seem to cause issues so far but downstream defines it to be 14. But similar to [0] leaving it on 14 (or 15 for that matter) causes some corruption issues with some resolutions with DisplayPort, like 1920x1200. So set it to 13 for now so that there's no screen corruption. [0] commit 6a0dbcd20ef2 ("drm/msm/a6xx: set highest_bank_bit to 13 for a610") Fixes: b7616b5c69e6 ("drm/msm/adreno: Add A619 support") Signed-off-by: Luca Weiss Patchwork: https://patchwork.freedesktop.org/patch/585215/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 0674aca0f8a3f..cf0b1de1c0712 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1377,6 +1377,10 @@ static void a6xx_calc_ubwc_config(struct adreno_gpu *gpu) if (adreno_is_a618(gpu)) gpu->ubwc_config.highest_bank_bit = 14; + if (adreno_is_a619(gpu)) + /* TODO: Should be 14 but causes corruption at e.g. 1920x1200 on DP */ + gpu->ubwc_config.highest_bank_bit = 13; + if (adreno_is_a619_holi(gpu)) gpu->ubwc_config.highest_bank_bit = 13; -- GitLab From a6c4162d844dae4dbfea1bf9ecffcb852d3ed615 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 Apr 2024 18:01:02 +0300 Subject: [PATCH 488/989] bcachefs: fix ! vs ~ typo in __clear_bit_le64() The ! was obviously intended to be ~. As it is, this function does the equivalent to: "addr[bit / 64] = 0;". Fixes: 27fcec6c27ca ("bcachefs: Clear recovery_passes_required as they complete without errors") Signed-off-by: Dan Carpenter Signed-off-by: Kent Overstreet --- fs/bcachefs/util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index b7e7c29278fc0..de639e8a3ab53 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -795,7 +795,7 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr) static inline void __clear_bit_le64(size_t bit, __le64 *addr) { - addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64)); + addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64)); } static inline bool test_bit_le64(size_t bit, __le64 *addr) -- GitLab From cf979fca9a05d7d0b116257e5c4dc12b6bb7eb3a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 5 Apr 2024 16:21:18 -0400 Subject: [PATCH 489/989] bcachefs: fix rand_delete unit test Signed-off-by: Kent Overstreet --- fs/bcachefs/tests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index b3fe9fc577470..bfec656f94c07 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, BTREE_ITER_INTENT); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)); ret = bkey_err(k); if (ret) goto err; -- GitLab From 752e3c53de0fa3b7d817a83050b6699b8e9c6ec9 Mon Sep 17 00:00:00 2001 From: Adam Goldman Date: Mon, 25 Mar 2024 07:38:41 +0900 Subject: [PATCH 490/989] firewire: ohci: mask bus reset interrupts between ISR and bottom half In the FireWire OHCI interrupt handler, if a bus reset interrupt has occurred, mask bus reset interrupts until bus_reset_work has serviced and cleared the interrupt. Normally, we always leave bus reset interrupts masked. We infer the bus reset from the self-ID interrupt that happens shortly thereafter. A scenario where we unmask bus reset interrupts was introduced in 2008 in a007bb857e0b26f5d8b73c2ff90782d9c0972620: If OHCI_PARAM_DEBUG_BUSRESETS (8) is set in the debug parameter bitmask, we will unmask bus reset interrupts so we can log them. irq_handler logs the bus reset interrupt. However, we can't clear the bus reset event flag in irq_handler, because we won't service the event until later. irq_handler exits with the event flag still set. If the corresponding interrupt is still unmasked, the first bus reset will usually freeze the system due to irq_handler being called again each time it exits. This freeze can be reproduced by loading firewire_ohci with "modprobe firewire_ohci debug=-1" (to enable all debugging output). Apparently there are also some cases where bus_reset_work will get called soon enough to clear the event, and operation will continue normally. This freeze was first reported a few months after a007bb85 was committed, but until now it was never fixed. The debug level could safely be set to -1 through sysfs after the module was loaded, but this would be ineffectual in logging bus reset interrupts since they were only unmasked during initialization. irq_handler will now leave the event flag set but mask bus reset interrupts, so irq_handler won't be called again and there will be no freeze. If OHCI_PARAM_DEBUG_BUSRESETS is enabled, bus_reset_work will unmask the interrupt after servicing the event, so future interrupts will be caught as desired. As a side effect to this change, OHCI_PARAM_DEBUG_BUSRESETS can now be enabled through sysfs in addition to during initial module loading. However, when enabled through sysfs, logging of bus reset interrupts will be effective only starting with the second bus reset, after bus_reset_work has executed. Signed-off-by: Adam Goldman Signed-off-by: Takashi Sakamoto --- drivers/firewire/ohci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 7bc71f4be64a0..38d19410a2be6 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -2060,6 +2060,8 @@ static void bus_reset_work(struct work_struct *work) ohci->generation = generation; reg_write(ohci, OHCI1394_IntEventClear, OHCI1394_busReset); + if (param_debug & OHCI_PARAM_DEBUG_BUSRESETS) + reg_write(ohci, OHCI1394_IntMaskSet, OHCI1394_busReset); if (ohci->quirks & QUIRK_RESET_PACKET) ohci->request_generation = generation; @@ -2125,12 +2127,14 @@ static irqreturn_t irq_handler(int irq, void *data) return IRQ_NONE; /* - * busReset and postedWriteErr must not be cleared yet + * busReset and postedWriteErr events must not be cleared yet * (OHCI 1.1 clauses 7.2.3.2 and 13.2.8.1) */ reg_write(ohci, OHCI1394_IntEventClear, event & ~(OHCI1394_busReset | OHCI1394_postedWriteErr)); log_irqs(ohci, event); + if (event & OHCI1394_busReset) + reg_write(ohci, OHCI1394_IntMaskClear, OHCI1394_busReset); if (event & OHCI1394_selfIDComplete) queue_work(selfid_workqueue, &ohci->bus_reset_work); -- GitLab From 97a54ef596c3fd24ec2b227ba8aaf2cf5415e779 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 15 Feb 2024 15:39:43 +0100 Subject: [PATCH 491/989] scsi: target: Fix SELinux error when systemd-modules loads the target module If the systemd-modules service loads the target module, the credentials of that userspace process will be used to validate the access to the target db directory. SELinux will prevent it, reporting an error like the following: kernel: audit: type=1400 audit(1676301082.205:4): avc: denied { read } for pid=1020 comm="systemd-modules" name="target" dev="dm-3" ino=4657583 scontext=system_u:system_r:systemd_modules_load_t:s0 tcontext=system_u:object_r:targetd_etc_rw_t:s0 tclass=dir permissive=0 Fix the error by using the kernel credentials to access the db directory Signed-off-by: Maurizio Lombardi Link: https://lore.kernel.org/r/20240215143944.847184-2-mlombard@redhat.com Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/target/target_core_configfs.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index c1fbcdd161826..c40217f44b1bc 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -3672,6 +3672,8 @@ static int __init target_core_init_configfs(void) { struct configfs_subsystem *subsys = &target_core_fabrics; struct t10_alua_lu_gp *lu_gp; + struct cred *kern_cred; + const struct cred *old_cred; int ret; pr_debug("TARGET_CORE[0]: Loading Generic Kernel Storage" @@ -3748,11 +3750,21 @@ static int __init target_core_init_configfs(void) if (ret < 0) goto out; + /* We use the kernel credentials to access the target directory */ + kern_cred = prepare_kernel_cred(&init_task); + if (!kern_cred) { + ret = -ENOMEM; + goto out; + } + old_cred = override_creds(kern_cred); target_init_dbroot(); + revert_creds(old_cred); + put_cred(kern_cred); return 0; out: + target_xcopy_release_pt(); configfs_unregister_subsystem(subsys); core_dev_release_virtual_lun0(); rd_module_exit(); -- GitLab From 358e919a351f2ea4b412e7dac6b1c23ec10bd4f5 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Tue, 2 Apr 2024 11:55:12 +0800 Subject: [PATCH 492/989] scsi: hisi_sas: Handle the NCQ error returned by D2H frame We find that some disks use D2H frame instead of SDB frame to return NCQ error. Currently, only the I/O corresponding to the D2H frame is processed in this scenario, which does not meet the processing requirements of the NCQ error scenario. So we set dev_status to HISI_SAS_DEV_NCQ_ERR and abort all I/Os of the disk in this scenario. Co-developed-by: Xingui Yang Signed-off-by: Xingui Yang Signed-off-by: Xiang Chen Link: https://lore.kernel.org/r/20240402035513.2024241-2-chenxiang66@hisilicon.com Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 7d2a33514538c..34f96cc35342b 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -2244,7 +2244,15 @@ slot_err_v3_hw(struct hisi_hba *hisi_hba, struct sas_task *task, case SAS_PROTOCOL_SATA | SAS_PROTOCOL_STP: if ((dw0 & CMPLT_HDR_RSPNS_XFRD_MSK) && (sipc_rx_err_type & RX_FIS_STATUS_ERR_MSK)) { - ts->stat = SAS_PROTO_RESPONSE; + if (task->ata_task.use_ncq) { + struct domain_device *device = task->dev; + struct hisi_sas_device *sas_dev = device->lldd_dev; + + sas_dev->dev_status = HISI_SAS_DEV_NCQ_ERR; + slot->abort = 1; + } else { + ts->stat = SAS_PROTO_RESPONSE; + } } else if (dma_rx_err_type & RX_DATA_LEN_UNDERFLOW_MSK) { ts->residual = trans_tx_fail_type; ts->stat = SAS_DATA_UNDERRUN; -- GitLab From 0098c55e0881f0b32591f2110410d5c8b7f9bd5a Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Tue, 2 Apr 2024 11:55:13 +0800 Subject: [PATCH 493/989] scsi: hisi_sas: Modify the deadline for ata_wait_after_reset() We found that the second parameter of function ata_wait_after_reset() is incorrectly used. We call smp_ata_check_ready_type() to poll the device type until the 30s timeout, so the correct deadline should be (jiffies + 30000). Fixes: 3c2673a09cf1 ("scsi: hisi_sas: Fix SATA devices missing issue during I_T nexus reset") Co-developed-by: xiabing Signed-off-by: xiabing Co-developed-by: Yihang Li Signed-off-by: Yihang Li Signed-off-by: Xiang Chen Link: https://lore.kernel.org/r/20240402035513.2024241-3-chenxiang66@hisilicon.com Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 097dfe4b620dc..35f8e00850d6c 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1797,7 +1797,7 @@ static int hisi_sas_debug_I_T_nexus_reset(struct domain_device *device) if (dev_is_sata(device)) { struct ata_link *link = &device->sata_dev.ap->link; - rc = ata_wait_after_reset(link, HISI_SAS_WAIT_PHYUP_TIMEOUT, + rc = ata_wait_after_reset(link, jiffies + HISI_SAS_WAIT_PHYUP_TIMEOUT, smp_ata_check_ready_type); } else { msleep(2000); -- GitLab From 4406e4176f47177f5e51b4cc7e6a7a2ff3dbfbbd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 2 Apr 2024 12:56:54 +0300 Subject: [PATCH 494/989] scsi: qla2xxx: Fix off by one in qla_edif_app_getstats() The app_reply->elem[] array is allocated earlier in this function and it has app_req.num_ports elements. Thus this > comparison needs to be >= to prevent memory corruption. Fixes: 7878f22a2e03 ("scsi: qla2xxx: edif: Add getfcinfo and statistic bsgs") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/5c125b2f-92dd-412b-9b6f-fc3a3207bd60@moroto.mountain Reviewed-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_edif.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_edif.c b/drivers/scsi/qla2xxx/qla_edif.c index 26e6b3e3af431..dcde55c8ee5de 100644 --- a/drivers/scsi/qla2xxx/qla_edif.c +++ b/drivers/scsi/qla2xxx/qla_edif.c @@ -1100,7 +1100,7 @@ qla_edif_app_getstats(scsi_qla_host_t *vha, struct bsg_job *bsg_job) list_for_each_entry_safe(fcport, tf, &vha->vp_fcports, list) { if (fcport->edif.enable) { - if (pcnt > app_req.num_ports) + if (pcnt >= app_req.num_ports) break; app_reply->elem[pcnt].rekey_count = -- GitLab From 978e5c19dfefc271e5550efba92fcef0d3f62864 Mon Sep 17 00:00:00 2001 From: Alexey Izbyshev Date: Fri, 5 Apr 2024 15:55:51 +0300 Subject: [PATCH 495/989] io_uring: Fix io_cqring_wait() not restoring sigmask on get_timespec64() failure This bug was introduced in commit 950e79dd7313 ("io_uring: minor io_cqring_wait() optimization"), which was made in preparation for adc8682ec690 ("io_uring: Add support for napi_busy_poll"). The latter got reverted in cb3182167325 ("Revert "io_uring: Add support for napi_busy_poll""), so simply undo the former as well. Cc: stable@vger.kernel.org Fixes: 950e79dd7313 ("io_uring: minor io_cqring_wait() optimization") Signed-off-by: Alexey Izbyshev Link: https://lore.kernel.org/r/20240405125551.237142-1-izbyshev@ispras.ru Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4521c2b66b98d..c170a2b8d2cf2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2602,19 +2602,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (__io_cqring_events_user(ctx) >= min_events) return 0; - if (sig) { -#ifdef CONFIG_COMPAT - if (in_compat_syscall()) - ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, - sigsz); - else -#endif - ret = set_user_sigmask(sig, sigsz); - - if (ret) - return ret; - } - init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); @@ -2633,6 +2620,19 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, io_napi_adjust_timeout(ctx, &iowq, &ts); } + if (sig) { +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, + sigsz); + else +#endif + ret = set_user_sigmask(sig, sigsz); + + if (ret) + return ret; + } + io_napi_busy_loop(ctx, &iowq); trace_io_uring_cqring_wait(ctx, min_events); -- GitLab From beaa51b36012fad5a4d3c18b88a617aea7a9b96d Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 4 Apr 2024 12:32:53 -0400 Subject: [PATCH 496/989] blk-iocost: avoid out of bounds shift UBSAN catches undefined behavior in blk-iocost, where sometimes iocg->delay is shifted right by a number that is too large, resulting in undefined behavior on some architectures. [ 186.556576] ------------[ cut here ]------------ UBSAN: shift-out-of-bounds in block/blk-iocost.c:1366:23 shift exponent 64 is too large for 64-bit type 'u64' (aka 'unsigned long long') CPU: 16 PID: 0 Comm: swapper/16 Tainted: G S E N 6.9.0-0_fbk700_debug_rc2_kbuilder_0_gc85af715cac0 #1 Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A23 12/08/2020 Call Trace: dump_stack_lvl+0x8f/0xe0 __ubsan_handle_shift_out_of_bounds+0x22c/0x280 iocg_kick_delay+0x30b/0x310 ioc_timer_fn+0x2fb/0x1f80 __run_timer_base+0x1b6/0x250 ... Avoid that undefined behavior by simply taking the "delay = 0" branch if the shift is too large. I am not sure what the symptoms of an undefined value delay will be, but I suspect it could be more than a little annoying to debug. Signed-off-by: Rik van Riel Cc: Tejun Heo Cc: Josef Bacik Cc: Jens Axboe Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240404123253.0f58010f@imladris.surriel.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 9a85bfbbc45a0..baa20c85799d5 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1347,7 +1347,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); - u64 tdelta, delay, new_delay; + u64 tdelta, delay, new_delay, shift; s64 vover, vover_pct; u32 hwa; @@ -1362,8 +1362,9 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) /* calculate the current delay in effect - 1/2 every second */ tdelta = now->now - iocg->delay_at; - if (iocg->delay) - delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); + shift = div64_u64(tdelta, USEC_PER_SEC); + if (iocg->delay && shift < BITS_PER_LONG) + delay = iocg->delay >> shift; else delay = 0; -- GitLab From 4539f91f2a801c0c028c252bffae56030cfb2cae Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 3 Apr 2024 22:38:01 +0200 Subject: [PATCH 497/989] net: openvswitch: fix unwanted error log on timeout policy probing On startup, ovs-vswitchd probes different datapath features including support for timeout policies. While probing, it tries to execute certain operations with OVS_PACKET_ATTR_PROBE or OVS_FLOW_ATTR_PROBE attributes set. These attributes tell the openvswitch module to not log any errors when they occur as it is expected that some of the probes will fail. For some reason, setting the timeout policy ignores the PROBE attribute and logs a failure anyway. This is causing the following kernel log on each re-start of ovs-vswitchd: kernel: Failed to associated timeout policy `ovs_test_tp' Fix that by using the same logging macro that all other messages are using. The message will still be printed at info level when needed and will be rate limited, but with a net rate limiter instead of generic printk one. The nf_ct_set_timeout() itself will still print some info messages, but at least this change makes logging in openvswitch module more consistent. Fixes: 06bd2bdf19d2 ("openvswitch: Add timeout support to ct action") Signed-off-by: Ilya Maximets Acked-by: Eelco Chaudron Link: https://lore.kernel.org/r/20240403203803.2137962-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- net/openvswitch/conntrack.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 3019a4406ca4f..74b63cdb59923 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1380,8 +1380,9 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (ct_info.timeout[0]) { if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto, ct_info.timeout)) - pr_info_ratelimited("Failed to associated timeout " - "policy `%s'\n", ct_info.timeout); + OVS_NLERR(log, + "Failed to associated timeout policy '%s'", + ct_info.timeout); else ct_info.nf_ct_timeout = rcu_dereference( nf_ct_timeout_find(ct_info.ct)->timeout); -- GitLab From 38a15d0a50e0a43778561a5861403851f0b0194c Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Thu, 4 Apr 2024 09:57:40 +0200 Subject: [PATCH 498/989] u64_stats: fix u64_stats_init() for lockdep when used repeatedly in one file Fix bogus lockdep warnings if multiple u64_stats_sync variables are initialized in the same file. With CONFIG_LOCKDEP, seqcount_init() is a macro which declares: static struct lock_class_key __key; Since u64_stats_init() is a function (albeit an inline one), all calls within the same file end up using the same instance, effectively treating them all as a single lock-class. Fixes: 9464ca650008 ("net: make u64_stats_init() a function") Closes: https://lore.kernel.org/netdev/ea1567d9-ce66-45e6-8168-ac40a47d1821@roeck-us.net/ Signed-off-by: Petr Tesarik Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240404075740.30682-1-petr@tesarici.cz Signed-off-by: Jakub Kicinski --- include/linux/u64_stats_sync.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index ffe48e69b3f3a..457879938fc19 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -135,10 +135,11 @@ static inline void u64_stats_inc(u64_stats_t *p) p->v++; } -static inline void u64_stats_init(struct u64_stats_sync *syncp) -{ - seqcount_init(&syncp->seq); -} +#define u64_stats_init(syncp) \ + do { \ + struct u64_stats_sync *__s = (syncp); \ + seqcount_init(&__s->seq); \ + } while (0) static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { -- GitLab From 237f3cf13b20db183d3706d997eedc3c49eacd44 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Apr 2024 20:27:38 +0000 Subject: [PATCH 499/989] xsk: validate user input for XDP_{UMEM|COMPLETION}_FILL_RING MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzbot reported an illegal copy in xsk_setsockopt() [1] Make sure to validate setsockopt() @optlen parameter. [1] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in xsk_setsockopt+0x909/0xa40 net/xdp/xsk.c:1420 Read of size 4 at addr ffff888028c6cde3 by task syz-executor.0/7549 CPU: 0 PID: 7549 Comm: syz-executor.0 Not tainted 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] copy_from_sockptr include/linux/sockptr.h:55 [inline] xsk_setsockopt+0x909/0xa40 net/xdp/xsk.c:1420 do_sock_setsockopt+0x3af/0x720 net/socket.c:2311 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 RIP: 0033:0x7fb40587de69 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fb40665a0c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00007fb4059abf80 RCX: 00007fb40587de69 RDX: 0000000000000005 RSI: 000000000000011b RDI: 0000000000000006 RBP: 00007fb4058ca47a R08: 0000000000000002 R09: 0000000000000000 R10: 0000000020001980 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007fb4059abf80 R15: 00007fff57ee4d08 Allocated by task 7549: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:370 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:387 kasan_kmalloc include/linux/kasan.h:211 [inline] __do_kmalloc_node mm/slub.c:3966 [inline] __kmalloc+0x233/0x4a0 mm/slub.c:3979 kmalloc include/linux/slab.h:632 [inline] __cgroup_bpf_run_filter_setsockopt+0xd2f/0x1040 kernel/bpf/cgroup.c:1869 do_sock_setsockopt+0x6b4/0x720 net/socket.c:2293 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 The buggy address belongs to the object at ffff888028c6cde0 which belongs to the cache kmalloc-8 of size 8 The buggy address is located 1 bytes to the right of allocated 2-byte region [ffff888028c6cde0, ffff888028c6cde2) The buggy address belongs to the physical page: page:ffffea0000a31b00 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888028c6c9c0 pfn:0x28c6c anon flags: 0xfff00000000800(slab|node=0|zone=1|lastcpupid=0x7ff) page_type: 0xffffffff() raw: 00fff00000000800 ffff888014c41280 0000000000000000 dead000000000001 raw: ffff888028c6c9c0 0000000080800057 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x112cc0(GFP_USER|__GFP_NOWARN|__GFP_NORETRY), pid 6648, tgid 6644 (syz-executor.0), ts 133906047828, free_ts 133859922223 set_page_owner include/linux/page_owner.h:31 [inline] post_alloc_hook+0x1ea/0x210 mm/page_alloc.c:1533 prep_new_page mm/page_alloc.c:1540 [inline] get_page_from_freelist+0x33ea/0x3580 mm/page_alloc.c:3311 __alloc_pages+0x256/0x680 mm/page_alloc.c:4569 __alloc_pages_node include/linux/gfp.h:238 [inline] alloc_pages_node include/linux/gfp.h:261 [inline] alloc_slab_page+0x5f/0x160 mm/slub.c:2175 allocate_slab mm/slub.c:2338 [inline] new_slab+0x84/0x2f0 mm/slub.c:2391 ___slab_alloc+0xc73/0x1260 mm/slub.c:3525 __slab_alloc mm/slub.c:3610 [inline] __slab_alloc_node mm/slub.c:3663 [inline] slab_alloc_node mm/slub.c:3835 [inline] __do_kmalloc_node mm/slub.c:3965 [inline] __kmalloc_node+0x2db/0x4e0 mm/slub.c:3973 kmalloc_node include/linux/slab.h:648 [inline] __vmalloc_area_node mm/vmalloc.c:3197 [inline] __vmalloc_node_range+0x5f9/0x14a0 mm/vmalloc.c:3392 __vmalloc_node mm/vmalloc.c:3457 [inline] vzalloc+0x79/0x90 mm/vmalloc.c:3530 bpf_check+0x260/0x19010 kernel/bpf/verifier.c:21162 bpf_prog_load+0x1667/0x20f0 kernel/bpf/syscall.c:2895 __sys_bpf+0x4ee/0x810 kernel/bpf/syscall.c:5631 __do_sys_bpf kernel/bpf/syscall.c:5738 [inline] __se_sys_bpf kernel/bpf/syscall.c:5736 [inline] __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:5736 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 page last free pid 6650 tgid 6647 stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1140 [inline] free_unref_page_prepare+0x95d/0xa80 mm/page_alloc.c:2346 free_unref_page_list+0x5a3/0x850 mm/page_alloc.c:2532 release_pages+0x2117/0x2400 mm/swap.c:1042 tlb_batch_pages_flush mm/mmu_gather.c:98 [inline] tlb_flush_mmu_free mm/mmu_gather.c:293 [inline] tlb_flush_mmu+0x34d/0x4e0 mm/mmu_gather.c:300 tlb_finish_mmu+0xd4/0x200 mm/mmu_gather.c:392 exit_mmap+0x4b6/0xd40 mm/mmap.c:3300 __mmput+0x115/0x3c0 kernel/fork.c:1345 exit_mm+0x220/0x310 kernel/exit.c:569 do_exit+0x99e/0x27e0 kernel/exit.c:865 do_group_exit+0x207/0x2c0 kernel/exit.c:1027 get_signal+0x176e/0x1850 kernel/signal.c:2907 arch_do_signal_or_restart+0x96/0x860 arch/x86/kernel/signal.c:310 exit_to_user_mode_loop kernel/entry/common.c:105 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:201 [inline] syscall_exit_to_user_mode+0xc9/0x360 kernel/entry/common.c:212 do_syscall_64+0x10a/0x240 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Memory state around the buggy address: ffff888028c6cc80: fa fc fc fc fa fc fc fc fa fc fc fc fa fc fc fc ffff888028c6cd00: fa fc fc fc fa fc fc fc 00 fc fc fc 06 fc fc fc >ffff888028c6cd80: fa fc fc fc fa fc fc fc fa fc fc fc 02 fc fc fc ^ ffff888028c6ce00: fa fc fc fc fa fc fc fc fa fc fc fc fa fc fc fc ffff888028c6ce80: fa fc fc fc fa fc fc fc fa fc fc fc fa fc fc fc Fixes: 423f38329d26 ("xsk: add umem fill queue support and mmap") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: "Björn Töpel" Cc: Magnus Karlsson Cc: Maciej Fijalkowski Cc: Jonathan Lemon Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240404202738.3634547-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3404d076a8a3e..727aa20be4bde 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -1417,6 +1417,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xsk_queue **q; int entries; + if (optlen < sizeof(entries)) + return -EINVAL; if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; -- GitLab From b377c66ae3509ccea596512d6afb4777711c4870 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 5 Apr 2024 16:46:37 +0200 Subject: [PATCH 500/989] x86/retpoline: Add NOENDBR annotation to the SRSO dummy return thunk srso_alias_untrain_ret() is special code, even if it is a dummy which is called in the !SRSO case, so annotate it like its real counterpart, to address the following objtool splat: vmlinux.o: warning: objtool: .export_symbol+0x2b290: data relocation to !ENDBR: srso_alias_untrain_ret+0x0 Fixes: 4535e1a4174c ("x86/bugs: Fix the SRSO mitigation on Zen3/4") Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240405144637.17908-1-bp@kernel.org --- arch/x86/lib/retpoline.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 0795b3464058b..e674ccf720b9f 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -229,6 +229,7 @@ SYM_CODE_END(srso_return_thunk) /* Dummy for the alternative in CALL_UNTRAIN_RET. */ SYM_CODE_START(srso_alias_untrain_ret) ANNOTATE_UNRET_SAFE + ANNOTATE_NOENDBR ret int3 SYM_FUNC_END(srso_alias_untrain_ret) -- GitLab From 374b3d38feff4a1cb4ecadace9bd915ffd91fe4b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 5 Apr 2024 22:23:29 -0400 Subject: [PATCH 501/989] bcachefs: Fix BCH_IOCTL_FSCK_OFFLINE for encrypted filesystems To open an encrypted filesystem, we use request_key() to get the encryption key from the user's keyring - but request_key() needs to happen in the context of the process that invoked the ioctl. This easily fixed by using bch2_fs_open() in nostart mode. Signed-off-by: Kent Overstreet --- fs/bcachefs/chardev.c | 94 +++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 44 deletions(-) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index cbfa6459bdbce..1e033df330b0d 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -134,42 +134,38 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg struct fsck_thread { struct thread_with_stdio thr; struct bch_fs *c; - char **devs; - size_t nr_devs; struct bch_opts opts; }; static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) { struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); - if (thr->devs) - for (size_t i = 0; i < thr->nr_devs; i++) - kfree(thr->devs[i]); - kfree(thr->devs); kfree(thr); } static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) { struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); - - if (IS_ERR(c)) - return PTR_ERR(c); + struct bch_fs *c = thr->c; - int ret = 0; - if (test_bit(BCH_FS_errors_fixed, &c->flags)) - ret |= 1; - if (test_bit(BCH_FS_error, &c->flags)) - ret |= 4; + int ret = PTR_ERR_OR_ZERO(c); + if (ret) + return ret; - bch2_fs_stop(c); + ret = bch2_fs_start(thr->c); + if (ret) + goto err; - if (ret & 1) + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); - if (ret & 4) + ret |= 1; + } + if (test_bit(BCH_FS_error, &c->flags)) { bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); - + ret |= 4; + } +err: + bch2_fs_stop(c); return ret; } @@ -182,7 +178,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a { struct bch_ioctl_fsck_offline arg; struct fsck_thread *thr = NULL; - u64 *devs = NULL; + darray_str(devs) = {}; long ret = 0; if (copy_from_user(&arg, user_arg, sizeof(arg))) @@ -194,29 +190,32 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) || - !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) || - !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) { - ret = -ENOMEM; - goto err; - } + for (size_t i = 0; i < arg.nr_devs; i++) { + u64 dev_u64; + ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); + if (ret) + goto err; - thr->opts = bch2_opts_empty(); - thr->nr_devs = arg.nr_devs; + char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); + ret = PTR_ERR_OR_ZERO(dev_str); + if (ret) + goto err; - if (copy_from_user(devs, &user_arg->devs[0], - array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) { - ret = -EINVAL; - goto err; + ret = darray_push(&devs, dev_str); + if (ret) { + kfree(dev_str); + goto err; + } } - for (size_t i = 0; i < arg.nr_devs; i++) { - thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX); - ret = PTR_ERR_OR_ZERO(thr->devs[i]); - if (ret) - goto err; + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) { + ret = -ENOMEM; + goto err; } + thr->opts = bch2_opts_empty(); + if (arg.opts) { char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); @@ -230,15 +229,22 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); + /* We need request_key() to be called before we punt to kthread: */ + opt_set(thr->opts, nostart, true); + + thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops); -err: - if (ret < 0) { - if (thr) - bch2_fsck_thread_exit(&thr->thr); - pr_err("ret %s", bch2_err_str(ret)); - } - kfree(devs); +out: + darray_for_each(devs, i) + kfree(*i); + darray_exit(&devs); return ret; +err: + if (thr) + bch2_fsck_thread_exit(&thr->thr); + pr_err("ret %s", bch2_err_str(ret)); + goto out; } static long bch2_global_ioctl(unsigned cmd, void __user *arg) -- GitLab From 05801b6526156aefe55c0440fab877109c9a89c5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 5 Apr 2024 22:30:30 -0400 Subject: [PATCH 502/989] bcachefs: Disable errors=panic for BCH_IOCTL_FSCK_OFFLINE BCH_IOCTL_FSCK_OFFLINE allows the userspace fsck tool to use the kernel implementation of fsck - primarily when the kernel version is a better version match. It should look and act exactly like the normal userspace fsck that the user expected to be invoking, so errors should never result in a kernel panic. We may want to consider further restricting errors=panic - it's only intended for debugging in controlled test environments, it should have no purpose it normal usage. Signed-off-by: Kent Overstreet --- fs/bcachefs/chardev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 1e033df330b0d..72781aad6ba70 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -234,6 +234,10 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); + if (!IS_ERR(thr->c) && + thr->c->opts.errors == BCH_ON_ERROR_panic) + thr->c->opts.errors = BCH_ON_ERROR_ro; + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops); out: darray_for_each(devs, i) -- GitLab From 6088234ce83acec4aaf56ecc0e9525bac18b4295 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 5 Apr 2024 23:27:27 -0400 Subject: [PATCH 503/989] bcachefs: JOURNAL_SPACE_LOW "bcachefs; Fix deadlock in bch2_btree_update_start()" was a significant performance regression (nearly 50%) on multithreaded random writes with fio. The reason is that the journal watermark checks multiple things, including the state of the btree write buffer, and on multithreaded update heavy workloads we're bottleneked on write buffer flushing - we don't want kicknig off btree updates to depend on the state of the write buffer. This isn't strictly correct; the interior btree update path does do write buffer updates, but it's a tiny fraction of total accounting updates and we're more concerned with space in the journal itself. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 2 +- fs/bcachefs/btree_update_interior.c | 18 +++++++----------- fs/bcachefs/journal_reclaim.c | 2 ++ fs/bcachefs/journal_types.h | 1 + fs/bcachefs/util.h | 8 ++++++++ 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 581edcb0911bf..4ae7890f07c6e 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -659,7 +659,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, commit_flags |= BCH_WATERMARK_reclaim; if (ck->journal.seq != journal_last_seq(j) || - j->watermark == BCH_WATERMARK_stripe) + !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; ret = bch2_btree_iter_traverse(&b_iter) ?: diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 24fbd5be70a20..a4a63e363047d 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1125,18 +1125,14 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (watermark < c->journal.watermark) { - struct journal_res res = { 0 }; - unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK; + if (watermark < BCH_WATERMARK_reclaim && + test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) { + if (flags & BCH_TRANS_COMMIT_journal_reclaim) + return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); - if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < BCH_WATERMARK_reclaim) - journal_flags |= JOURNAL_RES_GET_NONBLOCK; - - ret = drop_locks_do(trans, - bch2_journal_res_get(&c->journal, &res, 1, journal_flags)); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - ret = -BCH_ERR_journal_reclaim_would_deadlock; + bch2_trans_unlock(trans); + wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)); + ret = bch2_trans_relock(trans); if (ret) return ERR_PTR(ret); } diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ab811c0dad26a..04a577848b015 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -67,6 +67,8 @@ void bch2_journal_set_watermark(struct journal *j) track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) trace_and_count(c, journal_full, c); + mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin); + swap(watermark, j->watermark); if (watermark > j->watermark) journal_wake(j); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 8c053cb64ca5e..b5161b5d76a00 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -134,6 +134,7 @@ enum journal_flags { JOURNAL_STARTED, JOURNAL_MAY_SKIP_FLUSH, JOURNAL_NEED_FLUSH_WRITE, + JOURNAL_SPACE_LOW, }; /* Reasons we may fail to get a journal reservation: */ diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index de639e8a3ab53..5cf885b09986a 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -788,6 +788,14 @@ static inline int copy_from_user_errcode(void *to, const void __user *from, unsi #endif +static inline void mod_bit(long nr, volatile unsigned long *addr, bool v) +{ + if (v) + set_bit(nr, addr); + else + clear_bit(nr, addr); +} + static inline void __set_bit_le64(size_t bit, __le64 *addr) { addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); -- GitLab From aa98e70fc6c97f39a7bd68cb1e641ca50d4f9423 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 5 Apr 2024 14:23:18 +0700 Subject: [PATCH 504/989] Documentation: filesystems: Add bcachefs toctree Commit eb386617be4bdf ("bcachefs: Errcode tracepoint, documentation") adds initial bcachefs documentation (private error codes) but without any table of contents tree for the filesystem docs, hence Sphinx warns: Documentation/filesystems/bcachefs/errorcodes.rst: WARNING: document isn't included in any toctree Add bcachefs toctree to fix above warning. Fixes: eb386617be4b ("bcachefs: Errcode tracepoint, documentation") Signed-off-by: Bagas Sanjaya Signed-off-by: Kent Overstreet --- Documentation/filesystems/bcachefs/index.rst | 11 +++++++++++ Documentation/filesystems/index.rst | 1 + 2 files changed, 12 insertions(+) create mode 100644 Documentation/filesystems/bcachefs/index.rst diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst new file mode 100644 index 0000000000000..e2bd61ccd96ff --- /dev/null +++ b/Documentation/filesystems/bcachefs/index.rst @@ -0,0 +1,11 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +bcachefs Documentation +====================== + +.. toctree:: + :maxdepth: 2 + :numbered: + + errorcodes diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 0ea1e44fa0282..1f9b4c905a6a7 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -69,6 +69,7 @@ Documentation for filesystem implementations. afs autofs autofs-mount-control + bcachefs/index befs bfs btrfs -- GitLab From 7d83cf53c77c8fdc0a8033e1472ac62745fac2ac Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 5 Apr 2024 14:23:19 +0700 Subject: [PATCH 505/989] MAINTAINERS: Add entry for bcachefs documentation Now that bcachefs docs exist in Documentation/filesystems/bcachefs/, cover it in MAINTAINERS entry for the filesystem. Signed-off-by: Bagas Sanjaya Signed-off-by: Kent Overstreet --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 7c121493f43d0..c0091a206fd20 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3573,6 +3573,7 @@ S: Supported C: irc://irc.oftc.net/bcache T: git https://evilpiepirate.org/git/bcachefs.git F: fs/bcachefs/ +F: Documentation/filesystems/bcachefs/ BDISP ST MEDIA DRIVER M: Fabien Dessenne -- GitLab From 2d793e9315e335ef299024fe8bf7742c9c974b33 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sat, 6 Apr 2024 16:19:20 +0200 Subject: [PATCH 506/989] bcachefs: Rename struct field swap to prevent macro naming collision The struct field swap can collide with the swap() macro defined in linux/minmax.h. Rename the struct field to prevent such collisions. Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 4ce5e957a6e91..0f955c3c76a7b 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -115,7 +115,7 @@ static void swap_bytes(void *a, void *b, size_t n) struct wrapper { cmp_func_t cmp; - swap_func_t swap; + swap_func_t swap_func; }; /* @@ -125,7 +125,7 @@ struct wrapper { static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) { if (swap_func == SWAP_WRAPPER) { - ((const struct wrapper *)priv)->swap(a, b, (int)size); + ((const struct wrapper *)priv)->swap_func(a, b, (int)size); return; } @@ -174,7 +174,7 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, int i, c, r; /* called from 'sort' without swap function, let's pick the default */ - if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) swap_func = NULL; if (!swap_func) { @@ -227,7 +227,7 @@ void eytzinger0_sort(void *base, size_t n, size_t size, { struct wrapper w = { .cmp = cmp_func, - .swap = swap_func, + .swap_func = swap_func, }; return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); -- GitLab From 30e615a2ce6601d85729caefd8ac15634f848e59 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 6 Apr 2024 21:45:46 -0400 Subject: [PATCH 507/989] bcachefs: Fix gap buffer bug in bch2_journal_key_insert_take() Multiple bug fixes for journal iters: - When the journal keys gap buffer is resized, we have to adjust the iterators for moving the gap to the end - We don't want to rewind iterators to point to the key we just inserted if it's not for the correct btree/level Also, add some new assertions. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 55 ++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 5cbcbfe85235b..a6413a8747d3c 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -130,12 +130,30 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); } +static void journal_iter_verify(struct journal_iter *iter) +{ + struct journal_keys *keys = iter->keys; + size_t gap_size = keys->size - keys->nr; + + BUG_ON(iter->idx >= keys->gap && + iter->idx < keys->gap + gap_size); + + if (iter->idx < keys->size) { + struct journal_key *k = keys->data + iter->idx; + + int cmp = cmp_int(k->btree_id, iter->btree_id) ?: + cmp_int(k->level, iter->level); + BUG_ON(cmp < 0); + } +} + static void journal_iters_fix(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; /* The key we just inserted is immediately before the gap: */ size_t gap_end = keys->gap + (keys->size - keys->nr); - struct btree_and_journal_iter *iter; + struct journal_key *new_key = &keys->data[keys->gap - 1]; + struct journal_iter *iter; /* * If an iterator points one after the key we just inserted, decrement @@ -143,9 +161,14 @@ static void journal_iters_fix(struct bch_fs *c) * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will * handle that: */ - list_for_each_entry(iter, &c->journal_iters, journal.list) - if (iter->journal.idx == gap_end) - iter->journal.idx = keys->gap - 1; + list_for_each_entry(iter, &c->journal_iters, list) { + journal_iter_verify(iter); + if (iter->idx == gap_end && + new_key->btree_id == iter->btree_id && + new_key->level == iter->level) + iter->idx = keys->gap - 1; + journal_iter_verify(iter); + } } static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) @@ -192,7 +215,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, if (idx > keys->gap) idx -= keys->size - keys->nr; + size_t old_gap = keys->gap; + if (keys->nr == keys->size) { + journal_iters_move_gap(c, old_gap, keys->size); + old_gap = keys->size; + struct journal_keys new_keys = { .nr = keys->nr, .size = max_t(size_t, keys->size, 8) * 2, @@ -216,7 +244,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, keys->gap = keys->nr; } - journal_iters_move_gap(c, keys->gap, idx); + journal_iters_move_gap(c, old_gap, idx); move_gap(keys, idx); @@ -301,16 +329,21 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { - struct journal_key *k = iter->keys->data + iter->idx; + journal_iter_verify(iter); + + while (iter->idx < iter->keys->size) { + struct journal_key *k = iter->keys->data + iter->idx; + + int cmp = cmp_int(k->btree_id, iter->btree_id) ?: + cmp_int(k->level, iter->level); + if (cmp > 0) + break; + BUG_ON(cmp); - while (k < iter->keys->data + iter->keys->size && - k->btree_id == iter->btree_id && - k->level == iter->level) { if (!k->overwritten) return bkey_i_to_s_c(k->k); bch2_journal_iter_advance(iter); - k = iter->keys->data + iter->idx; } return bkey_s_c_null; @@ -330,6 +363,8 @@ static void bch2_journal_iter_init(struct bch_fs *c, iter->level = level; iter->keys = &c->journal_keys; iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); + + journal_iter_verify(iter); } static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) -- GitLab From 84471d01c92c33b3f4cedfe319639ecf7f8fc4c5 Mon Sep 17 00:00:00 2001 From: Vitaly Rodionov Date: Fri, 5 Apr 2024 22:06:35 +0100 Subject: [PATCH 508/989] ALSA: hda/realtek: Add quirk for HP SnowWhite laptops Add support for HP SnowWhite laptops with CS35L51 amplifiers on I2C bus connected to Realtek codec. Signed-off-by: Vitaly Rodionov Message-ID: <20240405210635.22193-1-vitalyr@opensource.cirrus.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index cdcb28aa9d7bf..d6940bc4ec393 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10084,6 +10084,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8ca7, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8cdd, "HP Spectre", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8cde, "HP Spectre", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x103c, 0x8cdf, "HP SnowWhite", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8ce0, "HP SnowWhite", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8cf5, "HP ZBook Studio 16", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), -- GitLab From 0b6f0ff01a4a8c1b66c600263465976d57dcc1a3 Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Sat, 6 Apr 2024 21:20:09 +0800 Subject: [PATCH 509/989] ALSA: hda/tas2781: correct the register for pow calibrated data Calibrated data was written into an incorrect register, which cause speaker protection sometimes malfuctions Fixes: 5be27f1e3ec9 ("ALSA: hda/tas2781: Add tas2781 HDA driver") Signed-off-by: Shenghao Ding Cc: Message-ID: <20240406132010.341-1-shenghao-ding@ti.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_hda_i2c.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/tas2781_hda_i2c.c b/sound/pci/hda/tas2781_hda_i2c.c index 48dae33393050..75f7674c66ee7 100644 --- a/sound/pci/hda/tas2781_hda_i2c.c +++ b/sound/pci/hda/tas2781_hda_i2c.c @@ -514,10 +514,10 @@ static int tas2563_save_calibration(struct tasdevice_priv *tas_priv) static void tas2781_apply_calib(struct tasdevice_priv *tas_priv) { static const unsigned char page_array[CALIB_MAX] = { - 0x17, 0x18, 0x18, 0x0d, 0x18 + 0x17, 0x18, 0x18, 0x13, 0x18, }; static const unsigned char rgno_array[CALIB_MAX] = { - 0x74, 0x0c, 0x14, 0x3c, 0x7c + 0x74, 0x0c, 0x14, 0x70, 0x7c, }; unsigned char *data; int i, j, rc; -- GitLab From 059a49aa2e25c58f90b50151f109dd3c4cdb3a47 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 3 Apr 2024 08:43:12 -0700 Subject: [PATCH 510/989] virtio_net: Do not send RSS key if it is not supported There is a bug when setting the RSS options in virtio_net that can break the whole machine, getting the kernel into an infinite loop. Running the following command in any QEMU virtual machine with virtionet will reproduce this problem: # ethtool -X eth0 hfunc toeplitz This is how the problem happens: 1) ethtool_set_rxfh() calls virtnet_set_rxfh() 2) virtnet_set_rxfh() calls virtnet_commit_rss_command() 3) virtnet_commit_rss_command() populates 4 entries for the rss scatter-gather 4) Since the command above does not have a key, then the last scatter-gatter entry will be zeroed, since rss_key_size == 0. sg_buf_size = vi->rss_key_size; 5) This buffer is passed to qemu, but qemu is not happy with a buffer with zero length, and do the following in virtqueue_map_desc() (QEMU function): if (!sz) { virtio_error(vdev, "virtio: zero sized buffers are not allowed"); 6) virtio_error() (also QEMU function) set the device as broken vdev->broken = true; 7) Qemu bails out, and do not repond this crazy kernel. 8) The kernel is waiting for the response to come back (function virtnet_send_command()) 9) The kernel is waiting doing the following : while (!virtqueue_get_buf(vi->cvq, &tmp) && !virtqueue_is_broken(vi->cvq)) cpu_relax(); 10) None of the following functions above is true, thus, the kernel loops here forever. Keeping in mind that virtqueue_is_broken() does not look at the qemu `vdev->broken`, so, it never realizes that the vitio is broken at QEMU side. Fix it by not sending RSS commands if the feature is not available in the device. Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.") Cc: stable@vger.kernel.org Cc: qemu-devel@nongnu.org Signed-off-by: Breno Leitao Reviewed-by: Heng Qi Reviewed-by: Xuan Zhuo Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index c22d1118a1333..115c3c5414f2a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3807,6 +3807,7 @@ static int virtnet_set_rxfh(struct net_device *dev, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); + bool update = false; int i; if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE && @@ -3814,13 +3815,28 @@ static int virtnet_set_rxfh(struct net_device *dev, return -EOPNOTSUPP; if (rxfh->indir) { + if (!vi->has_rss) + return -EOPNOTSUPP; + for (i = 0; i < vi->rss_indir_table_size; ++i) vi->ctrl->rss.indirection_table[i] = rxfh->indir[i]; + update = true; } - if (rxfh->key) + + if (rxfh->key) { + /* If either _F_HASH_REPORT or _F_RSS are negotiated, the + * device provides hash calculation capabilities, that is, + * hash_key is configured. + */ + if (!vi->has_rss && !vi->has_rss_hash_report) + return -EOPNOTSUPP; + memcpy(vi->ctrl->rss.key, rxfh->key, vi->rss_key_size); + update = true; + } - virtnet_commit_rss_command(vi); + if (update) + virtnet_commit_rss_command(vi); return 0; } @@ -4729,13 +4745,15 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) vi->has_rss_hash_report = true; - if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS)) { vi->has_rss = true; - if (vi->has_rss || vi->has_rss_hash_report) { vi->rss_indir_table_size = virtio_cread16(vdev, offsetof(struct virtio_net_config, rss_max_indirection_table_length)); + } + + if (vi->has_rss || vi->has_rss_hash_report) { vi->rss_key_size = virtio_cread8(vdev, offsetof(struct virtio_net_config, rss_max_key_size)); -- GitLab From bccb798e07f8bb8b91212fe8ed1e421685449076 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Thu, 4 Apr 2024 16:54:27 +0530 Subject: [PATCH 511/989] octeontx2-pf: Fix transmit scheduler resource leak Inorder to support shaping and scheduling, Upon class creation Netdev driver allocates trasmit schedulers. The previous patch which added support for Round robin scheduling has a bug due to which driver is not freeing transmit schedulers post class deletion. This patch fixes the same. Fixes: 47a9656f168a ("octeontx2-pf: htb offload support for Round Robin scheduling") Signed-off-by: Hariprasad Kelam Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/qos.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c index 1e77bbf5d22a1..1723e9912ae07 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c @@ -382,6 +382,7 @@ static void otx2_qos_read_txschq_cfg_tl(struct otx2_qos_node *parent, otx2_qos_read_txschq_cfg_tl(node, cfg); cnt = cfg->static_node_pos[node->level]; cfg->schq_contig_list[node->level][cnt] = node->schq; + cfg->schq_index_used[node->level][cnt] = true; cfg->schq_contig[node->level]++; cfg->static_node_pos[node->level]++; otx2_qos_read_txschq_cfg_schq(node, cfg); -- GitLab From 09e913f5826936c0f6632d6d0d35a36295fac7cc Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Tue, 26 Mar 2024 12:04:56 +0800 Subject: [PATCH 512/989] bcachefs: fix the count of nr_freed_pcpu after changing bc->freed_nonpcpu list When allocating bkey_cached from bc->freed_pcpu list, it missed decreasing the count of nr_freed_pcpu which would cause the mismatch between the value of nr_freed_pcpu and the list items. This problem also exists in moving new bkey_cached to bc->freed_pcpu list. If these happened, the bug info may appear in bch2_fs_btree_key_cache_exit by the follow code: BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); Fixes: c65c13f0eac6 ("bcachefs: Run btree key cache shrinker less aggressively") Signed-off-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 4ae7890f07c6e..88a3582a32757 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -169,6 +169,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, } else { mutex_lock(&bc->lock); list_move_tail(&ck->list, &bc->freed_pcpu); + bc->nr_freed_pcpu++; mutex_unlock(&bc->lock); } } @@ -245,6 +246,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, if (!list_empty(&bc->freed_pcpu)) { ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_pcpu--; } mutex_unlock(&bc->lock); } -- GitLab From fec50db7033ea478773b159e0e2efb135270e3b7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 7 Apr 2024 13:22:46 -0700 Subject: [PATCH 513/989] Linux 6.9-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4bef6323c47de..e1bf12891cb0e 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 9 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- GitLab From b897b148ee30c7fca995e6d15cf791f52993920b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 7 Apr 2024 16:20:17 -0400 Subject: [PATCH 514/989] bcachefs: fix bch2_get_acl() transaction restart handling bch2_acl_from_disk() uses allocate_dropping_locks, and can thus return a transaction restart - this wasn't handled. Signed-off-by: Kent Overstreet --- fs/bcachefs/acl.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 3640f417cce11..5c180fdc3efbd 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -281,7 +281,6 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; - struct bkey_s_c_xattr xattr; struct posix_acl *acl = NULL; struct bkey_s_c k; int ret; @@ -290,28 +289,27 @@ retry: ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, inode_inum(inode), &search, 0); - if (ret) { - if (!bch2_err_matches(ret, ENOENT)) - acl = ERR_PTR(ret); - goto out; - } + if (ret) + goto err; k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); - if (ret) { - acl = ERR_PTR(ret); - goto out; - } + if (ret) + goto err; - xattr = bkey_s_c_to_xattr(k); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); + le16_to_cpu(xattr.v->x_val_len)); + ret = PTR_ERR_OR_ZERO(acl); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; - if (!IS_ERR(acl)) + if (ret) + acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL; + + if (!IS_ERR_OR_NULL(acl)) set_cached_acl(&inode->v, type, acl); -out: - if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart)) - goto retry; bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); -- GitLab From 8b8ace080319a866f5dfe9da8e665ae51d971c54 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 7 Apr 2024 20:59:10 +0800 Subject: [PATCH 515/989] block: fix q->blkg_list corruption during disk rebind Multiple gendisk instances can allocated/added for single request queue in case of disk rebind. blkg may still stay in q->blkg_list when calling blkcg_init_disk() for rebind, then q->blkg_list becomes corrupted. Fix the list corruption issue by: - add blkg_init_queue() to initialize q->blkg_list & q->blkcg_mutex only - move calling blkg_init_queue() into blk_alloc_queue() The list corruption should be started since commit f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") which delays removing blkg from q->blkg_list into blkg_free_workfn(). Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") Fixes: 1059699f87eb ("block: move blkcg initialization/destroy into disk allocation/release handler") Cc: Yu Kuai Cc: Tejun Heo Signed-off-by: Ming Lei Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240407125910.4053377-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 ++++++--- block/blk-cgroup.h | 2 ++ block/blk-core.c | 2 ++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bdbb557feb5a0..059467086b131 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1409,6 +1409,12 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) return 0; } +void blkg_init_queue(struct request_queue *q) +{ + INIT_LIST_HEAD(&q->blkg_list); + mutex_init(&q->blkcg_mutex); +} + int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; @@ -1416,9 +1422,6 @@ int blkcg_init_disk(struct gendisk *disk) bool preloaded; int ret; - INIT_LIST_HEAD(&q->blkg_list); - mutex_init(&q->blkcg_mutex); - new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 78b74106bf10c..90b3959d88cfa 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -189,6 +189,7 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; +void blkg_init_queue(struct request_queue *q); int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); @@ -482,6 +483,7 @@ struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline void blkg_init_queue(struct request_queue *q) { } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } diff --git a/block/blk-core.c b/block/blk-core.c index a16b5abdbbf56..3a6f5603fb44b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -442,6 +442,8 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) init_waitqueue_head(&q->mq_freeze_wq); mutex_init(&q->mq_freeze_lock); + blkg_init_queue(q); + /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. -- GitLab From b561ea56a26415bf44ce8ca6a8e625c7c390f1ea Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 7 Apr 2024 21:19:31 +0800 Subject: [PATCH 516/989] block: allow device to have both virt_boundary_mask and max segment size When one stacking device is over one device with virt_boundary_mask and another one with max segment size, the stacking device have both limits set. This way is allowed before d690cb8ae14b ("block: add an API to atomically update queue limits"). Relax the limit so that we won't break such kind of stacking setting. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218687 Reported-by: janpieter.sollie@edpnet.be Fixes: d690cb8ae14b ("block: add an API to atomically update queue limits") Link: https://lore.kernel.org/linux-block/ZfGl8HzUpiOxCLm3@fedora/ Cc: Christoph Hellwig Cc: Mike Snitzer Cc: dm-devel@lists.linux.dev Cc: Song Liu Cc: linux-raid@vger.kernel.org Signed-off-by: Ming Lei Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20240407131931.4055231-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index cdbaef159c4bc..d2731843f2fcc 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -182,17 +182,13 @@ static int blk_validate_limits(struct queue_limits *lim) return -EINVAL; /* - * Devices that require a virtual boundary do not support scatter/gather - * I/O natively, but instead require a descriptor list entry for each - * page (which might not be identical to the Linux PAGE_SIZE). Because - * of that they are not limited by our notion of "segment size". + * Stacking device may have both virtual boundary and max segment + * size limit, so allow this setting now, and long-term the two + * might need to move out of stacking limits since we have immutable + * bvec and lower layer bio splitting is supposed to handle the two + * correctly. */ - if (lim->virt_boundary_mask) { - if (WARN_ON_ONCE(lim->max_segment_size && - lim->max_segment_size != UINT_MAX)) - return -EINVAL; - lim->max_segment_size = UINT_MAX; - } else { + if (!lim->virt_boundary_mask) { /* * The maximum segment size has an odd historic 64k default that * drivers probably should override. Just like the I/O size we -- GitLab From 8358a76cfb47c9a5af627a0c4e7168aa14fa25f6 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2024 11:41:55 -0700 Subject: [PATCH 517/989] clk: Remove prepare_lock hold assertion in __clk_release() Removing this assertion lets us move the kref_put() call outside the prepare_lock section. We don't need to hold the prepare_lock here to free memory and destroy the clk_core structure. We've already unlinked the clk from the clk tree and by the time the release function runs nothing holds a reference to the clk_core anymore so anything with the pointer can't access the memory that's being freed anyway. Way back in commit 496eadf821c2 ("clk: Use lockdep asserts to find missing hold of prepare_lock") we didn't need to have this assertion either. Fixes: 496eadf821c2 ("clk: Use lockdep asserts to find missing hold of prepare_lock") Cc: Krzysztof Kozlowski Reviewed-by: Douglas Anderson Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240325184204.745706-2-sboyd@kernel.org --- drivers/clk/clk.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 25371c91a58fe..fed20641822cd 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -4375,8 +4375,6 @@ static void __clk_release(struct kref *ref) { struct clk_core *core = container_of(ref, struct clk_core, ref); - lockdep_assert_held(&prepare_lock); - clk_core_free_parent_map(core); kfree_const(core->name); kfree(core); -- GitLab From 6f63af7511e7058f3fa4ad5b8102210741c9f947 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2024 11:41:56 -0700 Subject: [PATCH 518/989] clk: Don't hold prepare_lock when calling kref_put() We don't need to hold the prepare_lock when dropping a ref on a struct clk_core. The release function is only freeing memory and any code with a pointer reference has already unlinked anything pointing to the clk_core. This reduces the holding area of the prepare_lock a bit. Note that we also don't call free_clk() with the prepare_lock held. There isn't any reason to do that. Reviewed-by: Douglas Anderson Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240325184204.745706-3-sboyd@kernel.org --- drivers/clk/clk.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index fed20641822cd..e950d0cf6b2fd 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -4470,7 +4470,8 @@ void clk_unregister(struct clk *clk) if (ops == &clk_nodrv_ops) { pr_err("%s: unregistered clock: %s\n", __func__, clk->core->name); - goto unlock; + clk_prepare_unlock(); + return; } /* * Assign empty clock ops for consumers that might still hold @@ -4504,11 +4505,10 @@ void clk_unregister(struct clk *clk) if (clk->core->protect_count) pr_warn("%s: unregistering protected clock: %s\n", __func__, clk->core->name); + clk_prepare_unlock(); kref_put(&clk->core->ref, __clk_release); free_clk(clk); -unlock: - clk_prepare_unlock(); } EXPORT_SYMBOL_GPL(clk_unregister); @@ -4667,13 +4667,11 @@ void __clk_put(struct clk *clk) if (clk->min_rate > 0 || clk->max_rate < ULONG_MAX) clk_set_rate_range_nolock(clk, 0, ULONG_MAX); - owner = clk->core->owner; - kref_put(&clk->core->ref, __clk_release); - clk_prepare_unlock(); + owner = clk->core->owner; + kref_put(&clk->core->ref, __clk_release); module_put(owner); - free_clk(clk); } -- GitLab From 9d05ae531c2cff20d5d527f04e28d28e04379929 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2024 11:41:57 -0700 Subject: [PATCH 519/989] clk: Initialize struct clk_core kref earlier Initialize this kref once we allocate memory for the struct clk_core so that we can reuse the release function to free any memory associated with the structure. This mostly consolidates code, but also clarifies that the kref lifetime exists once the container structure (struct clk_core) is allocated instead of leaving it in a half-baked state for most of __clk_core_init(). Reviewed-by: Douglas Anderson Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240325184204.745706-4-sboyd@kernel.org --- drivers/clk/clk.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index e950d0cf6b2fd..8b35ca32f45eb 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -3981,8 +3981,6 @@ static int __clk_core_init(struct clk_core *core) } clk_core_reparent_orphans_nolock(); - - kref_init(&core->ref); out: clk_pm_runtime_put(core); unlock: @@ -4211,6 +4209,16 @@ static void clk_core_free_parent_map(struct clk_core *core) kfree(core->parents); } +/* Free memory allocated for a struct clk_core */ +static void __clk_release(struct kref *ref) +{ + struct clk_core *core = container_of(ref, struct clk_core, ref); + + clk_core_free_parent_map(core); + kfree_const(core->name); + kfree(core); +} + static struct clk * __clk_register(struct device *dev, struct device_node *np, struct clk_hw *hw) { @@ -4231,6 +4239,8 @@ __clk_register(struct device *dev, struct device_node *np, struct clk_hw *hw) goto fail_out; } + kref_init(&core->ref); + core->name = kstrdup_const(init->name, GFP_KERNEL); if (!core->name) { ret = -ENOMEM; @@ -4285,12 +4295,10 @@ __clk_register(struct device *dev, struct device_node *np, struct clk_hw *hw) hw->clk = NULL; fail_create_clk: - clk_core_free_parent_map(core); fail_parents: fail_ops: - kfree_const(core->name); fail_name: - kfree(core); + kref_put(&core->ref, __clk_release); fail_out: return ERR_PTR(ret); } @@ -4370,16 +4378,6 @@ int of_clk_hw_register(struct device_node *node, struct clk_hw *hw) } EXPORT_SYMBOL_GPL(of_clk_hw_register); -/* Free memory allocated for a clock. */ -static void __clk_release(struct kref *ref) -{ - struct clk_core *core = container_of(ref, struct clk_core, ref); - - clk_core_free_parent_map(core); - kfree_const(core->name); - kfree(core); -} - /* * Empty clk_ops for unregistered clocks. These are used temporarily * after clk_unregister() was called on a clock and until last clock -- GitLab From e581cf5d216289ef292d1a4036d53ce90e122469 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2024 11:41:58 -0700 Subject: [PATCH 520/989] clk: Get runtime PM before walking tree during disable_unused Doug reported [1] the following hung task: INFO: task swapper/0:1 blocked for more than 122 seconds. Not tainted 5.15.149-21875-gf795ebc40eb8 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:swapper/0 state:D stack: 0 pid: 1 ppid: 0 flags:0x00000008 Call trace: __switch_to+0xf4/0x1f4 __schedule+0x418/0xb80 schedule+0x5c/0x10c rpm_resume+0xe0/0x52c rpm_resume+0x178/0x52c __pm_runtime_resume+0x58/0x98 clk_pm_runtime_get+0x30/0xb0 clk_disable_unused_subtree+0x58/0x208 clk_disable_unused_subtree+0x38/0x208 clk_disable_unused_subtree+0x38/0x208 clk_disable_unused_subtree+0x38/0x208 clk_disable_unused_subtree+0x38/0x208 clk_disable_unused+0x4c/0xe4 do_one_initcall+0xcc/0x2d8 do_initcall_level+0xa4/0x148 do_initcalls+0x5c/0x9c do_basic_setup+0x24/0x30 kernel_init_freeable+0xec/0x164 kernel_init+0x28/0x120 ret_from_fork+0x10/0x20 INFO: task kworker/u16:0:9 blocked for more than 122 seconds. Not tainted 5.15.149-21875-gf795ebc40eb8 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:kworker/u16:0 state:D stack: 0 pid: 9 ppid: 2 flags:0x00000008 Workqueue: events_unbound deferred_probe_work_func Call trace: __switch_to+0xf4/0x1f4 __schedule+0x418/0xb80 schedule+0x5c/0x10c schedule_preempt_disabled+0x2c/0x48 __mutex_lock+0x238/0x488 __mutex_lock_slowpath+0x1c/0x28 mutex_lock+0x50/0x74 clk_prepare_lock+0x7c/0x9c clk_core_prepare_lock+0x20/0x44 clk_prepare+0x24/0x30 clk_bulk_prepare+0x40/0xb0 mdss_runtime_resume+0x54/0x1c8 pm_generic_runtime_resume+0x30/0x44 __genpd_runtime_resume+0x68/0x7c genpd_runtime_resume+0x108/0x1f4 __rpm_callback+0x84/0x144 rpm_callback+0x30/0x88 rpm_resume+0x1f4/0x52c rpm_resume+0x178/0x52c __pm_runtime_resume+0x58/0x98 __device_attach+0xe0/0x170 device_initial_probe+0x1c/0x28 bus_probe_device+0x3c/0x9c device_add+0x644/0x814 mipi_dsi_device_register_full+0xe4/0x170 devm_mipi_dsi_device_register_full+0x28/0x70 ti_sn_bridge_probe+0x1dc/0x2c0 auxiliary_bus_probe+0x4c/0x94 really_probe+0xcc/0x2c8 __driver_probe_device+0xa8/0x130 driver_probe_device+0x48/0x110 __device_attach_driver+0xa4/0xcc bus_for_each_drv+0x8c/0xd8 __device_attach+0xf8/0x170 device_initial_probe+0x1c/0x28 bus_probe_device+0x3c/0x9c deferred_probe_work_func+0x9c/0xd8 process_one_work+0x148/0x518 worker_thread+0x138/0x350 kthread+0x138/0x1e0 ret_from_fork+0x10/0x20 The first thread is walking the clk tree and calling clk_pm_runtime_get() to power on devices required to read the clk hardware via struct clk_ops::is_enabled(). This thread holds the clk prepare_lock, and is trying to runtime PM resume a device, when it finds that the device is in the process of resuming so the thread schedule()s away waiting for the device to finish resuming before continuing. The second thread is runtime PM resuming the same device, but the runtime resume callback is calling clk_prepare(), trying to grab the prepare_lock waiting on the first thread. This is a classic ABBA deadlock. To properly fix the deadlock, we must never runtime PM resume or suspend a device with the clk prepare_lock held. Actually doing that is near impossible today because the global prepare_lock would have to be dropped in the middle of the tree, the device runtime PM resumed/suspended, and then the prepare_lock grabbed again to ensure consistency of the clk tree topology. If anything changes with the clk tree in the meantime, we've lost and will need to start the operation all over again. Luckily, most of the time we're simply incrementing or decrementing the runtime PM count on an active device, so we don't have the chance to schedule away with the prepare_lock held. Let's fix this immediate problem that can be triggered more easily by simply booting on Qualcomm sc7180. Introduce a list of clk_core structures that have been registered, or are in the process of being registered, that require runtime PM to operate. Iterate this list and call clk_pm_runtime_get() on each of them without holding the prepare_lock during clk_disable_unused(). This way we can be certain that the runtime PM state of the devices will be active and resumed so we can't schedule away while walking the clk tree with the prepare_lock held. Similarly, call clk_pm_runtime_put() without the prepare_lock held to properly drop the runtime PM reference. We remove the calls to clk_pm_runtime_{get,put}() in this path because they're superfluous now that we know the devices are runtime resumed. Reported-by: Douglas Anderson Closes: https://lore.kernel.org/all/20220922084322.RFC.2.I375b6b9e0a0a5348962f004beb3dafee6a12dfbb@changeid/ [1] Closes: https://issuetracker.google.com/328070191 Cc: Marek Szyprowski Cc: Ulf Hansson Cc: Krzysztof Kozlowski Fixes: 9a34b45397e5 ("clk: Add support for runtime PM") Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240325184204.745706-5-sboyd@kernel.org Reviewed-by: Douglas Anderson --- drivers/clk/clk.c | 117 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 105 insertions(+), 12 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 8b35ca32f45eb..9b907cb3ef613 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -37,6 +37,10 @@ static HLIST_HEAD(clk_root_list); static HLIST_HEAD(clk_orphan_list); static LIST_HEAD(clk_notifier_list); +/* List of registered clks that use runtime PM */ +static HLIST_HEAD(clk_rpm_list); +static DEFINE_MUTEX(clk_rpm_list_lock); + static const struct hlist_head *all_lists[] = { &clk_root_list, &clk_orphan_list, @@ -59,6 +63,7 @@ struct clk_core { struct clk_hw *hw; struct module *owner; struct device *dev; + struct hlist_node rpm_node; struct device_node *of_node; struct clk_core *parent; struct clk_parent_map *parents; @@ -122,6 +127,89 @@ static void clk_pm_runtime_put(struct clk_core *core) pm_runtime_put_sync(core->dev); } +/** + * clk_pm_runtime_get_all() - Runtime "get" all clk provider devices + * + * Call clk_pm_runtime_get() on all runtime PM enabled clks in the clk tree so + * that disabling unused clks avoids a deadlock where a device is runtime PM + * resuming/suspending and the runtime PM callback is trying to grab the + * prepare_lock for something like clk_prepare_enable() while + * clk_disable_unused_subtree() holds the prepare_lock and is trying to runtime + * PM resume/suspend the device as well. + * + * Context: Acquires the 'clk_rpm_list_lock' and returns with the lock held on + * success. Otherwise the lock is released on failure. + * + * Return: 0 on success, negative errno otherwise. + */ +static int clk_pm_runtime_get_all(void) +{ + int ret; + struct clk_core *core, *failed; + + /* + * Grab the list lock to prevent any new clks from being registered + * or unregistered until clk_pm_runtime_put_all(). + */ + mutex_lock(&clk_rpm_list_lock); + + /* + * Runtime PM "get" all the devices that are needed for the clks + * currently registered. Do this without holding the prepare_lock, to + * avoid the deadlock. + */ + hlist_for_each_entry(core, &clk_rpm_list, rpm_node) { + ret = clk_pm_runtime_get(core); + if (ret) { + failed = core; + pr_err("clk: Failed to runtime PM get '%s' for clk '%s'\n", + dev_name(failed->dev), failed->name); + goto err; + } + } + + return 0; + +err: + hlist_for_each_entry(core, &clk_rpm_list, rpm_node) { + if (core == failed) + break; + + clk_pm_runtime_put(core); + } + mutex_unlock(&clk_rpm_list_lock); + + return ret; +} + +/** + * clk_pm_runtime_put_all() - Runtime "put" all clk provider devices + * + * Put the runtime PM references taken in clk_pm_runtime_get_all() and release + * the 'clk_rpm_list_lock'. + */ +static void clk_pm_runtime_put_all(void) +{ + struct clk_core *core; + + hlist_for_each_entry(core, &clk_rpm_list, rpm_node) + clk_pm_runtime_put(core); + mutex_unlock(&clk_rpm_list_lock); +} + +static void clk_pm_runtime_init(struct clk_core *core) +{ + struct device *dev = core->dev; + + if (dev && pm_runtime_enabled(dev)) { + core->rpm_enabled = true; + + mutex_lock(&clk_rpm_list_lock); + hlist_add_head(&core->rpm_node, &clk_rpm_list); + mutex_unlock(&clk_rpm_list_lock); + } +} + /*** locking ***/ static void clk_prepare_lock(void) { @@ -1381,9 +1469,6 @@ static void __init clk_unprepare_unused_subtree(struct clk_core *core) if (core->flags & CLK_IGNORE_UNUSED) return; - if (clk_pm_runtime_get(core)) - return; - if (clk_core_is_prepared(core)) { trace_clk_unprepare(core); if (core->ops->unprepare_unused) @@ -1392,8 +1477,6 @@ static void __init clk_unprepare_unused_subtree(struct clk_core *core) core->ops->unprepare(core->hw); trace_clk_unprepare_complete(core); } - - clk_pm_runtime_put(core); } static void __init clk_disable_unused_subtree(struct clk_core *core) @@ -1409,9 +1492,6 @@ static void __init clk_disable_unused_subtree(struct clk_core *core) if (core->flags & CLK_OPS_PARENT_ENABLE) clk_core_prepare_enable(core->parent); - if (clk_pm_runtime_get(core)) - goto unprepare_out; - flags = clk_enable_lock(); if (core->enable_count) @@ -1436,8 +1516,6 @@ static void __init clk_disable_unused_subtree(struct clk_core *core) unlock_out: clk_enable_unlock(flags); - clk_pm_runtime_put(core); -unprepare_out: if (core->flags & CLK_OPS_PARENT_ENABLE) clk_core_disable_unprepare(core->parent); } @@ -1453,6 +1531,7 @@ __setup("clk_ignore_unused", clk_ignore_unused_setup); static int __init clk_disable_unused(void) { struct clk_core *core; + int ret; if (clk_ignore_unused) { pr_warn("clk: Not disabling unused clocks\n"); @@ -1461,6 +1540,13 @@ static int __init clk_disable_unused(void) pr_info("clk: Disabling unused clocks\n"); + ret = clk_pm_runtime_get_all(); + if (ret) + return ret; + /* + * Grab the prepare lock to keep the clk topology stable while iterating + * over clks. + */ clk_prepare_lock(); hlist_for_each_entry(core, &clk_root_list, child_node) @@ -1477,6 +1563,8 @@ static int __init clk_disable_unused(void) clk_prepare_unlock(); + clk_pm_runtime_put_all(); + return 0; } late_initcall_sync(clk_disable_unused); @@ -4214,6 +4302,12 @@ static void __clk_release(struct kref *ref) { struct clk_core *core = container_of(ref, struct clk_core, ref); + if (core->rpm_enabled) { + mutex_lock(&clk_rpm_list_lock); + hlist_del(&core->rpm_node); + mutex_unlock(&clk_rpm_list_lock); + } + clk_core_free_parent_map(core); kfree_const(core->name); kfree(core); @@ -4253,9 +4347,8 @@ __clk_register(struct device *dev, struct device_node *np, struct clk_hw *hw) } core->ops = init->ops; - if (dev && pm_runtime_enabled(dev)) - core->rpm_enabled = true; core->dev = dev; + clk_pm_runtime_init(core); core->of_node = np; if (dev && dev->driver) core->owner = dev->driver->owner; -- GitLab From 9d1e795f754db1ac3344528b7af0b17b8146f321 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2024 11:41:59 -0700 Subject: [PATCH 521/989] clk: Get runtime PM before walking tree for clk_summary Similar to the previous commit, we should make sure that all devices are runtime resumed before printing the clk_summary through debugfs. Failure to do so would result in a deadlock if the thread is resuming a device to print clk state and that device is also runtime resuming in another thread, e.g the screen is turning on and the display driver is starting up. We remove the calls to clk_pm_runtime_{get,put}() in this path because they're superfluous now that we know the devices are runtime resumed. This also squashes a bug where the return value of clk_pm_runtime_get() wasn't checked, leading to an RPM count underflow on error paths. Fixes: 1bb294a7981c ("clk: Enable/Disable runtime PM for clk_summary") Cc: Taniya Das Cc: Douglas Anderson Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240325184204.745706-6-sboyd@kernel.org Reviewed-by: Douglas Anderson --- drivers/clk/clk.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 9b907cb3ef613..8cca52be993f4 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -3340,9 +3340,7 @@ static void clk_summary_show_subtree(struct seq_file *s, struct clk_core *c, { struct clk_core *child; - clk_pm_runtime_get(c); clk_summary_show_one(s, c, level); - clk_pm_runtime_put(c); hlist_for_each_entry(child, &c->children, child_node) clk_summary_show_subtree(s, child, level + 1); @@ -3352,11 +3350,15 @@ static int clk_summary_show(struct seq_file *s, void *data) { struct clk_core *c; struct hlist_head **lists = s->private; + int ret; seq_puts(s, " enable prepare protect duty hardware connection\n"); seq_puts(s, " clock count count count rate accuracy phase cycle enable consumer id\n"); seq_puts(s, "---------------------------------------------------------------------------------------------------------------------------------------------\n"); + ret = clk_pm_runtime_get_all(); + if (ret) + return ret; clk_prepare_lock(); @@ -3365,6 +3367,7 @@ static int clk_summary_show(struct seq_file *s, void *data) clk_summary_show_subtree(s, c, 0); clk_prepare_unlock(); + clk_pm_runtime_put_all(); return 0; } @@ -3412,8 +3415,14 @@ static int clk_dump_show(struct seq_file *s, void *data) struct clk_core *c; bool first_node = true; struct hlist_head **lists = s->private; + int ret; + + ret = clk_pm_runtime_get_all(); + if (ret) + return ret; seq_putc(s, '{'); + clk_prepare_lock(); for (; *lists; lists++) { @@ -3426,6 +3435,7 @@ static int clk_dump_show(struct seq_file *s, void *data) } clk_prepare_unlock(); + clk_pm_runtime_put_all(); seq_puts(s, "}\n"); return 0; -- GitLab From 22e1992cf7b034db5325660e98c41ca5afa5f519 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 28 Mar 2024 10:21:47 +1000 Subject: [PATCH 522/989] vhost: Add smp_rmb() in vhost_vq_avail_empty() A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by Will. Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M \ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_vq_avail_empty(). When tx_can_batch() returns true, it means there's still pending tx buffers. Since it might read indices, so it still can bypass the smp_rmb() in vhost_get_vq_desc(). Note that it should be safe until vq->avail_idx is changed by commit 275bf960ac697 ("vhost: better detection of available buffers"). Fixes: 275bf960ac69 ("vhost: better detection of available buffers") Cc: # v4.11+ Reported-by: Yihuang Yu Suggested-by: Will Deacon Signed-off-by: Gavin Shan Acked-by: Jason Wang Message-Id: <20240328002149.1141302-2-gshan@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- drivers/vhost/vhost.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12a..29df65b2ebf2e 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2799,9 +2799,19 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) r = vhost_get_avail_idx(vq, &avail_idx); if (unlikely(r)) return false; + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Since we have updated avail_idx, the following + * call to vhost_get_vq_desc() will read available + * ring entries. Make sure that read happens after + * the avail_idx read. + */ + smp_rmb(); + return false; + } - return vq->avail_idx == vq->last_avail_idx; + return true; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); -- GitLab From df9ace7647d4123209395bb9967e998d5758c645 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 28 Mar 2024 10:21:48 +1000 Subject: [PATCH 523/989] vhost: Add smp_rmb() in vhost_enable_notify() A smp_rmb() has been missed in vhost_enable_notify(), inspired by Will. Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M \ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_enable_notify(). When it returns true, it means there's still pending tx buffers. Since it might read indices, so it still can bypass the smp_rmb() in vhost_get_vq_desc(). Note that it should be safe until vq->avail_idx is changed by commit d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()"). Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()") Cc: # v5.18+ Reported-by: Yihuang Yu Suggested-by: Will Deacon Signed-off-by: Gavin Shan Acked-by: Jason Wang Message-Id: <20240328002149.1141302-3-gshan@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- drivers/vhost/vhost.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 29df65b2ebf2e..32686c79c41d6 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2848,9 +2848,19 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) &vq->avail->idx, r); return false; } + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Since we have updated avail_idx, the following + * call to vhost_get_vq_desc() will read available + * ring entries. Make sure that read happens after + * the avail_idx read. + */ + smp_rmb(); + return true; + } - return vq->avail_idx != vq->last_avail_idx; + return false; } EXPORT_SYMBOL_GPL(vhost_enable_notify); -- GitLab From ffe6176b7f53ca0c99355f13e14a33a40cf49406 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 31 Mar 2024 10:43:48 +0200 Subject: [PATCH 524/989] virtio: store owner from modules with register_virtio_driver() Modules registering driver with register_virtio_driver() might forget to set .owner field. i2c-virtio.c for example has it missing. The field is used by some other kernel parts for reference counting (try_module_get()), so it is expected that drivers will set it. Solve the problem by moving this task away from the drivers to the core virtio code, just like we did for platform_driver in commit 9447057eaff8 ("platform_device: use a macro instead of platform_driver_register"). Fixes: 3cfc88380413 ("i2c: virtio: add a virtio i2c frontend driver") Cc: "Jie Deng" Signed-off-by: Krzysztof Kozlowski Message-Id: <20240331-module-owner-virtio-v2-1-98f04bfaf46a@linaro.org> Signed-off-by: Michael S. Tsirkin --- Documentation/driver-api/virtio/writing_virtio_drivers.rst | 1 - drivers/virtio/virtio.c | 6 ++++-- include/linux/virtio.h | 7 +++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/driver-api/virtio/writing_virtio_drivers.rst b/Documentation/driver-api/virtio/writing_virtio_drivers.rst index e14c58796d250..e5de6f5d061a7 100644 --- a/Documentation/driver-api/virtio/writing_virtio_drivers.rst +++ b/Documentation/driver-api/virtio/writing_virtio_drivers.rst @@ -97,7 +97,6 @@ like this:: static struct virtio_driver virtio_dummy_driver = { .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, .id_table = id_table, .probe = virtio_dummy_probe, .remove = virtio_dummy_remove, diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index f173587893cb3..9510c551dce86 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -362,14 +362,16 @@ static const struct bus_type virtio_bus = { .remove = virtio_dev_remove, }; -int register_virtio_driver(struct virtio_driver *driver) +int __register_virtio_driver(struct virtio_driver *driver, struct module *owner) { /* Catch this early. */ BUG_ON(driver->feature_table_size && !driver->feature_table); driver->driver.bus = &virtio_bus; + driver->driver.owner = owner; + return driver_register(&driver->driver); } -EXPORT_SYMBOL_GPL(register_virtio_driver); +EXPORT_SYMBOL_GPL(__register_virtio_driver); void unregister_virtio_driver(struct virtio_driver *driver) { diff --git a/include/linux/virtio.h b/include/linux/virtio.h index b0201747a263a..26c4325aa3734 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -170,7 +170,7 @@ size_t virtio_max_dma_size(const struct virtio_device *vdev); /** * struct virtio_driver - operations for a virtio I/O driver - * @driver: underlying device driver (populate name and owner). + * @driver: underlying device driver (populate name). * @id_table: the ids serviced by this driver. * @feature_table: an array of feature numbers supported by this driver. * @feature_table_size: number of entries in the feature table array. @@ -208,7 +208,10 @@ static inline struct virtio_driver *drv_to_virtio(struct device_driver *drv) return container_of(drv, struct virtio_driver, driver); } -int register_virtio_driver(struct virtio_driver *drv); +/* use a macro to avoid include chaining to get THIS_MODULE */ +#define register_virtio_driver(drv) \ + __register_virtio_driver(drv, THIS_MODULE) +int __register_virtio_driver(struct virtio_driver *drv, struct module *owner); void unregister_virtio_driver(struct virtio_driver *drv); /* module_virtio_driver() - Helper macro for drivers that don't do -- GitLab From 2855c2a7820bc8198ae937a9a67dbdc3990e9d2c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 2 Apr 2024 17:21:43 -0400 Subject: [PATCH 525/989] vhost-vdpa: change ioctl # for VDPA_GET_VRING_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VDPA_GET_VRING_SIZE by mistake uses the already occupied ioctl # 0x80 and we never noticed - it happens to work because the direction and size are different, but confuses tools such as perf which like to look at just the number, and breaks the extra robustness of the ioctl numbering macros. To fix, sort the entries and renumber the ioctl - not too late since it wasn't in any released kernels yet. Cc: Arnaldo Carvalho de Melo Reported-by: Namhyung Kim Fixes: 1496c47065f9 ("vhost-vdpa: uapi to support reporting per vq size") Cc: "Zhu Lingshan" Signed-off-by: Michael S. Tsirkin Message-Id: <41c1c5489688abe5bfef9f7cf15584e3fb872ac5.1712092759.git.mst@redhat.com> Reviewed-by: Eugenio Pérez Reviewed-by: Zhu Lingshan Reviewed-by: Stefano Garzarella Acked-by: Jason Wang --- include/uapi/linux/vhost.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index bea6973906134..b95dd84eef2db 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -179,12 +179,6 @@ /* Get the config size */ #define VHOST_VDPA_GET_CONFIG_SIZE _IOR(VHOST_VIRTIO, 0x79, __u32) -/* Get the count of all virtqueues */ -#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) - -/* Get the number of virtqueue groups. */ -#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) - /* Get the number of address spaces. */ #define VHOST_VDPA_GET_AS_NUM _IOR(VHOST_VIRTIO, 0x7A, unsigned int) @@ -228,10 +222,17 @@ #define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ struct vhost_vring_state) + +/* Get the count of all virtqueues */ +#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) + +/* Get the number of virtqueue groups. */ +#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) + /* Get the queue size of a specific virtqueue. * userspace set the vring index in vhost_vring_state.index * kernel set the queue size in vhost_vring_state.num */ -#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x80, \ +#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ struct vhost_vring_state) #endif -- GitLab From 76f408535aab39c33e0a1dcada9fba5631c65595 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Mon, 11 Mar 2024 16:21:09 +0800 Subject: [PATCH 526/989] vhost: correct misleading printing information Guest moved avail idx not used idx when we need to print log if '(vq->avail_idx - last_avail_idx) > vq->num', so fix it. Signed-off-by: Xianting Tian Message-Id: <20240311082109.46773-1-xianting.tian@linux.alibaba.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 32686c79c41d6..8995730ce0bfc 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2515,7 +2515,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, vq->avail_idx = vhost16_to_cpu(vq, avail_idx); if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { - vq_err(vq, "Guest moved used index from %u to %u", + vq_err(vq, "Guest moved avail index from %u to %u", last_avail_idx, vq->avail_idx); return -EFAULT; } -- GitLab From f0cf7ffcd02953c72fed5995378805883d16203e Mon Sep 17 00:00:00 2001 From: "Wachowski, Karol" Date: Tue, 2 Apr 2024 12:49:22 +0200 Subject: [PATCH 527/989] accel/ivpu: Check return code of ipc->lock init Return value of drmm_mutex_init(ipc->lock) was unchecked. Fixes: 5d7422cfb498 ("accel/ivpu: Add IPC driver and JSM messages") Cc: # v6.3+ Signed-off-by: Wachowski, Karol Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-2-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_ipc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_ipc.c b/drivers/accel/ivpu/ivpu_ipc.c index 04ac4b9840fbe..56ff067f63e29 100644 --- a/drivers/accel/ivpu/ivpu_ipc.c +++ b/drivers/accel/ivpu/ivpu_ipc.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #include @@ -501,7 +501,11 @@ int ivpu_ipc_init(struct ivpu_device *vdev) spin_lock_init(&ipc->cons_lock); INIT_LIST_HEAD(&ipc->cons_list); INIT_LIST_HEAD(&ipc->cb_msg_list); - drmm_mutex_init(&vdev->drm, &ipc->lock); + ret = drmm_mutex_init(&vdev->drm, &ipc->lock); + if (ret) { + ivpu_err(vdev, "Failed to initialize ipc->lock, ret %d\n", ret); + goto err_free_rx; + } ivpu_ipc_reset(vdev); return 0; -- GitLab From e3caadf1f9dfc9d62b5ffc3bd73ebac0c8f26b3f Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:23 +0200 Subject: [PATCH 528/989] accel/ivpu: Remove d3hot_after_power_off WA Always enter D3hot after entering D0i3 an all platforms. This minimizes power usage. Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-3-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.c | 20 ++++++++++---------- drivers/accel/ivpu/ivpu_drv.h | 3 +-- drivers/accel/ivpu/ivpu_hw_37xx.c | 4 +--- drivers/accel/ivpu/ivpu_pm.c | 7 ++----- 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 39f6d1b98fd6a..303d92753387e 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #include @@ -387,12 +387,15 @@ int ivpu_shutdown(struct ivpu_device *vdev) { int ret; - ivpu_prepare_for_reset(vdev); + /* Save PCI state before powering down as it sometimes gets corrupted if NPU hangs */ + pci_save_state(to_pci_dev(vdev->drm.dev)); ret = ivpu_hw_power_down(vdev); if (ret) ivpu_warn(vdev, "Failed to power down HW: %d\n", ret); + pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); + return ret; } @@ -560,11 +563,11 @@ static int ivpu_dev_init(struct ivpu_device *vdev) /* Power up early so the rest of init code can access VPU registers */ ret = ivpu_hw_power_up(vdev); if (ret) - goto err_power_down; + goto err_shutdown; ret = ivpu_mmu_global_context_init(vdev); if (ret) - goto err_power_down; + goto err_shutdown; ret = ivpu_mmu_init(vdev); if (ret) @@ -601,10 +604,8 @@ err_mmu_rctx_fini: ivpu_mmu_reserved_context_fini(vdev); err_mmu_gctx_fini: ivpu_mmu_global_context_fini(vdev); -err_power_down: - ivpu_hw_power_down(vdev); - if (IVPU_WA(d3hot_after_power_off)) - pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); +err_shutdown: + ivpu_shutdown(vdev); err_xa_destroy: xa_destroy(&vdev->db_xa); xa_destroy(&vdev->submitted_jobs_xa); @@ -628,9 +629,8 @@ static void ivpu_bo_unbind_all_user_contexts(struct ivpu_device *vdev) static void ivpu_dev_fini(struct ivpu_device *vdev) { ivpu_pm_disable(vdev); + ivpu_prepare_for_reset(vdev); ivpu_shutdown(vdev); - if (IVPU_WA(d3hot_after_power_off)) - pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); ivpu_jobs_abort_all(vdev); ivpu_job_done_consumer_fini(vdev); diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h index 7be0500d9bb89..bb4374d0eaecc 100644 --- a/drivers/accel/ivpu/ivpu_drv.h +++ b/drivers/accel/ivpu/ivpu_drv.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #ifndef __IVPU_DRV_H__ @@ -90,7 +90,6 @@ struct ivpu_wa_table { bool punit_disabled; bool clear_runtime_mem; - bool d3hot_after_power_off; bool interrupt_clear_with_0; bool disable_clock_relinquish; bool disable_d0i3_msg; diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c index 9a0c9498baba2..5e2865f9f7d6d 100644 --- a/drivers/accel/ivpu/ivpu_hw_37xx.c +++ b/drivers/accel/ivpu/ivpu_hw_37xx.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #include "ivpu_drv.h" @@ -75,7 +75,6 @@ static void ivpu_hw_wa_init(struct ivpu_device *vdev) { vdev->wa.punit_disabled = false; vdev->wa.clear_runtime_mem = false; - vdev->wa.d3hot_after_power_off = true; REGB_WR32(VPU_37XX_BUTTRESS_INTERRUPT_STAT, BUTTRESS_ALL_IRQ_MASK); if (REGB_RD32(VPU_37XX_BUTTRESS_INTERRUPT_STAT) == BUTTRESS_ALL_IRQ_MASK) { @@ -86,7 +85,6 @@ static void ivpu_hw_wa_init(struct ivpu_device *vdev) IVPU_PRINT_WA(punit_disabled); IVPU_PRINT_WA(clear_runtime_mem); - IVPU_PRINT_WA(d3hot_after_power_off); IVPU_PRINT_WA(interrupt_clear_with_0); } diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 7cce1c928a7f4..9cbd7af6576b4 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #include @@ -58,15 +58,12 @@ static int ivpu_suspend(struct ivpu_device *vdev) { int ret; - /* Save PCI state before powering down as it sometimes gets corrupted if NPU hangs */ - pci_save_state(to_pci_dev(vdev->drm.dev)); + ivpu_prepare_for_reset(vdev); ret = ivpu_shutdown(vdev); if (ret) ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret); - pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); - return ret; } -- GitLab From 3534eacbf101f6e66105f03d869a03893407c384 Mon Sep 17 00:00:00 2001 From: "Wachowski, Karol" Date: Tue, 2 Apr 2024 12:49:24 +0200 Subject: [PATCH 529/989] accel/ivpu: Fix PCI D0 state entry in resume In case of failed power up we end up left in PCI D3hot state making it impossible to access NPU registers on retry. Enter D0 state on retry before proceeding with power up sequence. Fixes: 28083ff18d3f ("accel/ivpu: Fix DevTLB errors on suspend/resume and recovery") Cc: # v6.8+ Signed-off-by: Wachowski, Karol Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-4-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 9cbd7af6576b4..325b82f8d971c 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -71,10 +71,10 @@ static int ivpu_resume(struct ivpu_device *vdev) { int ret; - pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0); +retry: pci_restore_state(to_pci_dev(vdev->drm.dev)); + pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0); -retry: ret = ivpu_hw_power_up(vdev); if (ret) { ivpu_err(vdev, "Failed to power up HW: %d\n", ret); -- GitLab From 875bc9cd1b33eb027a5663f5e6878a43d98e9a16 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:25 +0200 Subject: [PATCH 530/989] accel/ivpu: Put NPU back to D3hot after failed resume Put NPU in D3hot after ivpu_resume() fails to power up the device. This will assure that D3->D0 power cycle will be performed before the next resume and also will minimize power usage in this corner case. Fixes: 28083ff18d3f ("accel/ivpu: Fix DevTLB errors on suspend/resume and recovery") Cc: # v6.8+ Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-5-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 325b82f8d971c..ba51781b58960 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -97,6 +97,7 @@ err_mmu_disable: ivpu_mmu_disable(vdev); err_power_down: ivpu_hw_power_down(vdev); + pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); if (!ivpu_fw_is_cold_boot(vdev)) { ivpu_pm_prepare_cold_boot(vdev); -- GitLab From 3556f922612caf4c9b97cf7337626f8342b3dea3 Mon Sep 17 00:00:00 2001 From: "Wachowski, Karol" Date: Tue, 2 Apr 2024 12:49:26 +0200 Subject: [PATCH 531/989] accel/ivpu: Improve clarity of MMU error messages This patch improves readability and clarity of MMU error messages. Previously, the error strings were somewhat confusing and could lead to ambiguous interpretations, making it difficult to diagnose issues. Signed-off-by: Wachowski, Karol Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-6-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_mmu.c b/drivers/accel/ivpu/ivpu_mmu.c index 91bd640655ab3..2e46b322c4505 100644 --- a/drivers/accel/ivpu/ivpu_mmu.c +++ b/drivers/accel/ivpu/ivpu_mmu.c @@ -278,7 +278,7 @@ static const char *ivpu_mmu_event_to_str(u32 cmd) case IVPU_MMU_EVT_F_VMS_FETCH: return "Fetch of VMS caused external abort"; default: - return "Unknown CMDQ command"; + return "Unknown event"; } } @@ -286,15 +286,15 @@ static const char *ivpu_mmu_cmdq_err_to_str(u32 err) { switch (err) { case IVPU_MMU_CERROR_NONE: - return "No CMDQ Error"; + return "No error"; case IVPU_MMU_CERROR_ILL: return "Illegal command"; case IVPU_MMU_CERROR_ABT: - return "External abort on CMDQ read"; + return "External abort on command queue read"; case IVPU_MMU_CERROR_ATC_INV_SYNC: return "Sync failed to complete ATS invalidation"; default: - return "Unknown CMDQ Error"; + return "Unknown error"; } } -- GitLab From c52c35e5b404b95a5bcff39af9be1b9293be3434 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:27 +0200 Subject: [PATCH 532/989] accel/ivpu: Return max freq for DRM_IVPU_PARAM_CORE_CLOCK_RATE DRM_IVPU_PARAM_CORE_CLOCK_RATE returns current NPU frequency which could be 0 if device was sleeping. This value isn't really useful to the user space, so return max freq instead which can be used to estimate NPU performance. Fixes: c39dc15191c4 ("accel/ivpu: Read clock rate only if device is up") Cc: # v6.7 Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-7-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.c | 18 +----------------- drivers/accel/ivpu/ivpu_hw.h | 6 ++++++ drivers/accel/ivpu/ivpu_hw_37xx.c | 7 ++++--- drivers/accel/ivpu/ivpu_hw_40xx.c | 6 ++++++ 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 303d92753387e..77283daaedd11 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -131,22 +131,6 @@ static int ivpu_get_capabilities(struct ivpu_device *vdev, struct drm_ivpu_param return 0; } -static int ivpu_get_core_clock_rate(struct ivpu_device *vdev, u64 *clk_rate) -{ - int ret; - - ret = ivpu_rpm_get_if_active(vdev); - if (ret < 0) - return ret; - - *clk_rate = ret ? ivpu_hw_reg_pll_freq_get(vdev) : 0; - - if (ret) - ivpu_rpm_put(vdev); - - return 0; -} - static int ivpu_get_param_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct ivpu_file_priv *file_priv = file->driver_priv; @@ -170,7 +154,7 @@ static int ivpu_get_param_ioctl(struct drm_device *dev, void *data, struct drm_f args->value = vdev->platform; break; case DRM_IVPU_PARAM_CORE_CLOCK_RATE: - ret = ivpu_get_core_clock_rate(vdev, &args->value); + args->value = ivpu_hw_ratio_to_freq(vdev, vdev->hw->pll.max_ratio); break; case DRM_IVPU_PARAM_NUM_CONTEXTS: args->value = ivpu_get_context_count(vdev); diff --git a/drivers/accel/ivpu/ivpu_hw.h b/drivers/accel/ivpu/ivpu_hw.h index b2909168a0a69..094c659d2800b 100644 --- a/drivers/accel/ivpu/ivpu_hw.h +++ b/drivers/accel/ivpu/ivpu_hw.h @@ -21,6 +21,7 @@ struct ivpu_hw_ops { u32 (*profiling_freq_get)(struct ivpu_device *vdev); void (*profiling_freq_drive)(struct ivpu_device *vdev, bool enable); u32 (*reg_pll_freq_get)(struct ivpu_device *vdev); + u32 (*ratio_to_freq)(struct ivpu_device *vdev, u32 ratio); u32 (*reg_telemetry_offset_get)(struct ivpu_device *vdev); u32 (*reg_telemetry_size_get)(struct ivpu_device *vdev); u32 (*reg_telemetry_enable_get)(struct ivpu_device *vdev); @@ -130,6 +131,11 @@ static inline u32 ivpu_hw_reg_pll_freq_get(struct ivpu_device *vdev) return vdev->hw->ops->reg_pll_freq_get(vdev); }; +static inline u32 ivpu_hw_ratio_to_freq(struct ivpu_device *vdev, u32 ratio) +{ + return vdev->hw->ops->ratio_to_freq(vdev, ratio); +} + static inline u32 ivpu_hw_reg_telemetry_offset_get(struct ivpu_device *vdev) { return vdev->hw->ops->reg_telemetry_offset_get(vdev); diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c index 5e2865f9f7d6d..bd25e2d9fb0f4 100644 --- a/drivers/accel/ivpu/ivpu_hw_37xx.c +++ b/drivers/accel/ivpu/ivpu_hw_37xx.c @@ -803,12 +803,12 @@ static void ivpu_hw_37xx_profiling_freq_drive(struct ivpu_device *vdev, bool ena /* Profiling freq - is a debug feature. Unavailable on VPU 37XX. */ } -static u32 ivpu_hw_37xx_pll_to_freq(u32 ratio, u32 config) +static u32 ivpu_hw_37xx_ratio_to_freq(struct ivpu_device *vdev, u32 ratio) { u32 pll_clock = PLL_REF_CLK_FREQ * ratio; u32 cpu_clock; - if ((config & 0xff) == PLL_RATIO_4_3) + if ((vdev->hw->config & 0xff) == PLL_RATIO_4_3) cpu_clock = pll_clock * 2 / 4; else cpu_clock = pll_clock * 2 / 5; @@ -827,7 +827,7 @@ static u32 ivpu_hw_37xx_reg_pll_freq_get(struct ivpu_device *vdev) if (!ivpu_is_silicon(vdev)) return PLL_SIMULATION_FREQ; - return ivpu_hw_37xx_pll_to_freq(pll_curr_ratio, vdev->hw->config); + return ivpu_hw_37xx_ratio_to_freq(vdev, pll_curr_ratio); } static u32 ivpu_hw_37xx_reg_telemetry_offset_get(struct ivpu_device *vdev) @@ -1050,6 +1050,7 @@ const struct ivpu_hw_ops ivpu_hw_37xx_ops = { .profiling_freq_get = ivpu_hw_37xx_profiling_freq_get, .profiling_freq_drive = ivpu_hw_37xx_profiling_freq_drive, .reg_pll_freq_get = ivpu_hw_37xx_reg_pll_freq_get, + .ratio_to_freq = ivpu_hw_37xx_ratio_to_freq, .reg_telemetry_offset_get = ivpu_hw_37xx_reg_telemetry_offset_get, .reg_telemetry_size_get = ivpu_hw_37xx_reg_telemetry_size_get, .reg_telemetry_enable_get = ivpu_hw_37xx_reg_telemetry_enable_get, diff --git a/drivers/accel/ivpu/ivpu_hw_40xx.c b/drivers/accel/ivpu/ivpu_hw_40xx.c index e4eddbf5d11c2..b0b88d4c89264 100644 --- a/drivers/accel/ivpu/ivpu_hw_40xx.c +++ b/drivers/accel/ivpu/ivpu_hw_40xx.c @@ -980,6 +980,11 @@ static u32 ivpu_hw_40xx_reg_pll_freq_get(struct ivpu_device *vdev) return PLL_RATIO_TO_FREQ(pll_curr_ratio); } +static u32 ivpu_hw_40xx_ratio_to_freq(struct ivpu_device *vdev, u32 ratio) +{ + return PLL_RATIO_TO_FREQ(ratio); +} + static u32 ivpu_hw_40xx_reg_telemetry_offset_get(struct ivpu_device *vdev) { return REGB_RD32(VPU_40XX_BUTTRESS_VPU_TELEMETRY_OFFSET); @@ -1230,6 +1235,7 @@ const struct ivpu_hw_ops ivpu_hw_40xx_ops = { .profiling_freq_get = ivpu_hw_40xx_profiling_freq_get, .profiling_freq_drive = ivpu_hw_40xx_profiling_freq_drive, .reg_pll_freq_get = ivpu_hw_40xx_reg_pll_freq_get, + .ratio_to_freq = ivpu_hw_40xx_ratio_to_freq, .reg_telemetry_offset_get = ivpu_hw_40xx_reg_telemetry_offset_get, .reg_telemetry_size_get = ivpu_hw_40xx_reg_telemetry_size_get, .reg_telemetry_enable_get = ivpu_hw_40xx_reg_telemetry_enable_get, -- GitLab From 0d298e23292b7a5b58c5589fe33b96e95363214f Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:28 +0200 Subject: [PATCH 533/989] accel/ivpu: Fix missed error message after VPU rename Change "VPU" to "NPU" in ivpu_suspend() so it matches all other error messages. Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-8-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index ba51781b58960..4f5ea466731ff 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -62,7 +62,7 @@ static int ivpu_suspend(struct ivpu_device *vdev) ret = ivpu_shutdown(vdev); if (ret) - ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret); + ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret); return ret; } -- GitLab From fd7726e75968b27fe98534ccbf47ccd6fef686f3 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:29 +0200 Subject: [PATCH 534/989] accel/ivpu: Fix deadlock in context_xa ivpu_device->context_xa is locked both in kernel thread and IRQ context. It requires XA_FLAGS_LOCK_IRQ flag to be passed during initialization otherwise the lock could be acquired from a thread and interrupted by an IRQ that locks it for the second time causing the deadlock. This deadlock was reported by lockdep and observed in internal tests. Fixes: 35b137630f08 ("accel/ivpu: Introduce a new DRM driver for Intel VPU") Cc: # v6.3+ Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-9-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 77283daaedd11..51d3f1a55d024 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -517,7 +517,7 @@ static int ivpu_dev_init(struct ivpu_device *vdev) vdev->context_xa_limit.min = IVPU_USER_CONTEXT_MIN_SSID; vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID; atomic64_set(&vdev->unique_id_counter, 0); - xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC); + xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1); xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1); lockdep_set_class(&vdev->submitted_jobs_xa.xa_lock, &submitted_jobs_xa_lock_class_key); -- GitLab From e9d47b7b31563a6524b9f64ea70ed0289cc4d9c4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Apr 2024 18:36:45 +0200 Subject: [PATCH 535/989] lib: checksum: hide unused expected_csum_ipv6_magic[] When CONFIG_NET is disabled, an extra warning shows up for this unused variable: lib/checksum_kunit.c:218:18: error: 'expected_csum_ipv6_magic' defined but not used [-Werror=unused-const-variable=] Replace the #ifdef with an IS_ENABLED() check that makes the compiler's dead-code-elimination take care of the link failure. Fixes: f24a70106dc1 ("lib: checksum: Fix build with CONFIG_NET=n") Suggested-by: Christophe Leroy Acked-by: Palmer Dabbelt Acked-by: Jakub Kicinski Signed-off-by: Arnd Bergmann Reviewed-by: Simon Horman Tested-by: Simon Horman # build-tested Signed-off-by: David S. Miller --- lib/checksum_kunit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/checksum_kunit.c b/lib/checksum_kunit.c index bf70850035c76..404dba36bae38 100644 --- a/lib/checksum_kunit.c +++ b/lib/checksum_kunit.c @@ -594,13 +594,15 @@ static void test_ip_fast_csum(struct kunit *test) static void test_csum_ipv6_magic(struct kunit *test) { -#if defined(CONFIG_NET) const struct in6_addr *saddr; const struct in6_addr *daddr; unsigned int len; unsigned char proto; __wsum csum; + if (!IS_ENABLED(CONFIG_NET)) + return; + const int daddr_offset = sizeof(struct in6_addr); const int len_offset = sizeof(struct in6_addr) + sizeof(struct in6_addr); const int proto_offset = sizeof(struct in6_addr) + sizeof(struct in6_addr) + @@ -618,7 +620,6 @@ static void test_csum_ipv6_magic(struct kunit *test) CHECK_EQ(to_sum16(expected_csum_ipv6_magic[i]), csum_ipv6_magic(saddr, daddr, len, proto, csum)); } -#endif /* !CONFIG_NET */ } static struct kunit_case __refdata checksum_test_cases[] = { -- GitLab From be121ffb384f53e966ee7299ffccc6eeb61bc73d Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Wed, 3 Apr 2024 12:03:46 +0300 Subject: [PATCH 536/989] RDMA/mlx5: Fix port number for counter query in multi-port configuration Set the correct port when querying PPCNT in multi-port configuration. Distinguish between cases where switchdev mode was enabled to multi-port configuration and don't overwrite the queried port to 1 in multi-port case. Fixes: 74b30b3ad5ce ("RDMA/mlx5: Set local port to one when accessing counters") Signed-off-by: Michael Guralnik Link: https://lore.kernel.org/r/9bfcc8ade958b760a51408c3ad654a01b11f7d76.1712134988.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mad.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 0c3c4e64812c5..3e43687a7f6f7 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -188,7 +188,8 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, mdev = dev->mdev; mdev_port_num = 1; } - if (MLX5_CAP_GEN(dev->mdev, num_ports) == 1) { + if (MLX5_CAP_GEN(dev->mdev, num_ports) == 1 && + !mlx5_core_mp_enabled(mdev)) { /* set local port to one for Function-Per-Port HCA. */ mdev = dev->mdev; mdev_port_num = 1; -- GitLab From eaac25d026a14be4fe97683103f2a3ae76bff7bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 5 Apr 2024 09:20:41 +0200 Subject: [PATCH 537/989] MAINTAINERS: Drop Li Yang as their email address stopped working MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When sending a patch to (among others) Li Yang the nxp MTA replied that the address doesn't exist and so the mail couldn't be delivered. The error code was 550, so at least technically that's not a temporal issue. Signed-off-by: Uwe Kleine-König Signed-off-by: David S. Miller --- MAINTAINERS | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 75381386fe4c3..f389bf6d78a3c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2191,7 +2191,6 @@ N: mxs ARM/FREESCALE LAYERSCAPE ARM ARCHITECTURE M: Shawn Guo -M: Li Yang L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git @@ -8523,7 +8522,6 @@ S: Maintained F: drivers/video/fbdev/fsl-diu-fb.* FREESCALE DMA DRIVER -M: Li Yang M: Zhang Wei L: linuxppc-dev@lists.ozlabs.org S: Maintained @@ -8688,10 +8686,9 @@ F: drivers/soc/fsl/qe/tsa.h F: include/dt-bindings/soc/cpm1-fsl,tsa.h FREESCALE QUICC ENGINE UCC ETHERNET DRIVER -M: Li Yang L: netdev@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org -S: Maintained +S: Orphan F: drivers/net/ethernet/freescale/ucc_geth* FREESCALE QUICC ENGINE UCC HDLC DRIVER @@ -8708,10 +8705,9 @@ S: Maintained F: drivers/tty/serial/ucc_uart.c FREESCALE SOC DRIVERS -M: Li Yang L: linuxppc-dev@lists.ozlabs.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -S: Maintained +S: Orphan F: Documentation/devicetree/bindings/misc/fsl,dpaa2-console.yaml F: Documentation/devicetree/bindings/soc/fsl/ F: drivers/soc/fsl/ @@ -8745,10 +8741,9 @@ F: Documentation/devicetree/bindings/sound/fsl,qmc-audio.yaml F: sound/soc/fsl/fsl_qmc_audio.c FREESCALE USB PERIPHERAL DRIVERS -M: Li Yang L: linux-usb@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org -S: Maintained +S: Orphan F: drivers/usb/gadget/udc/fsl* FREESCALE USB PHY DRIVER -- GitLab From d8a6213d70accb403b82924a1c229e733433a5ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Apr 2024 10:30:34 +0000 Subject: [PATCH 538/989] geneve: fix header validation in geneve[6]_xmit_skb syzbot is able to trigger an uninit-value in geneve_xmit() [1] Problem : While most ip tunnel helpers (like ip_tunnel_get_dsfield()) uses skb_protocol(skb, true), pskb_inet_may_pull() is only using skb->protocol. If anything else than ETH_P_IPV6 or ETH_P_IP is found in skb->protocol, pskb_inet_may_pull() does nothing at all. If a vlan tag was provided by the caller (af_packet in the syzbot case), the network header might not point to the correct location, and skb linear part could be smaller than expected. Add skb_vlan_inet_prepare() to perform a complete mac validation. Use this in geneve for the moment, I suspect we need to adopt this more broadly. v4 - Jakub reported v3 broke l2_tos_ttl_inherit.sh selftest - Only call __vlan_get_protocol() for vlan types. Link: https://lore.kernel.org/netdev/20240404100035.3270a7d5@kernel.org/ v2,v3 - Addressed Sabrina comments on v1 and v2 Link: https://lore.kernel.org/netdev/Zg1l9L2BNoZWZDZG@hog/ [1] BUG: KMSAN: uninit-value in geneve_xmit_skb drivers/net/geneve.c:910 [inline] BUG: KMSAN: uninit-value in geneve_xmit+0x302d/0x5420 drivers/net/geneve.c:1030 geneve_xmit_skb drivers/net/geneve.c:910 [inline] geneve_xmit+0x302d/0x5420 drivers/net/geneve.c:1030 __netdev_start_xmit include/linux/netdevice.h:4903 [inline] netdev_start_xmit include/linux/netdevice.h:4917 [inline] xmit_one net/core/dev.c:3531 [inline] dev_hard_start_xmit+0x247/0xa20 net/core/dev.c:3547 __dev_queue_xmit+0x348d/0x52c0 net/core/dev.c:4335 dev_queue_xmit include/linux/netdevice.h:3091 [inline] packet_xmit+0x9c/0x6c0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3081 [inline] packet_sendmsg+0x8bb0/0x9ef0 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 __sys_sendto+0x685/0x830 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2199 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was created at: slab_post_alloc_hook mm/slub.c:3804 [inline] slab_alloc_node mm/slub.c:3845 [inline] kmem_cache_alloc_node+0x613/0xc50 mm/slub.c:3888 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:577 __alloc_skb+0x35b/0x7a0 net/core/skbuff.c:668 alloc_skb include/linux/skbuff.h:1318 [inline] alloc_skb_with_frags+0xc8/0xbf0 net/core/skbuff.c:6504 sock_alloc_send_pskb+0xa81/0xbf0 net/core/sock.c:2795 packet_alloc_skb net/packet/af_packet.c:2930 [inline] packet_snd net/packet/af_packet.c:3024 [inline] packet_sendmsg+0x722d/0x9ef0 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 __sys_sendto+0x685/0x830 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2199 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 CPU: 0 PID: 5033 Comm: syz-executor346 Not tainted 6.9.0-rc1-syzkaller-00005-g928a87efa423 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/29/2024 Fixes: d13f048dd40e ("net: geneve: modify IP header check in geneve6_xmit_skb and geneve_xmit_skb") Reported-by: syzbot+9ee20ec1de7b3168db09@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/000000000000d19c3a06152f9ee4@google.com/ Signed-off-by: Eric Dumazet Cc: Phillip Potter Cc: Sabrina Dubroca Reviewed-by: Sabrina Dubroca Reviewed-by: Phillip Potter Signed-off-by: David S. Miller --- drivers/net/geneve.c | 4 ++-- include/net/ip_tunnels.h | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 2f6739fe78af2..6c2835086b57e 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -822,7 +822,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!pskb_inet_may_pull(skb)) + if (!skb_vlan_inet_prepare(skb)) return -EINVAL; if (!gs4) @@ -929,7 +929,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!pskb_inet_may_pull(skb)) + if (!skb_vlan_inet_prepare(skb)) return -EINVAL; if (!gs6) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 5cd64bb2104df..c286cc2e766ee 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -361,6 +361,39 @@ static inline bool pskb_inet_may_pull(struct sk_buff *skb) return pskb_network_may_pull(skb, nhlen); } +/* Variant of pskb_inet_may_pull(). + */ +static inline bool skb_vlan_inet_prepare(struct sk_buff *skb) +{ + int nhlen = 0, maclen = ETH_HLEN; + __be16 type = skb->protocol; + + /* Essentially this is skb_protocol(skb, true) + * And we get MAC len. + */ + if (eth_type_vlan(type)) + type = __vlan_get_protocol(skb, type, &maclen); + + switch (type) { +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + nhlen = sizeof(struct ipv6hdr); + break; +#endif + case htons(ETH_P_IP): + nhlen = sizeof(struct iphdr); + break; + } + /* For ETH_P_IPV6/ETH_P_IP we make sure to pull + * a base network header in skb->head. + */ + if (!pskb_may_pull(skb, maclen + nhlen)) + return false; + + skb_set_network_header(skb, maclen); + return true; +} + static inline int ip_encap_hlen(struct ip_tunnel_encap *e) { const struct ip_tunnel_encap_ops *ops; -- GitLab From 58effa3476536215530c9ec4910ffc981613b413 Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Fri, 5 Apr 2024 13:16:06 +0200 Subject: [PATCH 539/989] s390/ism: fix receive message buffer allocation Since [1], dma_alloc_coherent() does not accept requests for GFP_COMP anymore, even on archs that may be able to fulfill this. Functionality that relied on the receive buffer being a compound page broke at that point: The SMC-D protocol, that utilizes the ism device driver, passes receive buffers to the splice processor in a struct splice_pipe_desc with a single entry list of struct pages. As the buffer is no longer a compound page, the splice processor now rejects requests to handle more than a page worth of data. Replace dma_alloc_coherent() and allocate a buffer with folio_alloc and create a DMA map for it with dma_map_page(). Since only receive buffers on ISM devices use DMA, qualify the mapping as FROM_DEVICE. Since ISM devices are available on arch s390, only and on that arch all DMA is coherent, there is no need to introduce and export some kind of dma_sync_to_cpu() method to be called by the SMC-D protocol layer. Analogously, replace dma_free_coherent by a two step dma_unmap_page, then folio_put to free the receive buffer. [1] https://lore.kernel.org/all/20221113163535.884299-1-hch@lst.de/ Fixes: c08004eede4b ("s390/ism: don't pass bogus GFP_ flags to dma_alloc_coherent") Signed-off-by: Gerd Bayer Signed-off-by: David S. Miller --- drivers/s390/net/ism_drv.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 2c8e964425dc3..affb05521e146 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include "ism.h" @@ -292,13 +294,15 @@ out: static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); - dma_free_coherent(&ism->pdev->dev, dmb->dmb_len, - dmb->cpu_addr, dmb->dma_addr); + dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len, + DMA_FROM_DEVICE); + folio_put(virt_to_folio(dmb->cpu_addr)); } static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { unsigned long bit; + int rc; if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev)) return -EINVAL; @@ -315,14 +319,30 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) test_and_set_bit(dmb->sba_idx, ism->sba_bitmap)) return -EINVAL; - dmb->cpu_addr = dma_alloc_coherent(&ism->pdev->dev, dmb->dmb_len, - &dmb->dma_addr, - GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_NORETRY); - if (!dmb->cpu_addr) - clear_bit(dmb->sba_idx, ism->sba_bitmap); + dmb->cpu_addr = + folio_address(folio_alloc(GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | __GFP_NORETRY, + get_order(dmb->dmb_len))); - return dmb->cpu_addr ? 0 : -ENOMEM; + if (!dmb->cpu_addr) { + rc = -ENOMEM; + goto out_bit; + } + dmb->dma_addr = dma_map_page(&ism->pdev->dev, + virt_to_page(dmb->cpu_addr), 0, + dmb->dmb_len, DMA_FROM_DEVICE); + if (dma_mapping_error(&ism->pdev->dev, dmb->dma_addr)) { + rc = -ENOMEM; + goto out_free; + } + + return 0; + +out_free: + kfree(dmb->cpu_addr); +out_bit: + clear_bit(dmb->sba_idx, ism->sba_bitmap); + return rc; } int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, -- GitLab From b45d0d01da542be280d935d87b71465028cdcb1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernhard=20Rosenkr=C3=A4nzer?= Date: Fri, 29 Mar 2024 16:28:00 +0100 Subject: [PATCH 540/989] platform/x86: acer-wmi: Add support for Acer PH18-71 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Acer Predator PH18-71 to acer_quirks with predator_v4 to support mode button and fan speed sensor. Signed-off-by: Bernhard Rosenkränzer Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240329152800.29393-1-bero@baylibre.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/acer-wmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c index ee2e164f86b9c..38c932df6446a 100644 --- a/drivers/platform/x86/acer-wmi.c +++ b/drivers/platform/x86/acer-wmi.c @@ -597,6 +597,15 @@ static const struct dmi_system_id acer_quirks[] __initconst = { }, .driver_data = &quirk_acer_predator_v4, }, + { + .callback = dmi_matched, + .ident = "Acer Predator PH18-71", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "Predator PH18-71"), + }, + .driver_data = &quirk_acer_predator_v4, + }, { .callback = set_force_caps, .ident = "Acer Aspire Switch 10E SW3-016", -- GitLab From 7ac10c7d728d75bc9daaa8fade3c7a3273b9a9ff Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 5 Apr 2024 16:55:11 -0700 Subject: [PATCH 541/989] bnxt_en: Fix possible memory leak in bnxt_rdma_aux_device_init() If ulp = kzalloc() fails, the allocated edev will leak because it is not properly assigned and the cleanup path will not be able to free it. Fix it by assigning it properly immediately after allocation. Fixes: 303432211324 ("bnxt_en: Remove runtime interrupt vector allocation") Reviewed-by: Andy Gospodarek Signed-off-by: Vikas Gupta Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index 93f9bd55020f2..a5f9c9090a6b0 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -392,12 +392,13 @@ void bnxt_rdma_aux_device_init(struct bnxt *bp) if (!edev) goto aux_dev_uninit; + aux_priv->edev = edev; + ulp = kzalloc(sizeof(*ulp), GFP_KERNEL); if (!ulp) goto aux_dev_uninit; edev->ulp_tbl = ulp; - aux_priv->edev = edev; bp->edev = edev; bnxt_set_edev_info(edev, bp); -- GitLab From b5ea7d33ba2a42b95b4298d08d2af9cdeeaf0090 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 5 Apr 2024 16:55:12 -0700 Subject: [PATCH 542/989] bnxt_en: Fix error recovery for RoCE ulp client Since runtime MSIXs vector allocation/free has been removed, the L2 driver needs to repopulate the MSIX entries for the ulp client as the irq table may change during the recovery process. Fixes: 303432211324 ("bnxt_en: Remove runtime interrupt vector allocation") Reviewed-by: Andy Gospodarek Signed-off-by: Vikas Gupta Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index a5f9c9090a6b0..195c02dc06830 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -210,6 +210,9 @@ void bnxt_ulp_start(struct bnxt *bp, int err) if (err) return; + if (edev->ulp_tbl->msix_requested) + bnxt_fill_msix_vecs(bp, edev->msix_entries); + if (aux_priv) { struct auxiliary_device *adev; -- GitLab From faa12ca245585379d612736a4b5e98e88481ea59 Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Fri, 5 Apr 2024 16:55:13 -0700 Subject: [PATCH 543/989] bnxt_en: Reset PTP tx_avail after possible firmware reset It is possible that during error recovery and firmware reset, there is a pending TX PTP packet waiting for the timestamp. We need to reset this condition so that after recovery, the tx_avail count for PTP is reset back to the initial value. Otherwise, we may not accept any PTP TX timestamps after recovery. Fixes: 118612d519d8 ("bnxt_en: Add PTP clock APIs, ioctls, and ethtool methods") Reviewed-by: Kalesh AP Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 493b724848c8f..57e61f9631678 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -11758,6 +11758,8 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init) /* VF-reps may need to be re-opened after the PF is re-opened */ if (BNXT_PF(bp)) bnxt_vf_reps_open(bp); + if (bp->ptp_cfg) + atomic_set(&bp->ptp_cfg->tx_avail, BNXT_MAX_TX_TS); bnxt_ptp_init_rtc(bp, true); bnxt_ptp_cfg_tstamp_filters(bp); bnxt_cfg_usr_fltrs(bp); -- GitLab From 3c89a068bfd0698a5478f4cf39493595ef757d5e Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 8 Apr 2024 09:02:23 +0200 Subject: [PATCH 544/989] PM: s2idle: Make sure CPUs will wakeup directly on resume s2idle works like a regular suspend with freezing processes and freezing devices. All CPUs except the control CPU go into idle. Once this is completed the control CPU kicks all other CPUs out of idle, so that they reenter the idle loop and then enter s2idle state. The control CPU then issues an swait() on the suspend state and therefore enters the idle loop as well. Due to being kicked out of idle, the other CPUs leave their NOHZ states, which means the tick is active and the corresponding hrtimer is programmed to the next jiffie. On entering s2idle the CPUs shut down their local clockevent device to prevent wakeups. The last CPU which enters s2idle shuts down its local clockevent and freezes timekeeping. On resume, one of the CPUs receives the wakeup interrupt, unfreezes timekeeping and its local clockevent and starts the resume process. At that point all other CPUs are still in s2idle with their clockevents switched off. They only resume when they are kicked by another CPU or after resuming devices and then receiving a device interrupt. That means there is no guarantee that all CPUs will wakeup directly on resume. As a consequence there is no guarantee that timers which are queued on those CPUs and should expire directly after resume, are handled. Also timer list timers which are remotely queued to one of those CPUs after resume will not result in a reprogramming IPI as the tick is active. Queueing a hrtimer will also not result in a reprogramming IPI because the first hrtimer event is already in the past. The recent introduction of the timer pull model (7ee988770326 ("timers: Implement the hierarchical pull model")) amplifies this problem, if the current migrator is one of the non woken up CPUs. When a non pinned timer list timer is queued and the queuing CPU goes idle, it relies on the still suspended migrator CPU to expire the timer which will happen by chance. The problem exists since commit 8d89835b0467 ("PM: suspend: Do not pause cpuidle in the suspend-to-idle path"). There the cpuidle_pause() call which in turn invoked a wakeup for all idle CPUs was moved to a later point in the resume process. This might not be reached or reached very late because it waits on a timer of a still suspended CPU. Address this by kicking all CPUs out of idle after the control CPU returns from swait() so that they resume their timers and restore consistent system state. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218641 Fixes: 8d89835b0467 ("PM: suspend: Do not pause cpuidle in the suspend-to-idle path") Signed-off-by: Anna-Maria Behnsen Reviewed-by: Thomas Gleixner Tested-by: Mario Limonciello Cc: 5.16+ # 5.16+ Acked-by: Peter Zijlstra (Intel) Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index e3ae93bbcb9b5..09f8397bae15f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -106,6 +106,12 @@ static void s2idle_enter(void) swait_event_exclusive(s2idle_wait_head, s2idle_state == S2IDLE_STATE_WAKE); + /* + * Kick all CPUs to ensure that they resume their timers and restore + * consistent system state. + */ + wake_up_all_idle_cpus(); + cpus_read_unlock(); raw_spin_lock_irq(&s2idle_lock); -- GitLab From 5ce344beaca688f4cdea07045e0b8f03dc537e74 Mon Sep 17 00:00:00 2001 From: Adam Dunlap Date: Mon, 18 Mar 2024 16:09:27 -0700 Subject: [PATCH 545/989] x86/apic: Force native_apic_mem_read() to use the MOV instruction When done from a virtual machine, instructions that touch APIC memory must be emulated. By convention, MMIO accesses are typically performed via io.h helpers such as readl() or writeq() to simplify instruction emulation/decoding (ex: in KVM hosts and SEV guests) [0]. Currently, native_apic_mem_read() does not follow this convention, allowing the compiler to emit instructions other than the MOV instruction generated by readl(). In particular, when the kernel is compiled with clang and run as a SEV-ES or SEV-SNP guest, the compiler would emit a TESTL instruction which is not supported by the SEV-ES emulator, causing a boot failure in that environment. It is likely the same problem would happen in a TDX guest as that uses the same instruction emulator as SEV-ES. To make sure all emulators can emulate APIC memory reads via MOV, use the readl() function in native_apic_mem_read(). It is expected that any emulator would support MOV in any addressing mode as it is the most generic and is what is usually emitted currently. The TESTL instruction is emitted when native_apic_mem_read() is inlined into apic_mem_wait_icr_idle(). The emulator comes from insn_decode_mmio() in arch/x86/lib/insn-eval.c. It's not worth it to extend insn_decode_mmio() to support more instructions since, in theory, the compiler could choose to output nearly any instruction for such reads which would bloat the emulator beyond reason. [0] https://lore.kernel.org/all/20220405232939.73860-12-kirill.shutemov@linux.intel.com/ [ bp: Massage commit message, fix typos. ] Signed-off-by: Adam Dunlap Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Thomas Gleixner Reviewed-by: Ard Biesheuvel Tested-by: Kevin Loughlin Cc: Link: https://lore.kernel.org/r/20240318230927.2191933-1-acdunlap@google.com --- arch/x86/include/asm/apic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 94ce0f7c9d3a2..e6ab0cf15ed57 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -13,6 +13,7 @@ #include #include #include +#include #define ARCH_APICTIMER_STOPS_ON_C3 1 @@ -98,7 +99,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v) static inline u32 native_apic_mem_read(u32 reg) { - return *((volatile u32 *)(APIC_BASE + reg)); + return readl((void __iomem *)(APIC_BASE + reg)); } static inline void native_apic_mem_eoi(void) -- GitLab From dfe073f8714dc8022b5578510e2288e5292adeb5 Mon Sep 17 00:00:00 2001 From: Minda Chen Date: Mon, 8 Apr 2024 09:29:42 +0800 Subject: [PATCH 546/989] net: stmmac: mmc_core: Add GMAC LPI statistics XGMAC MMC has already added LPI statistics. GMAC MMC lack of these statistics. Add register definition and reading the LPI statistics from registers. Signed-off-by: Minda Chen Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/mmc_core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c index 7eb477faa75a3..b0db5f4e8fe86 100644 --- a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c @@ -79,6 +79,12 @@ #define MMC_RX_FIFO_OVERFLOW 0xd4 #define MMC_RX_VLAN_FRAMES_GB 0xd8 #define MMC_RX_WATCHDOG_ERROR 0xdc + +#define MMC_TX_LPI_USEC 0xec +#define MMC_TX_LPI_TRAN 0xf0 +#define MMC_RX_LPI_USEC 0xf4 +#define MMC_RX_LPI_TRAN 0xf8 + /* IPC*/ #define MMC_RX_IPC_INTR_MASK 0x100 #define MMC_RX_IPC_INTR 0x108 @@ -283,6 +289,8 @@ static void dwmac_mmc_read(void __iomem *mmcaddr, struct stmmac_counters *mmc) mmc->mmc_tx_excessdef += readl(mmcaddr + MMC_TX_EXCESSDEF); mmc->mmc_tx_pause_frame += readl(mmcaddr + MMC_TX_PAUSE_FRAME); mmc->mmc_tx_vlan_frame_g += readl(mmcaddr + MMC_TX_VLAN_FRAME_G); + mmc->mmc_tx_lpi_usec += readl(mmcaddr + MMC_TX_LPI_USEC); + mmc->mmc_tx_lpi_tran += readl(mmcaddr + MMC_TX_LPI_TRAN); /* MMC RX counter registers */ mmc->mmc_rx_framecount_gb += readl(mmcaddr + MMC_RX_FRAMECOUNT_GB); @@ -316,6 +324,9 @@ static void dwmac_mmc_read(void __iomem *mmcaddr, struct stmmac_counters *mmc) mmc->mmc_rx_fifo_overflow += readl(mmcaddr + MMC_RX_FIFO_OVERFLOW); mmc->mmc_rx_vlan_frames_gb += readl(mmcaddr + MMC_RX_VLAN_FRAMES_GB); mmc->mmc_rx_watchdog_error += readl(mmcaddr + MMC_RX_WATCHDOG_ERROR); + mmc->mmc_rx_lpi_usec += readl(mmcaddr + MMC_RX_LPI_USEC); + mmc->mmc_rx_lpi_tran += readl(mmcaddr + MMC_RX_LPI_TRAN); + /* IPv4 */ mmc->mmc_rx_ipv4_gd += readl(mmcaddr + MMC_RX_IPV4_GD); mmc->mmc_rx_ipv4_hderr += readl(mmcaddr + MMC_RX_IPV4_HDERR); -- GitLab From ff20393bdc4537c5e044e3002d7f25a45f0d0f98 Mon Sep 17 00:00:00 2001 From: Minda Chen Date: Mon, 8 Apr 2024 09:29:43 +0800 Subject: [PATCH 547/989] net: stmmac: mmc_core: Add GMAC mmc tx/rx missing statistics The missing statistics including Rx_Receive_Error_Packets and Tx_OSize_Packets_Good. Signed-off-by: Minda Chen Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/mmc.h | 2 ++ drivers/net/ethernet/stmicro/stmmac/mmc_core.c | 4 ++++ drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 2 ++ 3 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/mmc.h b/drivers/net/ethernet/stmicro/stmmac/mmc.h index dff02d75d5197..5d1ea3e07459a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/mmc.h +++ b/drivers/net/ethernet/stmicro/stmmac/mmc.h @@ -52,6 +52,7 @@ struct stmmac_counters { unsigned int mmc_tx_excessdef; unsigned int mmc_tx_pause_frame; unsigned int mmc_tx_vlan_frame_g; + unsigned int mmc_tx_oversize_g; unsigned int mmc_tx_lpi_usec; unsigned int mmc_tx_lpi_tran; @@ -80,6 +81,7 @@ struct stmmac_counters { unsigned int mmc_rx_fifo_overflow; unsigned int mmc_rx_vlan_frames_gb; unsigned int mmc_rx_watchdog_error; + unsigned int mmc_rx_error; unsigned int mmc_rx_lpi_usec; unsigned int mmc_rx_lpi_tran; unsigned int mmc_rx_discard_frames_gb; diff --git a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c index b0db5f4e8fe86..0fab842902a85 100644 --- a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c @@ -53,6 +53,7 @@ #define MMC_TX_EXCESSDEF 0x6c #define MMC_TX_PAUSE_FRAME 0x70 #define MMC_TX_VLAN_FRAME_G 0x74 +#define MMC_TX_OVERSIZE_G 0x78 /* MMC RX counter registers */ #define MMC_RX_FRAMECOUNT_GB 0x80 @@ -79,6 +80,7 @@ #define MMC_RX_FIFO_OVERFLOW 0xd4 #define MMC_RX_VLAN_FRAMES_GB 0xd8 #define MMC_RX_WATCHDOG_ERROR 0xdc +#define MMC_RX_ERROR 0xe0 #define MMC_TX_LPI_USEC 0xec #define MMC_TX_LPI_TRAN 0xf0 @@ -289,6 +291,7 @@ static void dwmac_mmc_read(void __iomem *mmcaddr, struct stmmac_counters *mmc) mmc->mmc_tx_excessdef += readl(mmcaddr + MMC_TX_EXCESSDEF); mmc->mmc_tx_pause_frame += readl(mmcaddr + MMC_TX_PAUSE_FRAME); mmc->mmc_tx_vlan_frame_g += readl(mmcaddr + MMC_TX_VLAN_FRAME_G); + mmc->mmc_tx_oversize_g += readl(mmcaddr + MMC_TX_OVERSIZE_G); mmc->mmc_tx_lpi_usec += readl(mmcaddr + MMC_TX_LPI_USEC); mmc->mmc_tx_lpi_tran += readl(mmcaddr + MMC_TX_LPI_TRAN); @@ -324,6 +327,7 @@ static void dwmac_mmc_read(void __iomem *mmcaddr, struct stmmac_counters *mmc) mmc->mmc_rx_fifo_overflow += readl(mmcaddr + MMC_RX_FIFO_OVERFLOW); mmc->mmc_rx_vlan_frames_gb += readl(mmcaddr + MMC_RX_VLAN_FRAMES_GB); mmc->mmc_rx_watchdog_error += readl(mmcaddr + MMC_RX_WATCHDOG_ERROR); + mmc->mmc_rx_error += readl(mmcaddr + MMC_RX_ERROR); mmc->mmc_rx_lpi_usec += readl(mmcaddr + MMC_RX_LPI_USEC); mmc->mmc_rx_lpi_tran += readl(mmcaddr + MMC_RX_LPI_TRAN); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index e1537a57815f3..542e2633a6f52 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -212,6 +212,7 @@ static const struct stmmac_stats stmmac_mmc[] = { STMMAC_MMC_STAT(mmc_tx_excessdef), STMMAC_MMC_STAT(mmc_tx_pause_frame), STMMAC_MMC_STAT(mmc_tx_vlan_frame_g), + STMMAC_MMC_STAT(mmc_tx_oversize_g), STMMAC_MMC_STAT(mmc_tx_lpi_usec), STMMAC_MMC_STAT(mmc_tx_lpi_tran), STMMAC_MMC_STAT(mmc_rx_framecount_gb), @@ -238,6 +239,7 @@ static const struct stmmac_stats stmmac_mmc[] = { STMMAC_MMC_STAT(mmc_rx_fifo_overflow), STMMAC_MMC_STAT(mmc_rx_vlan_frames_gb), STMMAC_MMC_STAT(mmc_rx_watchdog_error), + STMMAC_MMC_STAT(mmc_rx_error), STMMAC_MMC_STAT(mmc_rx_lpi_usec), STMMAC_MMC_STAT(mmc_rx_lpi_tran), STMMAC_MMC_STAT(mmc_rx_discard_frames_gb), -- GitLab From c1d11fc2c8320871b40730991071dd0a0b405bc8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2024 09:46:01 +0200 Subject: [PATCH 548/989] irqflags: Explicitly ignore lockdep_hrtimer_exit() argument When building with 'make W=1' but CONFIG_TRACE_IRQFLAGS=n, the unused argument to lockdep_hrtimer_exit() causes a warning: kernel/time/hrtimer.c:1655:14: error: variable 'expires_in_hardirq' set but not used [-Werror=unused-but-set-variable] This is intentional behavior, so add a cast to void to shut up the warning. Fixes: 73d20564e0dc ("hrtimer: Don't dereference the hrtimer pointer after the callback") Reported-by: kernel test robot Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240408074609.3170807-1-arnd@kernel.org Closes: https://lore.kernel.org/oe-kbuild-all/202311191229.55QXHVc6-lkp@intel.com/ --- include/linux/irqflags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 147feebd508ca..3f003d5fde534 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -114,7 +114,7 @@ do { \ # define lockdep_softirq_enter() do { } while (0) # define lockdep_softirq_exit() do { } while (0) # define lockdep_hrtimer_enter(__hrtimer) false -# define lockdep_hrtimer_exit(__context) do { } while (0) +# define lockdep_hrtimer_exit(__context) do { (void)(__context); } while (0) # define lockdep_posixtimer_enter() do { } while (0) # define lockdep_posixtimer_exit() do { } while (0) # define lockdep_irq_work_enter(__work) do { } while (0) -- GitLab From fa1f51162338b3e2f520d4bfedc42b3b2e00da6d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 19 Mar 2024 19:20:50 +0100 Subject: [PATCH 549/989] locking: Make rwsem_assert_held_write_nolockdep() build with PREEMPT_RT=y The commit cited below broke the build for PREEMPT_RT because rwsem_assert_held_write_nolockdep() passes a struct rw_semaphore but rw_base_assert_held_write() expects struct rwbase_rt. Fixing the type alone leads to the problem that WARN_ON() is not found because bug.h is missing. In order to resolve this: - Keep the assert (WARN_ON()) in rwsem.h (not rwbase_rt.h) - Make rwsem_assert_held_write_nolockdep() do the implementation specific (rw_base) writer check. - Replace the "inline" with __always_inline which was used before. Fixes: f70405afc99b1 ("locking: Add rwsem_assert_held() and rwsem_assert_held_write()") Reported-by: Clark Williams Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Waiman Long Link: https://lore.kernel.org/r/20240319182050.U4AzUF3I@linutronix.de --- include/linux/rwbase_rt.h | 4 ++-- include/linux/rwsem.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/rwbase_rt.h b/include/linux/rwbase_rt.h index 29c4e4f243e47..f2394a409c9d5 100644 --- a/include/linux/rwbase_rt.h +++ b/include/linux/rwbase_rt.h @@ -31,9 +31,9 @@ static __always_inline bool rw_base_is_locked(const struct rwbase_rt *rwb) return atomic_read(&rwb->readers) != READER_BIAS; } -static inline void rw_base_assert_held_write(const struct rwbase_rt *rwb) +static __always_inline bool rw_base_is_write_locked(const struct rwbase_rt *rwb) { - WARN_ON(atomic_read(&rwb->readers) != WRITER_BIAS); + return atomic_read(&rwb->readers) == WRITER_BIAS; } static __always_inline bool rw_base_is_contended(const struct rwbase_rt *rwb) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 4f1c18992f768..c8b543d428b0a 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -167,14 +167,14 @@ static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem) return rw_base_is_locked(&sem->rwbase); } -static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) +static __always_inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!rwsem_is_locked(sem)); } -static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) +static __always_inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) { - rw_base_assert_held_write(sem); + WARN_ON(!rw_base_is_write_locked(&sem->rwbase)); } static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) -- GitLab From d730192ff0246356a2d7e63ff5bd501060670eec Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 6 Apr 2024 13:40:52 +0200 Subject: [PATCH 550/989] ACPI: scan: Do not increase dep_unmet for already met dependencies On the Toshiba Encore WT10-A tablet the BATC battery ACPI device depends on 3 other devices: Name (_DEP, Package (0x03) // _DEP: Dependencies { I2C1, GPO2, GPO0 }) acpi_scan_check_dep() adds all 3 of these to the acpi_dep_list and then before an acpi_device is created for the BATC handle (and thus before acpi_scan_dep_init() runs) acpi_scan_clear_dep() gets called for both GPIO depenencies, with free_when_met not set for the dependencies. Since there is no adev for BATC yet, there also is no dep_unmet to decrement. The only result of acpi_scan_clear_dep() in this case is dep->met getting set. Soon after acpi_scan_clear_dep() has been called for the GPIO dependencies the acpi_device gets created for the BATC handle and acpi_scan_dep_init() runs, this sees 3 dependencies on the acpi_dep_list and initializes unmet_dep to 3. Later when the dependency for I2C1 is met unmet_dep becomes 2, but since the 2 GPIO deps where already met it never becomes 0 causing battery monitoring to not work. Fix this by modifying acpi_scan_dep_init() to not increase dep_met for dependencies which have already been marked as being met. Fixes: 3ba12d8de3fa ("ACPI: scan: Reduce overhead related to devices with dependencies") Signed-off-by: Hans de Goede Cc: 6.5+ # 6.5+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/scan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 7c157bf926956..d1464324de951 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1843,7 +1843,8 @@ static void acpi_scan_dep_init(struct acpi_device *adev) if (dep->honor_dep) adev->flags.honor_deps = 1; - adev->dep_unmet++; + if (!dep->met) + adev->dep_unmet++; } } } -- GitLab From 8ab58f6841b19423231c5db3378691ec80c778f8 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 14 Mar 2024 16:49:43 +0100 Subject: [PATCH 551/989] gpu: host1x: Do not setup DMA for virtual devices The host1x devices are virtual compound devices and do not perform DMA accesses themselves, so they do not need to be set up for DMA. Ideally we would also not need to set up DMA masks for the virtual devices, but we currently still need those for legacy support on old hardware. Tested-by: Jon Hunter Acked-by: Jon Hunter Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20240314154943.2487549-1-thierry.reding@gmail.com --- drivers/gpu/host1x/bus.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c index 783975d1384fc..7c52757a89db9 100644 --- a/drivers/gpu/host1x/bus.c +++ b/drivers/gpu/host1x/bus.c @@ -351,11 +351,6 @@ static int host1x_device_uevent(const struct device *dev, return 0; } -static int host1x_dma_configure(struct device *dev) -{ - return of_dma_configure(dev, dev->of_node, true); -} - static const struct dev_pm_ops host1x_device_pm_ops = { .suspend = pm_generic_suspend, .resume = pm_generic_resume, @@ -369,7 +364,6 @@ const struct bus_type host1x_bus_type = { .name = "host1x", .match = host1x_device_match, .uevent = host1x_device_uevent, - .dma_configure = host1x_dma_configure, .pm = &host1x_device_pm_ops, }; @@ -458,8 +452,6 @@ static int host1x_device_add(struct host1x *host1x, device->dev.bus = &host1x_bus_type; device->dev.parent = host1x->dev; - of_dma_configure(&device->dev, host1x->dev->of_node, true); - device->dev.dma_parms = &device->dma_parms; dma_set_max_seg_size(&device->dev, UINT_MAX); -- GitLab From aca1a5287ea328fd1f7e2bfa6806646486d86a70 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 28 Mar 2024 09:25:40 +0530 Subject: [PATCH 552/989] ACPI: bus: allow _UID matching for integer zero Commit b2b32a173881 ("ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types") added _UID matching support for both integer and string types, which satisfies NULL @uid2 argument for string types using inversion, but this logic prevents _UID comparision in case the argument is integer 0, which may result in false positives. Fix this using _Generic(), which will allow NULL @uid2 argument for string types as well as _UID matching for all possible integer values. Fixes: b2b32a173881 ("ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types") Signed-off-by: Raag Jadav [ rjw: Comment adjustment ] Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 5de954e2b18aa..e7796f373d0da 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -911,17 +911,19 @@ static inline bool acpi_int_uid_match(struct acpi_device *adev, u64 uid2) * acpi_dev_hid_uid_match - Match device by supplied HID and UID * @adev: ACPI device to match. * @hid2: Hardware ID of the device. - * @uid2: Unique ID of the device, pass 0 or NULL to not check _UID. + * @uid2: Unique ID of the device, pass NULL to not check _UID. * * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2 * will be treated as a match. If user wants to validate @uid2, it should be * done before calling this function. * - * Returns: %true if matches or @uid2 is 0 or NULL, %false otherwise. + * Returns: %true if matches or @uid2 is NULL, %false otherwise. */ #define acpi_dev_hid_uid_match(adev, hid2, uid2) \ (acpi_dev_hid_match(adev, hid2) && \ - (!(uid2) || acpi_dev_uid_match(adev, uid2))) + /* Distinguish integer 0 from NULL @uid2 */ \ + (_Generic(uid2, ACPI_STR_TYPES(!(uid2)), default: 0) || \ + acpi_dev_uid_match(adev, uid2))) void acpi_dev_clear_dependencies(struct acpi_device *supplier); bool acpi_dev_ready_for_enumeration(const struct acpi_device *device); -- GitLab From 3eadd887dbac1df8f25f701e5d404d1b90fd0fea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Thu, 4 Apr 2024 23:33:25 +0300 Subject: [PATCH 553/989] drm/client: Fully protect modes[] with dev->mode_config.mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The modes[] array contains pointers to modes on the connectors' mode lists, which are protected by dev->mode_config.mutex. Thus we need to extend modes[] the same protection or by the time we use it the elements may already be pointing to freed/reused memory. Cc: stable@vger.kernel.org Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10583 Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240404203336.10454-2-ville.syrjala@linux.intel.com Reviewed-by: Dmitry Baryshkov Reviewed-by: Jani Nikula Reviewed-by: Thomas Zimmermann --- drivers/gpu/drm/drm_client_modeset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_client_modeset.c b/drivers/gpu/drm/drm_client_modeset.c index 871e4e2129d6d..0683a129b3628 100644 --- a/drivers/gpu/drm/drm_client_modeset.c +++ b/drivers/gpu/drm/drm_client_modeset.c @@ -777,6 +777,7 @@ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int total_modes_count = 0; struct drm_client_offset *offsets; unsigned int connector_count = 0; + /* points to modes protected by mode_config.mutex */ struct drm_display_mode **modes; struct drm_crtc **crtcs; int i, ret = 0; @@ -845,7 +846,6 @@ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, 0, width, height); } - mutex_unlock(&dev->mode_config.mutex); drm_client_modeset_release(client); @@ -875,6 +875,7 @@ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, modeset->y = offset->y; } } + mutex_unlock(&dev->mode_config.mutex); mutex_unlock(&client->modeset_mutex); out: -- GitLab From 5864e479ca4344f3a5df8074524da24c960f440b Mon Sep 17 00:00:00 2001 From: David McFarland Date: Thu, 4 Apr 2024 08:41:45 -0300 Subject: [PATCH 554/989] platform/x86/intel/hid: Don't wake on 5-button releases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If, for example, the power button is configured to suspend, holding it and releasing it after the machine has suspended, will wake the machine. Also on some machines, power button release events are sent during hibernation, even if the button wasn't used to hibernate the machine. This causes hibernation to be aborted. Fixes: 0c4cae1bc00d ("PM: hibernate: Avoid missing wakeup events during hibernation") Signed-off-by: David McFarland Tested-by: Enrik Berkhan Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/878r1tpd6u.fsf_-_@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/hid.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/hid.c b/drivers/platform/x86/intel/hid.c index 7457ca2b27a60..9ffbdc988fe50 100644 --- a/drivers/platform/x86/intel/hid.c +++ b/drivers/platform/x86/intel/hid.c @@ -504,6 +504,7 @@ static void notify_handler(acpi_handle handle, u32 event, void *context) struct platform_device *device = context; struct intel_hid_priv *priv = dev_get_drvdata(&device->dev); unsigned long long ev_index; + struct key_entry *ke; int err; /* @@ -545,11 +546,15 @@ static void notify_handler(acpi_handle handle, u32 event, void *context) if (event == 0xc0 || !priv->array) return; - if (!sparse_keymap_entry_from_scancode(priv->array, event)) { + ke = sparse_keymap_entry_from_scancode(priv->array, event); + if (!ke) { dev_info(&device->dev, "unknown event 0x%x\n", event); return; } + if (ke->type == KE_IGNORE) + return; + wakeup: pm_wakeup_hard_event(&device->dev); -- GitLab From 79ce88064bb04ec62c4e9e4da4614d36906f8a04 Mon Sep 17 00:00:00 2001 From: Sumeet Pawnikar Date: Fri, 5 Apr 2024 17:56:30 +0530 Subject: [PATCH 555/989] platform/x86/intel/hid: Add Lunar Lake and Arrow Lake support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add INTC107B for Lunar Lake and INTC10CB for Arrow Lake ACPI devices IDs. Signed-off-by: Sumeet Pawnikar Link: https://lore.kernel.org/r/20240405122630.32154-1-sumeet.r.pawnikar@intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/hid.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/intel/hid.c b/drivers/platform/x86/intel/hid.c index 9ffbdc988fe50..c7a8276458640 100644 --- a/drivers/platform/x86/intel/hid.c +++ b/drivers/platform/x86/intel/hid.c @@ -49,6 +49,8 @@ static const struct acpi_device_id intel_hid_ids[] = { {"INTC1076", 0}, {"INTC1077", 0}, {"INTC1078", 0}, + {"INTC107B", 0}, + {"INTC10CB", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, intel_hid_ids); -- GitLab From 592780b8391fe31f129ef4823c1513528f4dcb76 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:13 -0700 Subject: [PATCH 556/989] cxl: Fix retrieving of access_coordinates in PCIe path Current loop in cxl_endpoint_get_perf_coordinates() incorrectly assumes the Root Port (RP) dport is the one with generic port access_coordinate. However those coordinates are one level up in the Host Bridge (HB). Current code causes the computation code to pick up 0s as the coordinates and cause minimal bandwidth to result in 0. Add check to skip RP when combining coordinates. Fixes: 14a6960b3e92 ("cxl: Add helper function that calculate performance data for downstream ports") Reported-by: Jonathan Cameron Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-3-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 6cbde50a742bf..7aadcec4fc64d 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2165,6 +2165,11 @@ int cxl_hb_get_perf_coordinates(struct cxl_port *port, return 0; } +static bool parent_port_is_cxl_root(struct cxl_port *port) +{ + return is_cxl_root(to_cxl_port(port->dev.parent)); +} + /** * cxl_endpoint_get_perf_coordinates - Retrieve performance numbers stored in dports * of CXL path @@ -2184,27 +2189,31 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct cxl_dport *dport; struct pci_dev *pdev; unsigned int bw; + bool is_cxl_root; if (!is_cxl_endpoint(port)) return -EINVAL; - dport = iter->parent_dport; - /* - * Exit the loop when the parent port of the current port is cxl root. - * The iterative loop starts at the endpoint and gathers the - * latency of the CXL link from the current iter to the next downstream - * port each iteration. If the parent is cxl root then there is - * nothing to gather. + * Exit the loop when the parent port of the current iter port is cxl + * root. The iterative loop starts at the endpoint and gathers the + * latency of the CXL link from the current device/port to the connected + * downstream port each iteration. */ - while (!is_cxl_root(to_cxl_port(iter->dev.parent))) { - cxl_coordinates_combine(&c, &c, &dport->sw_coord); + do { + dport = iter->parent_dport; + iter = to_cxl_port(iter->dev.parent); + is_cxl_root = parent_port_is_cxl_root(iter); + + /* + * There's no valid access_coordinate for a root port since RPs do not + * have CDAT and therefore needs to be skipped. + */ + if (!is_cxl_root) + cxl_coordinates_combine(&c, &c, &dport->sw_coord); c.write_latency += dport->link_latency; c.read_latency += dport->link_latency; - - iter = to_cxl_port(iter->dev.parent); - dport = iter->parent_dport; - } + } while (!is_cxl_root); /* Get the calculated PCI paths bandwidth */ pdev = to_pci_dev(port->uport_dev->parent); -- GitLab From 51293c565cf4b8d57c154efadb57b17866c74bcb Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:14 -0700 Subject: [PATCH 557/989] cxl: Fix incorrect region perf data calculation Current math in cxl_region_perf_data_calculate divides the latency by 1000 every time the function gets called. This causes the region latency to be divided by 1000 per memory device and the math is incorrect. This is user visible as the latency access_coordinate exposed via sysfs will show incorrect latency data. Normalize values from CDAT to nanoseconds. Adjust sub-nanoseconds latency to at least 1. Remove adjustment of perf numbers from the generic target since hmat handling code has already normalized those numbers. Now all computation and stored numbers should be in nanoseconds. cxl_hb_get_perf_coordinates() is removed and HB coords are calculated in the port access_coordinate calculation path since it no longer need to be treated special. Fixes: 3d9f4a197230 ("cxl/region: Calculate performance data for a region") Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-4-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 13 +----- drivers/cxl/core/cdat.c | 94 ++++++++++++++++++----------------------- drivers/cxl/core/port.c | 36 ++-------------- drivers/cxl/cxl.h | 2 - 4 files changed, 45 insertions(+), 100 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index af5cb818f84d6..566c387d43852 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -525,22 +525,11 @@ static int get_genport_coordinates(struct device *dev, struct cxl_dport *dport) { struct acpi_device *hb = to_cxl_host_bridge(NULL, dev); u32 uid; - int rc; if (kstrtou32(acpi_device_uid(hb), 0, &uid)) return -EINVAL; - rc = acpi_get_genport_coordinates(uid, dport->hb_coord); - if (rc < 0) - return rc; - - /* Adjust back to picoseconds from nanoseconds */ - for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { - dport->hb_coord[i].read_latency *= 1000; - dport->hb_coord[i].write_latency *= 1000; - } - - return 0; + return acpi_get_genport_coordinates(uid, dport->hb_coord); } static int add_host_bridge_dport(struct device *match, void *arg) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index eddbbe21450ca..f66b493b5d3c5 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -20,6 +20,36 @@ struct dsmas_entry { int qos_class; }; +static u32 cdat_normalize(u16 entry, u64 base, u8 type) +{ + u32 value; + + /* + * Check for invalid and overflow values + */ + if (entry == 0xffff || !entry) + return 0; + else if (base > (UINT_MAX / (entry))) + return 0; + + /* + * CDAT fields follow the format of HMAT fields. See table 5 Device + * Scoped Latency and Bandwidth Information Structure in Coherent Device + * Attribute Table (CDAT) Specification v1.01. + */ + value = entry * base; + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + case ACPI_HMAT_READ_LATENCY: + case ACPI_HMAT_WRITE_LATENCY: + value = DIV_ROUND_UP(value, 1000); + break; + default: + break; + } + return value; +} + static int cdat_dsmas_handler(union acpi_subtable_headers *header, void *arg, const unsigned long end) { @@ -97,7 +127,6 @@ static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, __le16 le_val; u64 val; u16 len; - int rc; len = le16_to_cpu((__force __le16)hdr->length); if (len != size || (unsigned long)hdr + len > end) { @@ -124,10 +153,8 @@ static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, le_base = (__force __le64)dslbis->entry_base_unit; le_val = (__force __le16)dslbis->entry[0]; - rc = check_mul_overflow(le64_to_cpu(le_base), - le16_to_cpu(le_val), &val); - if (rc) - pr_warn("DSLBIS value overflowed.\n"); + val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base), + dslbis->data_type); cxl_access_coordinate_set(&dent->coord, dslbis->data_type, val); @@ -164,7 +191,6 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, struct xarray *dsmas_xa) { struct access_coordinate ep_c; - struct access_coordinate coord[ACCESS_COORDINATE_MAX]; struct dsmas_entry *dent; int valid_entries = 0; unsigned long index; @@ -176,12 +202,6 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, return rc; } - rc = cxl_hb_get_perf_coordinates(port, coord); - if (rc) { - dev_dbg(&port->dev, "Failed to retrieve hb perf coordinates.\n"); - return rc; - } - struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port); if (!cxl_root) @@ -194,18 +214,9 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, int qos_class; cxl_coordinates_combine(&dent->coord, &dent->coord, &ep_c); - /* - * Keeping the host bridge coordinates separate from the dsmas - * coordinates in order to allow calculation of access class - * 0 and 1 for region later. - */ - cxl_coordinates_combine(&coord[ACCESS_COORDINATE_CPU], - &coord[ACCESS_COORDINATE_CPU], - &dent->coord); dent->entries = 1; - rc = cxl_root->ops->qos_class(cxl_root, - &coord[ACCESS_COORDINATE_CPU], - 1, &qos_class); + rc = cxl_root->ops->qos_class(cxl_root, &dent->coord, 1, + &qos_class); if (rc != 1) continue; @@ -461,10 +472,8 @@ static int cdat_sslbis_handler(union acpi_subtable_headers *header, void *arg, le_base = (__force __le64)tbl->sslbis_header.entry_base_unit; le_val = (__force __le16)tbl->entries[i].latency_or_bandwidth; - - if (check_mul_overflow(le64_to_cpu(le_base), - le16_to_cpu(le_val), &val)) - dev_warn(dev, "SSLBIS value overflowed!\n"); + val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base), + sslbis->data_type); xa_for_each(&port->dports, index, dport) { if (dsp_id == ACPI_CDAT_SSLBIS_ANY_PORT || @@ -521,17 +530,13 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_port *port = cxlmd->endpoint; struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); - struct access_coordinate hb_coord[ACCESS_COORDINATE_MAX]; - struct access_coordinate coord; struct range dpa = { .start = cxled->dpa_res->start, .end = cxled->dpa_res->end, }; struct cxl_dpa_perf *perf; - int rc; switch (cxlr->mode) { case CXL_DECODER_RAM: @@ -549,35 +554,16 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, if (!range_contains(&perf->dpa_range, &dpa)) return; - rc = cxl_hb_get_perf_coordinates(port, hb_coord); - if (rc) { - dev_dbg(&port->dev, "Failed to retrieve hb perf coordinates.\n"); - return; - } - for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { - /* Pickup the host bridge coords */ - cxl_coordinates_combine(&coord, &hb_coord[i], &perf->coord); - /* Get total bandwidth and the worst latency for the cxl region */ cxlr->coord[i].read_latency = max_t(unsigned int, cxlr->coord[i].read_latency, - coord.read_latency); + perf->coord.read_latency); cxlr->coord[i].write_latency = max_t(unsigned int, cxlr->coord[i].write_latency, - coord.write_latency); - cxlr->coord[i].read_bandwidth += coord.read_bandwidth; - cxlr->coord[i].write_bandwidth += coord.write_bandwidth; - - /* - * Convert latency to nanosec from picosec to be consistent - * with the resulting latency coordinates computed by the - * HMAT_REPORTING code. - */ - cxlr->coord[i].read_latency = - DIV_ROUND_UP(cxlr->coord[i].read_latency, 1000); - cxlr->coord[i].write_latency = - DIV_ROUND_UP(cxlr->coord[i].write_latency, 1000); + perf->coord.write_latency); + cxlr->coord[i].read_bandwidth += perf->coord.read_bandwidth; + cxlr->coord[i].write_bandwidth += perf->coord.write_bandwidth; } } diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 7aadcec4fc64d..c7c00eb373af6 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2133,38 +2133,6 @@ bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd) } EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, CXL); -/** - * cxl_hb_get_perf_coordinates - Retrieve performance numbers between initiator - * and host bridge - * - * @port: endpoint cxl_port - * @coord: output access coordinates - * - * Return: errno on failure, 0 on success. - */ -int cxl_hb_get_perf_coordinates(struct cxl_port *port, - struct access_coordinate *coord) -{ - struct cxl_port *iter = port; - struct cxl_dport *dport; - - if (!is_cxl_endpoint(port)) - return -EINVAL; - - dport = iter->parent_dport; - while (iter && !is_cxl_root(to_cxl_port(iter->dev.parent))) { - iter = to_cxl_port(iter->dev.parent); - dport = iter->parent_dport; - } - - coord[ACCESS_COORDINATE_LOCAL] = - dport->hb_coord[ACCESS_COORDINATE_LOCAL]; - coord[ACCESS_COORDINATE_CPU] = - dport->hb_coord[ACCESS_COORDINATE_CPU]; - - return 0; -} - static bool parent_port_is_cxl_root(struct cxl_port *port) { return is_cxl_root(to_cxl_port(port->dev.parent)); @@ -2215,6 +2183,10 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, c.read_latency += dport->link_latency; } while (!is_cxl_root); + dport = iter->parent_dport; + /* Retrieve HB coords */ + cxl_coordinates_combine(&c, &c, dport->hb_coord); + /* Get the calculated PCI paths bandwidth */ pdev = to_pci_dev(port->uport_dev->parent); bw = pcie_bandwidth_available(pdev, NULL, NULL, NULL); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 534e25e2f0a48..ed02373ce3d90 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -884,8 +884,6 @@ void cxl_switch_parse_cdat(struct cxl_port *port); int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct access_coordinate *coord); -int cxl_hb_get_perf_coordinates(struct cxl_port *port, - struct access_coordinate *coord); void cxl_region_perf_data_calculate(struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled); -- GitLab From 001c5d19341a39cb683ab0a18ce4b662a09d96a0 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:15 -0700 Subject: [PATCH 558/989] cxl: Consolidate dport access_coordinate ->hb_coord and ->sw_coord into ->coord The driver stores access_coordinate for host bridge in ->hb_coord and switch CDAT access_coordinate in ->sw_coord. Since neither of these access_coordinate clobber each other, the variable name can be consolidated into ->coord to simplify the code. Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-5-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 2 +- drivers/cxl/core/cdat.c | 74 +++++++++++++++++++++++------------- drivers/cxl/core/port.c | 48 +++++++++++++++++------ drivers/cxl/cxl.h | 6 +-- drivers/cxl/cxlmem.h | 2 +- tools/testing/cxl/test/cxl.c | 10 +++-- 6 files changed, 94 insertions(+), 48 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 566c387d43852..cb8c155a2c9b3 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -529,7 +529,7 @@ static int get_genport_coordinates(struct device *dev, struct cxl_dport *dport) if (kstrtou32(acpi_device_uid(hb), 0, &uid)) return -EINVAL; - return acpi_get_genport_coordinates(uid, dport->hb_coord); + return acpi_get_genport_coordinates(uid, dport->coord); } static int add_host_bridge_dport(struct device *match, void *arg) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index f66b493b5d3c5..bb83867d9fec9 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -14,7 +14,7 @@ struct dsmas_entry { struct range dpa_range; u8 handle; - struct access_coordinate coord; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; int entries; int qos_class; @@ -88,8 +88,8 @@ static int cdat_dsmas_handler(union acpi_subtable_headers *header, void *arg, return 0; } -static void cxl_access_coordinate_set(struct access_coordinate *coord, - int access, unsigned int val) +static void __cxl_access_coordinate_set(struct access_coordinate *coord, + int access, unsigned int val) { switch (access) { case ACPI_HMAT_ACCESS_LATENCY: @@ -115,6 +115,13 @@ static void cxl_access_coordinate_set(struct access_coordinate *coord, } } +static void cxl_access_coordinate_set(struct access_coordinate *coord, + int access, unsigned int val) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + __cxl_access_coordinate_set(&coord[i], access, val); +} + static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, const unsigned long end) { @@ -156,7 +163,7 @@ static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base), dslbis->data_type); - cxl_access_coordinate_set(&dent->coord, dslbis->data_type, val); + cxl_access_coordinate_set(dent->coord, dslbis->data_type, val); return 0; } @@ -190,13 +197,13 @@ static int cxl_cdat_endpoint_process(struct cxl_port *port, static int cxl_port_perf_data_calculate(struct cxl_port *port, struct xarray *dsmas_xa) { - struct access_coordinate ep_c; + struct access_coordinate ep_c[ACCESS_COORDINATE_MAX]; struct dsmas_entry *dent; int valid_entries = 0; unsigned long index; int rc; - rc = cxl_endpoint_get_perf_coordinates(port, &ep_c); + rc = cxl_endpoint_get_perf_coordinates(port, ep_c); if (rc) { dev_dbg(&port->dev, "Failed to retrieve ep perf coordinates.\n"); return rc; @@ -213,10 +220,11 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, xa_for_each(dsmas_xa, index, dent) { int qos_class; - cxl_coordinates_combine(&dent->coord, &dent->coord, &ep_c); + cxl_coordinates_combine(dent->coord, dent->coord, ep_c); dent->entries = 1; - rc = cxl_root->ops->qos_class(cxl_root, &dent->coord, 1, - &qos_class); + rc = cxl_root->ops->qos_class(cxl_root, + &dent->coord[ACCESS_COORDINATE_CPU], + 1, &qos_class); if (rc != 1) continue; @@ -233,14 +241,17 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, static void update_perf_entry(struct device *dev, struct dsmas_entry *dent, struct cxl_dpa_perf *dpa_perf) { + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + dpa_perf->coord[i] = dent->coord[i]; dpa_perf->dpa_range = dent->dpa_range; - dpa_perf->coord = dent->coord; dpa_perf->qos_class = dent->qos_class; dev_dbg(dev, "DSMAS: dpa: %#llx qos: %d read_bw: %d write_bw %d read_lat: %d write_lat: %d\n", dent->dpa_range.start, dpa_perf->qos_class, - dent->coord.read_bandwidth, dent->coord.write_bandwidth, - dent->coord.read_latency, dent->coord.write_latency); + dent->coord[ACCESS_COORDINATE_CPU].read_bandwidth, + dent->coord[ACCESS_COORDINATE_CPU].write_bandwidth, + dent->coord[ACCESS_COORDINATE_CPU].read_latency, + dent->coord[ACCESS_COORDINATE_CPU].write_latency); } static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds, @@ -477,10 +488,11 @@ static int cdat_sslbis_handler(union acpi_subtable_headers *header, void *arg, xa_for_each(&port->dports, index, dport) { if (dsp_id == ACPI_CDAT_SSLBIS_ANY_PORT || - dsp_id == dport->port_id) - cxl_access_coordinate_set(&dport->sw_coord, + dsp_id == dport->port_id) { + cxl_access_coordinate_set(dport->coord, sslbis->data_type, val); + } } } @@ -502,6 +514,21 @@ void cxl_switch_parse_cdat(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(cxl_switch_parse_cdat, CXL); +static void __cxl_coordinates_combine(struct access_coordinate *out, + struct access_coordinate *c1, + struct access_coordinate *c2) +{ + if (c1->write_bandwidth && c2->write_bandwidth) + out->write_bandwidth = min(c1->write_bandwidth, + c2->write_bandwidth); + out->write_latency = c1->write_latency + c2->write_latency; + + if (c1->read_bandwidth && c2->read_bandwidth) + out->read_bandwidth = min(c1->read_bandwidth, + c2->read_bandwidth); + out->read_latency = c1->read_latency + c2->read_latency; +} + /** * cxl_coordinates_combine - Combine the two input coordinates * @@ -513,15 +540,8 @@ void cxl_coordinates_combine(struct access_coordinate *out, struct access_coordinate *c1, struct access_coordinate *c2) { - if (c1->write_bandwidth && c2->write_bandwidth) - out->write_bandwidth = min(c1->write_bandwidth, - c2->write_bandwidth); - out->write_latency = c1->write_latency + c2->write_latency; - - if (c1->read_bandwidth && c2->read_bandwidth) - out->read_bandwidth = min(c1->read_bandwidth, - c2->read_bandwidth); - out->read_latency = c1->read_latency + c2->read_latency; + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + __cxl_coordinates_combine(&out[i], &c1[i], &c2[i]); } MODULE_IMPORT_NS(CXL); @@ -558,12 +578,12 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, /* Get total bandwidth and the worst latency for the cxl region */ cxlr->coord[i].read_latency = max_t(unsigned int, cxlr->coord[i].read_latency, - perf->coord.read_latency); + perf->coord[i].read_latency); cxlr->coord[i].write_latency = max_t(unsigned int, cxlr->coord[i].write_latency, - perf->coord.write_latency); - cxlr->coord[i].read_bandwidth += perf->coord.read_bandwidth; - cxlr->coord[i].write_bandwidth += perf->coord.write_bandwidth; + perf->coord[i].write_latency); + cxlr->coord[i].read_bandwidth += perf->coord[i].read_bandwidth; + cxlr->coord[i].write_bandwidth += perf->coord[i].write_bandwidth; } } diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c7c00eb373af6..801c4018a1dde 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2133,6 +2133,29 @@ bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd) } EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, CXL); +static void add_latency(struct access_coordinate *c, long latency) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + c[i].write_latency += latency; + c[i].read_latency += latency; + } +} + +static void set_min_bandwidth(struct access_coordinate *c, unsigned int bw) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + c[i].write_bandwidth = min(c[i].write_bandwidth, bw); + c[i].read_bandwidth = min(c[i].read_bandwidth, bw); + } +} + +static void set_access_coordinates(struct access_coordinate *out, + struct access_coordinate *in) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + out[i] = in[i]; +} + static bool parent_port_is_cxl_root(struct cxl_port *port) { return is_cxl_root(to_cxl_port(port->dev.parent)); @@ -2149,9 +2172,15 @@ static bool parent_port_is_cxl_root(struct cxl_port *port) int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct access_coordinate *coord) { - struct access_coordinate c = { - .read_bandwidth = UINT_MAX, - .write_bandwidth = UINT_MAX, + struct access_coordinate c[] = { + { + .read_bandwidth = UINT_MAX, + .write_bandwidth = UINT_MAX, + }, + { + .read_bandwidth = UINT_MAX, + .write_bandwidth = UINT_MAX, + }, }; struct cxl_port *iter = port; struct cxl_dport *dport; @@ -2178,14 +2207,13 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, * have CDAT and therefore needs to be skipped. */ if (!is_cxl_root) - cxl_coordinates_combine(&c, &c, &dport->sw_coord); - c.write_latency += dport->link_latency; - c.read_latency += dport->link_latency; + cxl_coordinates_combine(c, c, dport->coord); + add_latency(c, dport->link_latency); } while (!is_cxl_root); dport = iter->parent_dport; /* Retrieve HB coords */ - cxl_coordinates_combine(&c, &c, dport->hb_coord); + cxl_coordinates_combine(c, c, dport->coord); /* Get the calculated PCI paths bandwidth */ pdev = to_pci_dev(port->uport_dev->parent); @@ -2194,10 +2222,8 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, return -ENXIO; bw /= BITS_PER_BYTE; - c.write_bandwidth = min(c.write_bandwidth, bw); - c.read_bandwidth = min(c.read_bandwidth, bw); - - *coord = c; + set_min_bandwidth(c, bw); + set_access_coordinates(coord, c); return 0; } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index ed02373ce3d90..036d17db68e00 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -663,8 +663,7 @@ struct cxl_rcrb_info { * @rch: Indicate whether this dport was enumerated in RCH or VH mode * @port: reference to cxl_port that contains this downstream port * @regs: Dport parsed register blocks - * @sw_coord: access coordinates (performance) for switch from CDAT - * @hb_coord: access coordinates (performance) from ACPI generic port (host bridge) + * @coord: access coordinates (bandwidth and latency performance attributes) * @link_latency: calculated PCIe downstream latency */ struct cxl_dport { @@ -675,8 +674,7 @@ struct cxl_dport { bool rch; struct cxl_port *port; struct cxl_regs regs; - struct access_coordinate sw_coord; - struct access_coordinate hb_coord[ACCESS_COORDINATE_MAX]; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; long link_latency; }; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 20fb3b35e89e0..36cee9c30cebd 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -401,7 +401,7 @@ enum cxl_devtype { */ struct cxl_dpa_perf { struct range dpa_range; - struct access_coordinate coord; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; int qos_class; }; diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 908e0d0839369..61c69297e7978 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -986,10 +986,12 @@ static void dpa_perf_setup(struct cxl_port *endpoint, struct range *range, { dpa_perf->qos_class = FAKE_QTG_ID; dpa_perf->dpa_range = *range; - dpa_perf->coord.read_latency = 500; - dpa_perf->coord.write_latency = 500; - dpa_perf->coord.read_bandwidth = 1000; - dpa_perf->coord.write_bandwidth = 1000; + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + dpa_perf->coord[i].read_latency = 500; + dpa_perf->coord[i].write_latency = 500; + dpa_perf->coord[i].read_bandwidth = 1000; + dpa_perf->coord[i].write_bandwidth = 1000; + } } static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) -- GitLab From 7bcf809b1e7889ab7e75fe1fcf8f1a98332f36d2 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:16 -0700 Subject: [PATCH 559/989] cxl: Add checks to access_coordinate calculation to fail missing data Jonathan noted that when the coordinates for host bridge and switches can be 0s if no actual data are retrieved and the calculation continues. The resulting number would be inaccurate. Add checks to ensure that the calculation would complete only if the numbers are valid. While not seen in the wild, issue may show up with a BIOS that reported CXL root ports via Generic Ports (via a PCI handle in the SRAT entry). Fixes: 14a6960b3e92 ("cxl: Add helper function that calculate performance data for downstream ports") Reported-by: Jonathan Cameron Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-6-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 801c4018a1dde..762783bb091af 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2141,6 +2141,18 @@ static void add_latency(struct access_coordinate *c, long latency) } } +static bool coordinates_valid(struct access_coordinate *c) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + if (c[i].read_bandwidth && c[i].write_bandwidth && + c[i].read_latency && c[i].write_latency) + continue; + return false; + } + + return true; +} + static void set_min_bandwidth(struct access_coordinate *c, unsigned int bw) { for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { @@ -2206,13 +2218,18 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, * There's no valid access_coordinate for a root port since RPs do not * have CDAT and therefore needs to be skipped. */ - if (!is_cxl_root) + if (!is_cxl_root) { + if (!coordinates_valid(dport->coord)) + return -EINVAL; cxl_coordinates_combine(c, c, dport->coord); + } add_latency(c, dport->link_latency); } while (!is_cxl_root); dport = iter->parent_dport; /* Retrieve HB coords */ + if (!coordinates_valid(dport->coord)) + return -EINVAL; cxl_coordinates_combine(c, c, dport->coord); /* Get the calculated PCI paths bandwidth */ -- GitLab From 0dd50b3e2c3d651ea972c97cff1af67870f3deaf Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 2 Apr 2024 14:43:51 +0200 Subject: [PATCH 560/989] platform/x86: toshiba_acpi: Silence logging for some events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stop logging unknown event / unknown keycode messages on suspend / resume on a Toshiba Portege Z830: 1. The Toshiba Portege Z830 sends a 0x8e event when the power button is pressed, ignore this. 2. The Toshiba Portege Z830 sends a 0xe00 hotkey event on resume from suspend, ignore this. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240402124351.167152-1-hdegoede@redhat.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/toshiba_acpi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 291f14ef67024..77244c9aa60d2 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -264,6 +264,7 @@ static const struct key_entry toshiba_acpi_keymap[] = { { KE_KEY, 0xb32, { KEY_NEXTSONG } }, { KE_KEY, 0xb33, { KEY_PLAYPAUSE } }, { KE_KEY, 0xb5a, { KEY_MEDIA } }, + { KE_IGNORE, 0x0e00, { KEY_RESERVED } }, /* Wake from sleep */ { KE_IGNORE, 0x1430, { KEY_RESERVED } }, /* Wake from sleep */ { KE_IGNORE, 0x1501, { KEY_RESERVED } }, /* Output changed */ { KE_IGNORE, 0x1502, { KEY_RESERVED } }, /* HDMI plugged/unplugged */ @@ -3523,9 +3524,10 @@ static void toshiba_acpi_notify(struct acpi_device *acpi_dev, u32 event) (dev->kbd_mode == SCI_KBD_MODE_ON) ? LED_FULL : LED_OFF); break; + case 0x8e: /* Power button pressed */ + break; case 0x85: /* Unknown */ case 0x8d: /* Unknown */ - case 0x8e: /* Unknown */ case 0x94: /* Unknown */ case 0x95: /* Unknown */ default: -- GitLab From 868adf8a29179c00309ddd8ffe0afa2043f42cb5 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Fri, 29 Mar 2024 07:32:05 -0700 Subject: [PATCH 561/989] platform/x86: intel-vbtn: Use acpi_has_method to check for switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check for a device having virtual buttons is done using acpi_has_method(..."VBDL"). Mimic that for checking virtual switch presence. Signed-off-by: Gwendal Grignou Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240329143206.2977734-2-gwendal@chromium.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vbtn.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/platform/x86/intel/vbtn.c b/drivers/platform/x86/intel/vbtn.c index 084c355c86f5f..48f0ac19d6ddf 100644 --- a/drivers/platform/x86/intel/vbtn.c +++ b/drivers/platform/x86/intel/vbtn.c @@ -258,9 +258,6 @@ static const struct dmi_system_id dmi_switches_allow_list[] = { static bool intel_vbtn_has_switches(acpi_handle handle, bool dual_accel) { - unsigned long long vgbs; - acpi_status status; - /* See dual_accel_detect.h for more info */ if (dual_accel) return false; @@ -268,8 +265,7 @@ static bool intel_vbtn_has_switches(acpi_handle handle, bool dual_accel) if (!dmi_check_system(dmi_switches_allow_list)) return false; - status = acpi_evaluate_integer(handle, "VGBS", NULL, &vgbs); - return ACPI_SUCCESS(status); + return acpi_has_method(handle, "VGBS"); } static int intel_vbtn_probe(struct platform_device *device) -- GitLab From 434e5781d8cd2d0ed512d920c6cdeba4b33a2e81 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Fri, 29 Mar 2024 07:32:06 -0700 Subject: [PATCH 562/989] platform/x86: intel-vbtn: Update tablet mode switch at end of probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ACER Vivobook Flip (TP401NAS) virtual intel switch is implemented as follow: Device (VGBI) { Name (_HID, EisaId ("INT33D6") ... Name (VBDS, Zero) Method (_STA, 0, Serialized) // _STA: Status ... Method (VBDL, 0, Serialized) { PB1E |= 0x20 VBDS |= 0x40 } Method (VGBS, 0, Serialized) { Return (VBDS) /* \_SB_.PCI0.SBRG.EC0_.VGBI.VBDS */ } ... } By default VBDS is set to 0. At boot it is set to clamshell (bit 6 set) only after method VBDL is executed. Since VBDL is now evaluated in the probe routine later, after the device is registered, the retrieved value of VBDS was still 0 ("tablet mode") when setting up the virtual switch. Make sure to evaluate VGBS after VBDL, to ensure the convertible boots in clamshell mode, the expected default. Fixes: 26173179fae1 ("platform/x86: intel-vbtn: Eval VBDL after registering our notifier") Signed-off-by: Gwendal Grignou Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240329143206.2977734-3-gwendal@chromium.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vbtn.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/vbtn.c b/drivers/platform/x86/intel/vbtn.c index 48f0ac19d6ddf..79bb2c801daa9 100644 --- a/drivers/platform/x86/intel/vbtn.c +++ b/drivers/platform/x86/intel/vbtn.c @@ -136,8 +136,6 @@ static int intel_vbtn_input_setup(struct platform_device *device) priv->switches_dev->id.bustype = BUS_HOST; if (priv->has_switches) { - detect_tablet_mode(&device->dev); - ret = input_register_device(priv->switches_dev); if (ret) return ret; @@ -312,6 +310,9 @@ static int intel_vbtn_probe(struct platform_device *device) if (ACPI_FAILURE(status)) dev_err(&device->dev, "Error VBDL failed with ACPI status %d\n", status); } + // Check switches after buttons since VBDL may have side effects. + if (has_switches) + detect_tablet_mode(&device->dev); device_init_wakeup(&device->dev, true); /* -- GitLab From e71c8481692582c70cdfd0996c20cdcc71e425d3 Mon Sep 17 00:00:00 2001 From: Gergo Koteles Date: Wed, 3 Apr 2024 16:34:27 +0200 Subject: [PATCH 563/989] platform/x86: lg-laptop: fix %s null argument warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit W=1 warns about null argument to kprintf: warning: ‘%s’ directive argument is null [-Wformat-overflow=] pr_info("product: %s year: %d\n", product, year); Use "unknown" instead of NULL. Signed-off-by: Gergo Koteles Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://lore.kernel.org/r/33d40e976f08f82b9227d0ecae38c787fcc0c0b2.1712154684.git.soyer@irl.hu Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lg-laptop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/lg-laptop.c b/drivers/platform/x86/lg-laptop.c index ad3c39e9e9f58..e714ee6298dda 100644 --- a/drivers/platform/x86/lg-laptop.c +++ b/drivers/platform/x86/lg-laptop.c @@ -736,7 +736,7 @@ static int acpi_add(struct acpi_device *device) default: year = 2019; } - pr_info("product: %s year: %d\n", product, year); + pr_info("product: %s year: %d\n", product ?: "unknown", year); if (year >= 2019) battery_limit_use_wmbb = 1; -- GitLab From 7b1f6b5aaec0f849e19c3e99d4eea75876853cdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 2 Apr 2024 18:50:03 +0300 Subject: [PATCH 564/989] drm/i915/cdclk: Fix CDCLK programming order when pipes are active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we always reprogram CDCLK from the intel_set_cdclk_pre_plane_update() when using squash/crawl. The code only works correctly for the cd2x update or full modeset cases, and it was simply never updated to deal with squash/crawl. If the CDCLK frequency is increasing we must reprogram it before we do anything else that might depend on the new higher frequency, and conversely we must not decrease the frequency until everything that might still depend on the old higher frequency has been dealt with. Since cdclk_state->pipe is only relevant when doing a cd2x update we can't use it to determine the correct sequence during squash/crawl. To that end introduce cdclk_state->disable_pipes which simply indicates that we must perform the update while the pipes are disable (ie. during intel_set_cdclk_pre_plane_update()). Otherwise we use the same old vs. new CDCLK frequency comparsiong as for cd2x updates. The only remaining problem case is when the voltage_level needs to increase due to a DDI port, but the CDCLK frequency is decreasing (and not all pipes are being disabled). The current approach will not bump the voltage level up until after the port has already been enabled, which is too late. But we'll take care of that case separately. v2: Don't break the "must disable pipes case" v3: Keep the on stack 'pipe' for future use Cc: stable@vger.kernel.org Fixes: d62686ba3b54 ("drm/i915/adl_p: CDCLK crawl support for ADL") Reviewed-by: Uma Shankar Reviewed-by: Gustavo Sousa Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240402155016.13733-2-ville.syrjala@linux.intel.com (cherry picked from commit 3aecee90ac12a351905f12dda7643d5b0676d6ca) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_cdclk.c | 7 +++++-- drivers/gpu/drm/i915/display/intel_cdclk.h | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.c b/drivers/gpu/drm/i915/display/intel_cdclk.c index ed89b86ea625a..4833479e2e178 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.c +++ b/drivers/gpu/drm/i915/display/intel_cdclk.c @@ -2543,7 +2543,7 @@ intel_set_cdclk_pre_plane_update(struct intel_atomic_state *state) if (IS_DG2(i915)) intel_cdclk_pcode_pre_notify(state); - if (pipe == INVALID_PIPE || + if (new_cdclk_state->disable_pipes || old_cdclk_state->actual.cdclk <= new_cdclk_state->actual.cdclk) { drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); @@ -2575,7 +2575,7 @@ intel_set_cdclk_post_plane_update(struct intel_atomic_state *state) if (IS_DG2(i915)) intel_cdclk_pcode_post_notify(state); - if (pipe != INVALID_PIPE && + if (!new_cdclk_state->disable_pipes && old_cdclk_state->actual.cdclk > new_cdclk_state->actual.cdclk) { drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); @@ -3058,6 +3058,7 @@ static struct intel_global_state *intel_cdclk_duplicate_state(struct intel_globa return NULL; cdclk_state->pipe = INVALID_PIPE; + cdclk_state->disable_pipes = false; return &cdclk_state->base; } @@ -3236,6 +3237,8 @@ int intel_modeset_calc_cdclk(struct intel_atomic_state *state) if (ret) return ret; + new_cdclk_state->disable_pipes = true; + drm_dbg_kms(&dev_priv->drm, "Modeset required for cdclk change\n"); } diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.h b/drivers/gpu/drm/i915/display/intel_cdclk.h index 48fd7d39e0cd9..71bc032bfef16 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.h +++ b/drivers/gpu/drm/i915/display/intel_cdclk.h @@ -51,6 +51,9 @@ struct intel_cdclk_state { /* bitmask of active pipes */ u8 active_pipes; + + /* update cdclk with pipes disabled */ + bool disable_pipes; }; int intel_crtc_compute_min_cdclk(const struct intel_crtc_state *crtc_state); -- GitLab From 6154cc9177ccea00c89ce0bf93352e474b819ff2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 2 Apr 2024 18:50:04 +0300 Subject: [PATCH 565/989] drm/i915/cdclk: Fix voltage_level programming edge case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we only consider the relationship of the old and new CDCLK frequencies when determining whether to do the repgramming from intel_set_cdclk_pre_plane_update() or intel_set_cdclk_post_plane_update(). It is technically possible to have a situation where the CDCLK frequency is decreasing, but the voltage_level is increasing due a DDI port. In this case we should bump the voltage level already in intel_set_cdclk_pre_plane_update() (so that the voltage_level will have been increased by the time the port gets enabled), while leaving the CDCLK frequency unchanged (as active planes/etc. may still depend on it). We can then reduce the CDCLK frequency to its final value from intel_set_cdclk_post_plane_update(). In order to handle that correctly we shall construct a suitable amalgam of the old and new cdclk states in intel_set_cdclk_pre_plane_update(). And we can simply call intel_set_cdclk() unconditionally in both places as it will not do anything if nothing actually changes vs. the current hw state. v2: Handle cdclk_state->disable_pipes v3: Only synchronize the cd2x update against the pipe's vblank when the cdclk frequency is changing during the current commit phase (Gustavo) Cc: stable@vger.kernel.org Cc: Gustavo Sousa Reviewed-by: Uma Shankar Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240402155016.13733-3-ville.syrjala@linux.intel.com (cherry picked from commit 34d127e2bdef73a923aa0dcd95cbc3257ad5af52) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_cdclk.c | 37 ++++++++++++++++------ 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.c b/drivers/gpu/drm/i915/display/intel_cdclk.c index 4833479e2e178..f672bfd70d455 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.c +++ b/drivers/gpu/drm/i915/display/intel_cdclk.c @@ -2534,7 +2534,8 @@ intel_set_cdclk_pre_plane_update(struct intel_atomic_state *state) intel_atomic_get_old_cdclk_state(state); const struct intel_cdclk_state *new_cdclk_state = intel_atomic_get_new_cdclk_state(state); - enum pipe pipe = new_cdclk_state->pipe; + struct intel_cdclk_config cdclk_config; + enum pipe pipe; if (!intel_cdclk_changed(&old_cdclk_state->actual, &new_cdclk_state->actual)) @@ -2543,12 +2544,25 @@ intel_set_cdclk_pre_plane_update(struct intel_atomic_state *state) if (IS_DG2(i915)) intel_cdclk_pcode_pre_notify(state); - if (new_cdclk_state->disable_pipes || - old_cdclk_state->actual.cdclk <= new_cdclk_state->actual.cdclk) { - drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); + if (new_cdclk_state->disable_pipes) { + cdclk_config = new_cdclk_state->actual; + pipe = INVALID_PIPE; + } else { + if (new_cdclk_state->actual.cdclk >= old_cdclk_state->actual.cdclk) { + cdclk_config = new_cdclk_state->actual; + pipe = new_cdclk_state->pipe; + } else { + cdclk_config = old_cdclk_state->actual; + pipe = INVALID_PIPE; + } - intel_set_cdclk(i915, &new_cdclk_state->actual, pipe); + cdclk_config.voltage_level = max(new_cdclk_state->actual.voltage_level, + old_cdclk_state->actual.voltage_level); } + + drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); + + intel_set_cdclk(i915, &cdclk_config, pipe); } /** @@ -2566,7 +2580,7 @@ intel_set_cdclk_post_plane_update(struct intel_atomic_state *state) intel_atomic_get_old_cdclk_state(state); const struct intel_cdclk_state *new_cdclk_state = intel_atomic_get_new_cdclk_state(state); - enum pipe pipe = new_cdclk_state->pipe; + enum pipe pipe; if (!intel_cdclk_changed(&old_cdclk_state->actual, &new_cdclk_state->actual)) @@ -2576,11 +2590,14 @@ intel_set_cdclk_post_plane_update(struct intel_atomic_state *state) intel_cdclk_pcode_post_notify(state); if (!new_cdclk_state->disable_pipes && - old_cdclk_state->actual.cdclk > new_cdclk_state->actual.cdclk) { - drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); + new_cdclk_state->actual.cdclk < old_cdclk_state->actual.cdclk) + pipe = new_cdclk_state->pipe; + else + pipe = INVALID_PIPE; - intel_set_cdclk(i915, &new_cdclk_state->actual, pipe); - } + drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); + + intel_set_cdclk(i915, &new_cdclk_state->actual, pipe); } static int intel_pixel_rate_to_cdclk(const struct intel_crtc_state *crtc_state) -- GitLab From 12bcd9108f9d3b8d4b5f4418bd16df4628b6fa8f Mon Sep 17 00:00:00 2001 From: Suraj Kandpal Date: Mon, 1 Apr 2024 11:26:53 +0530 Subject: [PATCH 566/989] drm/i915/hdcp: Fix get remote hdcp capability function HDCP 1.x capability needs to be checked even if setup is not HDCP 2.x capable. --v2 -Assign hdcp_capable and hdcp2_capable to false [Chaitanya] --v3 -Fix variable assignment [Chaitanya] Fixes: 813cca96e4ac ("drm/i915/hdcp: Add new remote capability check shim function") Signed-off-by: Suraj Kandpal Reviewed-by: Chaitanya Kumar Borah Signed-off-by: Animesh Manna Link: https://patchwork.freedesktop.org/patch/msgid/20240401055652.276785-2-suraj.kandpal@intel.com (cherry picked from commit 6809f9246d43f7cb07310ca6a3deb7aa1c0ea938) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp_hdcp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp_hdcp.c b/drivers/gpu/drm/i915/display/intel_dp_hdcp.c index b98a87883fefb..9db43bd81ce2f 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_hdcp.c +++ b/drivers/gpu/drm/i915/display/intel_dp_hdcp.c @@ -691,12 +691,15 @@ int intel_dp_hdcp_get_remote_capability(struct intel_connector *connector, u8 bcaps; int ret; + *hdcp_capable = false; + *hdcp2_capable = false; if (!intel_encoder_is_mst(connector->encoder)) return -EINVAL; ret = _intel_dp_hdcp2_get_capability(aux, hdcp2_capable); if (ret) - return ret; + drm_dbg_kms(&i915->drm, + "HDCP2 DPCD capability read failed err: %d\n", ret); ret = intel_dp_hdcp_read_bcaps(aux, i915, &bcaps); if (ret) -- GitLab From 152191e5e94bba55c938c18688e66c7276b765a7 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 29 Mar 2024 16:53:05 -0700 Subject: [PATCH 567/989] drm/i915/guc: Fix the fix for reset lock confusion The previous fix for the circlular lock splat about the busyness worker wasn't quite complete. Even though the reset-in-progress flag is cleared at the start of intel_uc_reset_finish, the entire function is still inside the reset mutex lock. Not sure why the patch appeared to fix the issue both locally and in CI. However, it is now back again. There is a further complication that the wedge code path within intel_gt_reset() jumps around so much that it results in nested reset_prepare/_finish calls. That is, the call sequence is: intel_gt_reset | reset_prepare | __intel_gt_set_wedged | | reset_prepare | | reset_finish | reset_finish The nested finish means that even if the clear of the in-progress flag was moved to the end of _finish, it would still be clear for the entire second call. Surprisingly, this does not seem to be causing any other problems at present. As an aside, a wedge on fini does not call the finish functions at all. The reset_in_progress flag is left set (twice). So instead of trying to cancel the worker anywhere at all in the reset path, just add a cancel to intel_guc_submission_fini instead. Note that it is not a problem if the worker is still active during a reset. Either it will run before the reset path starts locking things and will simply block the reset code for a tiny amount of time. Or it will run after the locks have been acquired and will early exit due to the try-lock. Also, do not use the reset-in-progress flag to decide whether a synchronous cancel is safe (from a lockdep perspective) or not. Instead, use the actual reset mutex state (both the genuine one and the custom rolled BACKOFF one). Fixes: 0e00a8814eec ("drm/i915/guc: Avoid circular locking issue on busyness flush") Signed-off-by: John Harrison Cc: Zhanjun Dong Cc: John Harrison Cc: Andi Shyti Cc: Daniel Vetter Cc: Daniel Vetter Cc: Rodrigo Vivi Cc: Nirmoy Das Cc: Tvrtko Ursulin Cc: Umesh Nerlige Ramappa Cc: Andrzej Hajda Cc: Matt Roper Cc: Jonathan Cavitt Cc: Prathap Kumar Valsan Cc: Alan Previn Cc: Madhumitha Tolakanahalli Pradeep Cc: Daniele Ceraolo Spurio Cc: Ashutosh Dixit Cc: Dnyaneshwar Bhadane Reviewed-by: Nirmoy Das Reviewed-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240329235306.1559639-1-John.C.Harrison@Intel.com (cherry picked from commit 3563d855312acedcd445a3767f0cb07906f1c26f) Signed-off-by: Rodrigo Vivi --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 23 ++++++++----------- drivers/gpu/drm/i915/gt/uc/intel_uc.c | 4 ++++ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index f3dcae4b9d455..0f83c6d4376ff 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1403,14 +1403,17 @@ static void guc_cancel_busyness_worker(struct intel_guc *guc) * Trying to pass a 'need_sync' or 'in_reset' flag all the way down through * every possible call stack is unfeasible. It would be too intrusive to many * areas that really don't care about the GuC backend. However, there is the - * 'reset_in_progress' flag available, so just use that. + * I915_RESET_BACKOFF flag and the gt->reset.mutex can be tested for is_locked. + * So just use those. Note that testing both is required due to the hideously + * complex nature of the i915 driver's reset code paths. * * And note that in the case of a reset occurring during driver unload - * (wedge_on_fini), skipping the cancel in _prepare (when the reset flag is set - * is fine because there is another cancel in _finish (when the reset flag is - * not). + * (wedged_on_fini), skipping the cancel in reset_prepare/reset_fini (when the + * reset flag/mutex are set) is fine because there is another explicit cancel in + * intel_guc_submission_fini (when the reset flag/mutex are not). */ - if (guc_to_gt(guc)->uc.reset_in_progress) + if (mutex_is_locked(&guc_to_gt(guc)->reset.mutex) || + test_bit(I915_RESET_BACKOFF, &guc_to_gt(guc)->reset.flags)) cancel_delayed_work(&guc->timestamp.work); else cancel_delayed_work_sync(&guc->timestamp.work); @@ -1424,8 +1427,6 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc) unsigned long flags; ktime_t unused; - guc_cancel_busyness_worker(guc); - spin_lock_irqsave(&guc->timestamp.lock, flags); guc_update_pm_timestamp(guc, &unused); @@ -2004,13 +2005,6 @@ void intel_guc_submission_cancel_requests(struct intel_guc *guc) void intel_guc_submission_reset_finish(struct intel_guc *guc) { - /* - * Ensure the busyness worker gets cancelled even on a fatal wedge. - * Note that reset_prepare is not allowed to because it confuses lockdep. - */ - if (guc_submission_initialized(guc)) - guc_cancel_busyness_worker(guc); - /* Reset called during driver load or during wedge? */ if (unlikely(!guc_submission_initialized(guc) || !intel_guc_is_fw_running(guc) || @@ -2136,6 +2130,7 @@ void intel_guc_submission_fini(struct intel_guc *guc) if (!guc->submission_initialized) return; + guc_fini_engine_stats(guc); guc_flush_destroyed_contexts(guc); guc_lrc_desc_pool_destroy_v69(guc); i915_sched_engine_put(guc->sched_engine); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c index 6dfe5d9456c69..399bc319180b0 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c @@ -637,6 +637,10 @@ void intel_uc_reset_finish(struct intel_uc *uc) { struct intel_guc *guc = &uc->guc; + /* + * NB: The wedge code path results in prepare -> prepare -> finish -> finish. + * So this function is sometimes called with the in-progress flag not set. + */ uc->reset_in_progress = false; /* Firmware expected to be running when this function is called */ -- GitLab From e3d4ead4d48c05355bd3b99c8162428f68c3c1a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 5 Apr 2024 00:34:26 +0300 Subject: [PATCH 568/989] drm/i915/psr: Disable PSR when bigjoiner is used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bigjoiner seem to be causing all kinds of grief to the PSR code currently. I don't believe there is any hardware issue but the code simply not handling this correctly. For now just disable PSR when bigjoiner is needed. Cc: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20240404213441.17637-3-ville.syrjala@linux.intel.com Reviewed-by: Arun R Murthy Acked-by: Jouni Högander Signed-off-by: Ville Syrjälä (cherry picked from commit 372fa0c79d3f289f813d8001e0a8a96d1011826c) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_psr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index b6e539f1342c2..aabd018bd7374 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -1422,6 +1422,17 @@ void intel_psr_compute_config(struct intel_dp *intel_dp, return; } + /* + * FIXME figure out what is wrong with PSR+bigjoiner and + * fix it. Presumably something related to the fact that + * PSR is a transcoder level feature. + */ + if (crtc_state->bigjoiner_pipes) { + drm_dbg_kms(&dev_priv->drm, + "PSR disabled due to bigjoiner\n"); + return; + } + if (CAN_PANEL_REPLAY(intel_dp)) crtc_state->has_panel_replay = true; else -- GitLab From 0653d501409eeb9f1deb7e4c12e4d0d2c9f1cba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 5 Apr 2024 00:34:27 +0300 Subject: [PATCH 569/989] drm/i915: Disable port sync when bigjoiner is used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current modeset sequence can't handle port sync and bigjoiner at the same time. Refuse port sync when bigjoiner is needed, at least until we fix the modeset sequence. v2: Add a FIXME (Vandite) Cc: stable@vger.kernel.org Tested-by: Vidya Srinivas Reviewed-by: Vandita Kulkarni Link: https://patchwork.freedesktop.org/patch/msgid/20240404213441.17637-4-ville.syrjala@linux.intel.com Signed-off-by: Ville Syrjälä (cherry picked from commit b37e1347b991459c38c56ec2476087854a4f720b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_ddi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index c587a8efeafcf..c17462b4c2ac1 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -4256,7 +4256,12 @@ static bool m_n_equal(const struct intel_link_m_n *m_n_1, static bool crtcs_port_sync_compatible(const struct intel_crtc_state *crtc_state1, const struct intel_crtc_state *crtc_state2) { + /* + * FIXME the modeset sequence is currently wrong and + * can't deal with bigjoiner + port sync at the same time. + */ return crtc_state1->hw.active && crtc_state2->hw.active && + !crtc_state1->bigjoiner_pipes && !crtc_state2->bigjoiner_pipes && crtc_state1->output_types == crtc_state2->output_types && crtc_state1->output_format == crtc_state2->output_format && crtc_state1->lane_count == crtc_state2->lane_count && -- GitLab From 4a36e46df7aa781c756f09727d37dc2783f1ee75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 5 Apr 2024 00:34:28 +0300 Subject: [PATCH 570/989] drm/i915: Disable live M/N updates when using bigjoiner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All joined pipes share the same transcoder/timing generator. Currently we just do the commits per-pipe, which doesn't really work if we need to change the timings at the same time. For now just disable live M/N updates when bigjoiner is needed. Cc: stable@vger.kernel.org Tested-by: Vidya Srinivas Reviewed-by: Arun R Murthy Link: https://patchwork.freedesktop.org/patch/msgid/20240404213441.17637-5-ville.syrjala@linux.intel.com Signed-off-by: Ville Syrjälä (cherry picked from commit ef79820db723a2a7c229a7251c12859e7e25a247) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index abd62bebc46d0..e583515f9b25a 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -2725,7 +2725,11 @@ intel_dp_drrs_compute_config(struct intel_connector *connector, intel_panel_downclock_mode(connector, &pipe_config->hw.adjusted_mode); int pixel_clock; - if (has_seamless_m_n(connector)) + /* + * FIXME all joined pipes share the same transcoder. + * Need to account for that when updating M/N live. + */ + if (has_seamless_m_n(connector) && !pipe_config->bigjoiner_pipes) pipe_config->update_m_n = true; if (!can_enable_drrs(connector, pipe_config, downclock_mode)) { -- GitLab From dcd8992e47f13afb5c11a61e8d9c141c35e23751 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 5 Apr 2024 00:34:29 +0300 Subject: [PATCH 571/989] drm/i915/vrr: Disable VRR when using bigjoiner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All joined pipes share the same transcoder/timing generator. Currently we just do the commits per-pipe, which doesn't really work if we need to change switch between non-VRR and VRR timings generators on the fly, or even when sending the push to the transcoder. For now just disable VRR when bigjoiner is needed. Cc: stable@vger.kernel.org Tested-by: Vidya Srinivas Reviewed-by: Vandita Kulkarni Link: https://patchwork.freedesktop.org/patch/msgid/20240404213441.17637-6-ville.syrjala@linux.intel.com Signed-off-by: Ville Syrjälä (cherry picked from commit f9d5e51db65652dbd8a2102fd7619440e3599fd2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_vrr.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_vrr.c b/drivers/gpu/drm/i915/display/intel_vrr.c index eb5bd07439020..f542ee1db1d97 100644 --- a/drivers/gpu/drm/i915/display/intel_vrr.c +++ b/drivers/gpu/drm/i915/display/intel_vrr.c @@ -117,6 +117,13 @@ intel_vrr_compute_config(struct intel_crtc_state *crtc_state, const struct drm_display_info *info = &connector->base.display_info; int vmin, vmax; + /* + * FIXME all joined pipes share the same transcoder. + * Need to account for that during VRR toggle/push/etc. + */ + if (crtc_state->bigjoiner_pipes) + return; + if (adjusted_mode->flags & DRM_MODE_FLAG_INTERLACE) return; -- GitLab From 0cd01ac5dcb1e18eb18df0f0d05b5de76522a437 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 5 Apr 2024 11:14:13 -0700 Subject: [PATCH 572/989] x86/bugs: Change commas to semicolons in 'spectre_v2' sysfs file Change the format of the 'spectre_v2' vulnerabilities sysfs file slightly by converting the commas to semicolons, so that mitigations for future variants can be grouped together and separated by commas. Signed-off-by: Josh Poimboeuf Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/bugs.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e7ba936d798b8..e8c7146f9ed7c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2695,15 +2695,15 @@ static char *stibp_state(void) switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: - return ", STIBP: disabled"; + return "; STIBP: disabled"; case SPECTRE_V2_USER_STRICT: - return ", STIBP: forced"; + return "; STIBP: forced"; case SPECTRE_V2_USER_STRICT_PREFERRED: - return ", STIBP: always-on"; + return "; STIBP: always-on"; case SPECTRE_V2_USER_PRCTL: case SPECTRE_V2_USER_SECCOMP: if (static_key_enabled(&switch_to_cond_stibp)) - return ", STIBP: conditional"; + return "; STIBP: conditional"; } return ""; } @@ -2712,10 +2712,10 @@ static char *ibpb_state(void) { if (boot_cpu_has(X86_FEATURE_IBPB)) { if (static_key_enabled(&switch_mm_always_ibpb)) - return ", IBPB: always-on"; + return "; IBPB: always-on"; if (static_key_enabled(&switch_mm_cond_ibpb)) - return ", IBPB: conditional"; - return ", IBPB: disabled"; + return "; IBPB: conditional"; + return "; IBPB: disabled"; } return ""; } @@ -2725,11 +2725,11 @@ static char *pbrsb_eibrs_state(void) if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) - return ", PBRSB-eIBRS: SW sequence"; + return "; PBRSB-eIBRS: SW sequence"; else - return ", PBRSB-eIBRS: Vulnerable"; + return "; PBRSB-eIBRS: Vulnerable"; } else { - return ", PBRSB-eIBRS: Not affected"; + return "; PBRSB-eIBRS: Not affected"; } } @@ -2748,9 +2748,9 @@ static ssize_t spectre_v2_show_state(char *buf) return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "", stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "", pbrsb_eibrs_state(), spectre_v2_module_string()); } -- GitLab From 1e3ad78334a69b36e107232e337f9d693dcc9df2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 3 Apr 2024 16:36:44 -0700 Subject: [PATCH 573/989] x86/syscall: Don't force use of indirect calls for system calls Make build a switch statement instead, and the compiler can either decide to generate an indirect jump, or - more likely these days due to mitigations - just a series of conditional branches. Yes, the conditional branches also have branch prediction, but the branch prediction is much more controlled, in that it just causes speculatively running the wrong system call (harmless), rather than speculatively running possibly wrong random less controlled code gadgets. This doesn't mitigate other indirect calls, but the system call indirection is the first and most easily triggered case. Signed-off-by: Linus Torvalds Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf --- arch/x86/entry/common.c | 6 +++--- arch/x86/entry/syscall_32.c | 21 +++++++++++++++++++-- arch/x86/entry/syscall_64.c | 19 +++++++++++++++++-- arch/x86/entry/syscall_x32.c | 10 +++++++--- arch/x86/include/asm/syscall.h | 10 ++++------ 5 files changed, 50 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 6356060caaf31..cea0e2a23b426 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -49,7 +49,7 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) if (likely(unr < NR_syscalls)) { unr = array_index_nospec(unr, NR_syscalls); - regs->ax = sys_call_table[unr](regs); + regs->ax = x64_sys_call(regs, unr); return true; } return false; @@ -66,7 +66,7 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { xnr = array_index_nospec(xnr, X32_NR_syscalls); - regs->ax = x32_sys_call_table[xnr](regs); + regs->ax = x32_sys_call(regs, xnr); return true; } return false; @@ -162,7 +162,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) if (likely(unr < IA32_NR_syscalls)) { unr = array_index_nospec(unr, IA32_NR_syscalls); - regs->ax = ia32_sys_call_table[unr](regs); + regs->ax = ia32_sys_call(regs, unr); } else if (nr != -1) { regs->ax = __ia32_sys_ni_syscall(regs); } diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 8cfc9bc73e7f8..c2235bae17ef6 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -18,8 +18,25 @@ #include #undef __SYSCALL +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ +#ifdef CONFIG_X86_32 #define __SYSCALL(nr, sym) __ia32_##sym, - -__visible const sys_call_ptr_t ia32_sys_call_table[] = { +const sys_call_ptr_t sys_call_table[] = { #include }; +#undef __SYSCALL +#endif + +#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs); + +long ia32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include + default: return __ia32_sys_ni_syscall(regs); + } +}; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index be120eec1fc9f..33b3f09e6f151 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -11,8 +11,23 @@ #include #undef __SYSCALL +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ #define __SYSCALL(nr, sym) __x64_##sym, - -asmlinkage const sys_call_ptr_t sys_call_table[] = { +const sys_call_ptr_t sys_call_table[] = { #include }; +#undef __SYSCALL + +#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); + +long x64_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include + default: return __x64_sys_ni_syscall(regs); + } +}; diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index bdd0e03a1265d..03de4a9321318 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -11,8 +11,12 @@ #include #undef __SYSCALL -#define __SYSCALL(nr, sym) __x64_##sym, +#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); -asmlinkage const sys_call_ptr_t x32_sys_call_table[] = { -#include +long x32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include + default: return __x64_sys_ni_syscall(regs); + } }; diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index f44e2f9ab65d7..3c28f26bfe220 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -16,19 +16,17 @@ #include /* for TS_COMPAT */ #include +/* This is used purely for kernel/trace/trace_syscalls.c */ typedef long (*sys_call_ptr_t)(const struct pt_regs *); extern const sys_call_ptr_t sys_call_table[]; -#if defined(CONFIG_X86_32) -#define ia32_sys_call_table sys_call_table -#else /* * These may not exist, but still put the prototypes in so we * can use IS_ENABLED(). */ -extern const sys_call_ptr_t ia32_sys_call_table[]; -extern const sys_call_ptr_t x32_sys_call_table[]; -#endif +extern long ia32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x64_sys_call(const struct pt_regs *, unsigned int nr); /* * Only the low 32 bits of orig_ax are meaningful, so we return int. -- GitLab From 7390db8aea0d64e9deb28b8e1ce716f5020c7ee5 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 11 Mar 2024 08:56:58 -0700 Subject: [PATCH 574/989] x86/bhi: Add support for clearing branch history at syscall entry Branch History Injection (BHI) attacks may allow a malicious application to influence indirect branch prediction in kernel by poisoning the branch history. eIBRS isolates indirect branch targets in ring0. The BHB can still influence the choice of indirect branch predictor entry, and although branch predictor entries are isolated between modes when eIBRS is enabled, the BHB itself is not isolated between modes. Alder Lake and new processors supports a hardware control BHI_DIS_S to mitigate BHI. For older processors Intel has released a software sequence to clear the branch history on parts that don't support BHI_DIS_S. Add support to execute the software sequence at syscall entry and VMexit to overwrite the branch history. For now, branch history is not cleared at interrupt entry, as malicious applications are not believed to have sufficient control over the registers, since previous register state is cleared at interrupt entry. Researchers continue to poke at this area and it may become necessary to clear at interrupt entry as well in the future. This mitigation is only defined here. It is enabled later. Signed-off-by: Pawan Gupta Co-developed-by: Daniel Sneddon Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Josh Poimboeuf --- arch/x86/entry/common.c | 4 +- arch/x86/entry/entry_64.S | 61 ++++++++++++++++++++++++++++ arch/x86/entry/entry_64_compat.S | 16 ++++++++ arch/x86/include/asm/cpufeatures.h | 3 +- arch/x86/include/asm/nospec-branch.h | 12 ++++++ arch/x86/include/asm/syscall.h | 1 + arch/x86/kvm/vmx/vmenter.S | 2 + 7 files changed, 96 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index cea0e2a23b426..6de50b80702e6 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -189,7 +189,7 @@ static __always_inline bool int80_is_external(void) } /** - * int80_emulation - 32-bit legacy syscall entry + * do_int80_emulation - 32-bit legacy syscall C entry from asm * * This entry point can be used by 32-bit and 64-bit programs to perform * 32-bit system calls. Instances of INT $0x80 can be found inline in @@ -207,7 +207,7 @@ static __always_inline bool int80_is_external(void) * eax: system call number * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 */ -DEFINE_IDTENTRY_RAW(int80_emulation) +__visible noinstr void do_int80_emulation(struct pt_regs *regs) { int nr; diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 8af2a26b24f6a..1b5be07f86698 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -116,6 +116,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) /* clobbers %rax, make sure it is after saving the syscall nr */ IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY call do_syscall_64 /* returns with IRQs disabled */ @@ -1491,3 +1492,63 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) call make_task_dead SYM_CODE_END(rewind_stack_and_make_dead) .popsection + +/* + * This sequence executes branches in order to remove user branch information + * from the branch history tracker in the Branch Predictor, therefore removing + * user influence on subsequent BTB lookups. + * + * It should be used on parts prior to Alder Lake. Newer parts should use the + * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being + * virtualized on newer hardware the VMM should protect against BHI attacks by + * setting BHI_DIS_S for the guests. + * + * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging + * and not clearing the branch history. The call tree looks like: + * + * call 1 + * call 2 + * call 2 + * call 2 + * call 2 + * call 2 + * ret + * ret + * ret + * ret + * ret + * ret + * + * This means that the stack is non-constant and ORC can't unwind it with %rsp + * alone. Therefore we unconditionally set up the frame pointer, which allows + * ORC to unwind properly. + * + * The alignment is for performance and not for safety, and may be safely + * refactored in the future if needed. + */ +SYM_FUNC_START(clear_bhb_loop) + push %rbp + mov %rsp, %rbp + movl $5, %ecx + ANNOTATE_INTRA_FUNCTION_CALL + call 1f + jmp 5f + .align 64, 0xcc + ANNOTATE_INTRA_FUNCTION_CALL +1: call 2f + RET + .align 64, 0xcc +2: movl $5, %eax +3: jmp 4f + nop +4: sub $1, %eax + jnz 3b + sub $1, %ecx + jnz 1b + RET +5: lfence + pop %rbp + RET +SYM_FUNC_END(clear_bhb_loop) +EXPORT_SYMBOL_GPL(clear_bhb_loop) +STACK_FRAME_NON_STANDARD(clear_bhb_loop) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index eabf48c4d4b4c..c779046cc3fe7 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -92,6 +92,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY /* * SYSENTER doesn't filter flags, so we need to clear NT and AC @@ -206,6 +207,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY movq %rsp, %rdi call do_fast_syscall_32 @@ -276,3 +278,17 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR int3 SYM_CODE_END(entry_SYSCALL_compat) + +/* + * int 0x80 is used by 32 bit mode as a system call entry. Normally idt entries + * point to C routines, however since this is a system call interface the branch + * history needs to be scrubbed to protect against BHI attacks, and that + * scrubbing needs to take place in assembly code prior to entering any C + * routines. + */ +SYM_CODE_START(int80_emulation) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + CLEAR_BRANCH_HISTORY + jmp do_int80_emulation +SYM_CODE_END(int80_emulation) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index a38f8f9ba6572..9b94a86f46003 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -461,11 +461,12 @@ /* * Extended auxiliary flags: Linux defined - for features scattered in various - * CPUID levels like 0x80000022, etc. + * CPUID levels like 0x80000022, etc and Linux defined features. * * Reuse free bits when adding new feature flags! */ #define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ +#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ /* * BUG word(s) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 170c89ed22fcd..aea40204278b6 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -326,6 +326,14 @@ ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF .endm +#ifdef CONFIG_X86_64 +.macro CLEAR_BRANCH_HISTORY + ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP +.endm +#else +#define CLEAR_BRANCH_HISTORY +#endif + #else /* __ASSEMBLY__ */ #define ANNOTATE_RETPOLINE_SAFE \ @@ -368,6 +376,10 @@ extern void srso_alias_return_thunk(void); extern void entry_untrain_ret(void); extern void entry_ibpb(void); +#ifdef CONFIG_X86_64 +extern void clear_bhb_loop(void); +#endif + extern void (*x86_return_thunk)(void); extern void __warn_thunk(void); diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 3c28f26bfe220..2fc7bc3863ff6 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -125,6 +125,7 @@ static inline int syscall_get_arch(struct task_struct *task) } bool do_syscall_64(struct pt_regs *regs, int nr); +void do_int80_emulation(struct pt_regs *regs); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 2bfbf758d0611..0f3593e10c57d 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -275,6 +275,8 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) call vmx_spec_ctrl_restore_host + CLEAR_BRANCH_HISTORY + /* Put return value in AX */ mov %_ASM_BX, %_ASM_AX -- GitLab From 0f4a837615ff925ba62648d280a861adf1582df7 Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Wed, 13 Mar 2024 09:47:57 -0700 Subject: [PATCH 575/989] x86/bhi: Define SPEC_CTRL_BHI_DIS_S Newer processors supports a hardware control BHI_DIS_S to mitigate Branch History Injection (BHI). Setting BHI_DIS_S protects the kernel from userspace BHI attacks without having to manually overwrite the branch history. Define MSR_SPEC_CTRL bit BHI_DIS_S and its enumeration CPUID.BHI_CTRL. Mitigation is enabled later. Signed-off-by: Daniel Sneddon Signed-off-by: Pawan Gupta Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Josh Poimboeuf --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 5 ++++- arch/x86/kernel/cpu/scattered.c | 1 + arch/x86/kvm/reverse_cpuid.h | 3 ++- 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 9b94a86f46003..4085090704299 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -467,6 +467,7 @@ */ #define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ #define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ +#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ /* * BUG word(s) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 05956bd8bacf5..24615e8269988 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -61,10 +61,13 @@ #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ #define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) +#define SPEC_CTRL_BHI_DIS_S_SHIFT 10 /* Disable Branch History Injection behavior */ +#define SPEC_CTRL_BHI_DIS_S BIT(SPEC_CTRL_BHI_DIS_S_SHIFT) /* A mask for bits which the kernel toggles when controlling mitigations */ #define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \ - | SPEC_CTRL_RRSBA_DIS_S) + | SPEC_CTRL_RRSBA_DIS_S \ + | SPEC_CTRL_BHI_DIS_S) #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index a515328d9d7d8..af5aa2c754c22 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -28,6 +28,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, + { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index aadefcaa9561d..da9880d74a0bc 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -52,7 +52,7 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_IPRED_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 1) #define KVM_X86_FEATURE_RRSBA_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 2) #define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3) -#define X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) +#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) /* CPUID level 0x80000007 (EDX). */ @@ -126,6 +126,7 @@ static __always_inline u32 __feature_translate(int x86_feature) KVM_X86_TRANSLATE_FEATURE(CONSTANT_TSC); KVM_X86_TRANSLATE_FEATURE(PERFMON_V2); KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL); + KVM_X86_TRANSLATE_FEATURE(BHI_CTRL); default: return x86_feature; } -- GitLab From be482ff9500999f56093738f9219bbabc729d163 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 11 Mar 2024 08:57:03 -0700 Subject: [PATCH 576/989] x86/bhi: Enumerate Branch History Injection (BHI) bug Mitigation for BHI is selected based on the bug enumeration. Add bits needed to enumerate BHI bug. Signed-off-by: Pawan Gupta Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Josh Poimboeuf --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 4 ++++ arch/x86/kernel/cpu/common.c | 24 ++++++++++++++++-------- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4085090704299..8edd7a28869a8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -517,4 +517,5 @@ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ #define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ +#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 24615e8269988..e72c2b8729579 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -166,6 +166,10 @@ * are restricted to targets in * kernel. */ +#define ARCH_CAP_BHI_NO BIT(20) /* + * CPU is not affected by Branch + * History Injection. + */ #define ARCH_CAP_PBRSB_NO BIT(24) /* * Not susceptible to Post-Barrier * Return Stack Buffer Predictions. diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5c1e6d6be267a..754d91857d634 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1120,6 +1120,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SPECTRE_V2 BIT(8) #define NO_MMIO BIT(9) #define NO_EIBRS_PBRSB BIT(10) +#define NO_BHI BIT(11) #define VULNWL(vendor, family, model, whitelist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) @@ -1182,18 +1183,18 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI), /* Zhaoxin Family 7 */ - VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), - VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI), {} }; @@ -1435,6 +1436,13 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (vulnerable_to_rfds(ia32_cap)) setup_force_cpu_bug(X86_BUG_RFDS); + /* When virtualized, eIBRS could be hidden, assume vulnerable */ + if (!(ia32_cap & ARCH_CAP_BHI_NO) && + !cpu_matches(cpu_vuln_whitelist, NO_BHI) && + (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || + boot_cpu_has(X86_FEATURE_HYPERVISOR))) + setup_force_cpu_bug(X86_BUG_BHI); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; -- GitLab From ec9404e40e8f36421a2b66ecb76dc2209fe7f3ef Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 11 Mar 2024 08:57:05 -0700 Subject: [PATCH 577/989] x86/bhi: Add BHI mitigation knob Branch history clearing software sequences and hardware control BHI_DIS_S were defined to mitigate Branch History Injection (BHI). Add cmdline spectre_bhi={on|off|auto} to control BHI mitigation: auto - Deploy the hardware mitigation BHI_DIS_S, if available. on - Deploy the hardware mitigation BHI_DIS_S, if available, otherwise deploy the software sequence at syscall entry and VMexit. off - Turn off BHI mitigation. The default is auto mode which does not deploy the software sequence mitigation. This is because of the hardening done in the syscall dispatch path, which is the likely target of BHI. Signed-off-by: Pawan Gupta Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Josh Poimboeuf --- Documentation/admin-guide/hw-vuln/spectre.rst | 45 ++++++++-- .../admin-guide/kernel-parameters.txt | 11 +++ arch/x86/Kconfig | 25 ++++++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/bugs.c | 90 ++++++++++++++++++- 5 files changed, 165 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index cce768afec6be..7cb99b09827ce 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -138,11 +138,10 @@ associated with the source address of the indirect branch. Specifically, the BHB might be shared across privilege levels even in the presence of Enhanced IBRS. -Currently the only known real-world BHB attack vector is via -unprivileged eBPF. Therefore, it's highly recommended to not enable -unprivileged eBPF, especially when eIBRS is used (without retpolines). -For a full mitigation against BHB attacks, it's recommended to use -retpolines (or eIBRS combined with retpolines). +Previously the only known real-world BHB attack vector was via unprivileged +eBPF. Further research has found attacks that don't require unprivileged eBPF. +For a full mitigation against BHB attacks it is recommended to set BHI_DIS_S or +use the BHB clearing sequence. Attack scenarios ---------------- @@ -430,6 +429,21 @@ The possible values in this file are: 'PBRSB-eIBRS: Not affected' CPU is not affected by PBRSB =========================== ======================================================= + - Branch History Injection (BHI) protection status: + +.. list-table:: + + * - BHI: Not affected + - System is not affected + * - BHI: Retpoline + - System is protected by retpoline + * - BHI: BHI_DIS_S + - System is protected by BHI_DIS_S + * - BHI: SW loop + - System is protected by software clearing sequence + * - BHI: Syscall hardening + - Syscalls are hardened against BHI + Full mitigation might require a microcode update from the CPU vendor. When the necessary microcode is not available, the kernel will report vulnerability. @@ -484,7 +498,11 @@ Spectre variant 2 Systems which support enhanced IBRS (eIBRS) enable IBRS protection once at boot, by setting the IBRS bit, and they're automatically protected against - Spectre v2 variant attacks. + some Spectre v2 variant attacks. The BHB can still influence the choice of + indirect branch predictor entry, and although branch predictor entries are + isolated between modes when eIBRS is enabled, the BHB itself is not isolated + between modes. Systems which support BHI_DIS_S will set it to protect against + BHI attacks. On Intel's enhanced IBRS systems, this includes cross-thread branch target injections on SMT systems (STIBP). In other words, Intel eIBRS enables @@ -638,6 +656,21 @@ kernel command line. spectre_v2=off. Spectre variant 1 mitigations cannot be disabled. + spectre_bhi= + + [X86] Control mitigation of Branch History Injection + (BHI) vulnerability. Syscalls are hardened against BHI + regardless of this setting. This setting affects the deployment + of the HW BHI control and the SW BHB clearing sequence. + + on + unconditionally enable. + off + unconditionally disable. + auto + enable if hardware mitigation + control(BHI_DIS_S) is available. + For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt Mitigation selection guide diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bb884c14b2f67..2dbe60c1db225 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6063,6 +6063,17 @@ sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/admin-guide/laptops/sonypi.rst + spectre_bhi= [X86] Control mitigation of Branch History Injection + (BHI) vulnerability. Syscalls are hardened against BHI + reglardless of this setting. This setting affects the + deployment of the HW BHI control and the SW BHB + clearing sequence. + + on - unconditionally enable. + off - unconditionally disable. + auto - (default) enable only if hardware mitigation + control(BHI_DIS_S) is available. + spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. The default operation protects the kernel from diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4fff6ed46e902..29a7b69a13166 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2633,6 +2633,31 @@ config MITIGATION_RFDS stored in floating point, vector and integer registers. See also +choice + prompt "Clear branch history" + depends on CPU_SUP_INTEL + default SPECTRE_BHI_AUTO + help + Enable BHI mitigations. BHI attacks are a form of Spectre V2 attacks + where the branch history buffer is poisoned to speculatively steer + indirect branches. + See + +config SPECTRE_BHI_ON + bool "on" + help + Equivalent to setting spectre_bhi=on command line parameter. +config SPECTRE_BHI_OFF + bool "off" + help + Equivalent to setting spectre_bhi=off command line parameter. +config SPECTRE_BHI_AUTO + bool "auto" + help + Equivalent to setting spectre_bhi=auto command line parameter. + +endchoice + endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8edd7a28869a8..a2ee9a00e4a7e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -468,6 +468,7 @@ #define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ #define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ #define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ +#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ /* * BUG word(s) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e8c7146f9ed7c..1ab2750298887 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1607,6 +1607,74 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ dump_stack(); } +/* + * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by + * branch history in userspace. Not needed if BHI_NO is set. + */ +static bool __init spec_ctrl_bhi_dis(void) +{ + if (!boot_cpu_has(X86_FEATURE_BHI_CTRL)) + return false; + + x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW); + + return true; +} + +enum bhi_mitigations { + BHI_MITIGATION_OFF, + BHI_MITIGATION_ON, + BHI_MITIGATION_AUTO, +}; + +static enum bhi_mitigations bhi_mitigation __ro_after_init = + IS_ENABLED(CONFIG_SPECTRE_BHI_ON) ? BHI_MITIGATION_ON : + IS_ENABLED(CONFIG_SPECTRE_BHI_OFF) ? BHI_MITIGATION_OFF : + BHI_MITIGATION_AUTO; + +static int __init spectre_bhi_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + bhi_mitigation = BHI_MITIGATION_OFF; + else if (!strcmp(str, "on")) + bhi_mitigation = BHI_MITIGATION_ON; + else if (!strcmp(str, "auto")) + bhi_mitigation = BHI_MITIGATION_AUTO; + else + pr_err("Ignoring unknown spectre_bhi option (%s)", str); + + return 0; +} +early_param("spectre_bhi", spectre_bhi_parse_cmdline); + +static void __init bhi_select_mitigation(void) +{ + if (bhi_mitigation == BHI_MITIGATION_OFF) + return; + + /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */ + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && + !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) + return; + + if (spec_ctrl_bhi_dis()) + return; + + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + if (bhi_mitigation == BHI_MITIGATION_AUTO) + return; + + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); + pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n"); +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -1718,6 +1786,9 @@ static void __init spectre_v2_select_mitigation(void) mode == SPECTRE_V2_RETPOLINE) spec_ctrl_disable_kernel_rrsba(); + if (boot_cpu_has(X86_BUG_BHI)) + bhi_select_mitigation(); + spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); @@ -2733,6 +2804,21 @@ static char *pbrsb_eibrs_state(void) } } +static const char * const spectre_bhi_state(void) +{ + if (!boot_cpu_has_bug(X86_BUG_BHI)) + return "; BHI: Not affected"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW)) + return "; BHI: BHI_DIS_S"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) + return "; BHI: SW loop"; + else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) + return "; BHI: Retpoline"; + + return "; BHI: Vulnerable (Syscall hardening enabled)"; +} + static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) @@ -2745,13 +2831,15 @@ static ssize_t spectre_v2_show_state(char *buf) spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); - return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", + return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_state(), boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "", stibp_state(), boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "", pbrsb_eibrs_state(), + spectre_bhi_state(), + /* this should always be at the end */ spectre_v2_module_string()); } -- GitLab From 95a6ccbdc7199a14b71ad8901cb788ba7fb5167b Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 11 Mar 2024 08:57:09 -0700 Subject: [PATCH 578/989] x86/bhi: Mitigate KVM by default BHI mitigation mode spectre_bhi=auto does not deploy the software mitigation by default. In a cloud environment, it is a likely scenario where userspace is trusted but the guests are not trusted. Deploying system wide mitigation in such cases is not desirable. Update the auto mode to unconditionally mitigate against malicious guests. Deploy the software sequence at VMexit in auto mode also, when hardware mitigation is not available. Unlike the force =on mode, software sequence is not deployed at syscalls in auto mode. Suggested-by: Alexandre Chartre Signed-off-by: Pawan Gupta Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Josh Poimboeuf --- Documentation/admin-guide/hw-vuln/spectre.rst | 7 +++++-- Documentation/admin-guide/kernel-parameters.txt | 5 +++-- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/nospec-branch.h | 5 +++++ arch/x86/kernel/cpu/bugs.c | 9 ++++++++- arch/x86/kvm/vmx/vmenter.S | 2 +- 6 files changed, 23 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 7cb99b09827ce..b70b1d8bd8e65 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -439,10 +439,12 @@ The possible values in this file are: - System is protected by retpoline * - BHI: BHI_DIS_S - System is protected by BHI_DIS_S - * - BHI: SW loop + * - BHI: SW loop; KVM SW loop - System is protected by software clearing sequence * - BHI: Syscall hardening - Syscalls are hardened against BHI + * - BHI: Syscall hardening; KVM: SW loop + - System is protected from userspace attacks by syscall hardening; KVM is protected by software clearing sequence Full mitigation might require a microcode update from the CPU vendor. When the necessary microcode is not available, the kernel will @@ -669,7 +671,8 @@ kernel command line. unconditionally disable. auto enable if hardware mitigation - control(BHI_DIS_S) is available. + control(BHI_DIS_S) is available, otherwise + enable alternate mitigation in KVM. For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2dbe60c1db225..4fa46302f4368 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6071,8 +6071,9 @@ on - unconditionally enable. off - unconditionally disable. - auto - (default) enable only if hardware mitigation - control(BHI_DIS_S) is available. + auto - (default) enable hardware mitigation + (BHI_DIS_S) if available, otherwise enable + alternate mitigation in KVM. spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index a2ee9a00e4a7e..3c7434329661c 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -469,6 +469,7 @@ #define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ #define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ #define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ +#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ /* * BUG word(s) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index aea40204278b6..ff5f1ecc7d1e6 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -330,8 +330,13 @@ .macro CLEAR_BRANCH_HISTORY ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP .endm + +.macro CLEAR_BRANCH_HISTORY_VMEXIT + ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT +.endm #else #define CLEAR_BRANCH_HISTORY +#define CLEAR_BRANCH_HISTORY_VMEXIT #endif #else /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1ab2750298887..295463707e681 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1668,9 +1668,14 @@ static void __init bhi_select_mitigation(void) if (!IS_ENABLED(CONFIG_X86_64)) return; + /* Mitigate KVM by default */ + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); + pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n"); + if (bhi_mitigation == BHI_MITIGATION_AUTO) return; + /* Mitigate syscalls when the mitigation is forced =on */ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n"); } @@ -2811,10 +2816,12 @@ static const char * const spectre_bhi_state(void) else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW)) return "; BHI: BHI_DIS_S"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) - return "; BHI: SW loop"; + return "; BHI: SW loop, KVM: SW loop"; else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) return "; BHI: Retpoline"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) + return "; BHI: Syscall hardening, KVM: SW loop"; return "; BHI: Vulnerable (Syscall hardening enabled)"; } diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 0f3593e10c57d..f6986dee6f8c7 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -275,7 +275,7 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) call vmx_spec_ctrl_restore_host - CLEAR_BRANCH_HISTORY + CLEAR_BRANCH_HISTORY_VMEXIT /* Put return value in AX */ mov %_ASM_BX, %_ASM_AX -- GitLab From ed2e8d49b54d677f3123668a21a57822d679651f Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Wed, 13 Mar 2024 09:49:17 -0700 Subject: [PATCH 579/989] KVM: x86: Add BHI_NO Intel processors that aren't vulnerable to BHI will set MSR_IA32_ARCH_CAPABILITIES[BHI_NO] = 1;. Guests may use this BHI_NO bit to determine if they need to implement BHI mitigations or not. Allow this bit to be passed to the guests. Signed-off-by: Daniel Sneddon Signed-off-by: Pawan Gupta Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Josh Poimboeuf --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 47d9f03b77783..984ea2089efc3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1621,7 +1621,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr) ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR) + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) static u64 kvm_get_arch_capabilities(void) { -- GitLab From 8db8f6ce556af60ca9a9fd5e826d369ded70fcc7 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Wed, 3 Apr 2024 18:50:03 +0530 Subject: [PATCH 580/989] scsi: ufs: qcom: Add missing interconnect bandwidth values for Gear 5 These entries are necessary to scale the interconnect bandwidth while operating in Gear 5. Cc: Amit Pundir Fixes: 03ce80a1bb86 ("scsi: ufs: qcom: Add support for scaling interconnects") Tested-by: Amit Pundir Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20240403-ufs-icc-fix-v2-1-958412a5eb45@linaro.org Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-qcom.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 06859e17b67b7..7a00004bfd036 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -47,7 +47,7 @@ enum { TSTBUS_MAX, }; -#define QCOM_UFS_MAX_GEAR 4 +#define QCOM_UFS_MAX_GEAR 5 #define QCOM_UFS_MAX_LANE 2 enum { @@ -67,26 +67,32 @@ static const struct __ufs_qcom_bw_table { [MODE_PWM][UFS_PWM_G2][UFS_LANE_1] = { 1844, 1000 }, [MODE_PWM][UFS_PWM_G3][UFS_LANE_1] = { 3688, 1000 }, [MODE_PWM][UFS_PWM_G4][UFS_LANE_1] = { 7376, 1000 }, + [MODE_PWM][UFS_PWM_G5][UFS_LANE_1] = { 14752, 1000 }, [MODE_PWM][UFS_PWM_G1][UFS_LANE_2] = { 1844, 1000 }, [MODE_PWM][UFS_PWM_G2][UFS_LANE_2] = { 3688, 1000 }, [MODE_PWM][UFS_PWM_G3][UFS_LANE_2] = { 7376, 1000 }, [MODE_PWM][UFS_PWM_G4][UFS_LANE_2] = { 14752, 1000 }, + [MODE_PWM][UFS_PWM_G5][UFS_LANE_2] = { 29504, 1000 }, [MODE_HS_RA][UFS_HS_G1][UFS_LANE_1] = { 127796, 1000 }, [MODE_HS_RA][UFS_HS_G2][UFS_LANE_1] = { 255591, 1000 }, [MODE_HS_RA][UFS_HS_G3][UFS_LANE_1] = { 1492582, 102400 }, [MODE_HS_RA][UFS_HS_G4][UFS_LANE_1] = { 2915200, 204800 }, + [MODE_HS_RA][UFS_HS_G5][UFS_LANE_1] = { 5836800, 409600 }, [MODE_HS_RA][UFS_HS_G1][UFS_LANE_2] = { 255591, 1000 }, [MODE_HS_RA][UFS_HS_G2][UFS_LANE_2] = { 511181, 1000 }, [MODE_HS_RA][UFS_HS_G3][UFS_LANE_2] = { 1492582, 204800 }, [MODE_HS_RA][UFS_HS_G4][UFS_LANE_2] = { 2915200, 409600 }, + [MODE_HS_RA][UFS_HS_G5][UFS_LANE_2] = { 5836800, 819200 }, [MODE_HS_RB][UFS_HS_G1][UFS_LANE_1] = { 149422, 1000 }, [MODE_HS_RB][UFS_HS_G2][UFS_LANE_1] = { 298189, 1000 }, [MODE_HS_RB][UFS_HS_G3][UFS_LANE_1] = { 1492582, 102400 }, [MODE_HS_RB][UFS_HS_G4][UFS_LANE_1] = { 2915200, 204800 }, + [MODE_HS_RB][UFS_HS_G5][UFS_LANE_1] = { 5836800, 409600 }, [MODE_HS_RB][UFS_HS_G1][UFS_LANE_2] = { 298189, 1000 }, [MODE_HS_RB][UFS_HS_G2][UFS_LANE_2] = { 596378, 1000 }, [MODE_HS_RB][UFS_HS_G3][UFS_LANE_2] = { 1492582, 204800 }, [MODE_HS_RB][UFS_HS_G4][UFS_LANE_2] = { 2915200, 409600 }, + [MODE_HS_RB][UFS_HS_G5][UFS_LANE_2] = { 5836800, 819200 }, [MODE_MAX][0][0] = { 7643136, 307200 }, }; -- GitLab From 18f06e97692516d28c3cdc577fb5c501d690b303 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Mar 2024 17:15:40 -0700 Subject: [PATCH 581/989] KVM: Add helpers to consolidate gfn_to_pfn_cache's page split check Add a helper to check that the incoming length for a gfn_to_pfn_cache is valid with respect to the cache's GPA and/or HVA. To avoid activating a cache with a bogus GPA, a future fix will fork the page split check in the inner refresh path into activate() and the public rerfresh() APIs, at which point KVM will check the length in three separate places. Deliberately keep the "page offset" logic open coded, as the only other path that consumes the offset, __kvm_gpc_refresh(), already needs to differentiate between GPA-based and HVA-based caches, and it's not obvious that using a helper is a net positive in overall code readability. Note, for GPA-based caches, this has a subtle side effect of using the GPA instead of the resolved HVA in the check() path, but that should be a nop as the HVA offset is derived from the GPA, i.e. the two offsets are identical, barring a KVM bug. Reviewed-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20240320001542.3203871-2-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/pfncache.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 4e07112a24c2f..8f2121b5f2a0a 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -57,6 +57,19 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, spin_unlock(&kvm->gpc_lock); } +static bool kvm_gpc_is_valid_len(gpa_t gpa, unsigned long uhva, + unsigned long len) +{ + unsigned long offset = kvm_is_error_gpa(gpa) ? offset_in_page(uhva) : + offset_in_page(gpa); + + /* + * The cached access must fit within a single page. The 'len' argument + * to activate() and refresh() exists only to enforce that. + */ + return offset + len <= PAGE_SIZE; +} + bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(gpc->kvm); @@ -74,7 +87,7 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len) if (kvm_is_error_hva(gpc->uhva)) return false; - if (offset_in_page(gpc->uhva) + len > PAGE_SIZE) + if (!kvm_gpc_is_valid_len(gpc->gpa, gpc->uhva, len)) return false; if (!gpc->valid) @@ -247,13 +260,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l if (WARN_ON_ONCE(kvm_is_error_gpa(gpa) == kvm_is_error_hva(uhva))) return -EINVAL; - /* - * The cached acces must fit within a single page. The 'len' argument - * exists only to enforce that. - */ - page_offset = kvm_is_error_gpa(gpa) ? offset_in_page(uhva) : - offset_in_page(gpa); - if (page_offset + len > PAGE_SIZE) + if (!kvm_gpc_is_valid_len(gpa, uhva, len)) return -EINVAL; lockdep_assert_held(&gpc->refresh_lock); @@ -270,6 +277,8 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l old_uhva = PAGE_ALIGN_DOWN(gpc->uhva); if (kvm_is_error_gpa(gpa)) { + page_offset = offset_in_page(uhva); + gpc->gpa = INVALID_GPA; gpc->memslot = NULL; gpc->uhva = PAGE_ALIGN_DOWN(uhva); @@ -279,6 +288,8 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l } else { struct kvm_memslots *slots = kvm_memslots(gpc->kvm); + page_offset = offset_in_page(gpa); + if (gpc->gpa != gpa || gpc->generation != slots->generation || kvm_is_error_hva(gpc->uhva)) { gfn_t gfn = gpa_to_gfn(gpa); -- GitLab From 5c9ca4ed890889a2b7c300c4f63f3baf3f63383f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Mar 2024 17:15:41 -0700 Subject: [PATCH 582/989] KVM: Check validity of offset+length of gfn_to_pfn_cache prior to activation When activating a gfn_to_pfn_cache, verify that the offset+length is sane and usable before marking the cache active. Letting __kvm_gpc_refresh() detect the problem results in a cache being marked active without setting the GPA (or any other fields), which in turn results in KVM trying to refresh a cache with INVALID_GPA. Attempting to refresh a cache with INVALID_GPA isn't functionally problematic, but it runs afoul of the sanity check that exactly one of GPA or userspace HVA is valid, i.e. that a cache is either GPA-based or HVA-based. Reported-by: syzbot+106a4f72b0474e1d1b33@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/0000000000005fa5cc0613f1cebd@google.com Fixes: 721f5b0dda78 ("KVM: pfncache: allow a cache to be activated with a fixed (userspace) HVA") Cc: David Woodhouse Cc: Paul Durrant Reviewed-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20240320001542.3203871-3-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/pfncache.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 8f2121b5f2a0a..91b0e329006ba 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -245,8 +245,7 @@ out_error: return -EFAULT; } -static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva, - unsigned long len) +static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva) { unsigned long page_offset; bool unmap_old = false; @@ -260,9 +259,6 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l if (WARN_ON_ONCE(kvm_is_error_gpa(gpa) == kvm_is_error_hva(uhva))) return -EINVAL; - if (!kvm_gpc_is_valid_len(gpa, uhva, len)) - return -EINVAL; - lockdep_assert_held(&gpc->refresh_lock); write_lock_irq(&gpc->lock); @@ -365,6 +361,9 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len) guard(mutex)(&gpc->refresh_lock); + if (!kvm_gpc_is_valid_len(gpc->gpa, gpc->uhva, len)) + return -EINVAL; + /* * If the GPA is valid then ignore the HVA, as a cache can be GPA-based * or HVA-based, not both. For GPA-based caches, the HVA will be @@ -372,7 +371,7 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len) */ uhva = kvm_is_error_gpa(gpc->gpa) ? gpc->uhva : KVM_HVA_ERR_BAD; - return __kvm_gpc_refresh(gpc, gpc->gpa, uhva, len); + return __kvm_gpc_refresh(gpc, gpc->gpa, uhva); } void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm) @@ -392,6 +391,9 @@ static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned { struct kvm *kvm = gpc->kvm; + if (!kvm_gpc_is_valid_len(gpa, uhva, len)) + return -EINVAL; + guard(mutex)(&gpc->refresh_lock); if (!gpc->active) { @@ -411,7 +413,7 @@ static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned gpc->active = true; write_unlock_irq(&gpc->lock); } - return __kvm_gpc_refresh(gpc, gpa, uhva, len); + return __kvm_gpc_refresh(gpc, gpa, uhva); } int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) -- GitLab From fc62a4e8dee2d1a9037e8cdeaa52ba67457f7300 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Mar 2024 17:15:42 -0700 Subject: [PATCH 583/989] KVM: Explicitly disallow activatating a gfn_to_pfn_cache with INVALID_GPA Explicit disallow activating a gfn_to_pfn_cache with an error gpa, i.e. INVALID_GPA, to ensure that KVM doesn't mistake a GPA-based cache for an HVA-based cache (KVM uses INVALID_GPA as a magic value to differentiate between GPA-based and HVA-based caches). WARN if KVM attempts to activate a cache with INVALID_GPA, purely so that new caches need to at least consider what to do with a "bad" GPA, as all existing usage of kvm_gpc_activate() guarantees gpa != INVALID_GPA. I.e. removing the WARN in the future is completely reasonable if doing so would yield cleaner/better code overall. Reviewed-by: David Woodhouse Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20240320001542.3203871-4-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/pfncache.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 91b0e329006ba..f618719644e04 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -418,6 +418,13 @@ static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) { + /* + * Explicitly disallow INVALID_GPA so that the magic value can be used + * by KVM to differentiate between GPA-based and HVA-based caches. + */ + if (WARN_ON_ONCE(kvm_is_error_gpa(gpa))) + return -EINVAL; + return __kvm_gpc_activate(gpc, gpa, KVM_HVA_ERR_BAD, len); } -- GitLab From 9e985cbf2942a1bb8fcef9adc2a17d90fd7ca8ee Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Mar 2024 16:58:33 -0800 Subject: [PATCH 584/989] KVM: x86/pmu: Disable support for adaptive PEBS Drop support for virtualizing adaptive PEBS, as KVM's implementation is architecturally broken without an obvious/easy path forward, and because exposing adaptive PEBS can leak host LBRs to the guest, i.e. can leak host kernel addresses to the guest. Bug #1 is that KVM doesn't account for the upper 32 bits of IA32_FIXED_CTR_CTRL when (re)programming fixed counters, e.g fixed_ctrl_field() drops the upper bits, reprogram_fixed_counters() stores local variables as u8s and truncates the upper bits too, etc. Bug #2 is that, because KVM _always_ sets precise_ip to a non-zero value for PEBS events, perf will _always_ generate an adaptive record, even if the guest requested a basic record. Note, KVM will also enable adaptive PEBS in individual *counter*, even if adaptive PEBS isn't exposed to the guest, but this is benign as MSR_PEBS_DATA_CFG is guaranteed to be zero, i.e. the guest will only ever see Basic records. Bug #3 is in perf. intel_pmu_disable_fixed() doesn't clear the upper bits either, i.e. leaves ICL_FIXED_0_ADAPTIVE set, and intel_pmu_enable_fixed() effectively doesn't clear ICL_FIXED_0_ADAPTIVE either. I.e. perf _always_ enables ADAPTIVE counters, regardless of what KVM requests. Bug #4 is that adaptive PEBS *might* effectively bypass event filters set by the host, as "Updated Memory Access Info Group" records information that might be disallowed by userspace via KVM_SET_PMU_EVENT_FILTER. Bug #5 is that KVM doesn't ensure LBR MSRs hold guest values (or at least zeros) when entering a vCPU with adaptive PEBS, which allows the guest to read host LBRs, i.e. host RIPs/addresses, by enabling "LBR Entries" records. Disable adaptive PEBS support as an immediate fix due to the severity of the LBR leak in particular, and because fixing all of the bugs will be non-trivial, e.g. not suitable for backporting to stable kernels. Note! This will break live migration, but trying to make KVM play nice with live migration would be quite complicated, wouldn't be guaranteed to work (i.e. KVM might still kill/confuse the guest), and it's not clear that there are any publicly available VMMs that support adaptive PEBS, let alone live migrate VMs that support adaptive PEBS, e.g. QEMU doesn't support PEBS in any capacity. Link: https://lore.kernel.org/all/20240306230153.786365-1-seanjc@google.com Link: https://lore.kernel.org/all/ZeepGjHCeSfadANM@google.com Fixes: c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") Cc: stable@vger.kernel.org Cc: Like Xu Cc: Mingwei Zhang Cc: Zhenyu Wang Cc: Zhang Xiong Cc: Lv Zhiyuan Cc: Dapeng Mi Cc: Jim Mattson Acked-by: Like Xu Link: https://lore.kernel.org/r/20240307005833.827147-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c37a89eda90f8..e3315bda62372 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7882,8 +7882,28 @@ static u64 vmx_get_perf_capabilities(void) if (vmx_pebs_supported()) { perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; - if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) - perf_cap &= ~PERF_CAP_PEBS_BASELINE; + + /* + * Disallow adaptive PEBS as it is functionally broken, can be + * used by the guest to read *host* LBRs, and can be used to + * bypass userspace event filters. To correctly and safely + * support adaptive PEBS, KVM needs to: + * + * 1. Account for the ADAPTIVE flag when (re)programming fixed + * counters. + * + * 2. Gain support from perf (or take direct control of counter + * programming) to support events without adaptive PEBS + * enabled for the hardware counter. + * + * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with + * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. + * + * 4. Document which PMU events are effectively exposed to the + * guest via adaptive PEBS, and make adaptive PEBS mutually + * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. + */ + perf_cap &= ~PERF_CAP_PEBS_BASELINE; } return perf_cap; -- GitLab From 992b54bd083c5bee24ff7cc35991388ab08598c4 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 14 Mar 2024 14:29:02 -0700 Subject: [PATCH 585/989] KVM: x86/mmu: x86: Don't overflow lpage_info when checking attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix KVM_SET_MEMORY_ATTRIBUTES to not overflow lpage_info array and trigger KASAN splat, as seen in the private_mem_conversions_test selftest. When memory attributes are set on a GFN range, that range will have specific properties applied to the TDP. A huge page cannot be used when the attributes are inconsistent, so they are disabled for those the specific huge pages. For internal KVM reasons, huge pages are also not allowed to span adjacent memslots regardless of whether the backing memory could be mapped as huge. What GFNs support which huge page sizes is tracked by an array of arrays 'lpage_info' on the memslot, of ‘kvm_lpage_info’ structs. Each index of lpage_info contains a vmalloc allocated array of these for a specific supported page size. The kvm_lpage_info denotes whether a specific huge page (GFN and page size) on the memslot is supported. These arrays include indices for unaligned head and tail huge pages. Preventing huge pages from spanning adjacent memslot is covered by incrementing the count in head and tail kvm_lpage_info when the memslot is allocated, but disallowing huge pages for memory that has mixed attributes has to be done in a more complicated way. During the KVM_SET_MEMORY_ATTRIBUTES ioctl KVM updates lpage_info for each memslot in the range that has mismatched attributes. KVM does this a memslot at a time, and marks a special bit, KVM_LPAGE_MIXED_FLAG, in the kvm_lpage_info for any huge page. This bit is essentially a permanently elevated count. So huge pages will not be mapped for the GFN at that page size if the count is elevated in either case: a huge head or tail page unaligned to the memslot or if KVM_LPAGE_MIXED_FLAG is set because it has mixed attributes. To determine whether a huge page has consistent attributes, the KVM_SET_MEMORY_ATTRIBUTES operation checks an xarray to make sure it consistently has the incoming attribute. Since level - 1 huge pages are aligned to level huge pages, it employs an optimization. As long as the level - 1 huge pages are checked first, it can just check these and assume that if each level - 1 huge page contained within the level sized huge page is not mixed, then the level size huge page is not mixed. This optimization happens in the helper hugepage_has_attrs(). Unfortunately, although the kvm_lpage_info array representing page size 'level' will contain an entry for an unaligned tail page of size level, the array for level - 1 will not contain an entry for each GFN at page size level. The level - 1 array will only contain an index for any unaligned region covered by level - 1 huge page size, which can be a smaller region. So this causes the optimization to overflow the level - 1 kvm_lpage_info and perform a vmalloc out of bounds read. In some cases of head and tail pages where an overflow could happen, callers skip the operation completely as KVM_LPAGE_MIXED_FLAG is not required to prevent huge pages as discussed earlier. But for memslots that are smaller than the 1GB page size, it does call hugepage_has_attrs(). In this case the huge page is both the head and tail page. The issue can be observed simply by compiling the kernel with CONFIG_KASAN_VMALLOC and running the selftest “private_mem_conversions_test”, which produces the output like the following: BUG: KASAN: vmalloc-out-of-bounds in hugepage_has_attrs+0x7e/0x110 Read of size 4 at addr ffffc900000a3008 by task private_mem_con/169 Call Trace: dump_stack_lvl print_report ? __virt_addr_valid ? hugepage_has_attrs ? hugepage_has_attrs kasan_report ? hugepage_has_attrs hugepage_has_attrs kvm_arch_post_set_memory_attributes kvm_vm_ioctl It is a little ambiguous whether the unaligned head page (in the bug case also the tail page) should be expected to have KVM_LPAGE_MIXED_FLAG set. It is not functionally required, as the unaligned head/tail pages will already have their kvm_lpage_info count incremented. The comments imply not setting it on unaligned head pages is intentional, so fix the callers to skip trying to set KVM_LPAGE_MIXED_FLAG in this case, and in doing so not call hugepage_has_attrs(). Cc: stable@vger.kernel.org Fixes: 90b4fe17981e ("KVM: x86: Disallow hugepages when memory attributes are mixed") Signed-off-by: Rick Edgecombe Reviewed-by: Kai Huang Reviewed-by: Chao Peng Link: https://lore.kernel.org/r/20240314212902.2762507-1-rick.p.edgecombe@intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 992e651540e85..18a404b5923f1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7399,7 +7399,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, * by the memslot, KVM can't use a hugepage due to the * misaligned address regardless of memory attributes. */ - if (gfn >= slot->base_gfn) { + if (gfn >= slot->base_gfn && + gfn + nr_pages <= slot->base_gfn + slot->npages) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else -- GitLab From de120e1d692d73c7eefa3278837b1eb68f90728a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Mar 2024 17:36:40 -0800 Subject: [PATCH 586/989] KVM: x86/pmu: Set enable bits for GP counters in PERF_GLOBAL_CTRL at "RESET" Set the enable bits for general purpose counters in IA32_PERF_GLOBAL_CTRL when refreshing the PMU to emulate the MSR's architecturally defined post-RESET behavior. Per Intel's SDM: IA32_PERF_GLOBAL_CTRL: Sets bits n-1:0 and clears the upper bits. and Where "n" is the number of general-purpose counters available in the processor. AMD also documents this behavior for PerfMonV2 CPUs in one of AMD's many PPRs. Do not set any PERF_GLOBAL_CTRL bits if there are no general purpose counters, although a literal reading of the SDM would require the CPU to set either bits 63:0 or 31:0. The intent of the behavior is to globally enable all GP counters; honor the intent, if not the letter of the law. Leaving PERF_GLOBAL_CTRL '0' effectively breaks PMU usage in guests that haven't been updated to work with PMUs that support PERF_GLOBAL_CTRL. This bug was recently exposed when KVM added supported for AMD's PerfMonV2, i.e. when KVM started exposing a vPMU with PERF_GLOBAL_CTRL to guest software that only knew how to program v1 PMUs (that don't support PERF_GLOBAL_CTRL). Failure to emulate the post-RESET behavior results in such guests unknowingly leaving all general purpose counters globally disabled (the entire reason the post-RESET value sets the GP counter enable bits is to maintain backwards compatibility). The bug has likely gone unnoticed because PERF_GLOBAL_CTRL has been supported on Intel CPUs for as long as KVM has existed, i.e. hardly anyone is running guest software that isn't aware of PERF_GLOBAL_CTRL on Intel PMUs. And because up until v6.0, KVM _did_ emulate the behavior for Intel CPUs, although the old behavior was likely dumb luck. Because (a) that old code was also broken in its own way (the history of this code is a comedy of errors), and (b) PERF_GLOBAL_CTRL was documented as having a value of '0' post-RESET in all SDMs before March 2023. Initial vPMU support in commit f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests") *almost* got it right (again likely by dumb luck), but for some reason only set the bits if the guest PMU was advertised as v1: if (pmu->version == 1) { pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1; return; } Commit f19a0c2c2e6a ("KVM: PMU emulation: GLOBAL_CTRL MSR should be enabled on reset") then tried to remedy that goof, presumably because guest PMUs were leaving PERF_GLOBAL_CTRL '0', i.e. weren't enabling counters. pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; That was KVM's behavior up until commit c49467a45fe0 ("KVM: x86/pmu: Don't overwrite the pmu->global_ctrl when refreshing") removed *everything*. However, it did so based on the behavior defined by the SDM , which at the time stated that "Global Perf Counter Controls" is '0' at Power-Up and RESET. But then the March 2023 SDM (325462-079US), stealthily changed its "IA-32 and Intel 64 Processor States Following Power-up, Reset, or INIT" table to say: IA32_PERF_GLOBAL_CTRL: Sets bits n-1:0 and clears the upper bits. Note, kvm_pmu_refresh() can be invoked multiple times, i.e. it's not a "pure" RESET flow. But it can only be called prior to the first KVM_RUN, i.e. the guest will only ever observe the final value. Note #2, KVM has always cleared global_ctrl during refresh (see commit f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests")), i.e. there is no danger of breaking existing setups by clobbering a value set by userspace. Reported-by: Babu Moger Cc: Sandipan Das Cc: Like Xu Cc: Mingwei Zhang Cc: Dapeng Mi Cc: stable@vger.kernel.org Reviewed-by: Dapeng Mi Tested-by: Dapeng Mi Link: https://lore.kernel.org/r/20240309013641.1413400-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index c397b28e3d1b6..a593b03c9aed6 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -775,8 +775,20 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_data_cfg_mask = ~0ull; bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); - if (vcpu->kvm->arch.enable_pmu) - static_call(kvm_x86_pmu_refresh)(vcpu); + if (!vcpu->kvm->arch.enable_pmu) + return; + + static_call(kvm_x86_pmu_refresh)(vcpu); + + /* + * At RESET, both Intel and AMD CPUs set all enable bits for general + * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that + * was written for v1 PMUs don't unknowingly leave GP counters disabled + * in the global controls). Emulate that behavior when refreshing the + * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. + */ + if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) + pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); } void kvm_pmu_init(struct kvm_vcpu *vcpu) -- GitLab From 08a828249b16f69248652937be7f1b8dab340a22 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Mar 2024 17:36:41 -0800 Subject: [PATCH 587/989] KVM: selftests: Verify post-RESET value of PERF_GLOBAL_CTRL in PMCs test Add a guest assert in the PMU counters test to verify that KVM stuffs the vCPU's post-RESET value to globally enable all general purpose counters. Per Intel's SDM, IA32_PERF_GLOBAL_CTRL: Sets bits n-1:0 and clears the upper bits. and Where "n" is the number of general-purpose counters available in the processor. For the edge case where there are zero GP counters, follow the spirit of the architecture, not the SDM's literal wording, which doesn't account for this possibility and would require the CPU to set _all_ bits in PERF_GLOBAL_CTRL. Reviewed-by: Dapeng Mi Tested-by: Dapeng Mi Link: https://lore.kernel.org/r/20240309013641.1413400-3-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/pmu_counters_test.c | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c index 29609b52f8fa0..26c85815f7e98 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c @@ -416,12 +416,30 @@ static void guest_rd_wr_counters(uint32_t base_msr, uint8_t nr_possible_counters static void guest_test_gp_counters(void) { + uint8_t pmu_version = guest_get_pmu_version(); uint8_t nr_gp_counters = 0; uint32_t base_msr; - if (guest_get_pmu_version()) + if (pmu_version) nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS); + /* + * For v2+ PMUs, PERF_GLOBAL_CTRL's architectural post-RESET value is + * "Sets bits n-1:0 and clears the upper bits", where 'n' is the number + * of GP counters. If there are no GP counters, require KVM to leave + * PERF_GLOBAL_CTRL '0'. This edge case isn't covered by the SDM, but + * follow the spirit of the architecture and only globally enable GP + * counters, of which there are none. + */ + if (pmu_version > 1) { + uint64_t global_ctrl = rdmsr(MSR_CORE_PERF_GLOBAL_CTRL); + + if (nr_gp_counters) + GUEST_ASSERT_EQ(global_ctrl, GENMASK_ULL(nr_gp_counters - 1, 0)); + else + GUEST_ASSERT_EQ(global_ctrl, 0); + } + if (this_cpu_has(X86_FEATURE_PDCM) && rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES) base_msr = MSR_IA32_PMC0; -- GitLab From 0ef2dd1f4144bc024d3c98954fa061a102f6481b Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Fri, 15 Mar 2024 10:35:07 -0400 Subject: [PATCH 588/989] KVM: selftests: fix max_guest_memory_test with more that 256 vCPUs max_guest_memory_test uses ucalls to sync with the host, but it also resets the guest RIP back to its initial value in between tests stages. This makes the guest never reach the code which frees the ucall struct and since a fixed pool of 512 ucall structs is used, the test starts to fail when more that 256 vCPUs are used. Fix that by replacing the manual register reset with a loop in the guest code. Signed-off-by: Maxim Levitsky Link: https://lore.kernel.org/r/20240315143507.102629-1-mlevitsk@redhat.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/max_guest_memory_test.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index 6628dc4dda89f..1a6da7389bf1f 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -22,10 +22,11 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) { uint64_t gpa; - for (gpa = start_gpa; gpa < end_gpa; gpa += stride) - *((volatile uint64_t *)gpa) = gpa; - - GUEST_DONE(); + for (;;) { + for (gpa = start_gpa; gpa < end_gpa; gpa += stride) + *((volatile uint64_t *)gpa) = gpa; + GUEST_SYNC(0); + } } struct vcpu_info { @@ -55,7 +56,7 @@ static void rendezvous_with_boss(void) static void run_vcpu(struct kvm_vcpu *vcpu) { vcpu_run(vcpu); - TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC); } static void *vcpu_worker(void *data) @@ -64,17 +65,13 @@ static void *vcpu_worker(void *data) struct kvm_vcpu *vcpu = info->vcpu; struct kvm_vm *vm = vcpu->vm; struct kvm_sregs sregs; - struct kvm_regs regs; vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size); - /* Snapshot regs before the first run. */ - vcpu_regs_get(vcpu, ®s); rendezvous_with_boss(); run_vcpu(vcpu); rendezvous_with_boss(); - vcpu_regs_set(vcpu, ®s); vcpu_sregs_get(vcpu, &sregs); #ifdef __x86_64__ /* Toggle CR0.WP to trigger a MMU context reset. */ -- GitLab From 449c0811d8729068646e1a270be72bf2da2e66f3 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 3 Apr 2024 14:33:01 +0200 Subject: [PATCH 589/989] KVM: selftests: fix supported_flags for riscv commit 849c1816436f ("KVM: selftests: fix supported_flags for aarch64") fixed the set-memory-region test for aarch64 by declaring the read-only flag is supported. riscv also supports the read-only flag. Fix it too. Signed-off-by: Andrew Jones Link: https://lore.kernel.org/r/20240403123300.63923-2-ajones@ventanamicro.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/set_memory_region_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 06b43ed23580b..bd57d991e27d8 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -333,7 +333,7 @@ static void test_invalid_memory_region_flags(void) struct kvm_vm *vm; int r, i; -#if defined __aarch64__ || defined __x86_64__ +#if defined __aarch64__ || defined __riscv || defined __x86_64__ supported_flags |= KVM_MEM_READONLY; #endif -- GitLab From 7f2817ef52a1cc3ee0ace35eb8df7a39bd4fc9b7 Mon Sep 17 00:00:00 2001 From: Tao Su Date: Tue, 19 Mar 2024 11:11:11 +0800 Subject: [PATCH 590/989] KVM: VMX: Ignore MKTME KeyID bits when intercepting #PF for allow_smaller_maxphyaddr Use the raw/true host.MAXPHYADDR when deciding whether or not KVM must intercept #PFs when allow_smaller_maxphyaddr is enabled, as any adjustments the kernel makes to boot_cpu_data.x86_phys_bits to account for MKTME KeyID bits do not apply to the guest physical address space. I.e. the KeyID are off-limits for host physical addresses, but are not reserved for GPAs as far as hardware is concerned. Signed-off-by: Tao Su Link: https://lore.kernel.org/r/20240319031111.495006-1-tao1.su@linux.intel.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 65786dbe7d60b..1742c88ba9c80 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -15,6 +15,7 @@ #include "vmx_ops.h" #include "../cpuid.h" #include "run_flags.h" +#include "../mmu.h" #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 @@ -719,7 +720,8 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) if (!enable_ept) return true; - return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; + return allow_smaller_maxphyaddr && + cpuid_maxphyaddr(vcpu) < kvm_get_shadow_phys_bits(); } static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) -- GitLab From ca91259b775f6fd98ae5d23bb4eec101d468ba8d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 25 Mar 2024 15:44:17 -0700 Subject: [PATCH 591/989] scsi: core: Fix handling of SCMD_FAIL_IF_RECOVERING There is code in the SCSI core that sets the SCMD_FAIL_IF_RECOVERING flag but there is no code that clears this flag. Instead of only clearing SCMD_INITIALIZED in scsi_end_request(), clear all flags. It is never necessary to preserve any command flags inside scsi_end_request(). Cc: stable@vger.kernel.org Fixes: 310bcaef6d7e ("scsi: core: Support failing requests while recovering") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240325224417.1477135-1-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 2e28e2360c857..5b3230ef51fe6 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -635,10 +635,9 @@ static bool scsi_end_request(struct request *req, blk_status_t error, if (blk_queue_add_random(q)) add_disk_randomness(req->q->disk); - if (!blk_rq_is_passthrough(req)) { - WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED)); - cmd->flags &= ~SCMD_INITIALIZED; - } + WARN_ON_ONCE(!blk_rq_is_passthrough(req) && + !(cmd->flags & SCMD_INITIALIZED)); + cmd->flags = 0; /* * Calling rcu_barrier() is not necessary here because the -- GitLab From f96f700449b6d190e06272f1cf732ae8e45b73df Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 5 Apr 2024 22:30:39 +0200 Subject: [PATCH 592/989] net: ks8851: Inline ks8851_rx_skb() Both ks8851_rx_skb_par() and ks8851_rx_skb_spi() call netif_rx(skb), inline the netif_rx(skb) call directly into ks8851_common.c and drop the .rx_skb callback and ks8851_rx_skb() wrapper. This removes one indirect call from the driver, no functional change otherwise. Signed-off-by: Marek Vasut Link: https://lore.kernel.org/r/20240405203204.82062-1-marex@denx.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/micrel/ks8851.h | 3 --- drivers/net/ethernet/micrel/ks8851_common.c | 12 +----------- drivers/net/ethernet/micrel/ks8851_par.c | 11 ----------- drivers/net/ethernet/micrel/ks8851_spi.c | 11 ----------- 4 files changed, 1 insertion(+), 36 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.h b/drivers/net/ethernet/micrel/ks8851.h index e5ec0a363aff8..31f75b4a67fd7 100644 --- a/drivers/net/ethernet/micrel/ks8851.h +++ b/drivers/net/ethernet/micrel/ks8851.h @@ -368,7 +368,6 @@ union ks8851_tx_hdr { * @rdfifo: FIFO read callback * @wrfifo: FIFO write callback * @start_xmit: start_xmit() implementation callback - * @rx_skb: rx_skb() implementation callback * @flush_tx_work: flush_tx_work() implementation callback * * The @statelock is used to protect information in the structure which may @@ -423,8 +422,6 @@ struct ks8851_net { struct sk_buff *txp, bool irq); netdev_tx_t (*start_xmit)(struct sk_buff *skb, struct net_device *dev); - void (*rx_skb)(struct ks8851_net *ks, - struct sk_buff *skb); void (*flush_tx_work)(struct ks8851_net *ks); }; diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c index 0bf13b38b8f5b..896d43bb8883d 100644 --- a/drivers/net/ethernet/micrel/ks8851_common.c +++ b/drivers/net/ethernet/micrel/ks8851_common.c @@ -231,16 +231,6 @@ static void ks8851_dbg_dumpkkt(struct ks8851_net *ks, u8 *rxpkt) rxpkt[12], rxpkt[13], rxpkt[14], rxpkt[15]); } -/** - * ks8851_rx_skb - receive skbuff - * @ks: The device state. - * @skb: The skbuff - */ -static void ks8851_rx_skb(struct ks8851_net *ks, struct sk_buff *skb) -{ - ks->rx_skb(ks, skb); -} - /** * ks8851_rx_pkts - receive packets from the host * @ks: The device information. @@ -309,7 +299,7 @@ static void ks8851_rx_pkts(struct ks8851_net *ks) ks8851_dbg_dumpkkt(ks, rxpkt); skb->protocol = eth_type_trans(skb, ks->netdev); - ks8851_rx_skb(ks, skb); + netif_rx(skb); ks->netdev->stats.rx_packets++; ks->netdev->stats.rx_bytes += rxlen; diff --git a/drivers/net/ethernet/micrel/ks8851_par.c b/drivers/net/ethernet/micrel/ks8851_par.c index 2a7f298542670..381b9cd285ebd 100644 --- a/drivers/net/ethernet/micrel/ks8851_par.c +++ b/drivers/net/ethernet/micrel/ks8851_par.c @@ -210,16 +210,6 @@ static void ks8851_wrfifo_par(struct ks8851_net *ks, struct sk_buff *txp, iowrite16_rep(ksp->hw_addr, txp->data, len / 2); } -/** - * ks8851_rx_skb_par - receive skbuff - * @ks: The device state. - * @skb: The skbuff - */ -static void ks8851_rx_skb_par(struct ks8851_net *ks, struct sk_buff *skb) -{ - netif_rx(skb); -} - static unsigned int ks8851_rdreg16_par_txqcr(struct ks8851_net *ks) { return ks8851_rdreg16_par(ks, KS_TXQCR); @@ -298,7 +288,6 @@ static int ks8851_probe_par(struct platform_device *pdev) ks->rdfifo = ks8851_rdfifo_par; ks->wrfifo = ks8851_wrfifo_par; ks->start_xmit = ks8851_start_xmit_par; - ks->rx_skb = ks8851_rx_skb_par; #define STD_IRQ (IRQ_LCI | /* Link Change */ \ IRQ_RXI | /* RX done */ \ diff --git a/drivers/net/ethernet/micrel/ks8851_spi.c b/drivers/net/ethernet/micrel/ks8851_spi.c index 2f803377c9f9d..670c1de966db8 100644 --- a/drivers/net/ethernet/micrel/ks8851_spi.c +++ b/drivers/net/ethernet/micrel/ks8851_spi.c @@ -298,16 +298,6 @@ static unsigned int calc_txlen(unsigned int len) return ALIGN(len + 4, 4); } -/** - * ks8851_rx_skb_spi - receive skbuff - * @ks: The device state - * @skb: The skbuff - */ -static void ks8851_rx_skb_spi(struct ks8851_net *ks, struct sk_buff *skb) -{ - netif_rx(skb); -} - /** * ks8851_tx_work - process tx packet(s) * @work: The work strucutre what was scheduled. @@ -435,7 +425,6 @@ static int ks8851_probe_spi(struct spi_device *spi) ks->rdfifo = ks8851_rdfifo_spi; ks->wrfifo = ks8851_wrfifo_spi; ks->start_xmit = ks8851_start_xmit_spi; - ks->rx_skb = ks8851_rx_skb_spi; ks->flush_tx_work = ks8851_flush_tx_work_spi; #define STD_IRQ (IRQ_LCI | /* Link Change */ \ -- GitLab From be0384bf599cf1eb8d337517feeb732d71f75a6f Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 5 Apr 2024 22:30:40 +0200 Subject: [PATCH 593/989] net: ks8851: Handle softirqs at the end of IRQ thread to fix hang The ks8851_irq() thread may call ks8851_rx_pkts() in case there are any packets in the MAC FIFO, which calls netif_rx(). This netif_rx() implementation is guarded by local_bh_disable() and local_bh_enable(). The local_bh_enable() may call do_softirq() to run softirqs in case any are pending. One of the softirqs is net_rx_action, which ultimately reaches the driver .start_xmit callback. If that happens, the system hangs. The entire call chain is below: ks8851_start_xmit_par from netdev_start_xmit netdev_start_xmit from dev_hard_start_xmit dev_hard_start_xmit from sch_direct_xmit sch_direct_xmit from __dev_queue_xmit __dev_queue_xmit from __neigh_update __neigh_update from neigh_update neigh_update from arp_process.constprop.0 arp_process.constprop.0 from __netif_receive_skb_one_core __netif_receive_skb_one_core from process_backlog process_backlog from __napi_poll.constprop.0 __napi_poll.constprop.0 from net_rx_action net_rx_action from __do_softirq __do_softirq from call_with_stack call_with_stack from do_softirq do_softirq from __local_bh_enable_ip __local_bh_enable_ip from netif_rx netif_rx from ks8851_irq ks8851_irq from irq_thread_fn irq_thread_fn from irq_thread irq_thread from kthread kthread from ret_from_fork The hang happens because ks8851_irq() first locks a spinlock in ks8851_par.c ks8851_lock_par() spin_lock_irqsave(&ksp->lock, ...) and with that spinlock locked, calls netif_rx(). Once the execution reaches ks8851_start_xmit_par(), it calls ks8851_lock_par() again which attempts to claim the already locked spinlock again, and the hang happens. Move the do_softirq() call outside of the spinlock protected section of ks8851_irq() by disabling BHs around the entire spinlock protected section of ks8851_irq() handler. Place local_bh_enable() outside of the spinlock protected section, so that it can trigger do_softirq() without the ks8851_par.c ks8851_lock_par() spinlock being held, and safely call ks8851_start_xmit_par() without attempting to lock the already locked spinlock. Since ks8851_irq() is protected by local_bh_disable()/local_bh_enable() now, replace netif_rx() with __netif_rx() which is not duplicating the local_bh_disable()/local_bh_enable() calls. Fixes: 797047f875b5 ("net: ks8851: Implement Parallel bus operations") Signed-off-by: Marek Vasut Link: https://lore.kernel.org/r/20240405203204.82062-2-marex@denx.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/micrel/ks8851_common.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c index 896d43bb8883d..d4cdf3d4f5525 100644 --- a/drivers/net/ethernet/micrel/ks8851_common.c +++ b/drivers/net/ethernet/micrel/ks8851_common.c @@ -299,7 +299,7 @@ static void ks8851_rx_pkts(struct ks8851_net *ks) ks8851_dbg_dumpkkt(ks, rxpkt); skb->protocol = eth_type_trans(skb, ks->netdev); - netif_rx(skb); + __netif_rx(skb); ks->netdev->stats.rx_packets++; ks->netdev->stats.rx_bytes += rxlen; @@ -330,6 +330,8 @@ static irqreturn_t ks8851_irq(int irq, void *_ks) unsigned long flags; unsigned int status; + local_bh_disable(); + ks8851_lock(ks, &flags); status = ks8851_rdreg16(ks, KS_ISR); @@ -406,6 +408,8 @@ static irqreturn_t ks8851_irq(int irq, void *_ks) if (status & IRQ_LCI) mii_check_link(&ks->mii); + local_bh_enable(); + return IRQ_HANDLED; } -- GitLab From 9c432404b9555c9444cbf6c8feaf52c0d8cad486 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 8 Apr 2024 22:32:08 -0400 Subject: [PATCH 594/989] bcachefs: fix eytzinger0_find_gt() - fix return types: promoting from unsigned to ssize_t does not do what we want here, and was pointless since the rest of the eytzinger code is u32 - nr, not size Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index ee0e2df33322d..24840aee335c0 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -242,8 +242,8 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) = eytzinger0_next((_i), (_size))) /* return greatest node <= @search, or -1 if not found */ -static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) +static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) { unsigned i, n = 0; @@ -256,18 +256,32 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, } while (n < nr); if (n & 1) { - /* @i was greater than @search, return previous node: */ + /* + * @i was greater than @search, return previous node: + * + * if @i was leftmost/smallest element, + * eytzinger0_prev(eytzinger0_first())) returns -1, as expected + */ return eytzinger0_prev(i, nr); } else { return i; } } -static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) +static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) { ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); - return eytzinger0_next(idx, size); + + /* + * if eytitzinger0_find_le() returned -1 - no element was <= search - we + * want to return the first element; next/prev identities mean this work + * as expected + * + * similarly if find_le() returns last element, we should return -1; + * identities mean this all works out: + */ + return eytzinger0_next(idx, nr); } #define eytzinger0_find(base, nr, size, _cmp, search) \ -- GitLab From b46f4eaa4f0ec38909fb0072eea3aeddb32f954e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 5 Apr 2024 15:10:57 -0700 Subject: [PATCH 595/989] af_unix: Clear stale u->oob_skb. syzkaller started to report deadlock of unix_gc_lock after commit 4090fa373f0e ("af_unix: Replace garbage collection algorithm."), but it just uncovers the bug that has been there since commit 314001f0bf92 ("af_unix: Add OOB support"). The repro basically does the following. from socket import * from array import array c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) c1.sendmsg([b'a'], [(SOL_SOCKET, SCM_RIGHTS, array("i", [c2.fileno()]))], MSG_OOB) c2.recv(1) # blocked as no normal data in recv queue c2.close() # done async and unblock recv() c1.close() # done async and trigger GC A socket sends its file descriptor to itself as OOB data and tries to receive normal data, but finally recv() fails due to async close(). The problem here is wrong handling of OOB skb in manage_oob(). When recvmsg() is called without MSG_OOB, manage_oob() is called to check if the peeked skb is OOB skb. In such a case, manage_oob() pops it out of the receive queue but does not clear unix_sock(sk)->oob_skb. This is wrong in terms of uAPI. Let's say we send "hello" with MSG_OOB, and "world" without MSG_OOB. The 'o' is handled as OOB data. When recv() is called twice without MSG_OOB, the OOB data should be lost. >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM, 0) >>> c1.send(b'hello', MSG_OOB) # 'o' is OOB data 5 >>> c1.send(b'world') 5 >>> c2.recv(5) # OOB data is not received b'hell' >>> c2.recv(5) # OOB date is skipped b'world' >>> c2.recv(5, MSG_OOB) # This should return an error b'o' In the same situation, TCP actually returns -EINVAL for the last recv(). Also, if we do not clear unix_sk(sk)->oob_skb, unix_poll() always set EPOLLPRI even though the data has passed through by previous recv(). To avoid these issues, we must clear unix_sk(sk)->oob_skb when dequeuing it from recv queue. The reason why the old GC did not trigger the deadlock is because the old GC relied on the receive queue to detect the loop. When it is triggered, the socket with OOB data is marked as GC candidate because file refcount == inflight count (1). However, after traversing all inflight sockets, the socket still has a positive inflight count (1), thus the socket is excluded from candidates. Then, the old GC lose the chance to garbage-collect the socket. With the old GC, the repro continues to create true garbage that will never be freed nor detected by kmemleak as it's linked to the global inflight list. That's why we couldn't even notice the issue. Fixes: 314001f0bf92 ("af_unix: Add OOB support") Reported-by: syzbot+7f7f201cc2668a8fd169@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7f7f201cc2668a8fd169 Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240405221057.2406-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5b41e2321209a..d032eb5fa6df1 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2665,7 +2665,9 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, } } else if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); - consume_skb(skb); + WRITE_ONCE(u->oob_skb, NULL); + if (!WARN_ON_ONCE(skb_unref(skb))) + kfree_skb(skb); skb = skb_peek(&sk->sk_receive_queue); } } -- GitLab From 718c4fb221dbeff9072810841b949413c5ffc345 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 8 Apr 2024 16:42:43 +1000 Subject: [PATCH 596/989] nouveau: fix devinit paths to only handle display on GSP. This reverts: nouveau/gsp: don't check devinit disable on GSP. and applies a further fix. It turns out the open gpu driver, checks this register, but only for display. Match that behaviour and in the turing path only disable the display block. (ampere already only does displays). Fixes: 5d4e8ae6e57b ("nouveau/gsp: don't check devinit disable on GSP.") Reviewed-by: Danilo Krummrich Signed-off-by: Dave Airlie Link: https://patchwork.freedesktop.org/patch/msgid/20240408064243.2219527-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c | 12 ++++++++---- drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c index 7bcbc4895ec22..271bfa038f5bc 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c @@ -25,6 +25,7 @@ #include #include +#include void gm107_devinit_disable(struct nvkm_devinit *init) @@ -33,10 +34,13 @@ gm107_devinit_disable(struct nvkm_devinit *init) u32 r021c00 = nvkm_rd32(device, 0x021c00); u32 r021c04 = nvkm_rd32(device, 0x021c04); - if (r021c00 & 0x00000001) - nvkm_subdev_disable(device, NVKM_ENGINE_CE, 0); - if (r021c00 & 0x00000004) - nvkm_subdev_disable(device, NVKM_ENGINE_CE, 2); + /* gsp only wants to enable/disable display */ + if (!nvkm_gsp_rm(device->gsp)) { + if (r021c00 & 0x00000001) + nvkm_subdev_disable(device, NVKM_ENGINE_CE, 0); + if (r021c00 & 0x00000004) + nvkm_subdev_disable(device, NVKM_ENGINE_CE, 2); + } if (r021c04 & 0x00000001) nvkm_subdev_disable(device, NVKM_ENGINE_DISP, 0); } diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c index 11b4c9c274a1a..666eb93b1742c 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c @@ -41,6 +41,7 @@ r535_devinit_new(const struct nvkm_devinit_func *hw, rm->dtor = r535_devinit_dtor; rm->post = hw->post; + rm->disable = hw->disable; ret = nv50_devinit_new_(rm, device, type, inst, pdevinit); if (ret) -- GitLab From 4fe82aedeb8a8cb09bfa60f55ab57b5c10a74ac4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Apr 2024 18:11:09 +0100 Subject: [PATCH 597/989] io_uring/net: restore msg_control on sendzc retry cac9e4418f4cb ("io_uring/net: save msghdr->msg_control for retries") reinstatiates msg_control before every __sys_sendmsg_sock(), since the function can overwrite the value in msghdr. We need to do same for zerocopy sendmsg. Cc: stable@vger.kernel.org Fixes: 493108d95f146 ("io_uring/net: zerocopy sendmsg") Link: https://github.com/axboe/liburing/issues/1067 Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cc1d5d9df0576fa66ddad4420d240a98a020b267.1712596179.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/net.c b/io_uring/net.c index 1e7665ff6ef70..4afb475d41974 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1276,6 +1276,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) if (req_has_async_data(req)) { kmsg = req->async_data; + kmsg->msg.msg_control_user = sr->msg_control; } else { ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) -- GitLab From 359571c327a726d622786aef3833637dacfd5d38 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 9 Apr 2024 00:02:47 -0400 Subject: [PATCH 598/989] bcachefs: Fix check_topology() when using node scan shoot down journal keys _before_ populating journal keys with pointers to scanned nodes Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 6280da1244b55..a1c2c0ffb0cc6 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -544,12 +544,12 @@ reconstruct_root: bch2_btree_root_alloc_fake(c, i, 0); } else { bch2_btree_root_alloc_fake(c, i, 1); + bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); if (ret) break; } - bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); reconstructed_root = true; } -- GitLab From 5ab4beb759c05c74fb385ac5ca0ade5d3db67975 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 9 Apr 2024 00:49:39 -0400 Subject: [PATCH 599/989] bcachefs: Don't scan for btree nodes when we can reconstruct Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 14 ++++++++++++++ fs/bcachefs/btree_gc.c | 11 ++++++++--- fs/bcachefs/btree_node_scan.c | 8 +++++++- fs/bcachefs/recovery.c | 14 -------------- 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 63102992d9556..364ae42022af1 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1535,6 +1535,20 @@ enum btree_id { BTREE_ID_NR }; +static inline bool btree_id_is_alloc(enum btree_id id) +{ + switch (id) { + case BTREE_ID_alloc: + case BTREE_ID_backpointers: + case BTREE_ID_need_discard: + case BTREE_ID_freespace: + case BTREE_ID_bucket_gens: + return true; + default: + return false; + } +} + #define BTREE_MAX_DEPTH 4U /* Btree nodes */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index a1c2c0ffb0cc6..d2555da55c6da 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -368,11 +368,16 @@ again: buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); cur = NULL; - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: - bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); if (ret) break; + + if (!btree_id_is_alloc(b->c.btree_id)) { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + break; + } continue; } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 3f33be7e5e5c2..a7d0593b38714 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -133,6 +133,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, if (le64_to_cpu(bn->magic) != bset_magic(c)) return; + if (btree_id_is_alloc(BTREE_NODE_ID(bn))) + return; + rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), @@ -290,7 +293,7 @@ again: found_btree_node_to_text(&buf, c, n); bch_err(c, "%s", buf.buf); printbuf_exit(&buf); - return -1; + return -BCH_ERR_fsck_repair_unimplemented; } } @@ -436,6 +439,9 @@ bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos node_min, struct bpos node_max) { + if (btree_id_is_alloc(btree)) + return 0; + struct find_btree_nodes *f = &c->found_btree_nodes; int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index b76c16152579c..0f328aba9760b 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -47,20 +47,6 @@ void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) } } -static bool btree_id_is_alloc(enum btree_id id) -{ - switch (id) { - case BTREE_ID_alloc: - case BTREE_ID_backpointers: - case BTREE_ID_need_discard: - case BTREE_ID_freespace: - case BTREE_ID_bucket_gens: - return true; - default: - return false; - } -} - /* for -o reconstruct_alloc: */ static void bch2_reconstruct_alloc(struct bch_fs *c) { -- GitLab From 80e9963fb3b5509dfcabe9652d56bf4b35542055 Mon Sep 17 00:00:00 2001 From: Nianyao Tang Date: Sat, 6 Apr 2024 02:27:37 +0000 Subject: [PATCH 600/989] irqchip/gic-v3-its: Fix VSYNC referencing an unmapped VPE on GIC v4.1 As per the GICv4.1 spec (Arm IHI 0069H, 5.3.19): "A VMAPP with {V, Alloc}=={0, x} is self-synchronizing, This means the ITS command queue does not show the command as consumed until all of its effects are completed." Furthermore, VSYNC is allowed to deliver an SError when referencing a non existent VPE. By these definitions, a VMAPP followed by a VSYNC is a bug, as the later references a VPE that has been unmapped by the former. Fix it by eliding the VSYNC in this scenario. Fixes: 64edfaa9a234 ("irqchip/gic-v4.1: Implement the v4.1 flavour of VMAPP") Signed-off-by: Nianyao Tang Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20240406022737.3898763-1-tangnianyao@huawei.com --- drivers/irqchip/irq-gic-v3-its.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index fca888b36680d..2a537cbfcb077 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -786,6 +786,7 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, struct its_cmd_block *cmd, struct its_cmd_desc *desc) { + struct its_vpe *vpe = valid_vpe(its, desc->its_vmapp_cmd.vpe); unsigned long vpt_addr, vconf_addr; u64 target; bool alloc; @@ -798,6 +799,11 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, if (is_v4_1(its)) { alloc = !atomic_dec_return(&desc->its_vmapp_cmd.vpe->vmapp_count); its_encode_alloc(cmd, alloc); + /* + * Unmapping a VPE is self-synchronizing on GICv4.1, + * no need to issue a VSYNC. + */ + vpe = NULL; } goto out; @@ -832,7 +838,7 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, out: its_fixup_cmd(cmd); - return valid_vpe(its, desc->its_vmapp_cmd.vpe); + return vpe; } static struct its_vpe *its_build_vmapti_cmd(struct its_node *its, -- GitLab From faf23006185e777db18912685922c5ddb2df383f Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Mon, 8 Apr 2024 12:06:43 +0530 Subject: [PATCH 601/989] octeontx2-af: Fix NIX SQ mode and BP config NIX SQ mode and link backpressure configuration is required for all platforms. But in current driver this code is wrongly placed under specific platform check. This patch fixes the issue by moving the code out of platform check. Fixes: 5d9b976d4480 ("octeontx2-af: Support fixed transmit scheduler topology") Signed-off-by: Geetha sowjanya Link: https://lore.kernel.org/r/20240408063643.26288-1-gakula@marvell.com Signed-off-by: Paolo Abeni --- .../ethernet/marvell/octeontx2/af/rvu_nix.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index d39001cdc707e..00af8888e3291 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -4819,18 +4819,18 @@ static int rvu_nix_block_init(struct rvu *rvu, struct nix_hw *nix_hw) */ rvu_write64(rvu, blkaddr, NIX_AF_CFG, rvu_read64(rvu, blkaddr, NIX_AF_CFG) | 0x40ULL); + } - /* Set chan/link to backpressure TL3 instead of TL2 */ - rvu_write64(rvu, blkaddr, NIX_AF_PSE_CHANNEL_LEVEL, 0x01); + /* Set chan/link to backpressure TL3 instead of TL2 */ + rvu_write64(rvu, blkaddr, NIX_AF_PSE_CHANNEL_LEVEL, 0x01); - /* Disable SQ manager's sticky mode operation (set TM6 = 0) - * This sticky mode is known to cause SQ stalls when multiple - * SQs are mapped to same SMQ and transmitting pkts at a time. - */ - cfg = rvu_read64(rvu, blkaddr, NIX_AF_SQM_DBG_CTL_STATUS); - cfg &= ~BIT_ULL(15); - rvu_write64(rvu, blkaddr, NIX_AF_SQM_DBG_CTL_STATUS, cfg); - } + /* Disable SQ manager's sticky mode operation (set TM6 = 0) + * This sticky mode is known to cause SQ stalls when multiple + * SQs are mapped to same SMQ and transmitting pkts at a time. + */ + cfg = rvu_read64(rvu, blkaddr, NIX_AF_SQM_DBG_CTL_STATUS); + cfg &= ~BIT_ULL(15); + rvu_write64(rvu, blkaddr, NIX_AF_SQM_DBG_CTL_STATUS, cfg); ltdefs = rvu->kpu.lt_def; /* Calibrate X2P bus to check if CGX/LBK links are fine */ -- GitLab From 74043489fcb5e5ca4074133582b5b8011b67f9e7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2024 09:42:02 +0200 Subject: [PATCH 602/989] ipv6: fib: hide unused 'pn' variable When CONFIG_IPV6_SUBTREES is disabled, the only user is hidden, causing a 'make W=1' warning: net/ipv6/ip6_fib.c: In function 'fib6_add': net/ipv6/ip6_fib.c:1388:32: error: variable 'pn' set but not used [-Werror=unused-but-set-variable] Add another #ifdef around the variable declaration, matching the other uses in this file. Fixes: 66729e18df08 ("[IPV6] ROUTE: Make sure we have fn->leaf when adding a node on subtree.") Link: https://lore.kernel.org/netdev/20240322131746.904943-1-arnd@kernel.org/ Reviewed-by: David Ahern Signed-off-by: Arnd Bergmann Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240408074219.3030256-1-arnd@kernel.org Signed-off-by: Paolo Abeni --- net/ipv6/ip6_fib.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 7209419cfb0e9..c1f62352a4814 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1385,7 +1385,10 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; - struct fib6_node *fn, *pn = NULL; + struct fib6_node *fn; +#ifdef CONFIG_IPV6_SUBTREES + struct fib6_node *pn = NULL; +#endif int err = -ENOMEM; int allow_create = 1; int replace_required = 0; @@ -1409,9 +1412,9 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, goto out; } +#ifdef CONFIG_IPV6_SUBTREES pn = fn; -#ifdef CONFIG_IPV6_SUBTREES if (rt->fib6_src.plen) { struct fib6_node *sn; -- GitLab From cf1b7201df59fb936f40f4a807433fe3f2ce310a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2024 09:42:03 +0200 Subject: [PATCH 603/989] ipv4/route: avoid unused-but-set-variable warning The log_martians variable is only used in an #ifdef, causing a 'make W=1' warning with gcc: net/ipv4/route.c: In function 'ip_rt_send_redirect': net/ipv4/route.c:880:13: error: variable 'log_martians' set but not used [-Werror=unused-but-set-variable] Change the #ifdef to an equivalent IS_ENABLED() to let the compiler see where the variable is used. Fixes: 30038fc61adf ("net: ip_rt_send_redirect() optimization") Reviewed-by: David Ahern Signed-off-by: Arnd Bergmann Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240408074219.3030256-2-arnd@kernel.org Signed-off-by: Paolo Abeni --- net/ipv4/route.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c8f76f56dc165..d36ace160d426 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -926,13 +926,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->n_redirects; -#ifdef CONFIG_IP_ROUTE_VERBOSE - if (log_martians && + if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians && peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); -#endif } out_put_peer: inet_putpeer(peer); -- GitLab From 638441bed666619b4275f68bbca9d1cd731a2063 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 16 Mar 2024 12:30:09 +0300 Subject: [PATCH 604/989] serial: 8250_lpc18xx: disable clks on error in probe() Goto the clean up path to clean up a couple clocks before returning on this error path. Fixes: 0087b9e694ee ("serial: 8250_lpc18xx: Switch to use uart_read_port_properties()") Signed-off-by: Dan Carpenter Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/92646c10-e0b5-4117-a9ac-ce9987d33ce3@moroto.mountain Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_lpc18xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_lpc18xx.c b/drivers/tty/serial/8250/8250_lpc18xx.c index 7984ee05af1da..47e1a056a60c3 100644 --- a/drivers/tty/serial/8250/8250_lpc18xx.c +++ b/drivers/tty/serial/8250/8250_lpc18xx.c @@ -151,7 +151,7 @@ static int lpc18xx_serial_probe(struct platform_device *pdev) ret = uart_read_port_properties(&uart.port); if (ret) - return ret; + goto dis_uart_clk; uart.port.iotype = UPIO_MEM32; uart.port.regshift = 2; -- GitLab From 7dfae6cbadc1ac99e38ad19fb08810b31ff167be Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 17 Mar 2024 22:41:23 +0100 Subject: [PATCH 605/989] serial: 8250_dw: Revert: Do not reclock if already at correct rate Commit e5d6bd25f93d ("serial: 8250_dw: Do not reclock if already at correct rate") breaks the dw UARTs on Intel Bay Trail (BYT) and Cherry Trail (CHT) SoCs. Before this change the RTL8732BS Bluetooth HCI which is found connected over the dw UART on both BYT and CHT boards works properly: Bluetooth: hci0: RTL: examining hci_ver=06 hci_rev=000b lmp_ver=06 lmp_subver=8723 Bluetooth: hci0: RTL: rom_version status=0 version=1 Bluetooth: hci0: RTL: loading rtl_bt/rtl8723bs_fw.bin Bluetooth: hci0: RTL: loading rtl_bt/rtl8723bs_config-OBDA8723.bin Bluetooth: hci0: RTL: cfg_sz 64, total sz 24508 Bluetooth: hci0: RTL: fw version 0x365d462e where as after this change probing it fails: Bluetooth: hci0: RTL: examining hci_ver=06 hci_rev=000b lmp_ver=06 lmp_subver=8723 Bluetooth: hci0: RTL: rom_version status=0 version=1 Bluetooth: hci0: RTL: loading rtl_bt/rtl8723bs_fw.bin Bluetooth: hci0: RTL: loading rtl_bt/rtl8723bs_config-OBDA8723.bin Bluetooth: hci0: RTL: cfg_sz 64, total sz 24508 Bluetooth: hci0: command 0xfc20 tx timeout Bluetooth: hci0: RTL: download fw command failed (-110) Revert the changes to fix this regression. Fixes: e5d6bd25f93d ("serial: 8250_dw: Do not reclock if already at correct rate") Cc: stable@vger.kernel.org Cc: Peter Collingbourne Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Acked-by: Peter Collingbourne Link: https://lore.kernel.org/r/20240317214123.34482-1-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_dw.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c index a3acbf0f5da1b..1300c92b8702a 100644 --- a/drivers/tty/serial/8250/8250_dw.c +++ b/drivers/tty/serial/8250/8250_dw.c @@ -356,9 +356,9 @@ static void dw8250_set_termios(struct uart_port *p, struct ktermios *termios, long rate; int ret; + clk_disable_unprepare(d->clk); rate = clk_round_rate(d->clk, newrate); - if (rate > 0 && p->uartclk != rate) { - clk_disable_unprepare(d->clk); + if (rate > 0) { /* * Note that any clock-notifer worker will block in * serial8250_update_uartclk() until we are done. @@ -366,8 +366,8 @@ static void dw8250_set_termios(struct uart_port *p, struct ktermios *termios, ret = clk_set_rate(d->clk, newrate); if (!ret) p->uartclk = rate; - clk_prepare_enable(d->clk); } + clk_prepare_enable(d->clk); dw8250_do_set_termios(p, termios, old); } -- GitLab From 54c4ec5f8c471b7c1137a1f769648549c423c026 Mon Sep 17 00:00:00 2001 From: Emil Kronborg Date: Wed, 20 Mar 2024 12:15:36 +0000 Subject: [PATCH 606/989] serial: mxs-auart: add spinlock around changing cts state The uart_handle_cts_change() function in serial_core expects the caller to hold uport->lock. For example, I have seen the below kernel splat, when the Bluetooth driver is loaded on an i.MX28 board. [ 85.119255] ------------[ cut here ]------------ [ 85.124413] WARNING: CPU: 0 PID: 27 at /drivers/tty/serial/serial_core.c:3453 uart_handle_cts_change+0xb4/0xec [ 85.134694] Modules linked in: hci_uart bluetooth ecdh_generic ecc wlcore_sdio configfs [ 85.143314] CPU: 0 PID: 27 Comm: kworker/u3:0 Not tainted 6.6.3-00021-gd62a2f068f92 #1 [ 85.151396] Hardware name: Freescale MXS (Device Tree) [ 85.156679] Workqueue: hci0 hci_power_on [bluetooth] (...) [ 85.191765] uart_handle_cts_change from mxs_auart_irq_handle+0x380/0x3f4 [ 85.198787] mxs_auart_irq_handle from __handle_irq_event_percpu+0x88/0x210 (...) Cc: stable@vger.kernel.org Fixes: 4d90bb147ef6 ("serial: core: Document and assert lock requirements for irq helpers") Reviewed-by: Frank Li Signed-off-by: Emil Kronborg Link: https://lore.kernel.org/r/20240320121530.11348-1-emil.kronborg@protonmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/mxs-auart.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/mxs-auart.c b/drivers/tty/serial/mxs-auart.c index 4749331fe618c..1e8853eae5042 100644 --- a/drivers/tty/serial/mxs-auart.c +++ b/drivers/tty/serial/mxs-auart.c @@ -1086,11 +1086,13 @@ static void mxs_auart_set_ldisc(struct uart_port *port, static irqreturn_t mxs_auart_irq_handle(int irq, void *context) { - u32 istat; + u32 istat, stat; struct mxs_auart_port *s = context; u32 mctrl_temp = s->mctrl_prev; - u32 stat = mxs_read(s, REG_STAT); + uart_port_lock(&s->port); + + stat = mxs_read(s, REG_STAT); istat = mxs_read(s, REG_INTR); /* ack irq */ @@ -1126,6 +1128,8 @@ static irqreturn_t mxs_auart_irq_handle(int irq, void *context) istat &= ~AUART_INTR_TXIS; } + uart_port_unlock(&s->port); + return IRQ_HANDLED; } -- GitLab From 5555980571cc744cd99b6455e3e388b54519db8f Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 25 Mar 2024 09:16:47 +0200 Subject: [PATCH 607/989] serial: core: Fix regression when runtime PM is not enabled Commit 45a3a8ef8129 ("serial: core: Revert checks for tx runtime PM state") caused a regression for Sun Ultra 60 for the sunsab driver as reported by Nick Bowler . We need to add back the check runtime PM enabled state for serial port controller device, I wrongly assumed earlier we could just remove it. Fixes: 45a3a8ef8129 ("serial: core: Revert checks for tx runtime PM state") Cc: stable Reported-by: Nick Bowler Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20240325071649.27040-1-tony@atomide.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index ff85ebd3a007d..25a83820927a5 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -156,7 +156,7 @@ static void __uart_start(struct uart_state *state) * enabled, serial_port_runtime_resume() calls start_tx() again * after enabling the device. */ - if (pm_runtime_active(&port_dev->dev)) + if (!pm_runtime_enabled(port->dev) || pm_runtime_active(&port_dev->dev)) port->ops->start_tx(port); pm_runtime_mark_last_busy(&port_dev->dev); pm_runtime_put_autosuspend(&port_dev->dev); -- GitLab From 90452456eb69297fe7ae34e56e40d8e47dc9e019 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 4 Apr 2024 01:41:52 +0300 Subject: [PATCH 608/989] serial: 8250_pci: Remove redundant PCI IDs Driver complains that PCI IDs are not needed for some of the LAVA cards: [ 0.297252] serial 0000:04:00.0: Redundant entry in serial pci_table. [ 0.297252] Please send the output of lspci -vv, this [ 0.297252] message (0x1407,0x0120,0x0000,0x0000), the [ 0.297252] manufacturer and name of serial board or [ 0.297252] modem board to . Do as suggested. Reported-by: Jimmy A Closes: https://lore.kernel.org/r/VI1P194MB052751BE157EFE9CEAB75725CE362@VI1P194MB0527.EURP194.PROD.OUTLOOK.COM Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240403224152.945099-1-andy.shevchenko@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_pci.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c index 0d35c77fad9eb..e2e4f99f9d347 100644 --- a/drivers/tty/serial/8250/8250_pci.c +++ b/drivers/tty/serial/8250/8250_pci.c @@ -5010,12 +5010,6 @@ static const struct pci_device_id serial_pci_tbl[] = { { PCI_VENDOR_ID_LAVA, PCI_DEVICE_ID_LAVA_QUATRO_B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, pbn_b0_bt_2_115200 }, - { PCI_VENDOR_ID_LAVA, PCI_DEVICE_ID_LAVA_QUATTRO_A, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, - pbn_b0_bt_2_115200 }, - { PCI_VENDOR_ID_LAVA, PCI_DEVICE_ID_LAVA_QUATTRO_B, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, - pbn_b0_bt_2_115200 }, { PCI_VENDOR_ID_LAVA, PCI_DEVICE_ID_LAVA_OCTO_A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, pbn_b0_bt_4_460800 }, -- GitLab From 1be3226445362bfbf461c92a5bcdb1723f2e4907 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Mon, 8 Apr 2024 19:23:43 +1000 Subject: [PATCH 609/989] serial/pmac_zilog: Remove flawed mitigation for rx irq flood The mitigation was intended to stop the irq completely. That may be better than a hard lock-up but it turns out that you get a crash anyway if you're using pmac_zilog as a serial console: ttyPZ0: pmz: rx irq flood ! BUG: spinlock recursion on CPU#0, swapper/0 That's because the pr_err() call in pmz_receive_chars() results in pmz_console_write() attempting to lock a spinlock already locked in pmz_interrupt(). With CONFIG_DEBUG_SPINLOCK=y, this produces a fatal BUG splat. The spinlock in question is the one in struct uart_port. Even when it's not fatal, the serial port rx function ceases to work. Also, the iteration limit doesn't play nicely with QEMU, as can be seen in the bug report linked below. A web search for other reports of the error message "pmz: rx irq flood" didn't produce anything. So I don't think this code is needed any more. Remove it. Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Aneesh Kumar K.V Cc: Naveen N. Rao Cc: Andy Shevchenko Cc: stable@kernel.org Cc: linux-m68k@lists.linux-m68k.org Link: https://github.com/vivier/qemu-m68k/issues/44 Link: https://lore.kernel.org/all/1078874617.9746.36.camel@gaston/ Acked-by: Michael Ellerman Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable Signed-off-by: Finn Thain Link: https://lore.kernel.org/r/e853cf2c762f23101cd2ddec0cc0c2be0e72685f.1712568223.git.fthain@linux-m68k.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/pmac_zilog.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/tty/serial/pmac_zilog.c b/drivers/tty/serial/pmac_zilog.c index 05d97e89511e6..92195f984de1b 100644 --- a/drivers/tty/serial/pmac_zilog.c +++ b/drivers/tty/serial/pmac_zilog.c @@ -210,7 +210,6 @@ static bool pmz_receive_chars(struct uart_pmac_port *uap) { struct tty_port *port; unsigned char ch, r1, drop, flag; - int loops = 0; /* Sanity check, make sure the old bug is no longer happening */ if (uap->port.state == NULL) { @@ -291,24 +290,11 @@ static bool pmz_receive_chars(struct uart_pmac_port *uap) if (r1 & Rx_OVR) tty_insert_flip_char(port, 0, TTY_OVERRUN); next_char: - /* We can get stuck in an infinite loop getting char 0 when the - * line is in a wrong HW state, we break that here. - * When that happens, I disable the receive side of the driver. - * Note that what I've been experiencing is a real irq loop where - * I'm getting flooded regardless of the actual port speed. - * Something strange is going on with the HW - */ - if ((++loops) > 1000) - goto flood; ch = read_zsreg(uap, R0); if (!(ch & Rx_CH_AV)) break; } - return true; - flood: - pmz_interrupt_control(uap, 0); - pmz_error("pmz: rx irq flood !\n"); return true; } -- GitLab From d325a858a53b5816a60447887f7148eace999e00 Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Mon, 8 Apr 2024 12:13:29 +0200 Subject: [PATCH 610/989] MAINTAINERS: mailmap: update Richard Genoud's email address I'm working now at bootlin, so I'll use my bootlin address for kernel development from now on. Update also the yaml file for atmel-serial accordingly. Signed-off-by: Richard Genoud Reviewed-by: Nicolas Ferre Link: https://lore.kernel.org/r/20240408101329.9448-1-richard.genoud@bootlin.com Signed-off-by: Greg Kroah-Hartman --- .mailmap | 1 + Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml | 2 +- MAINTAINERS | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 8284692f96107..71e28f4e0d4a0 100644 --- a/.mailmap +++ b/.mailmap @@ -524,6 +524,7 @@ Rémi Denis-Courmont Ricardo Ribalda Ricardo Ribalda Ricardo Ribalda Delgado Ricardo Ribalda +Richard Genoud Richard Leitner Richard Leitner Richard Leitner diff --git a/Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml b/Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml index 65cb2e5c5eee0..eb2992a447d79 100644 --- a/Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml +++ b/Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml @@ -8,7 +8,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Atmel Universal Synchronous Asynchronous Receiver/Transmitter (USART) maintainers: - - Richard Genoud + - Richard Genoud properties: compatible: diff --git a/MAINTAINERS b/MAINTAINERS index aea47e04c3a52..0dbdc81e46c60 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14363,7 +14363,7 @@ F: drivers/dma/at_xdmac.c F: include/dt-bindings/dma/at91.h MICROCHIP AT91 SERIAL DRIVER -M: Richard Genoud +M: Richard Genoud S: Maintained F: Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml F: drivers/tty/serial/atmel_serial.c -- GitLab From fbbdc255fbee59b4207a5398fdb4f04590681a79 Mon Sep 17 00:00:00 2001 From: Zhenhua Huang Date: Mon, 8 Apr 2024 21:43:57 -0700 Subject: [PATCH 611/989] fs/proc: remove redundant comments from /proc/bootconfig commit 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to /proc/bootconfig") adds bootloader argument comments into /proc/bootconfig. /proc/bootconfig shows boot_command_line[] multiple times following every xbc key value pair, that's duplicated and not necessary. Remove redundant ones. Output before and after the fix is like: key1 = value1 *bootloader argument comments* key2 = value2 *bootloader argument comments* key3 = value3 *bootloader argument comments* ... key1 = value1 key2 = value2 key3 = value3 *bootloader argument comments* ... Link: https://lore.kernel.org/all/20240409044358.1156477-1-paulmck@kernel.org/ Fixes: 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to /proc/bootconfig") Signed-off-by: Zhenhua Huang Signed-off-by: Paul E. McKenney Cc: Cc: Cc: stable@vger.kernel.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- fs/proc/bootconfig.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 902b326e1e560..e5635a6b127b0 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) break; dst += ret; } - if (ret >= 0 && boot_command_line[0]) { - ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", - boot_command_line); - if (ret > 0) - dst += ret; - } + } + if (ret >= 0 && boot_command_line[0]) { + ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", + boot_command_line); + if (ret > 0) + dst += ret; } out: kfree(key); -- GitLab From c722cea208789d9e2660992bcd05fb9fac3adb56 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 8 Apr 2024 21:43:58 -0700 Subject: [PATCH 612/989] fs/proc: Skip bootloader comment if no embedded kernel parameters If the "bootconfig" kernel command-line argument was specified or if the kernel was built with CONFIG_BOOT_CONFIG_FORCE, but if there are no embedded kernel parameter, omit the "# Parameters from bootloader:" comment from the /proc/bootconfig file. This will cause automation to fall back to the /proc/cmdline file, which will be identical to the comment in this no-embedded-kernel-parameters case. Link: https://lore.kernel.org/all/20240409044358.1156477-2-paulmck@kernel.org/ Fixes: 8b8ce6c75430 ("fs/proc: remove redundant comments from /proc/bootconfig") Signed-off-by: Masami Hiramatsu Signed-off-by: Paul E. McKenney Cc: stable@vger.kernel.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- fs/proc/bootconfig.c | 2 +- include/linux/bootconfig.h | 1 + init/main.c | 5 +++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index e5635a6b127b0..87dcaae32ff87 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -63,7 +63,7 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) dst += ret; } } - if (ret >= 0 && boot_command_line[0]) { + if (cmdline_has_extra_options() && ret >= 0 && boot_command_line[0]) { ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", boot_command_line); if (ret > 0) diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index ca73940e26df8..e5ee2c694401e 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -10,6 +10,7 @@ #ifdef __KERNEL__ #include #include +bool __init cmdline_has_extra_options(void); #else /* !__KERNEL__ */ /* * NOTE: This is only for tools/bootconfig, because tools/bootconfig will diff --git a/init/main.c b/init/main.c index 2ca52474d0c30..881f6230ee59e 100644 --- a/init/main.c +++ b/init/main.c @@ -487,6 +487,11 @@ static int __init warn_bootconfig(char *str) early_param("bootconfig", warn_bootconfig); +bool __init cmdline_has_extra_options(void) +{ + return extra_command_line || extra_init_args; +} + /* Change NUL term back to "=", to make "param" the whole string. */ static void __init repair_env_string(char *param, char *val) { -- GitLab From 4370b673ccf240bf7587b0cb8e6726a5ccaf1f17 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Thu, 28 Mar 2024 14:27:56 +0000 Subject: [PATCH 613/989] MIPS: scall: Save thread_info.syscall unconditionally on entry thread_info.syscall is used by syscall_get_nr to supply syscall nr over a thread stack frame. Previously, thread_info.syscall is only saved at syscall_trace_enter when syscall tracing is enabled. However rest of the kernel code do expect syscall_get_nr to be available without syscall tracing. The previous design breaks collect_syscall. Move saving process to syscall entry to fix it. Reported-by: Xi Ruoyao Link: https://github.com/util-linux/util-linux/issues/2867 Signed-off-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/ptrace.h | 2 +- arch/mips/kernel/asm-offsets.c | 1 + arch/mips/kernel/ptrace.c | 15 ++++++--------- arch/mips/kernel/scall32-o32.S | 23 +++++++++++++---------- arch/mips/kernel/scall64-n32.S | 3 ++- arch/mips/kernel/scall64-n64.S | 3 ++- arch/mips/kernel/scall64-o32.S | 33 +++++++++++++++++---------------- 7 files changed, 42 insertions(+), 38 deletions(-) diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index d14d0e37ad02d..4a2b40ce39e09 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -159,7 +159,7 @@ extern unsigned long exception_ip(struct pt_regs *regs); #define exception_ip(regs) exception_ip(regs) #define profile_pc(regs) instruction_pointer(regs) -extern asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall); +extern asmlinkage long syscall_trace_enter(struct pt_regs *regs); extern asmlinkage void syscall_trace_leave(struct pt_regs *regs); extern void die(const char *, struct pt_regs *) __noreturn; diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index d1b11f66f748f..cb1045ebab062 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -101,6 +101,7 @@ void output_thread_info_defines(void) OFFSET(TI_CPU, thread_info, cpu); OFFSET(TI_PRE_COUNT, thread_info, preempt_count); OFFSET(TI_REGS, thread_info, regs); + OFFSET(TI_SYSCALL, thread_info, syscall); DEFINE(_THREAD_SIZE, THREAD_SIZE); DEFINE(_THREAD_MASK, THREAD_MASK); DEFINE(_IRQ_STACK_SIZE, IRQ_STACK_SIZE); diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 59288c13b581b..61503a36067e9 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -1317,16 +1317,13 @@ long arch_ptrace(struct task_struct *child, long request, * Notification of system call entry/exit * - triggered by current->work.syscall_trace */ -asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) +asmlinkage long syscall_trace_enter(struct pt_regs *regs) { user_exit(); - current_thread_info()->syscall = syscall; - if (test_thread_flag(TIF_SYSCALL_TRACE)) { if (ptrace_report_syscall_entry(regs)) return -1; - syscall = current_thread_info()->syscall; } #ifdef CONFIG_SECCOMP @@ -1335,7 +1332,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) struct seccomp_data sd; unsigned long args[6]; - sd.nr = syscall; + sd.nr = current_thread_info()->syscall; sd.arch = syscall_get_arch(current); syscall_get_arguments(current, regs, args); for (i = 0; i < 6; i++) @@ -1345,23 +1342,23 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) ret = __secure_computing(&sd); if (ret == -1) return ret; - syscall = current_thread_info()->syscall; } #endif if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->regs[2]); - audit_syscall_entry(syscall, regs->regs[4], regs->regs[5], + audit_syscall_entry(current_thread_info()->syscall, + regs->regs[4], regs->regs[5], regs->regs[6], regs->regs[7]); /* * Negative syscall numbers are mistaken for rejected syscalls, but * won't have had the return value set appropriately, so we do so now. */ - if (syscall < 0) + if (current_thread_info()->syscall < 0) syscall_set_return_value(current, regs, -ENOSYS, 0); - return syscall; + return current_thread_info()->syscall; } /* diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 18dc9b3450561..2c604717e6308 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -77,6 +77,18 @@ loads_done: PTR_WD load_a7, bad_stack_a7 .previous + /* + * syscall number is in v0 unless we called syscall(__NR_###) + * where the real syscall number is in a0 + */ + subu t2, v0, __NR_O32_Linux + bnez t2, 1f /* __NR_syscall at offset 0 */ + LONG_S a0, TI_SYSCALL($28) # Save a0 as syscall number + b 2f +1: + LONG_S v0, TI_SYSCALL($28) # Save v0 as syscall number +2: + lw t0, TI_FLAGS($28) # syscall tracing enabled? li t1, _TIF_WORK_SYSCALL_ENTRY and t0, t1 @@ -114,16 +126,7 @@ syscall_trace_entry: SAVE_STATIC move a0, sp - /* - * syscall number is in v0 unless we called syscall(__NR_###) - * where the real syscall number is in a0 - */ - move a1, v0 - subu t2, v0, __NR_O32_Linux - bnez t2, 1f /* __NR_syscall at offset 0 */ - lw a1, PT_R4(sp) - -1: jal syscall_trace_enter + jal syscall_trace_enter bltz v0, 1f # seccomp failed? Skip syscall diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 97456b2ca7dc3..97788859238c3 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -44,6 +44,8 @@ NESTED(handle_sysn32, PT_SIZE, sp) sd a3, PT_R26(sp) # save a3 for syscall restarting + LONG_S v0, TI_SYSCALL($28) # Store syscall number + li t1, _TIF_WORK_SYSCALL_ENTRY LONG_L t0, TI_FLAGS($28) # syscall tracing enabled? and t0, t1, t0 @@ -72,7 +74,6 @@ syscall_common: n32_syscall_trace_entry: SAVE_STATIC move a0, sp - move a1, v0 jal syscall_trace_enter bltz v0, 1f # seccomp failed? Skip syscall diff --git a/arch/mips/kernel/scall64-n64.S b/arch/mips/kernel/scall64-n64.S index e6264aa62e457..be11ea5cc67e0 100644 --- a/arch/mips/kernel/scall64-n64.S +++ b/arch/mips/kernel/scall64-n64.S @@ -46,6 +46,8 @@ NESTED(handle_sys64, PT_SIZE, sp) sd a3, PT_R26(sp) # save a3 for syscall restarting + LONG_S v0, TI_SYSCALL($28) # Store syscall number + li t1, _TIF_WORK_SYSCALL_ENTRY LONG_L t0, TI_FLAGS($28) # syscall tracing enabled? and t0, t1, t0 @@ -82,7 +84,6 @@ n64_syscall_exit: syscall_trace_entry: SAVE_STATIC move a0, sp - move a1, v0 jal syscall_trace_enter bltz v0, 1f # seccomp failed? Skip syscall diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index d3c2616cba226..7a5abb73e5312 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -79,6 +79,22 @@ loads_done: PTR_WD load_a7, bad_stack_a7 .previous + /* + * absolute syscall number is in v0 unless we called syscall(__NR_###) + * where the real syscall number is in a0 + * note: NR_syscall is the first O32 syscall but the macro is + * only defined when compiling with -mabi=32 (CONFIG_32BIT) + * therefore __NR_O32_Linux is used (4000) + */ + + subu t2, v0, __NR_O32_Linux + bnez t2, 1f /* __NR_syscall at offset 0 */ + LONG_S a0, TI_SYSCALL($28) # Save a0 as syscall number + b 2f +1: + LONG_S v0, TI_SYSCALL($28) # Save v0 as syscall number +2: + li t1, _TIF_WORK_SYSCALL_ENTRY LONG_L t0, TI_FLAGS($28) # syscall tracing enabled? and t0, t1, t0 @@ -113,22 +129,7 @@ trace_a_syscall: sd a7, PT_R11(sp) # For indirect syscalls move a0, sp - /* - * absolute syscall number is in v0 unless we called syscall(__NR_###) - * where the real syscall number is in a0 - * note: NR_syscall is the first O32 syscall but the macro is - * only defined when compiling with -mabi=32 (CONFIG_32BIT) - * therefore __NR_O32_Linux is used (4000) - */ - .set push - .set reorder - subu t1, v0, __NR_O32_Linux - move a1, v0 - bnez t1, 1f /* __NR_syscall at offset 0 */ - ld a1, PT_R4(sp) /* Arg1 for __NR_syscall case */ - .set pop - -1: jal syscall_trace_enter + jal syscall_trace_enter bltz v0, 1f # seccomp failed? Skip syscall -- GitLab From 9cf7ea2eeb745213dc2a04103e426b960e807940 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 4 Apr 2024 17:59:26 +0300 Subject: [PATCH 614/989] serial: core: Clearing the circular buffer before NULLifying it The circular buffer is NULLified in uart_tty_port_shutdown() under the spin lock. However, the PM or other timer based callbacks may still trigger after this event without knowning that buffer pointer is not valid. Since the serial code is a bit inconsistent in checking the buffer state (some rely on the head-tail positions, some on the buffer pointer), it's better to have both aligned, i.e. buffer pointer to be NULL and head-tail possitions to be the same, meaning it's empty. This will prevent asynchronous calls to dereference NULL pointer as reported recently in 8250 case: BUG: kernel NULL pointer dereference, address: 00000cf5 Workqueue: pm pm_runtime_work EIP: serial8250_tx_chars (drivers/tty/serial/8250/8250_port.c:1809) ... ? serial8250_tx_chars (drivers/tty/serial/8250/8250_port.c:1809) __start_tx (drivers/tty/serial/8250/8250_port.c:1551) serial8250_start_tx (drivers/tty/serial/8250/8250_port.c:1654) serial_port_runtime_suspend (include/linux/serial_core.h:667 drivers/tty/serial/serial_port.c:63) __rpm_callback (drivers/base/power/runtime.c:393) ? serial_port_remove (drivers/tty/serial/serial_port.c:50) rpm_suspend (drivers/base/power/runtime.c:447) The proposed change will prevent ->start_tx() to be called during suspend on shut down port. Fixes: 43066e32227e ("serial: port: Don't suspend if the port is still busy") Cc: stable Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202404031607.2e92eebe-lkp@intel.com Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240404150034.41648-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 25a83820927a5..2247efe972503 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -1788,6 +1788,7 @@ static void uart_tty_port_shutdown(struct tty_port *port) * Free the transmit buffer. */ uart_port_lock_irq(uport); + uart_circ_clear(&state->xmit); buf = state->xmit.buf; state->xmit.buf = NULL; uart_port_unlock_irq(uport); -- GitLab From 011d79ef1cfad701c2d8e7e80d8c77523af9c771 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 8 Apr 2024 09:32:34 +0200 Subject: [PATCH 615/989] MAINTAINERS: Change Krzysztof Kozlowski's email address Switch Krzysztof Kozlowski's to @kernel.org account. Link: https://lore.kernel.org/r/20240329174823.74918-1-krzysztof.kozlowski@linaro.org Signed-off-by: Krzysztof Kozlowski Signed-off-by: Arnd Bergmann --- MAINTAINERS | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7c121493f43d0..cb0049030acd2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2708,7 +2708,7 @@ F: sound/soc/rockchip/ N: rockchip ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-samsung-soc@vger.kernel.org @@ -5557,7 +5557,7 @@ F: drivers/cpuidle/cpuidle-big_little.c CPUIDLE DRIVER - ARM EXYNOS M: Daniel Lezcano M: Kukjin Kim -R: Krzysztof Kozlowski +R: Krzysztof Kozlowski L: linux-pm@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -9000,7 +9000,7 @@ F: drivers/i2c/muxes/i2c-mux-gpio.c F: include/linux/platform_data/i2c-mux-gpio.h GENERIC GPIO RESET DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski S: Maintained F: drivers/reset/reset-gpio.c @@ -13295,7 +13295,7 @@ F: drivers/iio/adc/max11205.c MAXIM MAX17040 FAMILY FUEL GAUGE DRIVERS R: Iskren Chernev -R: Krzysztof Kozlowski +R: Krzysztof Kozlowski R: Marek Szyprowski R: Matheus Castello L: linux-pm@vger.kernel.org @@ -13305,7 +13305,7 @@ F: drivers/power/supply/max17040_battery.c MAXIM MAX17042 FAMILY FUEL GAUGE DRIVERS R: Hans de Goede -R: Krzysztof Kozlowski +R: Krzysztof Kozlowski R: Marek Szyprowski R: Sebastian Krzyszkowiak R: Purism Kernel Team @@ -13363,7 +13363,7 @@ F: Documentation/devicetree/bindings/power/supply/maxim,max77976.yaml F: drivers/power/supply/max77976_charger.c MAXIM MUIC CHARGER DRIVERS FOR EXYNOS BASED BOARDS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-pm@vger.kernel.org S: Maintained B: mailto:linux-samsung-soc@vger.kernel.org @@ -13374,7 +13374,7 @@ F: drivers/power/supply/max77693_charger.c MAXIM PMIC AND MUIC DRIVERS FOR EXYNOS BASED BOARDS M: Chanwoo Choi -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained B: mailto:linux-samsung-soc@vger.kernel.org @@ -14156,7 +14156,7 @@ F: mm/mm_init.c F: tools/testing/memblock/ MEMORY CONTROLLER DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained B: mailto:krzysztof.kozlowski@linaro.org @@ -15537,7 +15537,7 @@ F: include/uapi/linux/nexthop.h F: net/ipv4/nexthop.c NFC SUBSYSTEM -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: netdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/nfc/ @@ -15914,7 +15914,7 @@ F: Documentation/devicetree/bindings/regulator/nxp,pf8x00-regulator.yaml F: drivers/regulator/pf8x00-regulator.c NXP PTN5150A CC LOGIC AND EXTCON DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/extcon/extcon-ptn5150.yaml @@ -16525,7 +16525,7 @@ K: of_overlay_remove OPEN FIRMWARE AND FLATTENED DEVICE TREE BINDINGS M: Rob Herring -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Conor Dooley L: devicetree@vger.kernel.org S: Maintained @@ -17483,7 +17483,7 @@ F: Documentation/devicetree/bindings/pinctrl/renesas,* F: drivers/pinctrl/renesas/ PIN CONTROLLER - SAMSUNG -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Sylwester Nawrocki R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@ -19451,7 +19451,7 @@ F: Documentation/devicetree/bindings/sound/samsung* F: sound/soc/samsung/ SAMSUNG EXYNOS PSEUDO RANDOM NUMBER GENERATOR (RNG) DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-crypto@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -19486,7 +19486,7 @@ S: Maintained F: drivers/platform/x86/samsung-laptop.c SAMSUNG MULTIFUNCTION PMIC DEVICE DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -19512,7 +19512,7 @@ F: drivers/media/platform/samsung/s3c-camif/ F: include/media/drv-intf/s3c_camif.h SAMSUNG S3FWRN5 NFC DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski S: Maintained F: Documentation/devicetree/bindings/net/nfc/samsung,s3fwrn5.yaml F: drivers/nfc/s3fwrn5 @@ -19533,7 +19533,7 @@ S: Supported F: drivers/media/i2c/s5k5baf.c SAMSUNG S5P Security SubSystem (SSS) DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Vladimir Zapolskiy L: linux-crypto@vger.kernel.org L: linux-samsung-soc@vger.kernel.org @@ -19555,7 +19555,7 @@ F: Documentation/devicetree/bindings/media/samsung,fimc.yaml F: drivers/media/platform/samsung/exynos4-is/ SAMSUNG SOC CLOCK DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Sylwester Nawrocki M: Chanwoo Choi R: Alim Akhtar @@ -19587,7 +19587,7 @@ F: drivers/net/ethernet/samsung/sxgbe/ SAMSUNG THERMAL DRIVER M: Bartlomiej Zolnierkiewicz -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-pm@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -23782,7 +23782,7 @@ S: Orphan F: drivers/mmc/host/vub300.c W1 DALLAS'S 1-WIRE BUS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski S: Maintained F: Documentation/devicetree/bindings/w1/ F: Documentation/w1/ -- GitLab From fbdd90334a6205e8a99d0bc2dfc738ee438f00bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 5 Apr 2024 09:20:41 +0200 Subject: [PATCH 616/989] MAINTAINERS: Drop Li Yang as their email address stopped working MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When sending a patch to (among others) Li Yang the nxp MTA replied that the address doesn't exist and so the mail couldn't be delivered. The error code was 550, so at least technically that's not a temporal issue. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240405072042.697182-2-u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3c78979fb8197..da6b81a9e3365 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2191,7 +2191,6 @@ N: mxs ARM/FREESCALE LAYERSCAPE ARM ARCHITECTURE M: Shawn Guo -M: Li Yang L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git @@ -8523,7 +8522,6 @@ S: Maintained F: drivers/video/fbdev/fsl-diu-fb.* FREESCALE DMA DRIVER -M: Li Yang M: Zhang Wei L: linuxppc-dev@lists.ozlabs.org S: Maintained @@ -8688,10 +8686,9 @@ F: drivers/soc/fsl/qe/tsa.h F: include/dt-bindings/soc/cpm1-fsl,tsa.h FREESCALE QUICC ENGINE UCC ETHERNET DRIVER -M: Li Yang L: netdev@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org -S: Maintained +S: Orphan F: drivers/net/ethernet/freescale/ucc_geth* FREESCALE QUICC ENGINE UCC HDLC DRIVER @@ -8708,10 +8705,9 @@ S: Maintained F: drivers/tty/serial/ucc_uart.c FREESCALE SOC DRIVERS -M: Li Yang L: linuxppc-dev@lists.ozlabs.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -S: Maintained +S: Orphan F: Documentation/devicetree/bindings/misc/fsl,dpaa2-console.yaml F: Documentation/devicetree/bindings/soc/fsl/ F: drivers/soc/fsl/ @@ -8745,10 +8741,9 @@ F: Documentation/devicetree/bindings/sound/fsl,qmc-audio.yaml F: sound/soc/fsl/fsl_qmc_audio.c FREESCALE USB PERIPHERAL DRIVERS -M: Li Yang L: linux-usb@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org -S: Maintained +S: Orphan F: drivers/usb/gadget/udc/fsl* FREESCALE USB PHY DRIVER -- GitLab From 3461e02066758b78a0731eb71faecfb1eccd0e6c Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 9 Apr 2024 13:36:46 +0200 Subject: [PATCH 617/989] usb: typec: mux: it5205: Fix ChipID value typo The ChipID bytes are read in inverse order: invert the ChipID value defined as IT5205FN_CHIP_ID and used for validating the same. Fixes: 41fe9ea1696c ("usb: typec: mux: Add ITE IT5205 Alternate Mode Passive MUX driver") Signed-off-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20240409113646.305105-1-angelogioacchino.delregno@collabora.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/mux/it5205.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/mux/it5205.c b/drivers/usb/typec/mux/it5205.c index 5535932e42cde..4357cc67a8672 100644 --- a/drivers/usb/typec/mux/it5205.c +++ b/drivers/usb/typec/mux/it5205.c @@ -22,7 +22,7 @@ #include #define IT5205_REG_CHIP_ID(x) (0x4 + (x)) -#define IT5205FN_CHIP_ID 0x35323035 /* "5205" */ +#define IT5205FN_CHIP_ID 0x35303235 /* "5025" -> "5205" */ /* MUX power down register */ #define IT5205_REG_MUXPDR 0x10 -- GitLab From eed04fa96c48790c1cce73c8a248e9d460b088f8 Mon Sep 17 00:00:00 2001 From: Minas Harutyunyan Date: Tue, 9 Apr 2024 12:27:54 +0000 Subject: [PATCH 618/989] usb: dwc2: host: Fix dereference issue in DDMA completion flow. Fixed variable dereference issue in DDMA completion flow. Fixes: b258e4268850 ("usb: dwc2: host: Fix ISOC flow in DDMA mode") CC: stable@vger.kernel.org Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-usb/2024040834-ethically-rumble-701f@gregkh/T/#m4c4b83bef0ebb4b67fe2e0a7d6466cbb6f416e39 Signed-off-by: Minas Harutyunyan Link: https://lore.kernel.org/r/cc826d3ef53c934d8e6d98870f17f3cdc3d2755d.1712665387.git.Minas.Harutyunyan@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/hcd_ddma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc2/hcd_ddma.c b/drivers/usb/dwc2/hcd_ddma.c index 79582b102c7ed..994a78ad084b1 100644 --- a/drivers/usb/dwc2/hcd_ddma.c +++ b/drivers/usb/dwc2/hcd_ddma.c @@ -867,13 +867,15 @@ static int dwc2_cmpl_host_isoc_dma_desc(struct dwc2_hsotg *hsotg, struct dwc2_dma_desc *dma_desc; struct dwc2_hcd_iso_packet_desc *frame_desc; u16 frame_desc_idx; - struct urb *usb_urb = qtd->urb->priv; + struct urb *usb_urb; u16 remain = 0; int rc = 0; if (!qtd->urb) return -EINVAL; + usb_urb = qtd->urb->priv; + dma_sync_single_for_cpu(hsotg->dev, qh->desc_list_dma + (idx * sizeof(struct dwc2_dma_desc)), sizeof(struct dwc2_dma_desc), -- GitLab From 6d029c25b71f2de2838a6f093ce0fa0e69336154 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 9 Apr 2024 15:38:03 +0200 Subject: [PATCH 619/989] selftests/timers/posix_timers: Reimplement check_timer_distribution() check_timer_distribution() runs ten threads in a busy loop and tries to test that the kernel distributes a process posix CPU timer signal to every thread over time. There is not guarantee that this is true even after commit bcb7ee79029d ("posix-timers: Prefer delivery of signals to the current thread") because that commit only avoids waking up the sleeping process leader thread, but that has nothing to do with the actual signal delivery. As the signal is process wide the first thread which observes sigpending and wins the race to lock sighand will deliver the signal. Testing shows that this hangs on a regular base because some threads never win the race. The comment "This primarily tests that the kernel does not favour any one." is wrong. The kernel does favour a thread which hits the timer interrupt when CLOCK_PROCESS_CPUTIME_ID expires. Rewrite the test so it only checks that the group leader sleeping in join() never receives SIGALRM and the thread which burns CPU cycles receives all signals. In older kernels which do not have commit bcb7ee79029d ("posix-timers: Prefer delivery of signals to the current thread") the test-case fails immediately, the very 1st tick wakes the leader up. Otherwise it quickly succeeds after 100 ticks. CI testing wants to use newer selftest versions on stable kernels. In this case the test is guaranteed to fail. So check in the failure case whether the kernel version is less than v6.3 and skip the test result in that case. [ tglx: Massaged change log, renamed the version check helper ] Fixes: e797203fb3ba ("selftests/timers/posix_timers: Test delivery of signals across threads") Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240409133802.GD29396@redhat.com --- tools/testing/selftests/kselftest.h | 13 +++ tools/testing/selftests/timers/posix_timers.c | 103 ++++++++---------- 2 files changed, 60 insertions(+), 56 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 541bf192e30e6..973b18e156b28 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -51,6 +51,7 @@ #include #include #include +#include #endif #ifndef ARRAY_SIZE @@ -388,4 +389,16 @@ static inline __printf(1, 2) int ksft_exit_skip(const char *msg, ...) exit(KSFT_SKIP); } +static inline int ksft_min_kernel_version(unsigned int min_major, + unsigned int min_minor) +{ + unsigned int major, minor; + struct utsname info; + + if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2) + ksft_exit_fail_msg("Can't parse kernel version\n"); + + return major > min_major || (major == min_major && minor >= min_minor); +} + #endif /* __KSELFTEST_H */ diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index d49dd3ffd0d96..d86a0e00711e4 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -184,80 +184,71 @@ static int check_timer_create(int which) return 0; } -int remain; -__thread int got_signal; +static pthread_t ctd_thread; +static volatile int ctd_count, ctd_failed; -static void *distribution_thread(void *arg) +static void ctd_sighandler(int sig) { - while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); - return NULL; + if (pthread_self() != ctd_thread) + ctd_failed = 1; + ctd_count--; } -static void distribution_handler(int nr) +static void *ctd_thread_func(void *arg) { - if (!__atomic_exchange_n(&got_signal, 1, __ATOMIC_RELAXED)) - __atomic_fetch_sub(&remain, 1, __ATOMIC_RELAXED); -} - -/* - * Test that all running threads _eventually_ receive CLOCK_PROCESS_CPUTIME_ID - * timer signals. This primarily tests that the kernel does not favour any one. - */ -static int check_timer_distribution(void) -{ - int err, i; - timer_t id; - const int nthreads = 10; - pthread_t threads[nthreads]; struct itimerspec val = { .it_value.tv_sec = 0, .it_value.tv_nsec = 1000 * 1000, .it_interval.tv_sec = 0, .it_interval.tv_nsec = 1000 * 1000, }; + timer_t id; - remain = nthreads + 1; /* worker threads + this thread */ - signal(SIGALRM, distribution_handler); - err = timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); - if (err < 0) { - ksft_perror("Can't create timer"); - return -1; - } - err = timer_settime(id, 0, &val, NULL); - if (err < 0) { - ksft_perror("Can't set timer"); - return -1; - } + /* 1/10 seconds to ensure the leader sleeps */ + usleep(10000); - for (i = 0; i < nthreads; i++) { - err = pthread_create(&threads[i], NULL, distribution_thread, - NULL); - if (err) { - ksft_print_msg("Can't create thread: %s (%d)\n", - strerror(errno), errno); - return -1; - } - } + ctd_count = 100; + if (timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id)) + return "Can't create timer\n"; + if (timer_settime(id, 0, &val, NULL)) + return "Can't set timer\n"; - /* Wait for all threads to receive the signal. */ - while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); + while (ctd_count > 0 && !ctd_failed) + ; - for (i = 0; i < nthreads; i++) { - err = pthread_join(threads[i], NULL); - if (err) { - ksft_print_msg("Can't join thread: %s (%d)\n", - strerror(errno), errno); - return -1; - } - } + if (timer_delete(id)) + return "Can't delete timer\n"; - if (timer_delete(id)) { - ksft_perror("Can't delete timer"); - return -1; - } + return NULL; +} + +/* + * Test that only the running thread receives the timer signal. + */ +static int check_timer_distribution(void) +{ + const char *errmsg; - ksft_test_result_pass("check_timer_distribution\n"); + signal(SIGALRM, ctd_sighandler); + + errmsg = "Can't create thread\n"; + if (pthread_create(&ctd_thread, NULL, ctd_thread_func, NULL)) + goto err; + + errmsg = "Can't join thread\n"; + if (pthread_join(ctd_thread, (void **)&errmsg) || errmsg) + goto err; + + if (!ctd_failed) + ksft_test_result_pass("check signal distribution\n"); + else if (ksft_min_kernel_version(6, 3)) + ksft_test_result_fail("check signal distribution\n"); + else + ksft_test_result_skip("check signal distribution (old kernel)\n"); return 0; +err: + ksft_print_msg(errmsg); + return -1; } int main(int argc, char **argv) -- GitLab From d7a62d0a9a17e97ce2d4d40431094e09956a0568 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 9 Apr 2024 17:46:23 +0200 Subject: [PATCH 620/989] compiler.h: Add missing quote in macro comment Add a missing doublequote in the __is_constexpr() macro comment. Signed-off-by: Thorsten Blum Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c00cc6c0878a1..8c252e073bd81 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -268,7 +268,7 @@ static inline void *offset_to_ptr(const int *off) * - When one operand is a null pointer constant (i.e. when x is an integer * constant expression) and the other is an object pointer (i.e. our * third operand), the conditional operator returns the type of the - * object pointer operand (i.e. "int *). Here, within the sizeof(), we + * object pointer operand (i.e. "int *"). Here, within the sizeof(), we * would then get: * sizeof(*((int *)(...)) == sizeof(int) == 4 * - When one operand is a void pointer (i.e. when x is not an integer -- GitLab From 4710e4fc3e2afa6c092db2c65eef365b0836d6db Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 9 Mar 2024 18:15:45 +0100 Subject: [PATCH 621/989] KVM: SVM: Remove a useless zeroing of allocated memory Remove KVM's unnecessary zeroing of memory when allocating the pages array in sev_pin_memory() via __vmalloc(), as the array is only used to hold kernel pointers. The kmalloc() path for "small" regions doesn't zero the array, and if KVM leaks state and/or accesses uninitialized data, then the kernel has bigger problems. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/c7619a3d3cbb36463531a7c73ccbde9db587986c.1710004509.git.christophe.jaillet@wanadoo.fr [sean: massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 61a7531d41b01..759581bb2128d 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -434,7 +434,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT); else pages = kmalloc(size, GFP_KERNEL_ACCOUNT); -- GitLab From 19597a71a0c8603f8c8599686ac9b0ff4b846716 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:26 -0800 Subject: [PATCH 622/989] KVM: SVM: Create a stack frame in __svm_vcpu_run() for unwinding Unconditionally create a stack frame in __svm_vcpu_run() to play nice with unwinding via frame pointers, at least until the point where RBP is loaded with the guest's value. Don't bother conditioning the code on CONFIG_FRAME_POINTER=y, as RBP needs to be saved and restored anyways (due to it being clobbered with the guest's value); omitting the "MOV RSP, RBP" is not worth the extra #ifdef. Creating a stack frame will allow removing the OBJECT_FILES_NON_STANDARD tag from vmenter.S once __svm_sev_es_vcpu_run() is fixed to not stomp all over RBP for no reason. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 187018c424bfb..82637abf0b7ea 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -99,6 +99,7 @@ */ SYM_FUNC_START(__svm_vcpu_run) push %_ASM_BP + mov %_ASM_SP, %_ASM_BP #ifdef CONFIG_X86_64 push %r15 push %r14 -- GitLab From 7774c8f32e99b1f314c0df7c204a897792b4f378 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:27 -0800 Subject: [PATCH 623/989] KVM: SVM: Wrap __svm_sev_es_vcpu_run() with #ifdef CONFIG_KVM_AMD_SEV Compile (and link) __svm_sev_es_vcpu_run() if and only if SEV support is actually enabled. This will allow dropping non-existent 32-bit "support" from __svm_sev_es_vcpu_run() without causing undue confusion. Intentionally don't provide a stub (but keep the declaration), as any sane compiler, even with things like KASAN enabled, should eliminate the call to __svm_sev_es_vcpu_run() since sev_es_guest() unconditionally returns "false" if CONFIG_KVM_AMD_SEV=n. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 82637abf0b7ea..6f57d496867a1 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -291,6 +291,7 @@ SYM_FUNC_START(__svm_vcpu_run) SYM_FUNC_END(__svm_vcpu_run) +#ifdef CONFIG_KVM_AMD_SEV /** * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode * @svm: struct vcpu_svm * @@ -389,3 +390,4 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) _ASM_EXTABLE(1b, 3b) SYM_FUNC_END(__svm_sev_es_vcpu_run) +#endif /* CONFIG_KVM_AMD_SEV */ -- GitLab From 331282fdb15edaf1beb1d27a64d3f65a34d7394d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:28 -0800 Subject: [PATCH 624/989] KVM: SVM: Drop 32-bit "support" from __svm_sev_es_vcpu_run() Drop 32-bit "support" from __svm_sev_es_vcpu_run(), as SEV/SEV-ES firmly 64-bit only. The "support" was purely the result of bad copy+paste from __svm_vcpu_run(), which in turn was slightly less bad copy+paste from __vmx_vcpu_run(). Opportunistically convert to unadulterated register accesses so that it's easier (but still not easy) to follow which registers hold what arguments, and when. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 44 +++++++++++--------------------------- 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 6f57d496867a1..c057866a459b6 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -298,17 +298,12 @@ SYM_FUNC_END(__svm_vcpu_run) * @spec_ctrl_intercepted: bool */ SYM_FUNC_START(__svm_sev_es_vcpu_run) - push %_ASM_BP -#ifdef CONFIG_X86_64 + push %rbp push %r15 push %r14 push %r13 push %r12 -#else - push %edi - push %esi -#endif - push %_ASM_BX + push %rbx /* * Save variables needed after vmexit on the stack, in inverse @@ -316,39 +311,31 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) */ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ - push %_ASM_ARG2 + push %rsi /* Save @svm. */ - push %_ASM_ARG1 - -.ifnc _ASM_ARG1, _ASM_DI - /* - * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX - * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL. - */ - mov %_ASM_ARG1, %_ASM_DI -.endif + push %rdi /* Clobbers RAX, RCX, RDX. */ RESTORE_GUEST_SPEC_CTRL /* Get svm->current_vmcb->pa into RAX. */ - mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX - mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX + mov SVM_current_vmcb(%rdi), %rax + mov KVM_VMCB_pa(%rax), %rax /* Enter guest mode */ sti -1: vmrun %_ASM_AX +1: vmrun %rax 2: cli /* Pop @svm to RDI, guest registers have been saved already. */ - pop %_ASM_DI + pop %rdi #ifdef CONFIG_MITIGATION_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE + FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif /* Clobbers RAX, RCX, RDX. */ @@ -364,26 +351,21 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) UNTRAIN_RET_VM /* "Pop" @spec_ctrl_intercepted. */ - pop %_ASM_BX + pop %rbx - pop %_ASM_BX + pop %rbx -#ifdef CONFIG_X86_64 pop %r12 pop %r13 pop %r14 pop %r15 -#else - pop %esi - pop %edi -#endif - pop %_ASM_BP + pop %rbp RET RESTORE_GUEST_SPEC_CTRL_BODY RESTORE_HOST_SPEC_CTRL_BODY -3: cmpb $0, _ASM_RIP(kvm_rebooting) +3: cmpb $0, kvm_rebooting(%rip) jne 2b ud2 -- GitLab From 87e8e360a05fd29465691aeac179bcf585600c59 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:29 -0800 Subject: [PATCH 625/989] KVM: SVM: Clobber RAX instead of RBX when discarding spec_ctrl_intercepted POP @spec_ctrl_intercepted into RAX instead of RBX when discarding it from the stack so that __svm_sev_es_vcpu_run() doesn't modify any non-volatile registers. __svm_sev_es_vcpu_run() doesn't return a value, and RAX is already are clobbered multiple times in the #VMEXIT path. This will allowing using the host save area to save/restore non-volatile registers in __svm_sev_es_vcpu_run(). Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index c057866a459b6..db94fb6f610ae 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -350,8 +350,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) */ UNTRAIN_RET_VM - /* "Pop" @spec_ctrl_intercepted. */ - pop %rbx + /* "Pop" and discard @spec_ctrl_intercepted. */ + pop %rax pop %rbx -- GitLab From c92be2fd8edf7b300a758c185fe032fd0257b886 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:30 -0800 Subject: [PATCH 626/989] KVM: SVM: Save/restore non-volatile GPRs in SEV-ES VMRUN via host save area Use the host save area to save/restore non-volatile (callee-saved) registers in __svm_sev_es_vcpu_run() to take advantage of hardware loading all registers from the save area on #VMEXIT. KVM still needs to save the registers it wants restored, but the loads are handled automatically by hardware. Aside from less assembly code, letting hardware do the restoration means stack frames are preserved for the entirety of __svm_sev_es_vcpu_run(). Opportunistically add a comment to call out why @svm needs to be saved across VMRUN->#VMEXIT, as it's not easy to decipher that from the macro hell. Cc: Tom Lendacky Cc: Michael Roth Cc: Alexey Kardashevskiy Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 17 +++++++++------- arch/x86/kvm/svm/svm.h | 3 ++- arch/x86/kvm/svm/vmenter.S | 41 +++++++++++++++++++++----------------- 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d1a9f99516358..9aaf83c8d57df 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1503,6 +1503,11 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } +static struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) +{ + return page_address(sd->save_area) + 0x400; +} + static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1519,12 +1524,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * or subsequent vmload of host save area. */ vmsave(sd->save_area_pa); - if (sev_es_guest(vcpu->kvm)) { - struct sev_es_save_area *hostsa; - hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); - - sev_es_prepare_switch_to_guest(hostsa); - } + if (sev_es_guest(vcpu->kvm)) + sev_es_prepare_switch_to_guest(sev_es_host_save_area(sd)); if (tsc_scaling) __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); @@ -4101,6 +4102,7 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) { + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); guest_state_enter_irqoff(); @@ -4108,7 +4110,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in amd_clear_divider(); if (sev_es_guest(vcpu->kvm)) - __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted); + __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted, + sev_es_host_save_area(sd)); else __svm_vcpu_run(svm, spec_ctrl_intercepted); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7f1fbd874c458..33878efdebc82 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -698,7 +698,8 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu); /* vmenter.S */ -void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); +void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted, + struct sev_es_save_area *hostsa); void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); #define DEFINE_KVM_GHCB_ACCESSORS(field) \ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index db94fb6f610ae..7f627d535afc5 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -292,23 +292,35 @@ SYM_FUNC_START(__svm_vcpu_run) SYM_FUNC_END(__svm_vcpu_run) #ifdef CONFIG_KVM_AMD_SEV + + +#ifdef CONFIG_X86_64 +#define SEV_ES_GPRS_BASE 0x300 +#define SEV_ES_RBX (SEV_ES_GPRS_BASE + __VCPU_REGS_RBX * WORD_SIZE) +#define SEV_ES_RBP (SEV_ES_GPRS_BASE + __VCPU_REGS_RBP * WORD_SIZE) +#define SEV_ES_R12 (SEV_ES_GPRS_BASE + __VCPU_REGS_R12 * WORD_SIZE) +#define SEV_ES_R13 (SEV_ES_GPRS_BASE + __VCPU_REGS_R13 * WORD_SIZE) +#define SEV_ES_R14 (SEV_ES_GPRS_BASE + __VCPU_REGS_R14 * WORD_SIZE) +#define SEV_ES_R15 (SEV_ES_GPRS_BASE + __VCPU_REGS_R15 * WORD_SIZE) +#endif + /** * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode * @svm: struct vcpu_svm * * @spec_ctrl_intercepted: bool */ SYM_FUNC_START(__svm_sev_es_vcpu_run) - push %rbp - push %r15 - push %r14 - push %r13 - push %r12 - push %rbx - /* - * Save variables needed after vmexit on the stack, in inverse - * order compared to when they are needed. + * Save non-volatile (callee-saved) registers to the host save area. + * Except for RAX and RSP, all GPRs are restored on #VMEXIT, but not + * saved on VMRUN. */ + mov %rbp, SEV_ES_RBP (%rdx) + mov %r15, SEV_ES_R15 (%rdx) + mov %r14, SEV_ES_R14 (%rdx) + mov %r13, SEV_ES_R13 (%rdx) + mov %r12, SEV_ES_R12 (%rdx) + mov %rbx, SEV_ES_RBX (%rdx) /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ push %rsi @@ -316,7 +328,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) /* Save @svm. */ push %rdi - /* Clobbers RAX, RCX, RDX. */ + /* Clobbers RAX, RCX, RDX (@hostsa). */ RESTORE_GUEST_SPEC_CTRL /* Get svm->current_vmcb->pa into RAX. */ @@ -338,7 +350,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif - /* Clobbers RAX, RCX, RDX. */ + /* Clobbers RAX, RCX, RDX, consumes RDI (@svm). */ RESTORE_HOST_SPEC_CTRL /* @@ -353,13 +365,6 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) /* "Pop" and discard @spec_ctrl_intercepted. */ pop %rax - pop %rbx - - pop %r12 - pop %r13 - pop %r14 - pop %r15 - pop %rbp RET RESTORE_GUEST_SPEC_CTRL_BODY -- GitLab From adac42bf42c1608f23938c03e3ca53fa6c87f337 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:31 -0800 Subject: [PATCH 627/989] KVM: SVM: Save/restore args across SEV-ES VMRUN via host save area Use the host save area to preserve volatile registers that are used in __svm_sev_es_vcpu_run() to access function parameters after #VMEXIT. Like saving/restoring non-volatile registers, there's no reason not to take advantage of hardware restoring registers on #VMEXIT, as doing so shaves a few instructions and the save area is going to be accessed no matter what. Converting all register save/restore code to use the host save area also make it easier to follow the SEV-ES VMRUN flow in its entirety, as opposed to having a mix of stack-based versus host save area save/restore. Add a parameter to RESTORE_HOST_SPEC_CTRL_BODY so that the SEV-ES path doesn't need to write @spec_ctrl_intercepted to memory just to play nice with the common macro. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 7f627d535afc5..d1613395dd393 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -67,7 +67,7 @@ "", X86_FEATURE_V_SPEC_CTRL 901: .endm -.macro RESTORE_HOST_SPEC_CTRL_BODY +.macro RESTORE_HOST_SPEC_CTRL_BODY spec_ctrl_intercepted:req 900: /* Same for after vmexit. */ mov $MSR_IA32_SPEC_CTRL, %ecx @@ -76,7 +76,7 @@ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL, * if it was not intercepted during guest execution. */ - cmpb $0, (%_ASM_SP) + cmpb $0, \spec_ctrl_intercepted jnz 998f rdmsr movl %eax, SVM_spec_ctrl(%_ASM_DI) @@ -269,7 +269,7 @@ SYM_FUNC_START(__svm_vcpu_run) RET RESTORE_GUEST_SPEC_CTRL_BODY - RESTORE_HOST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY (%_ASM_SP) 10: cmpb $0, _ASM_RIP(kvm_rebooting) jne 2b @@ -298,6 +298,8 @@ SYM_FUNC_END(__svm_vcpu_run) #define SEV_ES_GPRS_BASE 0x300 #define SEV_ES_RBX (SEV_ES_GPRS_BASE + __VCPU_REGS_RBX * WORD_SIZE) #define SEV_ES_RBP (SEV_ES_GPRS_BASE + __VCPU_REGS_RBP * WORD_SIZE) +#define SEV_ES_RSI (SEV_ES_GPRS_BASE + __VCPU_REGS_RSI * WORD_SIZE) +#define SEV_ES_RDI (SEV_ES_GPRS_BASE + __VCPU_REGS_RDI * WORD_SIZE) #define SEV_ES_R12 (SEV_ES_GPRS_BASE + __VCPU_REGS_R12 * WORD_SIZE) #define SEV_ES_R13 (SEV_ES_GPRS_BASE + __VCPU_REGS_R13 * WORD_SIZE) #define SEV_ES_R14 (SEV_ES_GPRS_BASE + __VCPU_REGS_R14 * WORD_SIZE) @@ -322,11 +324,12 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov %r12, SEV_ES_R12 (%rdx) mov %rbx, SEV_ES_RBX (%rdx) - /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ - push %rsi - - /* Save @svm. */ - push %rdi + /* + * Save volatile registers that hold arguments that are needed after + * #VMEXIT (RDI=@svm and RSI=@spec_ctrl_intercepted). + */ + mov %rdi, SEV_ES_RDI (%rdx) + mov %rsi, SEV_ES_RSI (%rdx) /* Clobbers RAX, RCX, RDX (@hostsa). */ RESTORE_GUEST_SPEC_CTRL @@ -342,15 +345,12 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) 2: cli - /* Pop @svm to RDI, guest registers have been saved already. */ - pop %rdi - #ifdef CONFIG_MITIGATION_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif - /* Clobbers RAX, RCX, RDX, consumes RDI (@svm). */ + /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */ RESTORE_HOST_SPEC_CTRL /* @@ -362,13 +362,10 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) */ UNTRAIN_RET_VM - /* "Pop" and discard @spec_ctrl_intercepted. */ - pop %rax - RET RESTORE_GUEST_SPEC_CTRL_BODY - RESTORE_HOST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY %sil 3: cmpb $0, kvm_rebooting(%rip) jne 2b -- GitLab From 4367a75887ec8d68932cd84ea9cffe24d7a55fa0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:32 -0800 Subject: [PATCH 628/989] KVM: SVM: Create a stack frame in __svm_sev_es_vcpu_run() Now that KVM uses the host save area to context switch RBP, i.e. preserves RBP for the entirety of __svm_sev_es_vcpu_run(), create a stack frame using the standared FRAME_{BEGIN,END} macros. Note, __svm_sev_es_vcpu_run() is subtly not a leaf function as it can call into ibpb_feature() via UNTRAIN_RET_VM. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index d1613395dd393..a0c8eb37d3e1c 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include "kvm-asm-offsets.h" @@ -312,6 +313,8 @@ SYM_FUNC_END(__svm_vcpu_run) * @spec_ctrl_intercepted: bool */ SYM_FUNC_START(__svm_sev_es_vcpu_run) + FRAME_BEGIN + /* * Save non-volatile (callee-saved) registers to the host save area. * Except for RAX and RSP, all GPRs are restored on #VMEXIT, but not @@ -362,6 +365,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) */ UNTRAIN_RET_VM + FRAME_END RET RESTORE_GUEST_SPEC_CTRL_BODY -- GitLab From 27ca867042affef7eba692d3bac7b51ee85e36ad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:33 -0800 Subject: [PATCH 629/989] KVM: x86: Stop compiling vmenter.S with OBJECT_FILES_NON_STANDARD Stop compiling vmenter.S with OBJECT_FILES_NON_STANDARD to skip objtool's stack validation now that __svm_vcpu_run() and __svm_sev_es_vcpu_run() create stack frames (though the former's effectiveness is dubious). Note, due to a quirk in how OBJECT_FILES_NON_STANDARD was handled by the build system prior to commit bf48d9b756b9 ("kbuild: change tool coverage variables to take the path relative to $(obj)"), vmx/vmenter.S got lumped in with svm/vmenter.S. __vmx_vcpu_run() already plays nice with frame pointers, i.e. it was collateral damage when commit 7f4b5cde2409 ("kvm: Disable objtool frame pointer checking for vmenter.S") added the OBJECT_FILES_NON_STANDARD hack-a-fix. Link: https://lore.kernel.org/all/20240217055504.2059803-1-masahiroy@kernel.org Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index a88bb14266b69..addc44fc7187d 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -3,11 +3,6 @@ ccflags-y += -I $(srctree)/arch/x86/kvm ccflags-$(CONFIG_KVM_WERROR) += -Werror -ifeq ($(CONFIG_FRAME_POINTER),y) -OBJECT_FILES_NON_STANDARD_vmx/vmenter.o := y -OBJECT_FILES_NON_STANDARD_svm/vmenter.o := y -endif - include $(srctree)/virt/kvm/Makefile.kvm kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ -- GitLab From 4c08f01934ab67d1d283d5cbaa52b923abcfe4cd Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Sun, 7 Apr 2024 22:28:02 -0400 Subject: [PATCH 630/989] drm/vmwgfx: Enable DMA mappings with SEV Enable DMA mappings in vmwgfx after TTM has been fixed in commit 3bf3710e3718 ("drm/ttm: Add a generic TTM memcpy move for page-based iomem") This enables full guest-backed memory support and in particular allows usage of screen targets as the presentation mechanism. Signed-off-by: Zack Rusin Reported-by: Ye Li Tested-by: Ye Li Fixes: 3b0d6458c705 ("drm/vmwgfx: Refuse DMA operation when SEV encryption is active") Cc: Broadcom internal kernel review list Cc: dri-devel@lists.freedesktop.org Cc: # v6.6+ Reviewed-by: Martin Krastev Link: https://patchwork.freedesktop.org/patch/msgid/20240408022802.358641-1-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index c7d90f96d16a6..0a304706e0132 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -666,11 +666,12 @@ static int vmw_dma_select_mode(struct vmw_private *dev_priv) [vmw_dma_map_populate] = "Caching DMA mappings.", [vmw_dma_map_bind] = "Giving up DMA mappings early."}; - /* TTM currently doesn't fully support SEV encryption. */ - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) - return -EINVAL; - - if (vmw_force_coherent) + /* + * When running with SEV we always want dma mappings, because + * otherwise ttm tt pool pages will bounce through swiotlb running + * out of available space. + */ + if (vmw_force_coherent || cc_platform_has(CC_ATTR_MEM_ENCRYPT)) dev_priv->map_mode = vmw_dma_alloc_coherent; else if (vmw_restrict_iommu) dev_priv->map_mode = vmw_dma_map_bind; -- GitLab From 05a2f07db8883b027c0b4a475fcc586278922b8d Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 5 Mar 2024 12:27:27 +0100 Subject: [PATCH 631/989] tools/power turbostat: read RAPL counters via perf Some of the future Intel platforms will require reading the RAPL counters via perf and not MSR. On current platforms we can still read them using both ways. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 786 +++++++++++++++++++++----- 1 file changed, 649 insertions(+), 137 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a380829c58903..283dffb987b59 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -37,6 +37,7 @@ #include #include #include +#include #define UNUSED(x) (void)(x) @@ -60,6 +61,7 @@ enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; +enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; struct msr_counter { unsigned int msr_num; @@ -958,6 +960,175 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi #define MAX_ADDED_THREAD_COUNTERS 24 #define BITMASK_SIZE 32 +/* Indexes used to map data read from perf and MSRs into global variables */ +enum rapl_rci_index { + RAPL_RCI_INDEX_ENERGY_PKG = 0, + RAPL_RCI_INDEX_ENERGY_CORES = 1, + RAPL_RCI_INDEX_DRAM = 2, + RAPL_RCI_INDEX_GFX = 3, + RAPL_RCI_INDEX_PKG_PERF_STATUS = 4, + RAPL_RCI_INDEX_DRAM_PERF_STATUS = 5, + RAPL_RCI_INDEX_CORE_ENERGY = 6, + NUM_RAPL_COUNTERS, +}; + +enum rapl_unit { + RAPL_UNIT_INVALID, + RAPL_UNIT_JOULES, + RAPL_UNIT_WATTS, +}; + +struct rapl_counter_info_t { + unsigned long long data[NUM_RAPL_COUNTERS]; + enum rapl_source source[NUM_RAPL_COUNTERS]; + unsigned long long flags[NUM_RAPL_COUNTERS]; + double scale[NUM_RAPL_COUNTERS]; + enum rapl_unit unit[NUM_RAPL_COUNTERS]; + + union { + /* Active when source == RAPL_SOURCE_MSR */ + struct { + unsigned long long msr[NUM_RAPL_COUNTERS]; + unsigned long long msr_mask[NUM_RAPL_COUNTERS]; + int msr_shift[NUM_RAPL_COUNTERS]; + }; + }; + + int fd_perf; +}; + +/* struct rapl_counter_info_t for each RAPL domain */ +struct rapl_counter_info_t *rapl_counter_info_perdomain; + +#define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1) + +struct rapl_counter_arch_info { + int feature_mask; /* Mask for testing if the counter is supported on host */ + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned long long msr_mask; + int msr_shift; /* Positive mean shift right, negative mean shift left */ + double *platform_rapl_msr_scale; /* Scale applied to values read by MSR (platform dependent, filled at runtime) */ + unsigned int rci_index; /* Maps data from perf counters to global variables */ + unsigned long long bic; + double compat_scale; /* Some counters require constant scaling to be in the same range as other, similar ones */ + unsigned long long flags; +}; + +static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { + { + .feature_mask = RAPL_PKG, + .perf_subsys = "power", + .perf_name = "energy-pkg", + .msr = MSR_PKG_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, + .bic = BIC_PkgWatt | BIC_Pkg_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_AMD_F17H, + .perf_subsys = "power", + .perf_name = "energy-pkg", + .msr = MSR_PKG_ENERGY_STAT, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, + .bic = BIC_PkgWatt | BIC_Pkg_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_CORE_ENERGY_STATUS, + .perf_subsys = "power", + .perf_name = "energy-cores", + .msr = MSR_PP0_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_CORES, + .bic = BIC_CorWatt | BIC_Cor_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_DRAM, + .perf_subsys = "power", + .perf_name = "energy-ram", + .msr = MSR_DRAM_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_dram_energy_units, + .rci_index = RAPL_RCI_INDEX_DRAM, + .bic = BIC_RAMWatt | BIC_RAM_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_GFX, + .perf_subsys = "power", + .perf_name = "energy-gpu", + .msr = MSR_PP1_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_GFX, + .bic = BIC_GFXWatt | BIC_GFX_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_PKG_PERF_STATUS, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_PKG_PERF_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_time_units, + .rci_index = RAPL_RCI_INDEX_PKG_PERF_STATUS, + .bic = BIC_PKG__, + .compat_scale = 100.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_DRAM_PERF_STATUS, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_DRAM_PERF_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_time_units, + .rci_index = RAPL_RCI_INDEX_DRAM_PERF_STATUS, + .bic = BIC_RAM__, + .compat_scale = 100.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_AMD_F17H, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_CORE_ENERGY_STAT, + .msr_mask = 0xFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_CORE_ENERGY, + .bic = BIC_CorWatt | BIC_Cor_J, + .compat_scale = 1.0, + .flags = 0, + }, +}; + +struct rapl_counter { + unsigned long long raw_value; + enum rapl_unit unit; + double scale; +}; + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -984,7 +1155,7 @@ struct core_data { unsigned long long c7; unsigned long long mc6_us; /* duplicate as per-core for now, even though per module */ unsigned int core_temp_c; - unsigned int core_energy; /* MSR_CORE_ENERGY_STAT */ + struct rapl_counter core_energy; /* MSR_CORE_ENERGY_STAT */ unsigned int core_id; unsigned long long core_throt_cnt; unsigned long long counter[MAX_ADDED_COUNTERS]; @@ -1009,12 +1180,12 @@ struct pkg_data { unsigned int gfx_mhz; unsigned int gfx_act_mhz; unsigned int package_id; - unsigned long long energy_pkg; /* MSR_PKG_ENERGY_STATUS */ - unsigned long long energy_dram; /* MSR_DRAM_ENERGY_STATUS */ - unsigned long long energy_cores; /* MSR_PP0_ENERGY_STATUS */ - unsigned long long energy_gfx; /* MSR_PP1_ENERGY_STATUS */ - unsigned long long rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ - unsigned long long rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ + struct rapl_counter energy_pkg; /* MSR_PKG_ENERGY_STATUS */ + struct rapl_counter energy_dram; /* MSR_DRAM_ENERGY_STATUS */ + struct rapl_counter energy_cores; /* MSR_PP0_ENERGY_STATUS */ + struct rapl_counter energy_gfx; /* MSR_PP1_ENERGY_STATUS */ + struct rapl_counter rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ + struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; unsigned long long counter[MAX_ADDED_COUNTERS]; @@ -1403,6 +1574,21 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) return 0; } +int probe_msr(int cpu, off_t offset) +{ + ssize_t retval; + unsigned long long dummy; + + assert(!no_msr); + + retval = pread(get_msr_fd(cpu), &dummy, sizeof(dummy), offset); + + if (retval != sizeof(dummy)) + return 1; + + return 0; +} + #define MAX_DEFERRED 16 char *deferred_add_names[MAX_DEFERRED]; char *deferred_skip_names[MAX_DEFERRED]; @@ -1755,7 +1941,11 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "c7: %016llX\n", c->c7); outp += sprintf(outp, "DTS: %dC\n", c->core_temp_c); outp += sprintf(outp, "cpu_throt_count: %016llX\n", c->core_throt_cnt); - outp += sprintf(outp, "Joules: %0X\n", c->core_energy); + + const unsigned long long energy_value = c->core_energy.raw_value * c->core_energy.scale; + const double energy_scale = c->core_energy.scale; + if (c->core_energy.unit == RAPL_UNIT_JOULES) + outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { outp += @@ -1785,12 +1975,12 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "pc10: %016llX\n", p->pc10); outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi); outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi); - outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg); - outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores); - outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx); - outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram); - outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status); - outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status); + outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg.raw_value); + outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores.raw_value); + outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx.raw_value); + outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram.raw_value); + outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status.raw_value); + outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status.raw_value); outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { @@ -1805,6 +1995,23 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p return 0; } +double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desired_unit, double interval) +{ + assert(desired_unit != RAPL_UNIT_INVALID); + + /* + * For now we don't expect anything other than joules, + * so just simplify the logic. + */ + assert(c->unit == RAPL_UNIT_JOULES); + + const double scaled = c->raw_value * c->scale; + + if (desired_unit == RAPL_UNIT_WATTS) + return scaled / interval; + return scaled; +} + /* * column formatting convention & formats */ @@ -1998,9 +2205,11 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&c->core_energy, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&c->core_energy, RAPL_UNIT_JOULES, interval_float)); /* print per-package data only for 1st core in package */ if (!is_cpu_first_core_in_package(t, c, p)) @@ -2072,34 +2281,40 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_PkgWatt)) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float); - + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_GFXWatt)) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_RAMWatt)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - p->energy_dram * rapl_dram_energy_units / interval_float); + rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_Pkg_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_GFX_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_RAM_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_PKG__)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); + rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_RAM__)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); + rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float)); /* UncMHz */ if (DO_BIC(BIC_UNCORE_MHZ)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz); @@ -2208,12 +2423,13 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->gfx_mhz = new->gfx_mhz; old->gfx_act_mhz = new->gfx_act_mhz; - old->energy_pkg = new->energy_pkg - old->energy_pkg; - old->energy_cores = new->energy_cores - old->energy_cores; - old->energy_gfx = new->energy_gfx - old->energy_gfx; - old->energy_dram = new->energy_dram - old->energy_dram; - old->rapl_pkg_perf_status = new->rapl_pkg_perf_status - old->rapl_pkg_perf_status; - old->rapl_dram_perf_status = new->rapl_dram_perf_status - old->rapl_dram_perf_status; + old->energy_pkg.raw_value = new->energy_pkg.raw_value - old->energy_pkg.raw_value; + old->energy_cores.raw_value = new->energy_cores.raw_value - old->energy_cores.raw_value; + old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value; + old->energy_dram.raw_value = new->energy_dram.raw_value - old->energy_dram.raw_value; + old->rapl_pkg_perf_status.raw_value = new->rapl_pkg_perf_status.raw_value - old->rapl_pkg_perf_status.raw_value; + old->rapl_dram_perf_status.raw_value = + new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2237,7 +2453,7 @@ void delta_core(struct core_data *new, struct core_data *old) old->core_throt_cnt = new->core_throt_cnt; old->mc6_us = new->mc6_us - old->mc6_us; - DELTA_WRAP32(new->core_energy, old->core_energy); + DELTA_WRAP32(new->core_energy.raw_value, old->core_energy.raw_value); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2364,6 +2580,13 @@ int delta_cpu(struct thread_data *t, struct core_data *c, return retval; } +void rapl_counter_clear(struct rapl_counter *c) +{ + c->raw_value = 0; + c->scale = 0.0; + c->unit = RAPL_UNIT_INVALID; +} + void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; @@ -2391,7 +2614,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data c->c7 = 0; c->mc6_us = 0; c->core_temp_c = 0; - c->core_energy = 0; + rapl_counter_clear(&c->core_energy); c->core_throt_cnt = 0; p->pkg_wtd_core_c0 = 0; @@ -2412,12 +2635,12 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->cpu_lpi = 0; p->sys_lpi = 0; - p->energy_pkg = 0; - p->energy_dram = 0; - p->energy_cores = 0; - p->energy_gfx = 0; - p->rapl_pkg_perf_status = 0; - p->rapl_dram_perf_status = 0; + rapl_counter_clear(&p->energy_pkg); + rapl_counter_clear(&p->energy_dram); + rapl_counter_clear(&p->energy_cores); + rapl_counter_clear(&p->energy_gfx); + rapl_counter_clear(&p->rapl_pkg_perf_status); + rapl_counter_clear(&p->rapl_dram_perf_status); p->pkg_temp_c = 0; p->gfx_rc6_ms = 0; @@ -2434,6 +2657,20 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->counter[i] = 0; } +void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src) +{ + /* Copy unit and scale from src if dst is not initialized */ + if (dst->unit == RAPL_UNIT_INVALID) { + dst->unit = src->unit; + dst->scale = src->scale; + } + + assert(dst->unit == src->unit); + assert(dst->scale == src->scale); + + dst->raw_value += src->raw_value; +} + int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; @@ -2480,7 +2717,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c); average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt); - average.cores.core_energy += c->core_energy; + rapl_counter_accumulate(&average.cores.core_energy, &c->core_energy); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2515,10 +2752,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.cpu_lpi = p->cpu_lpi; average.packages.sys_lpi = p->sys_lpi; - average.packages.energy_pkg += p->energy_pkg; - average.packages.energy_dram += p->energy_dram; - average.packages.energy_cores += p->energy_cores; - average.packages.energy_gfx += p->energy_gfx; + rapl_counter_accumulate(&average.packages.energy_pkg, &p->energy_pkg); + rapl_counter_accumulate(&average.packages.energy_dram, &p->energy_dram); + rapl_counter_accumulate(&average.packages.energy_cores, &p->energy_cores); + rapl_counter_accumulate(&average.packages.energy_gfx, &p->energy_gfx); average.packages.gfx_rc6_ms = p->gfx_rc6_ms; average.packages.uncore_mhz = p->uncore_mhz; @@ -2527,8 +2764,8 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c); - average.packages.rapl_pkg_perf_status += p->rapl_pkg_perf_status; - average.packages.rapl_dram_perf_status += p->rapl_dram_perf_status; + rapl_counter_accumulate(&average.packages.rapl_pkg_perf_status, &p->rapl_pkg_perf_status); + rapl_counter_accumulate(&average.packages.rapl_dram_perf_status, &p->rapl_dram_perf_status); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0)) @@ -2797,25 +3034,53 @@ struct amperf_group_fd { int mperf; }; -static unsigned int read_perf_counter_info(const char *const path, const char *const parse_format) +static int read_perf_counter_info(const char *const path, const char *const parse_format, void *value_ptr) { int fdmt; - char buf[16]; - unsigned int v; + int bytes_read; + char buf[64]; + int ret = -1; fdmt = open(path, O_RDONLY, 0); - if (fdmt == -1) - errx(1, "Failed to read perf counter info %s\n", path); + if (fdmt == -1) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } - if (read(fdmt, buf, sizeof(buf)) <= 0) - return 0; + bytes_read = read(fdmt, buf, sizeof(buf) - 1); + if (bytes_read <= 0 || bytes_read >= (int)sizeof(buf)) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } - buf[sizeof(buf) - 1] = '\0'; + buf[bytes_read] = '\0'; - if (sscanf(buf, parse_format, &v) != 1) - errx(1, "Failed to parse perf counter info %s\n", path); + if (sscanf(buf, parse_format, value_ptr) != 1) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } + ret = 0; + +cleanup_and_exit: close(fdmt); + return ret; +} + +static unsigned int read_perf_counter_info_n(const char *const path, const char *const parse_format) +{ + unsigned int v; + int status; + + status = read_perf_counter_info(path, parse_format, &v); + if (status) + v = -1; return v; } @@ -2825,7 +3090,7 @@ static unsigned read_msr_type(void) const char *const path = "/sys/bus/event_source/devices/msr/type"; const char *const format = "%u"; - return read_perf_counter_info(path, format); + return read_perf_counter_info_n(path, format); } static unsigned read_aperf_config(void) @@ -2833,7 +3098,7 @@ static unsigned read_aperf_config(void) const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; const char *const format = "event=%x"; - return read_perf_counter_info(path, format); + return read_perf_counter_info_n(path, format); } static unsigned read_mperf_config(void) @@ -2841,7 +3106,60 @@ static unsigned read_mperf_config(void) const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; const char *const format = "event=%x"; - return read_perf_counter_info(path, format); + return read_perf_counter_info_n(path, format); +} + +static unsigned read_perf_type(const char *subsys) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/type"; + const char *const format = "%u"; + char path[128]; + + snprintf(path, sizeof(path), path_format, subsys); + + return read_perf_counter_info_n(path, format); +} + +static unsigned read_rapl_config(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s"; + const char *const format = "event=%x"; + char path[128]; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + return read_perf_counter_info_n(path, format); +} + +static unsigned read_perf_rapl_unit(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.unit"; + const char *const format = "%s"; + char path[128]; + char unit_buffer[16]; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + read_perf_counter_info(path, format, &unit_buffer); + if (strcmp("Joules", unit_buffer) == 0) + return RAPL_UNIT_JOULES; + + return RAPL_UNIT_INVALID; +} + +static double read_perf_rapl_scale(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale"; + const char *const format = "%lf"; + char path[128]; + double scale; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + if (read_perf_counter_info(path, format, &scale)) + return 0.0; + + return scale; } static struct amperf_group_fd open_amperf_fd(int cpu) @@ -2961,6 +3279,99 @@ retry: return 0; } +size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_RAPL_COUNTERS; ++i) + if (rci->source[i] == RAPL_SOURCE_PERF) + ++ret; + + return ret; +} + +void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx) +{ + rc->raw_value = rci->data[idx]; + rc->unit = rci->unit[idx]; + rc->scale = rci->scale[idx]; +} + +int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data *p) +{ + unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain]; + + if (debug) + fprintf(stderr, "get_rapl_counters: cpu%d domain%d\n", cpu, domain); + + assert(rapl_counter_info_perdomain); + + /* + * If we have any perf counters to read, read them all now, in bulk + */ + if (rci->fd_perf != -1) { + size_t num_perf_counters = rapl_counter_info_count_perf(rci); + const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long); + const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data)); + if (actual_read_size != expected_read_size) + err(-1, "get_rapl_counters: failed to read perf_data (%zu %zu)", expected_read_size, + actual_read_size); + } + + for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) { + switch (rci->source[i]) { + case RAPL_SOURCE_NONE: + break; + + case RAPL_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(rci->fd_perf != -1); + + if (debug) + fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n", + i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]); + + rci->data[i] = perf_data[pi]; + + ++pi; + break; + + case RAPL_SOURCE_MSR: + if (debug) + fprintf(stderr, "Reading rapl counter via msr at %u\n", i); + + assert(!no_msr); + if (rci->flags[i] & RAPL_COUNTER_FLAG_USE_MSR_SUM) { + if (get_msr_sum(cpu, rci->msr[i], &rci->data[i])) + return -13 - i; + } else { + if (get_msr(cpu, rci->msr[i], &rci->data[i])) + return -13 - i; + } + + rci->data[i] &= rci->msr_mask[i]; + if (rci->msr_shift[i] >= 0) + rci->data[i] >>= abs(rci->msr_shift[i]); + else + rci->data[i] <<= abs(rci->msr_shift[i]); + + break; + } + } + + _Static_assert(NUM_RAPL_COUNTERS == 7); + write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG); + write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES); + write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM); + write_rapl_counter(&p->energy_gfx, rci, RAPL_RCI_INDEX_GFX); + write_rapl_counter(&p->rapl_pkg_perf_status, rci, RAPL_RCI_INDEX_PKG_PERF_STATUS); + write_rapl_counter(&p->rapl_dram_perf_status, rci, RAPL_RCI_INDEX_DRAM_PERF_STATUS); + write_rapl_counter(&c->core_energy, rci, RAPL_RCI_INDEX_CORE_ENERGY); + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -2972,6 +3383,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) unsigned long long msr; struct msr_counter *mp; int i; + int status; if (cpu_migrate(cpu)) { fprintf(outf, "get_counters: Could not migrate to CPU %d\n", cpu); @@ -3029,6 +3441,12 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (!is_cpu_first_thread_in_core(t, c, p)) goto done; + if (platform->has_per_core_rapl) { + status = get_rapl_counters(cpu, c->core_id, c, p); + if (status != 0) + return status; + } + if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) { if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3)) return -6; @@ -3069,12 +3487,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (DO_BIC(BIC_CORE_THROT_CNT)) get_core_throt_cnt(cpu, &c->core_throt_cnt); - if ((platform->rapl_msrs & RAPL_AMD_F17H) && !no_msr) { - if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr)) - return -14; - c->core_energy = msr & 0xFFFFFFFF; - } - for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &c->counter[i])) return -10; @@ -3134,42 +3546,10 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (DO_BIC(BIC_SYS_LPI)) p->sys_lpi = cpuidle_cur_sys_lpi_us; - if (!no_msr) { - if (platform->rapl_msrs & RAPL_PKG) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) - return -13; - p->energy_pkg = msr; - } - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) { - if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) - return -14; - p->energy_cores = msr; - } - if (platform->rapl_msrs & RAPL_DRAM) { - if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) - return -15; - p->energy_dram = msr; - } - if (platform->rapl_msrs & RAPL_GFX) { - if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) - return -16; - p->energy_gfx = msr; - } - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) - return -16; - p->rapl_pkg_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) - return -16; - p->rapl_dram_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_AMD_F17H) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) - return -13; - p->energy_pkg = msr; - } + if (!platform->has_per_core_rapl) { + status = get_rapl_counters(cpu, p->package_id, c, p); + if (status != 0) + return status; } if (DO_BIC(BIC_PkgTmp)) { @@ -3714,6 +4094,21 @@ void free_fd_instr_count_percpu(void) fd_instr_count_percpu = NULL; } +void free_fd_rapl_percpu(void) +{ + if (!rapl_counter_info_perdomain) + return; + + const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + + for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + if (rapl_counter_info_perdomain[domain_id].fd_perf != -1) + close(rapl_counter_info_perdomain[domain_id].fd_perf); + } + + free(rapl_counter_info_perdomain); +} + void free_all_buffers(void) { int i; @@ -3757,6 +4152,7 @@ void free_all_buffers(void) free_fd_percpu(); free_fd_instr_count_percpu(); free_fd_amperf_percpu(); + free_fd_rapl_percpu(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -4091,12 +4487,14 @@ static void update_effective_set(bool startup) } void linux_perf_init(void); +void rapl_perf_init(void); void re_initialize(void) { free_all_buffers(); setup_all_buffers(false); linux_perf_init(); + rapl_perf_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -5315,31 +5713,18 @@ void rapl_probe_intel(void) unsigned long long msr; unsigned int time_unit; double tdp; + const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt; + const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J; - if (rapl_joules) { - if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) - BIC_PRESENT(BIC_Pkg_J); - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) - BIC_PRESENT(BIC_Cor_J); - if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) - BIC_PRESENT(BIC_RAM_J); - if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) - BIC_PRESENT(BIC_GFX_J); - } else { - if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) - BIC_PRESENT(BIC_PkgWatt); - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) - BIC_PRESENT(BIC_CorWatt); - if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) - BIC_PRESENT(BIC_RAMWatt); - if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) - BIC_PRESENT(BIC_GFXWatt); - } + if (rapl_joules) + bic_enabled &= ~bic_watt_bits; + else + bic_enabled &= ~bic_joules_bits; - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) - BIC_PRESENT(BIC_PKG__); - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) - BIC_PRESENT(BIC_RAM__); + if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS)) + bic_enabled &= ~BIC_PKG__; + if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS)) + bic_enabled &= ~BIC_RAM__; /* units on package 0, verify later other packages match */ if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr)) @@ -5373,14 +5758,13 @@ void rapl_probe_amd(void) { unsigned long long msr; double tdp; + const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt; + const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J; - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - } + if (rapl_joules) + bic_enabled &= ~bic_watt_bits; + else + bic_enabled &= ~bic_joules_bits; if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr)) return; @@ -5885,6 +6269,48 @@ bool is_aperf_access_required(void) || BIC_IS_ENABLED(BIC_IPC); } +int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, + double *scale_, enum rapl_unit *unit_) +{ + if (no_perf) + return -1; + + const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name); + if (scale == 0.0) + return -1; + + const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name); + if (unit == RAPL_UNIT_INVALID) + return -1; + + const unsigned rapl_type = read_perf_type(cai->perf_subsys); + const unsigned rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); + + const int fd_counter = + open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP); + if (fd_counter == -1) + return -1; + + /* If it's the first counter opened, make it a group descriptor */ + if (rci->fd_perf == -1) + rci->fd_perf = fd_counter; + + *scale_ = scale; + *unit_ = unit; + return fd_counter; +} + +int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, + double *scale, enum rapl_unit *unit) +{ + int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit); + + if (debug) + fprintf(stderr, "add_rapl_perf_counter: %d (cpu: %d)\n", ret, cpu); + + return ret; +} + /* * Linux-perf manages the HW instructions-retired counter * by enabling when requested, and hiding rollover @@ -5908,17 +6334,101 @@ void linux_perf_init(void) } } -static int has_amperf_access_via_msr(void) +void rapl_perf_init(void) { - unsigned long long dummy; + const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + bool *domain_visited = calloc(num_domains, sizeof(bool)); + + rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain)); + if (rapl_counter_info_perdomain == NULL) + err(-1, "calloc rapl_counter_info_percpu"); + /* + * Initialize rapl_counter_info_percpu + */ + for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id]; + rci->fd_perf = -1; + for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) { + rci->data[i] = 0; + rci->source[i] = RAPL_SOURCE_NONE; + } + } + + /* + * Open/probe the counters + * If can't get it via perf, fallback to MSR + */ + for (size_t i = 0; i < ARRAY_SIZE(rapl_counter_arch_infos); ++i) { + + const struct rapl_counter_arch_info *const cai = &rapl_counter_arch_infos[i]; + bool has_counter = 0; + double scale; + enum rapl_unit unit; + int next_domain; + + memset(domain_visited, 0, num_domains * sizeof(*domain_visited)); + + for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) { + + if (cpu_is_not_allowed(cpu)) + continue; + + /* Skip already seen and handled RAPL domains */ + next_domain = + platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id; + + if (domain_visited[next_domain]) + continue; + + domain_visited[next_domain] = 1; + + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain]; + + /* Check if the counter is enabled and accessible */ + if (BIC_IS_ENABLED(cai->bic) && (platform->rapl_msrs & cai->feature_mask)) { + + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name + && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) { + rci->source[cai->rci_index] = RAPL_SOURCE_PERF; + rci->scale[cai->rci_index] = scale * cai->compat_scale; + rci->unit[cai->rci_index] = unit; + rci->flags[cai->rci_index] = cai->flags; + + /* Use MSR for this counter */ + } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { + rci->source[cai->rci_index] = RAPL_SOURCE_MSR; + rci->msr[cai->rci_index] = cai->msr; + rci->msr_mask[cai->rci_index] = cai->msr_mask; + rci->msr_shift[cai->rci_index] = cai->msr_shift; + rci->unit[cai->rci_index] = RAPL_UNIT_JOULES; + rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale; + rci->flags[cai->rci_index] = cai->flags; + } + } + + if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE) + has_counter = 1; + } + + /* If any CPU has access to the counter, make it present */ + if (has_counter) + BIC_PRESENT(cai->bic); + } + + free(domain_visited); +} + +static int has_amperf_access_via_msr(void) +{ if (no_msr) return 0; - if (get_msr(base_cpu, MSR_IA32_APERF, &dummy)) + if (probe_msr(base_cpu, MSR_IA32_APERF)) return 0; - if (get_msr(base_cpu, MSR_IA32_MPERF, &dummy)) + if (probe_msr(base_cpu, MSR_IA32_MPERF)) return 0; return 1; @@ -6696,6 +7206,7 @@ bool is_msr_access_required(void) || BIC_IS_ENABLED(BIC_Pkgpc8) || BIC_IS_ENABLED(BIC_Pkgpc9) || BIC_IS_ENABLED(BIC_Pkgpc10) + /* TODO: Multiplex access with perf */ || BIC_IS_ENABLED(BIC_CorWatt) || BIC_IS_ENABLED(BIC_Cor_J) || BIC_IS_ENABLED(BIC_PkgWatt) @@ -6749,6 +7260,7 @@ void turbostat_init() probe_pm_features(); set_amperf_source(); linux_perf_init(); + rapl_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); -- GitLab From 17d1ea136be86f53be0461b0c33daf6b58e6cbf7 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 7 Mar 2024 17:00:35 +0100 Subject: [PATCH 632/989] tools/power turbostat: Add selftests Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- .../testing/selftests/turbostat/defcolumns.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100755 tools/testing/selftests/turbostat/defcolumns.py diff --git a/tools/testing/selftests/turbostat/defcolumns.py b/tools/testing/selftests/turbostat/defcolumns.py new file mode 100755 index 0000000000000..70d3b7780311e --- /dev/null +++ b/tools/testing/selftests/turbostat/defcolumns.py @@ -0,0 +1,59 @@ +#!/bin/env python3 + +import subprocess +from shutil import which + +turbostat = which('turbostat') +if turbostat is None: + print('Could not find turbostat binary') + exit(1) + +timeout = which('timeout') +if timeout is None: + print('Could not find timeout binary') + exit(1) + +proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +# +# By default --list reports also "usec" and "Time_Of_Day_Seconds" columns +# which are only visible when running with --debug. +# +expected_columns_debug = proc_turbostat.stdout.replace(b',', b'\t').strip() +expected_columns = expected_columns_debug.replace(b'usec\t', b'').replace(b'Time_Of_Day_Seconds\t', b'').replace(b'X2APIC\t', b'').replace(b'APIC\t', b'') + +# +# Run turbostat with no options for 10 seconds and send SIGINT +# +timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '1s'] +turbostat_argv = [turbostat, '-i', '0.250'] + +print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) +actual_columns = proc_turbostat.stdout.split(b'\n')[0] +if expected_columns != actual_columns: + print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}') + exit(1) +print('OK') + +# +# Same, but with --debug +# +turbostat_argv.append('--debug') + +print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) +actual_columns = proc_turbostat.stdout.split(b'\n')[0] +if expected_columns_debug != actual_columns: + print(f'turbostat column check failed\n{expected_columns_debug=}\n{actual_columns=}') + exit(1) +print('OK') -- GitLab From bb5db22c13125b38b0740e19c18ae94f8e5a0eb6 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 12 Mar 2024 11:19:15 +0800 Subject: [PATCH 633/989] tools/power/turbostat: Enable MSR_CORE_C1_RES support for ICX Enable Core C1 hardware residency counter (MSR_CORE_C1_RES) on ICX. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 283dffb987b59..372f67a70d8ab 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -664,6 +664,7 @@ static const struct platform_features icx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_ICX, + .has_msr_core_c1_res = 1, .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, -- GitLab From 4e2bbbf78cf7144204214fd0bd7cca309acd8f89 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 12 Mar 2024 14:23:37 +0800 Subject: [PATCH 634/989] tools/power/turbostat: Cache graphics sysfs path Graphics drivers (i915/Xe) have different sysfs knobs on different platforms, and it is possible that different sysfs knobs fit into the same turbostat columns. Instead of specifying different sysfs knobs every time, detect them once and cache the path for future use. No functional change. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 45 +++++++++++++++++++-------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 372f67a70d8ab..78db4f65a2374 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -276,6 +276,19 @@ bool no_msr; bool no_perf; enum amperf_source amperf_source; +enum gfx_sysfs_idx { + GFX_rc6, + GFX_MHz, + GFX_ACTMHz, + GFX_MAX +}; + +struct gfx_sysfs_info { + const char *path; +}; + +static struct gfx_sysfs_info gfx_info[GFX_MAX]; + int get_msr(int cpu, off_t offset, unsigned long long *msr); /* Model specific support Start */ @@ -4620,7 +4633,7 @@ int snapshot_gfx_rc6_ms(void) FILE *fp; int retval; - fp = fopen_or_die("/sys/class/drm/card0/power/rc6_residency_ms", "r"); + fp = fopen_or_die(gfx_info[GFX_rc6].path, "r"); retval = fscanf(fp, "%lld", &gfx_cur_rc6_ms); if (retval != 1) @@ -4645,9 +4658,7 @@ int snapshot_gfx_mhz(void) int retval; if (fp == NULL) { - fp = fopen("/sys/class/drm/card0/gt_cur_freq_mhz", "r"); - if (!fp) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r"); + fp = fopen_or_die(gfx_info[GFX_MHz].path, "r"); } else { rewind(fp); fflush(fp); @@ -4674,9 +4685,7 @@ int snapshot_gfx_act_mhz(void) int retval; if (fp == NULL) { - fp = fopen("/sys/class/drm/card0/gt_act_freq_mhz", "r"); - if (!fp) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r"); + fp = fopen_or_die(gfx_info[GFX_ACTMHz].path, "r"); } else { rewind(fp); fflush(fp); @@ -5338,14 +5347,24 @@ probe_cluster: static void probe_graphics(void) { if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) - BIC_PRESENT(BIC_GFX_rc6); + gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms"; - if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) - BIC_PRESENT(BIC_GFXMHz); + if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt_cur_freq_mhz"; + else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz"; + + + if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz"; + else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz"; - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + if (gfx_info[GFX_rc6].path) + BIC_PRESENT(BIC_GFX_rc6); + if (gfx_info[GFX_MHz].path) + BIC_PRESENT(BIC_GFXMHz); + if (gfx_info[GFX_ACTMHz].path) BIC_PRESENT(BIC_GFXACTMHz); } -- GitLab From de39d38c06eb047954c5ad20a3f9acb6d3c78498 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 13 Mar 2024 10:12:19 +0800 Subject: [PATCH 635/989] tools/power/turbostat: Unify graphics sysfs snapshots Graphics sysfs snapshots share similar logic. Combine them into one function to avoid code duplication. No functional change. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 109 ++++++++------------------ 1 file changed, 34 insertions(+), 75 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 78db4f65a2374..e5b6161fef487 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -251,11 +251,8 @@ char *output_buffer, *outp; unsigned int do_dts; unsigned int do_ptm; unsigned int do_ipc; -unsigned long long gfx_cur_rc6_ms; unsigned long long cpuidle_cur_cpu_lpi_us; unsigned long long cpuidle_cur_sys_lpi_us; -unsigned int gfx_cur_mhz; -unsigned int gfx_act_mhz; unsigned int tj_max; unsigned int tj_max_override; double rapl_power_units, rapl_time_units; @@ -285,6 +282,9 @@ enum gfx_sysfs_idx { struct gfx_sysfs_info { const char *path; + FILE *fp; + unsigned int val; + unsigned long long val_ull; }; static struct gfx_sysfs_info gfx_info[GFX_MAX]; @@ -3573,17 +3573,17 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) } if (DO_BIC(BIC_GFX_rc6)) - p->gfx_rc6_ms = gfx_cur_rc6_ms; + p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; /* n.b. assume die0 uncore frequency applies to whole package */ if (DO_BIC(BIC_UNCORE_MHZ)) p->uncore_mhz = get_uncore_mhz(p->package_id, 0); if (DO_BIC(BIC_GFXMHz)) - p->gfx_mhz = gfx_cur_mhz; + p->gfx_mhz = gfx_info[GFX_MHz].val; if (DO_BIC(BIC_GFXACTMHz)) - p->gfx_act_mhz = gfx_act_mhz; + p->gfx_act_mhz = gfx_info[GFX_ACTMHz].val; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &p->counter[i])) @@ -4621,81 +4621,40 @@ int snapshot_proc_interrupts(void) } /* - * snapshot_gfx_rc6_ms() + * snapshot_graphics() * - * record snapshot of - * /sys/class/drm/card0/power/rc6_residency_ms + * record snapshot of specified graphics sysfs knob * * return 1 if config change requires a restart, else return 0 */ -int snapshot_gfx_rc6_ms(void) +int snapshot_graphics(int idx) { FILE *fp; int retval; - fp = fopen_or_die(gfx_info[GFX_rc6].path, "r"); - - retval = fscanf(fp, "%lld", &gfx_cur_rc6_ms); - if (retval != 1) - err(1, "GFX rc6"); - - fclose(fp); - - return 0; -} - -/* - * snapshot_gfx_mhz() - * - * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz - * when /sys/class/drm/card0/gt_cur_freq_mhz is not available. - * - * return 1 if config change requires a restart, else return 0 - */ -int snapshot_gfx_mhz(void) -{ - static FILE *fp; - int retval; - - if (fp == NULL) { - fp = fopen_or_die(gfx_info[GFX_MHz].path, "r"); - } else { - rewind(fp); - fflush(fp); - } - - retval = fscanf(fp, "%d", &gfx_cur_mhz); - if (retval != 1) - err(1, "GFX MHz"); - - return 0; -} - -/* - * snapshot_gfx_cur_mhz() - * - * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz - * when /sys/class/drm/card0/gt_act_freq_mhz is not available. - * - * return 1 if config change requires a restart, else return 0 - */ -int snapshot_gfx_act_mhz(void) -{ - static FILE *fp; - int retval; - - if (fp == NULL) { - fp = fopen_or_die(gfx_info[GFX_ACTMHz].path, "r"); - } else { - rewind(fp); - fflush(fp); + switch (idx) { + case GFX_rc6: + fp = fopen_or_die(gfx_info[idx].path, "r"); + retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull); + if (retval != 1) + err(1, "rc6"); + fclose(fp); + return 0; + case GFX_MHz: + case GFX_ACTMHz: + if (gfx_info[idx].fp == NULL) { + gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); + } else { + rewind(gfx_info[idx].fp); + fflush(gfx_info[idx].fp); + } + retval = fscanf(gfx_info[idx].fp, "%d", &gfx_info[idx].val); + if (retval != 1) + err(1, "MHz"); + return 0; + default: + return -EINVAL; } - - retval = fscanf(fp, "%d", &gfx_act_mhz); - if (retval != 1) - err(1, "GFX ACT MHz"); - - return 0; } /* @@ -4760,13 +4719,13 @@ int snapshot_proc_sysfs_files(void) return 1; if (DO_BIC(BIC_GFX_rc6)) - snapshot_gfx_rc6_ms(); + snapshot_graphics(GFX_rc6); if (DO_BIC(BIC_GFXMHz)) - snapshot_gfx_mhz(); + snapshot_graphics(GFX_MHz); if (DO_BIC(BIC_GFXACTMHz)) - snapshot_gfx_act_mhz(); + snapshot_graphics(GFX_ACTMHz); if (DO_BIC(BIC_CPU_LPI)) snapshot_cpu_lpi_us(); -- GitLab From 60add818ab2543b7e4f2bfeaacf2504743c1eb50 Mon Sep 17 00:00:00 2001 From: Justin Ernst Date: Tue, 2 Apr 2024 13:40:29 -0400 Subject: [PATCH 636/989] tools/power/turbostat: Fix uncore frequency file string Running turbostat on a 16 socket HPE Scale-up Compute 3200 (SapphireRapids) fails with: turbostat: /sys/devices/system/cpu/intel_uncore_frequency/package_010_die_00/current_freq_khz: open failed: No such file or directory We observe the sysfs uncore frequency directories named: ... package_09_die_00/ package_10_die_00/ package_11_die_00/ ... package_15_die_00/ The culprit is an incorrect sprintf format string "package_0%d_die_0%d" used with each instance of reading uncore frequency files. uncore-frequency-common.c creates the sysfs directory with the format "package_%02d_die_%02d". Once the package value reaches double digits, the formats diverge. Change each instance of "package_0%d_die_0%d" to "package_%02d_die_%02d". [lenb: deleted the probe part of this patch, as it was already fixed] Signed-off-by: Justin Ernst Reviewed-by: Thomas Renninger Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e5b6161fef487..016a5c7dc9bf3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2939,7 +2939,7 @@ unsigned long long get_uncore_mhz(int package, int die) { char path[128]; - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/current_freq_khz", package, + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package, die); return (snapshot_sysfs_counter(path) / 1000); -- GitLab From ff81dade48608363136d52bb2493a6df76458b28 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Wed, 10 Apr 2024 01:35:28 +0800 Subject: [PATCH 637/989] io-uring: correct typo in comment for IOU_F_TWQ_LAZY_WAKE The 'r' key is near to 't' key, that makes 'with' to be 'wirh' ? :) Signed-off-by: Haiyue Wang Link: https://lore.kernel.org/r/20240409173531.846714-1-haiyue.wang@intel.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 05df0e399d7c0..ac333ea81d319 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -13,7 +13,7 @@ enum { * A hint to not wake right away but delay until there are enough of * tw's queued to match the number of CQEs the task is waiting for. * - * Must not be used wirh requests generating more than one CQE. + * Must not be used with requests generating more than one CQE. * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. */ IOU_F_TWQ_LAZY_WAKE = 1, -- GitLab From 68879386180c0efd5a11e800b0525a01068c9457 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 26 Mar 2024 14:39:20 +0900 Subject: [PATCH 638/989] btrfs: zoned: do not flag ZEROOUT on non-dirty extent buffer Btrfs clears the content of an extent buffer marked as EXTENT_BUFFER_ZONED_ZEROOUT before the bio submission. This mechanism is introduced to prevent a write hole of an extent buffer, which is once allocated, marked dirty, but turns out unnecessary and cleaned up within one transaction operation. Currently, btrfs_clear_buffer_dirty() marks the extent buffer as EXTENT_BUFFER_ZONED_ZEROOUT, and skips the entry function. If this call happens while the buffer is under IO (with the WRITEBACK flag set, without the DIRTY flag), we can add the ZEROOUT flag and clear the buffer's content just before a bio submission. As a result: 1) it can lead to adding faulty delayed reference item which leads to a FS corrupted (EUCLEAN) error, and 2) it writes out cleared tree node on disk The former issue is previously discussed in [1]. The corruption happens when it runs a delayed reference update. So, on-disk data is safe. [1] https://lore.kernel.org/linux-btrfs/3f4f2a0ff1a6c818050434288925bdcf3cd719e5.1709124777.git.naohiro.aota@wdc.com/ The latter one can reach on-disk data. But, as that node is already processed by btrfs_clear_buffer_dirty(), that will be invalidated in the next transaction commit anyway. So, the chance of hitting the corruption is relatively small. Anyway, we should skip flagging ZEROOUT on a non-DIRTY extent buffer, to keep the content under IO intact. Fixes: aa6313e6ff2b ("btrfs: zoned: don't clear dirty flag of extent buffer") CC: stable@vger.kernel.org # 6.8 Link: https://lore.kernel.org/linux-btrfs/oadvdekkturysgfgi4qzuemd57zudeasynswurjxw3ocdfsef6@sjyufeugh63f/ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 61594eaf1f896..df3fe36126f99 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4154,7 +4154,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, * The actual zeroout of the buffer will happen later in * btree_csum_one_bio. */ - if (btrfs_is_zoned(fs_info)) { + if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); return; } -- GitLab From 073bda7a541731f41ed08f32d286394236c74005 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 26 Mar 2024 14:39:21 +0900 Subject: [PATCH 639/989] btrfs: zoned: add ASSERT and WARN for EXTENT_BUFFER_ZONED_ZEROOUT handling Add an ASSERT to catch a faulty delayed reference item resulting from prematurely cleared extent buffer. Also, add a WARN to detect if we try to dirty a ZEROOUT buffer again, which is suspicious as its update will be lost. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 8 ++++++++ fs/btrfs/extent_io.c | 1 + 2 files changed, 9 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index beedd6ed64d39..257d044bca915 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3464,6 +3464,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root_id != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_ref generic_ref = { 0 }; + /* + * Assert that the extent buffer is not cleared due to + * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer + * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for + * detail. + */ + ASSERT(btrfs_header_bytenr(buf) != 0); + btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, buf->start, buf->len, parent, btrfs_header_owner(buf)); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index df3fe36126f99..b18034f2ab804 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4193,6 +4193,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { bool subpage = eb->fs_info->nodesize < PAGE_SIZE; -- GitLab From 1db7959aacd905e6487d0478ac01d89f86eb1e51 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Mar 2024 09:16:46 +1030 Subject: [PATCH 640/989] btrfs: do not wait for short bulk allocation [BUG] There is a recent report that when memory pressure is high (including cached pages), btrfs can spend most of its time on memory allocation in btrfs_alloc_page_array() for compressed read/write. [CAUSE] For btrfs_alloc_page_array() we always go alloc_pages_bulk_array(), and even if the bulk allocation failed (fell back to single page allocation) we still retry but with extra memalloc_retry_wait(). If the bulk alloc only returned one page a time, we would spend a lot of time on the retry wait. The behavior was introduced in commit 395cb57e8560 ("btrfs: wait between incomplete batch memory allocations"). [FIX] Although the commit mentioned that other filesystems do the wait, it's not the case at least nowadays. All the mainlined filesystems only call memalloc_retry_wait() if they failed to allocate any page (not only for bulk allocation). If there is any progress, they won't call memalloc_retry_wait() at all. For example, xfs_buf_alloc_pages() would only call memalloc_retry_wait() if there is no allocation progress at all, and the call is not for metadata readahead. So I don't believe we should call memalloc_retry_wait() unconditionally for short allocation. Call memalloc_retry_wait() if it fails to allocate any page for tree block allocation (which goes with __GFP_NOFAIL and may not need the special handling anyway), and reduce the latency for btrfs_alloc_page_array(). Reported-by: Julian Taylor Tested-by: Julian Taylor Link: https://lore.kernel.org/all/8966c095-cbe7-4d22-9784-a647d1bf27c3@1und1.de/ Fixes: 395cb57e8560 ("btrfs: wait between incomplete batch memory allocations") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Sweet Tea Dorminy Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b18034f2ab804..2776112dbdf8d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -681,31 +681,21 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, gfp_t extra_gfp) { + const gfp_t gfp = GFP_NOFS | extra_gfp; unsigned int allocated; for (allocated = 0; allocated < nr_pages;) { unsigned int last = allocated; - allocated = alloc_pages_bulk_array(GFP_NOFS | extra_gfp, - nr_pages, page_array); - - if (allocated == nr_pages) - return 0; - - /* - * During this iteration, no page could be allocated, even - * though alloc_pages_bulk_array() falls back to alloc_page() - * if it could not bulk-allocate. So we must be out of memory. - */ - if (allocated == last) { + allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array); + if (unlikely(allocated == last)) { + /* No progress, fail and do cleanup. */ for (int i = 0; i < allocated; i++) { __free_page(page_array[i]); page_array[i] = NULL; } return -ENOMEM; } - - memalloc_retry_wait(GFP_NOFS); } return 0; } -- GitLab From 60b703c71fa80de0c2e14af66e57e234019b7da2 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 2 Apr 2024 12:17:16 +0200 Subject: [PATCH 641/989] zonefs: Use str_plural() to fix Coccinelle warning Fixes the following Coccinelle/coccicheck warning reported by string_choices.cocci: opportunity for str_plural(zgroup->g_nr_zones) Signed-off-by: Thorsten Blum Signed-off-by: Damien Le Moal --- fs/zonefs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index c6a124e8d565f..964fa7f240033 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1048,7 +1048,7 @@ static int zonefs_init_zgroup(struct super_block *sb, zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", zonefs_zgroup_name(ztype), zgroup->g_nr_zones, - zgroup->g_nr_zones > 1 ? "s" : ""); + str_plural(zgroup->g_nr_zones)); return 0; } -- GitLab From 9b31152fd74eeb10a20345909e542fef6f1d98e2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 9 Apr 2024 18:50:27 -0400 Subject: [PATCH 642/989] bcachefs: btree_node_scan: Respect member.data_allowed If a device wasn't used for btree nodes, no need to scan for them. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_node_scan.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index a7d0593b38714..556f76f5c84e1 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -216,6 +216,9 @@ static int read_btree_nodes(struct find_btree_nodes *f) closure_init_stack(&cl); for_each_online_member(c, ca) { + if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) + continue; + struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); struct task_struct *t; -- GitLab From 6e45a30fe5e7cf5d42ac07262a3d97644f23dc68 Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Tue, 9 Apr 2024 23:53:00 +0000 Subject: [PATCH 643/989] fs/9p: remove erroneous nlink init from legacy stat2inode In 9p2000 legacy mode, stat2inode initializes nlink to 1, which is redundant with what alloc_inode should have already set. 9p2000.u overrides this with extensions if present in the stat structure, and 9p2000.L incorporates nlink into its stat structure. At the very least this probably messes with directory nlink accounting in legacy mode. Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index c5b4d3631c47e..47bd77199e20c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1064,8 +1064,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct v9fs_session_info *v9ses = sb->s_fs_info; struct v9fs_inode *v9inode = V9FS_I(inode); - set_nlink(inode, 1); - inode_set_atime(inode, stat->atime, 0); inode_set_mtime(inode, stat->mtime, 0); inode_set_ctime(inode, stat->mtime, 0); -- GitLab From 6309863b31dd80317cd7d6824820b44e254e2a9c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Apr 2024 08:28:43 +0000 Subject: [PATCH 644/989] net: add copy_safe_from_sockptr() helper copy_from_sockptr() helper is unsafe, unless callers did the prior check against user provided optlen. Too many callers get this wrong, lets add a helper to fix them and avoid future copy/paste bugs. Instead of : if (optlen < sizeof(opt)) { err = -EINVAL; break; } if (copy_from_sockptr(&opt, optval, sizeof(opt)) { err = -EFAULT; break; } Use : err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240408082845.3957374-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/sockptr.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index 307961b41541a..317200cd3a603 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -50,11 +50,36 @@ static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, return 0; } +/* Deprecated. + * This is unsafe, unless caller checked user provided optlen. + * Prefer copy_safe_from_sockptr() instead. + */ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) { return copy_from_sockptr_offset(dst, src, 0, size); } +/** + * copy_safe_from_sockptr: copy a struct from sockptr + * @dst: Destination address, in kernel space. This buffer must be @ksize + * bytes long. + * @ksize: Size of @dst struct. + * @optval: Source address. (in user or kernel space) + * @optlen: Size of @optval data. + * + * Returns: + * * -EINVAL: @optlen < @ksize + * * -EFAULT: access to userspace failed. + * * 0 : @ksize bytes were copied + */ +static inline int copy_safe_from_sockptr(void *dst, size_t ksize, + sockptr_t optval, unsigned int optlen) +{ + if (optlen < ksize) + return -EINVAL; + return copy_from_sockptr(dst, optval, ksize); +} + static inline int copy_struct_from_sockptr(void *dst, size_t ksize, sockptr_t src, size_t usize) { -- GitLab From 138b787804f4a10417618e8d1e6e2700539fd88c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Apr 2024 08:28:44 +0000 Subject: [PATCH 645/989] mISDN: fix MISDN_TIME_STAMP handling syzbot reports one unsafe call to copy_from_sockptr() [1] Use copy_safe_from_sockptr() instead. [1] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in data_sock_setsockopt+0x46c/0x4cc drivers/isdn/mISDN/socket.c:417 Read of size 4 at addr ffff0000c6d54083 by task syz-executor406/6167 CPU: 1 PID: 6167 Comm: syz-executor406 Not tainted 6.8.0-rc7-syzkaller-g707081b61156 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call trace: dump_backtrace+0x1b8/0x1e4 arch/arm64/kernel/stacktrace.c:291 show_stack+0x2c/0x3c arch/arm64/kernel/stacktrace.c:298 __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd0/0x124 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:377 [inline] print_report+0x178/0x518 mm/kasan/report.c:488 kasan_report+0xd8/0x138 mm/kasan/report.c:601 __asan_report_load_n_noabort+0x1c/0x28 mm/kasan/report_generic.c:391 copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] copy_from_sockptr include/linux/sockptr.h:55 [inline] data_sock_setsockopt+0x46c/0x4cc drivers/isdn/mISDN/socket.c:417 do_sock_setsockopt+0x2a0/0x4e0 net/socket.c:2311 __sys_setsockopt+0x128/0x1a8 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __arm64_sys_setsockopt+0xb8/0xd4 net/socket.c:2340 __invoke_syscall arch/arm64/kernel/syscall.c:34 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:48 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:133 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:152 el0_svc+0x54/0x168 arch/arm64/kernel/entry-common.c:712 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 Fixes: 1b2b03f8e514 ("Add mISDN core files") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Karsten Keil Link: https://lore.kernel.org/r/20240408082845.3957374-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/isdn/mISDN/socket.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 2776ca5fc33f3..b215b28cad7b7 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -401,23 +401,23 @@ data_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } static int data_sock_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int len) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0, opt = 0; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p, %d, %x, optval, %d)\n", __func__, sock, - level, optname, len); + level, optname, optlen); lock_sock(sk); switch (optname) { case MISDN_TIME_STAMP: - if (copy_from_sockptr(&opt, optval, sizeof(int))) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), + optval, optlen); + if (err) break; - } if (opt) _pms(sk)->cmask |= MISDN_TIME_STAMP; -- GitLab From 7a87441c9651ba37842f4809224aca13a554a26f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Apr 2024 08:28:45 +0000 Subject: [PATCH 646/989] nfc: llcp: fix nfc_llcp_setsockopt() unsafe copies syzbot reported unsafe calls to copy_from_sockptr() [1] Use copy_safe_from_sockptr() instead. [1] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in nfc_llcp_setsockopt+0x6c2/0x850 net/nfc/llcp_sock.c:255 Read of size 4 at addr ffff88801caa1ec3 by task syz-executor459/5078 CPU: 0 PID: 5078 Comm: syz-executor459 Not tainted 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] copy_from_sockptr include/linux/sockptr.h:55 [inline] nfc_llcp_setsockopt+0x6c2/0x850 net/nfc/llcp_sock.c:255 do_sock_setsockopt+0x3b1/0x720 net/socket.c:2311 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfd/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 RIP: 0033:0x7f7fac07fd89 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 91 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fff660eb788 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f7fac07fd89 RDX: 0000000000000000 RSI: 0000000000000118 RDI: 0000000000000004 RBP: 0000000000000000 R08: 0000000000000002 R09: 0000000000000000 R10: 0000000020000a80 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240408082845.3957374-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/nfc/llcp_sock.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 819157bbb5a2c..d5344563e525c 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -252,10 +252,10 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), + optval, optlen); + if (err) break; - } if (opt > LLCP_MAX_RW) { err = -EINVAL; @@ -274,10 +274,10 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), + optval, optlen); + if (err) break; - } if (opt > LLCP_MAX_MIUX) { err = -EINVAL; -- GitLab From 7633c4da919ad51164acbf1aa322cc1a3ead6129 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Mon, 8 Apr 2024 16:18:21 +0200 Subject: [PATCH 647/989] ipv6: fix race condition between ipv6_get_ifaddr and ipv6_del_addr Although ipv6_get_ifaddr walks inet6_addr_lst under the RCU lock, it still means hlist_for_each_entry_rcu can return an item that got removed from the list. The memory itself of such item is not freed thanks to RCU but nothing guarantees the actual content of the memory is sane. In particular, the reference count can be zero. This can happen if ipv6_del_addr is called in parallel. ipv6_del_addr removes the entry from inet6_addr_lst (hlist_del_init_rcu(&ifp->addr_lst)) and drops all references (__in6_ifa_put(ifp) + in6_ifa_put(ifp)). With bad enough timing, this can happen: 1. In ipv6_get_ifaddr, hlist_for_each_entry_rcu returns an entry. 2. Then, the whole ipv6_del_addr is executed for the given entry. The reference count drops to zero and kfree_rcu is scheduled. 3. ipv6_get_ifaddr continues and tries to increments the reference count (in6_ifa_hold). 4. The rcu is unlocked and the entry is freed. 5. The freed entry is returned. Prevent increasing of the reference count in such case. The name in6_ifa_hold_safe is chosen to mimic the existing fib6_info_hold_safe. [ 41.506330] refcount_t: addition on 0; use-after-free. [ 41.506760] WARNING: CPU: 0 PID: 595 at lib/refcount.c:25 refcount_warn_saturate+0xa5/0x130 [ 41.507413] Modules linked in: veth bridge stp llc [ 41.507821] CPU: 0 PID: 595 Comm: python3 Not tainted 6.9.0-rc2.main-00208-g49563be82afa #14 [ 41.508479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) [ 41.509163] RIP: 0010:refcount_warn_saturate+0xa5/0x130 [ 41.509586] Code: ad ff 90 0f 0b 90 90 c3 cc cc cc cc 80 3d c0 30 ad 01 00 75 a0 c6 05 b7 30 ad 01 01 90 48 c7 c7 38 cc 7a 8c e8 cc 18 ad ff 90 <0f> 0b 90 90 c3 cc cc cc cc 80 3d 98 30 ad 01 00 0f 85 75 ff ff ff [ 41.510956] RSP: 0018:ffffbda3c026baf0 EFLAGS: 00010282 [ 41.511368] RAX: 0000000000000000 RBX: ffff9e9c46914800 RCX: 0000000000000000 [ 41.511910] RDX: ffff9e9c7ec29c00 RSI: ffff9e9c7ec1c900 RDI: ffff9e9c7ec1c900 [ 41.512445] RBP: ffff9e9c43660c9c R08: 0000000000009ffb R09: 00000000ffffdfff [ 41.512998] R10: 00000000ffffdfff R11: ffffffff8ca58a40 R12: ffff9e9c4339a000 [ 41.513534] R13: 0000000000000001 R14: ffff9e9c438a0000 R15: ffffbda3c026bb48 [ 41.514086] FS: 00007fbc4cda1740(0000) GS:ffff9e9c7ec00000(0000) knlGS:0000000000000000 [ 41.514726] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 41.515176] CR2: 000056233b337d88 CR3: 000000000376e006 CR4: 0000000000370ef0 [ 41.515713] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 41.516252] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 41.516799] Call Trace: [ 41.517037] [ 41.517249] ? __warn+0x7b/0x120 [ 41.517535] ? refcount_warn_saturate+0xa5/0x130 [ 41.517923] ? report_bug+0x164/0x190 [ 41.518240] ? handle_bug+0x3d/0x70 [ 41.518541] ? exc_invalid_op+0x17/0x70 [ 41.520972] ? asm_exc_invalid_op+0x1a/0x20 [ 41.521325] ? refcount_warn_saturate+0xa5/0x130 [ 41.521708] ipv6_get_ifaddr+0xda/0xe0 [ 41.522035] inet6_rtm_getaddr+0x342/0x3f0 [ 41.522376] ? __pfx_inet6_rtm_getaddr+0x10/0x10 [ 41.522758] rtnetlink_rcv_msg+0x334/0x3d0 [ 41.523102] ? netlink_unicast+0x30f/0x390 [ 41.523445] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 41.523832] netlink_rcv_skb+0x53/0x100 [ 41.524157] netlink_unicast+0x23b/0x390 [ 41.524484] netlink_sendmsg+0x1f2/0x440 [ 41.524826] __sys_sendto+0x1d8/0x1f0 [ 41.525145] __x64_sys_sendto+0x1f/0x30 [ 41.525467] do_syscall_64+0xa5/0x1b0 [ 41.525794] entry_SYSCALL_64_after_hwframe+0x72/0x7a [ 41.526213] RIP: 0033:0x7fbc4cfcea9a [ 41.526528] Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 15 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 7e c3 0f 1f 44 00 00 41 54 48 83 ec 30 44 89 [ 41.527942] RSP: 002b:00007ffcf54012a8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 41.528593] RAX: ffffffffffffffda RBX: 00007ffcf5401368 RCX: 00007fbc4cfcea9a [ 41.529173] RDX: 000000000000002c RSI: 00007fbc4b9d9bd0 RDI: 0000000000000005 [ 41.529786] RBP: 00007fbc4bafb040 R08: 00007ffcf54013e0 R09: 000000000000000c [ 41.530375] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [ 41.530977] R13: ffffffffc4653600 R14: 0000000000000001 R15: 00007fbc4ca85d1b [ 41.531573] Fixes: 5c578aedcb21d ("IPv6: convert addrconf hash list to RCU") Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jiri Benc Link: https://lore.kernel.org/r/8ab821e36073a4a406c50ec83c9e8dc586c539e4.1712585809.git.jbenc@redhat.com Signed-off-by: Jakub Kicinski --- include/net/addrconf.h | 4 ++++ net/ipv6/addrconf.c | 7 ++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 9d06eb945509e..62a407db1bf5f 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -438,6 +438,10 @@ static inline void in6_ifa_hold(struct inet6_ifaddr *ifp) refcount_inc(&ifp->refcnt); } +static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp) +{ + return refcount_inc_not_zero(&ifp->refcnt); +} /* * compute link-local solicited-node multicast address diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 92db9b474f2bd..779aa6ecdd499 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2091,9 +2091,10 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add if (ipv6_addr_equal(&ifp->addr, addr)) { if (!dev || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { - result = ifp; - in6_ifa_hold(ifp); - break; + if (in6_ifa_hold_safe(ifp)) { + result = ifp; + break; + } } } } -- GitLab From 8bdfb4ea95ca738d33ef71376c21eba20130f2eb Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Tue, 26 Mar 2024 15:32:46 -0400 Subject: [PATCH 648/989] drm/amdkfd: Reset GPU on queue preemption failure Currently, with F32 HWS GPU reset is only when unmap queue fails. However, if compute queue doesn't repond to preemption request in time unmap will return without any error. In this case, only preemption error is logged and Reset is not triggered. Call GPU reset in this case also. Reviewed-by: Alex Deucher Signed-off-by: Harish Kasiviswanathan Reviewed-by: Mukul Joshi Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index f4d395e38683d..0b655555e1678 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2001,6 +2001,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n"); while (halt_if_hws_hang) schedule(); + kfd_hws_hang(dqm); return -ETIME; } -- GitLab From 65ff8092e4802f96d87d3d7cde146961f5228265 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Sat, 23 Mar 2024 20:46:53 -0400 Subject: [PATCH 649/989] drm/amdgpu: always force full reset for SOC21 There are cases where soft reset seems to succeed, but does not, so always use mode1/2 for now. Reviewed-by: Harish Kasiviswanathan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/soc21.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 581a3bd11481c..8526282f4da12 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -457,10 +457,8 @@ static bool soc21_need_full_reset(struct amdgpu_device *adev) { switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(11, 0, 0): - return amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC); case IP_VERSION(11, 0, 2): case IP_VERSION(11, 0, 3): - return false; default: return true; } -- GitLab From 4b18a91faf1752f9bd69a4ed3aed2c8f6e5b0528 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Thu, 21 Mar 2024 17:46:36 +0530 Subject: [PATCH 650/989] drm/amdgpu: Refine IB schedule error logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Downgrade to debug information when IBs are skipped. Also, use dev_* to identify the device. Signed-off-by: Lijo Lazar Reviewed-by: Christian König Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 4b3000c21ef2c..e4742b65032d1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -304,12 +304,15 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) dma_fence_set_error(finished, -ECANCELED); if (finished->error < 0) { - DRM_INFO("Skip scheduling IBs!\n"); + dev_dbg(adev->dev, "Skip scheduling IBs in ring(%s)", + ring->name); } else { r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, &fence); if (r) - DRM_ERROR("Error scheduling IBs (%d)\n", r); + dev_err(adev->dev, + "Error scheduling IBs (%d) in ring(%s)", r, + ring->name); } job->job_run_counter++; -- GitLab From 0f1bbcc2bab25d5fb2dfb1ee3e08131437690d3d Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Mon, 25 Mar 2024 13:24:31 +0800 Subject: [PATCH 651/989] drm/amdgpu/umsch: reinitialize write pointer in hw init Otherwise the old one will be used during GPU reset. That's not expected. Signed-off-by: Lang Yu Reviewed-by: Feifei Xu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c b/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c index 84368cf1e1753..bd57896ab85d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c @@ -225,6 +225,8 @@ static int umsch_mm_v4_0_ring_start(struct amdgpu_umsch_mm *umsch) WREG32_SOC15(VCN, 0, regVCN_UMSCH_RB_SIZE, ring->ring_size); + ring->wptr = 0; + data = RREG32_SOC15(VCN, 0, regVCN_RB_ENABLE); data &= ~(VCN_RB_ENABLE__AUDIO_RB_EN_MASK); WREG32_SOC15(VCN, 0, regVCN_RB_ENABLE, data); -- GitLab From 8b2be55f4d6c1099d7f629b0ed7535a5be788c83 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Wed, 14 Feb 2024 17:55:54 +0530 Subject: [PATCH 652/989] drm/amdgpu: Reset dGPU if suspend got aborted For SOC21 ASICs, there is an issue in re-enabling PM features if a suspend got aborted. In such cases, reset the device during resume phase. This is a workaround till a proper solution is finalized. Signed-off-by: Lijo Lazar Reviewed-by: Alex Deucher Reviewed-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/soc21.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 8526282f4da12..abe319b0f0639 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -867,10 +867,35 @@ static int soc21_common_suspend(void *handle) return soc21_common_hw_fini(adev); } +static bool soc21_need_reset_on_resume(struct amdgpu_device *adev) +{ + u32 sol_reg1, sol_reg2; + + /* Will reset for the following suspend abort cases. + * 1) Only reset dGPU side. + * 2) S3 suspend got aborted and TOS is active. + */ + if (!(adev->flags & AMD_IS_APU) && adev->in_s3 && + !adev->suspend_complete) { + sol_reg1 = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81); + msleep(100); + sol_reg2 = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81); + + return (sol_reg1 != sol_reg2); + } + + return false; +} + static int soc21_common_resume(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + if (soc21_need_reset_on_resume(adev)) { + dev_info(adev->dev, "S3 suspend aborted, resetting..."); + soc21_asic_reset(adev); + } + return soc21_common_hw_init(adev); } -- GitLab From d4396924c3d44f34d0643f650e70892e07f3677f Mon Sep 17 00:00:00 2001 From: Li Ma Date: Thu, 28 Mar 2024 10:55:10 +0800 Subject: [PATCH 653/989] drm/amd/display: add DCN 351 version for microcode load There is a new DCN veriosn 3.5.1 need to load Signed-off-by: Li Ma Reviewed-by: Yifan Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 71d2d44681b21..173438fc75375 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -148,6 +148,9 @@ MODULE_FIRMWARE(FIRMWARE_NAVI12_DMCU); #define FIRMWARE_DCN_35_DMUB "amdgpu/dcn_3_5_dmcub.bin" MODULE_FIRMWARE(FIRMWARE_DCN_35_DMUB); +#define FIRMWARE_DCN_351_DMUB "amdgpu/dcn_3_5_1_dmcub.bin" +MODULE_FIRMWARE(FIRMWARE_DCN_351_DMUB); + /* Number of bytes in PSP header for firmware. */ #define PSP_HEADER_BYTES 0x100 @@ -4820,9 +4823,11 @@ static int dm_init_microcode(struct amdgpu_device *adev) fw_name_dmub = FIRMWARE_DCN_V3_2_1_DMCUB; break; case IP_VERSION(3, 5, 0): - case IP_VERSION(3, 5, 1): fw_name_dmub = FIRMWARE_DCN_35_DMUB; break; + case IP_VERSION(3, 5, 1): + fw_name_dmub = FIRMWARE_DCN_351_DMUB; + break; default: /* ASIC doesn't support DMUB. */ return 0; -- GitLab From 31729e8c21ecfd671458e02b6511eb68c2225113 Mon Sep 17 00:00:00 2001 From: Tim Huang Date: Wed, 27 Mar 2024 13:10:37 +0800 Subject: [PATCH 654/989] drm/amd/pm: fixes a random hang in S4 for SMU v13.0.4/11 While doing multiple S4 stress tests, GC/RLC/PMFW get into an invalid state resulting into hard hangs. Adding a GFX reset as workaround just before sending the MP1_UNLOAD message avoids this failure. Signed-off-by: Tim Huang Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c index bb98156b2fa1d..949131bd1ecb2 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c @@ -226,8 +226,18 @@ static int smu_v13_0_4_system_features_control(struct smu_context *smu, bool en) struct amdgpu_device *adev = smu->adev; int ret = 0; - if (!en && !adev->in_s0ix) + if (!en && !adev->in_s0ix) { + /* Adds a GFX reset as workaround just before sending the + * MP1_UNLOAD message to prevent GC/RLC/PMFW from entering + * an invalid state. + */ + ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset, + SMU_RESET_MODE_2, NULL); + if (ret) + return ret; + ret = smu_cmn_send_smc_msg(smu, SMU_MSG_PrepareMp1ForUnload, NULL); + } return ret; } -- GitLab From a3a4c0b12346a2493b41c8790d85141844a04e28 Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Fri, 22 Mar 2024 12:25:16 -0400 Subject: [PATCH 655/989] drm/amdgpu : Add mes_log_enable to control mes log feature The MES log might slow down the performance for extra step of log the data, disable it by default and introduce a parameter can enable it when necessary Signed-off-by: shaoyunl Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 5 ++++- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 7 +++++-- 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 9c62552bec344..b3b84647207ed 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -210,6 +210,7 @@ extern int amdgpu_async_gfx_ring; extern int amdgpu_mcbp; extern int amdgpu_discovery; extern int amdgpu_mes; +extern int amdgpu_mes_log_enable; extern int amdgpu_mes_kiq; extern int amdgpu_noretry; extern int amdgpu_force_asic_type; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 80b9642f2bc4f..e4277298cf1aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -195,6 +195,7 @@ int amdgpu_async_gfx_ring = 1; int amdgpu_mcbp = -1; int amdgpu_discovery = -1; int amdgpu_mes; +int amdgpu_mes_log_enable = 0; int amdgpu_mes_kiq; int amdgpu_noretry = -1; int amdgpu_force_asic_type = -1; @@ -667,6 +668,15 @@ MODULE_PARM_DESC(mes, "Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)"); module_param_named(mes, amdgpu_mes, int, 0444); +/** + * DOC: mes_log_enable (int) + * Enable Micro Engine Scheduler log. This is used to enable/disable MES internal log. + * (0 = disabled (default), 1 = enabled) + */ +MODULE_PARM_DESC(mes_log_enable, + "Enable Micro Engine Scheduler log (0 = disabled (default), 1 = enabled)"); +module_param_named(mes_log_enable, amdgpu_mes_log_enable, int, 0444); + /** * DOC: mes_kiq (int) * Enable Micro Engine Scheduler KIQ. This is a new engine pipe for kiq. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index a98e03e0a51f1..cfd613ea39b39 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -102,6 +102,9 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev) { int r; + if (!amdgpu_mes_log_enable) + return 0; + r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, &adev->mes.event_log_gpu_obj, @@ -1565,7 +1568,7 @@ void amdgpu_debugfs_mes_event_log_init(struct amdgpu_device *adev) #if defined(CONFIG_DEBUG_FS) struct drm_minor *minor = adev_to_drm(adev)->primary; struct dentry *root = minor->debugfs_root; - if (adev->enable_mes) + if (adev->enable_mes && amdgpu_mes_log_enable) debugfs_create_file("amdgpu_mes_event_log", 0444, root, adev, &amdgpu_debugfs_mes_event_log_fops); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 072c478665ade..63f281a9984d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -411,8 +411,11 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes) mes_set_hw_res_pkt.enable_reg_active_poll = 1; mes_set_hw_res_pkt.enable_level_process_quantum_check = 1; mes_set_hw_res_pkt.oversubscription_timer = 50; - mes_set_hw_res_pkt.enable_mes_event_int_logging = 1; - mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = mes->event_log_gpu_addr; + if (amdgpu_mes_log_enable) { + mes_set_hw_res_pkt.enable_mes_event_int_logging = 1; + mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = + mes->event_log_gpu_addr; + } return mes_v11_0_submit_pkt_and_poll_completion(mes, &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt), -- GitLab From 5b0cd091d905ee9da0a3ecdf06b9cbdd17ba711d Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Fri, 22 Mar 2024 12:44:55 -0400 Subject: [PATCH 656/989] drm/amdgpu : Increase the mes log buffer size as per new MES FW version From MES version 0x54, the log entry increased and require the log buffer size to be increased. The 16k is maximum size agreed Signed-off-by: shaoyunl Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 5 ++--- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index cfd613ea39b39..a00cf4756ad0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -105,7 +105,7 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev) if (!amdgpu_mes_log_enable) return 0; - r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, + r = amdgpu_bo_create_kernel(adev, AMDGPU_MES_LOG_BUFFER_SIZE, PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, &adev->mes.event_log_gpu_obj, &adev->mes.event_log_gpu_addr, @@ -1552,12 +1552,11 @@ static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused) uint32_t *mem = (uint32_t *)(adev->mes.event_log_cpu_addr); seq_hex_dump(m, "", DUMP_PREFIX_OFFSET, 32, 4, - mem, PAGE_SIZE, false); + mem, AMDGPU_MES_LOG_BUFFER_SIZE, false); return 0; } - DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_mes_event_log); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 7d4f93fea937a..4c8fc3117ef89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -52,6 +52,7 @@ enum amdgpu_mes_priority_level { #define AMDGPU_MES_PROC_CTX_SIZE 0x1000 /* one page area */ #define AMDGPU_MES_GANG_CTX_SIZE 0x1000 /* one page area */ +#define AMDGPU_MES_LOG_BUFFER_SIZE 0x4000 /* Maximu log buffer size for MES */ struct amdgpu_mes_funcs; -- GitLab From c5b1ccff26950d50bf2043cb2af9bafb1f08bbaf Mon Sep 17 00:00:00 2001 From: lima1002 Date: Mon, 29 Jan 2024 20:17:54 +0800 Subject: [PATCH 657/989] drm/amd/swsmu: Update smu v14.0.0 headers to be 14.0.1 compatible update ppsmc.h pmfw.h and driver_if.h for smu v14_0_1 Reviewed-by: Alex Deucher Signed-off-by: lima1002 Signed-off-by: Alex Deucher --- .../inc/pmfw_if/smu14_driver_if_v14_0_0.h | 33 +- .../pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h | 55 ++- .../pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h | 18 +- drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h | 1 + .../gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c | 2 +- .../drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c | 347 ++++++++++++++++-- 6 files changed, 413 insertions(+), 43 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0_0.h index 5bb7a63c0602b..97522c0852589 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0_0.h @@ -144,6 +144,37 @@ typedef struct { uint32_t MaxGfxClk; } DpmClocks_t; +//Freq in MHz +//Voltage in milli volts with 2 fractional bits +typedef struct { + uint32_t DcfClocks[NUM_DCFCLK_DPM_LEVELS]; + uint32_t DispClocks[NUM_DISPCLK_DPM_LEVELS]; + uint32_t DppClocks[NUM_DPPCLK_DPM_LEVELS]; + uint32_t SocClocks[NUM_SOCCLK_DPM_LEVELS]; + uint32_t VClocks0[NUM_VCN_DPM_LEVELS]; + uint32_t VClocks1[NUM_VCN_DPM_LEVELS]; + uint32_t DClocks0[NUM_VCN_DPM_LEVELS]; + uint32_t DClocks1[NUM_VCN_DPM_LEVELS]; + uint32_t VPEClocks[NUM_VPE_DPM_LEVELS]; + uint32_t FclkClocks_Freq[NUM_FCLK_DPM_LEVELS]; + uint32_t FclkClocks_Voltage[NUM_FCLK_DPM_LEVELS]; + uint32_t SocVoltage[NUM_SOC_VOLTAGE_LEVELS]; + MemPstateTable_t MemPstateTable[NUM_MEM_PSTATE_LEVELS]; + + uint8_t NumDcfClkLevelsEnabled; + uint8_t NumDispClkLevelsEnabled; //Applies to both Dispclk and Dppclk + uint8_t NumSocClkLevelsEnabled; + uint8_t Vcn0ClkLevelsEnabled; //Applies to both Vclk0 and Dclk0 + uint8_t Vcn1ClkLevelsEnabled; //Applies to both Vclk1 and Dclk1 + uint8_t VpeClkLevelsEnabled; + uint8_t NumMemPstatesEnabled; + uint8_t NumFclkLevelsEnabled; + uint8_t spare; + + uint32_t MinGfxClk; + uint32_t MaxGfxClk; +} DpmClocks_t_v14_0_1; + typedef struct { uint16_t CoreFrequency[16]; //Target core frequency [MHz] uint16_t CorePower[16]; //CAC calculated core power [mW] @@ -224,7 +255,7 @@ typedef enum { #define TABLE_CUSTOM_DPM 2 // Called by Driver #define TABLE_BIOS_GPIO_CONFIG 3 // Called by BIOS #define TABLE_DPMCLOCKS 4 // Called by Driver and VBIOS -#define TABLE_SPARE0 5 // Unused +#define TABLE_MOMENTARY_PM 5 // Called by Tools #define TABLE_MODERN_STDBY 6 // Called by Tools for Modern Standby Log #define TABLE_SMU_METRICS 7 // Called by Driver and SMF/PMF #define TABLE_COUNT 8 diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h index 356e0f57a426f..ddb6258600831 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h @@ -42,7 +42,7 @@ #define FEATURE_EDC_BIT 7 #define FEATURE_PLL_POWER_DOWN_BIT 8 #define FEATURE_VDDOFF_BIT 9 -#define FEATURE_VCN_DPM_BIT 10 +#define FEATURE_VCN_DPM_BIT 10 /* this is for both VCN0 and VCN1 */ #define FEATURE_DS_MPM_BIT 11 #define FEATURE_FCLK_DPM_BIT 12 #define FEATURE_SOCCLK_DPM_BIT 13 @@ -56,9 +56,9 @@ #define FEATURE_DS_GFXCLK_BIT 21 #define FEATURE_DS_SOCCLK_BIT 22 #define FEATURE_DS_LCLK_BIT 23 -#define FEATURE_LOW_POWER_DCNCLKS_BIT 24 // for all DISP clks +#define FEATURE_LOW_POWER_DCNCLKS_BIT 24 #define FEATURE_DS_SHUBCLK_BIT 25 -#define FEATURE_SPARE0_BIT 26 //SPARE +#define FEATURE_RESERVED0_BIT 26 #define FEATURE_ZSTATES_BIT 27 #define FEATURE_IOMMUL2_PG_BIT 28 #define FEATURE_DS_FCLK_BIT 29 @@ -66,8 +66,8 @@ #define FEATURE_DS_MP1CLK_BIT 31 #define FEATURE_WHISPER_MODE_BIT 32 #define FEATURE_SMU_LOW_POWER_BIT 33 -#define FEATURE_SMART_L3_RINSER_BIT 34 -#define FEATURE_SPARE1_BIT 35 //SPARE +#define FEATURE_RESERVED1_BIT 34 /* v14_0_0 SMART_L3_RINSER; v14_0_1 RESERVED1 */ +#define FEATURE_GFX_DEM_BIT 35 /* v14_0_0 SPARE; v14_0_1 GFX_DEM */ #define FEATURE_PSI_BIT 36 #define FEATURE_PROCHOT_BIT 37 #define FEATURE_CPUOFF_BIT 38 @@ -77,11 +77,11 @@ #define FEATURE_PERF_LIMIT_BIT 42 #define FEATURE_CORE_DLDO_BIT 43 #define FEATURE_DVO_BIT 44 -#define FEATURE_DS_VCN_BIT 45 +#define FEATURE_DS_VCN_BIT 45 /* v14_0_1 this is for both VCN0 and VCN1 */ #define FEATURE_CPPC_BIT 46 #define FEATURE_CPPC_PREFERRED_CORES 47 #define FEATURE_DF_CSTATES_BIT 48 -#define FEATURE_SPARE2_BIT 49 //SPARE +#define FEATURE_FAST_PSTATE_CLDO_BIT 49 /* v14_0_0 SPARE */ #define FEATURE_ATHUB_PG_BIT 50 #define FEATURE_VDDOFF_ECO_BIT 51 #define FEATURE_ZSTATES_ECO_BIT 52 @@ -93,8 +93,8 @@ #define FEATURE_DS_IPUCLK_BIT 58 #define FEATURE_DS_VPECLK_BIT 59 #define FEATURE_VPE_DPM_BIT 60 -#define FEATURE_SPARE_61 61 -#define FEATURE_FP_DIDT 62 +#define FEATURE_SMART_L3_RINSER_BIT 61 /* v14_0_0 SPARE*/ +#define FEATURE_PCC_BIT 62 /* v14_0_0 FP_DIDT v14_0_1 PCC_BIT */ #define NUM_FEATURES 63 // Firmware Header/Footer @@ -151,6 +151,43 @@ typedef struct { // MP1_EXT_SCRATCH7 = RTOS Current Job } FwStatus_t; +typedef struct { + // MP1_EXT_SCRATCH0 + uint32_t DpmHandlerID : 8; + uint32_t ActivityMonitorID : 8; + uint32_t DpmTimerID : 8; + uint32_t DpmHubID : 4; + uint32_t DpmHubTask : 4; + // MP1_EXT_SCRATCH1 + uint32_t CclkSyncStatus : 8; + uint32_t ZstateStatus : 4; + uint32_t Cpu1VddOff : 4; + uint32_t DstateFun : 4; + uint32_t DstateDev : 4; + uint32_t GfxOffStatus : 2; + uint32_t Cpu0Off : 2; + uint32_t Cpu1Off : 2; + uint32_t Cpu0VddOff : 2; + // MP1_EXT_SCRATCH2 + uint32_t P2JobHandler :32; + // MP1_EXT_SCRATCH3 + uint32_t PostCode :32; + // MP1_EXT_SCRATCH4 + uint32_t MsgPortBusy :15; + uint32_t RsmuPmiP1Pending : 1; + uint32_t RsmuPmiP2PendingCnt : 8; + uint32_t DfCstateExitPending : 1; + uint32_t Pc6EntryPending : 1; + uint32_t Pc6ExitPending : 1; + uint32_t WarmResetPending : 1; + uint32_t Mp0ClkPending : 1; + uint32_t InWhisperMode : 1; + uint32_t spare2 : 2; + // MP1_EXT_SCRATCH5 + uint32_t IdleMask :32; + // MP1_EXT_SCRATCH6 = RTOS threads' status + // MP1_EXT_SCRATCH7 = RTOS Current Job +} FwStatus_t_v14_0_1; #pragma pack(pop) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h index ca7ce4251482d..c4dc5881d8df0 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h @@ -72,23 +72,19 @@ #define PPSMC_MSG_SetHardMinSocclkByFreq 0x13 ///< Set hard min for SOC CLK #define PPSMC_MSG_SetSoftMinFclk 0x14 ///< Set hard min for FCLK #define PPSMC_MSG_SetSoftMinVcn0 0x15 ///< Set soft min for VCN0 clocks (VCLK0 and DCLK0) - #define PPSMC_MSG_EnableGfxImu 0x16 ///< Enable GFX IMU - -#define PPSMC_MSG_spare_0x17 0x17 -#define PPSMC_MSG_spare_0x18 0x18 +#define PPSMC_MSG_spare_0x17 0x17 ///< Get GFX clock frequency +#define PPSMC_MSG_spare_0x18 0x18 ///< Get FCLK frequency #define PPSMC_MSG_AllowGfxOff 0x19 ///< Inform PMFW of allowing GFXOFF entry #define PPSMC_MSG_DisallowGfxOff 0x1A ///< Inform PMFW of disallowing GFXOFF entry #define PPSMC_MSG_SetSoftMaxGfxClk 0x1B ///< Set soft max for GFX CLK #define PPSMC_MSG_SetHardMinGfxClk 0x1C ///< Set hard min for GFX CLK - #define PPSMC_MSG_SetSoftMaxSocclkByFreq 0x1D ///< Set soft max for SOC CLK #define PPSMC_MSG_SetSoftMaxFclkByFreq 0x1E ///< Set soft max for FCLK #define PPSMC_MSG_SetSoftMaxVcn0 0x1F ///< Set soft max for VCN0 clocks (VCLK0 and DCLK0) -#define PPSMC_MSG_spare_0x20 0x20 +#define PPSMC_MSG_spare_0x20 0x20 ///< Set power limit percentage #define PPSMC_MSG_PowerDownJpeg0 0x21 ///< Power down Jpeg of VCN0 #define PPSMC_MSG_PowerUpJpeg0 0x22 ///< Power up Jpeg of VCN0; VCN0 is power gated by default - #define PPSMC_MSG_SetHardMinFclkByFreq 0x23 ///< Set hard min for FCLK #define PPSMC_MSG_SetSoftMinSocclkByFreq 0x24 ///< Set soft min for SOC CLK #define PPSMC_MSG_AllowZstates 0x25 ///< Inform PMFM of allowing Zstate entry, i.e. no Miracast activity @@ -99,8 +95,8 @@ #define PPSMC_MSG_PowerUpIspByTile 0x2A ///< This message is used to power up ISP tiles and enable the ISP DPM #define PPSMC_MSG_SetHardMinIspiclkByFreq 0x2B ///< Set HardMin by frequency for ISPICLK #define PPSMC_MSG_SetHardMinIspxclkByFreq 0x2C ///< Set HardMin by frequency for ISPXCLK -#define PPSMC_MSG_PowerDownUmsch 0x2D ///< Power down VCN.UMSCH (aka VSCH) scheduler -#define PPSMC_MSG_PowerUpUmsch 0x2E ///< Power up VCN.UMSCH (aka VSCH) scheduler +#define PPSMC_MSG_PowerDownUmsch 0x2D ///< Power down VCN0.UMSCH (aka VSCH) scheduler +#define PPSMC_MSG_PowerUpUmsch 0x2E ///< Power up VCN0.UMSCH (aka VSCH) scheduler #define PPSMC_Message_IspStutterOn_MmhubPgDis 0x2F ///< ISP StutterOn mmHub PgDis #define PPSMC_Message_IspStutterOff_MmhubPgEn 0x30 ///< ISP StufferOff mmHub PgEn #define PPSMC_MSG_PowerUpVpe 0x31 ///< Power up VPE @@ -110,7 +106,9 @@ #define PPSMC_MSG_DisableLSdma 0x35 ///< Disable LSDMA #define PPSMC_MSG_SetSoftMaxVpe 0x36 ///< #define PPSMC_MSG_SetSoftMinVpe 0x37 ///< -#define PPSMC_Message_Count 0x38 ///< Total number of PPSMC messages +#define PPSMC_MSG_AllocMALLCache 0x38 ///< Allocating MALL Cache +#define PPSMC_MSG_ReleaseMALLCache 0x39 ///< Releasing MALL Cache +#define PPSMC_Message_Count 0x3A ///< Total number of PPSMC messages /** @}*/ /** diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h index 3f7463c1c1a91..4af1985ae4466 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h @@ -27,6 +27,7 @@ #define SMU14_DRIVER_IF_VERSION_INV 0xFFFFFFFF #define SMU14_DRIVER_IF_VERSION_SMU_V14_0_0 0x7 +#define SMU14_DRIVER_IF_VERSION_SMU_V14_0_1 0x6 #define SMU14_DRIVER_IF_VERSION_SMU_V14_0_2 0x1 #define FEATURE_MASK(feature) (1ULL << feature) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c index 9e39f99154f94..07a65e005785d 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c @@ -234,7 +234,7 @@ int smu_v14_0_check_fw_version(struct smu_context *smu) smu->smc_driver_if_version = SMU14_DRIVER_IF_VERSION_SMU_V14_0_0; break; case IP_VERSION(14, 0, 1): - smu->smc_driver_if_version = SMU14_DRIVER_IF_VERSION_SMU_V14_0_0; + smu->smc_driver_if_version = SMU14_DRIVER_IF_VERSION_SMU_V14_0_1; break; default: diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c index d6de6d97286c6..63399c00cc28f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c @@ -161,7 +161,7 @@ static int smu_v14_0_0_init_smc_tables(struct smu_context *smu) SMU_TABLE_INIT(tables, SMU_TABLE_WATERMARKS, sizeof(Watermarks_t), PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); - SMU_TABLE_INIT(tables, SMU_TABLE_DPMCLOCKS, sizeof(DpmClocks_t), + SMU_TABLE_INIT(tables, SMU_TABLE_DPMCLOCKS, max(sizeof(DpmClocks_t), sizeof(DpmClocks_t_v14_0_1)), PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); SMU_TABLE_INIT(tables, SMU_TABLE_SMU_METRICS, sizeof(SmuMetrics_t), PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); @@ -171,7 +171,7 @@ static int smu_v14_0_0_init_smc_tables(struct smu_context *smu) goto err0_out; smu_table->metrics_time = 0; - smu_table->clocks_table = kzalloc(sizeof(DpmClocks_t), GFP_KERNEL); + smu_table->clocks_table = kzalloc(max(sizeof(DpmClocks_t), sizeof(DpmClocks_t_v14_0_1)), GFP_KERNEL); if (!smu_table->clocks_table) goto err1_out; @@ -593,6 +593,60 @@ static int smu_v14_0_0_mode2_reset(struct smu_context *smu) return ret; } +static int smu_v14_0_1_get_dpm_freq_by_index(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t dpm_level, + uint32_t *freq) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + + if (!clk_table || clk_type >= SMU_CLK_COUNT) + return -EINVAL; + + switch (clk_type) { + case SMU_SOCCLK: + if (dpm_level >= clk_table->NumSocClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->SocClocks[dpm_level]; + break; + case SMU_VCLK: + if (dpm_level >= clk_table->Vcn0ClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->VClocks0[dpm_level]; + break; + case SMU_DCLK: + if (dpm_level >= clk_table->Vcn0ClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->DClocks0[dpm_level]; + break; + case SMU_VCLK1: + if (dpm_level >= clk_table->Vcn1ClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->VClocks1[dpm_level]; + break; + case SMU_DCLK1: + if (dpm_level >= clk_table->Vcn1ClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->DClocks1[dpm_level]; + break; + case SMU_UCLK: + case SMU_MCLK: + if (dpm_level >= clk_table->NumMemPstatesEnabled) + return -EINVAL; + *freq = clk_table->MemPstateTable[dpm_level].MemClk; + break; + case SMU_FCLK: + if (dpm_level >= clk_table->NumFclkLevelsEnabled) + return -EINVAL; + *freq = clk_table->FclkClocks_Freq[dpm_level]; + break; + default: + return -EINVAL; + } + + return 0; +} + static int smu_v14_0_0_get_dpm_freq_by_index(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t dpm_level, @@ -637,6 +691,19 @@ static int smu_v14_0_0_get_dpm_freq_by_index(struct smu_context *smu, return 0; } +static int smu_v14_0_common_get_dpm_freq_by_index(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t dpm_level, + uint32_t *freq) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, dpm_level, freq); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_v14_0_1_get_dpm_freq_by_index(smu, clk_type, dpm_level, freq); + + return 0; +} + static bool smu_v14_0_0_clk_dpm_is_enabled(struct smu_context *smu, enum smu_clk_type clk_type) { @@ -657,6 +724,8 @@ static bool smu_v14_0_0_clk_dpm_is_enabled(struct smu_context *smu, break; case SMU_VCLK: case SMU_DCLK: + case SMU_VCLK1: + case SMU_DCLK1: feature_id = SMU_FEATURE_VCN_DPM_BIT; break; default: @@ -666,6 +735,126 @@ static bool smu_v14_0_0_clk_dpm_is_enabled(struct smu_context *smu, return smu_cmn_feature_is_enabled(smu, feature_id); } +static int smu_v14_0_1_get_dpm_ultimate_freq(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t *min, + uint32_t *max) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + uint32_t clock_limit; + uint32_t max_dpm_level, min_dpm_level; + int ret = 0; + + if (!smu_v14_0_0_clk_dpm_is_enabled(smu, clk_type)) { + switch (clk_type) { + case SMU_MCLK: + case SMU_UCLK: + clock_limit = smu->smu_table.boot_values.uclk; + break; + case SMU_FCLK: + clock_limit = smu->smu_table.boot_values.fclk; + break; + case SMU_GFXCLK: + case SMU_SCLK: + clock_limit = smu->smu_table.boot_values.gfxclk; + break; + case SMU_SOCCLK: + clock_limit = smu->smu_table.boot_values.socclk; + break; + case SMU_VCLK: + case SMU_VCLK1: + clock_limit = smu->smu_table.boot_values.vclk; + break; + case SMU_DCLK: + case SMU_DCLK1: + clock_limit = smu->smu_table.boot_values.dclk; + break; + default: + clock_limit = 0; + break; + } + + /* clock in Mhz unit */ + if (min) + *min = clock_limit / 100; + if (max) + *max = clock_limit / 100; + + return 0; + } + + if (max) { + switch (clk_type) { + case SMU_GFXCLK: + case SMU_SCLK: + *max = clk_table->MaxGfxClk; + break; + case SMU_MCLK: + case SMU_UCLK: + case SMU_FCLK: + max_dpm_level = 0; + break; + case SMU_SOCCLK: + max_dpm_level = clk_table->NumSocClkLevelsEnabled - 1; + break; + case SMU_VCLK: + case SMU_DCLK: + max_dpm_level = clk_table->Vcn0ClkLevelsEnabled - 1; + break; + case SMU_VCLK1: + case SMU_DCLK1: + max_dpm_level = clk_table->Vcn1ClkLevelsEnabled - 1; + break; + default: + ret = -EINVAL; + goto failed; + } + + if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK) { + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, max_dpm_level, max); + if (ret) + goto failed; + } + } + + if (min) { + switch (clk_type) { + case SMU_GFXCLK: + case SMU_SCLK: + *min = clk_table->MinGfxClk; + break; + case SMU_MCLK: + case SMU_UCLK: + min_dpm_level = clk_table->NumMemPstatesEnabled - 1; + break; + case SMU_FCLK: + min_dpm_level = clk_table->NumFclkLevelsEnabled - 1; + break; + case SMU_SOCCLK: + min_dpm_level = 0; + break; + case SMU_VCLK: + case SMU_DCLK: + case SMU_VCLK1: + case SMU_DCLK1: + min_dpm_level = 0; + break; + default: + ret = -EINVAL; + goto failed; + } + + if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK) { + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, min_dpm_level, min); + if (ret) + goto failed; + } + } + +failed: + return ret; +} + static int smu_v14_0_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t *min, @@ -736,7 +925,7 @@ static int smu_v14_0_0_get_dpm_ultimate_freq(struct smu_context *smu, } if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK) { - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, max_dpm_level, max); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, max_dpm_level, max); if (ret) goto failed; } @@ -768,7 +957,7 @@ static int smu_v14_0_0_get_dpm_ultimate_freq(struct smu_context *smu, } if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK) { - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, min_dpm_level, min); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, min_dpm_level, min); if (ret) goto failed; } @@ -778,6 +967,19 @@ failed: return ret; } +static int smu_v14_0_common_get_dpm_ultimate_freq(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t *min, + uint32_t *max) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_v14_0_0_get_dpm_ultimate_freq(smu, clk_type, min, max); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_v14_0_1_get_dpm_ultimate_freq(smu, clk_type, min, max); + + return 0; +} + static int smu_v14_0_0_get_current_clk_freq(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t *value) @@ -811,6 +1013,37 @@ static int smu_v14_0_0_get_current_clk_freq(struct smu_context *smu, return smu_v14_0_0_get_smu_metrics_data(smu, member_type, value); } +static int smu_v14_0_1_get_dpm_level_count(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t *count) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + + switch (clk_type) { + case SMU_SOCCLK: + *count = clk_table->NumSocClkLevelsEnabled; + break; + case SMU_VCLK: + case SMU_DCLK: + *count = clk_table->Vcn0ClkLevelsEnabled; + break; + case SMU_VCLK1: + case SMU_DCLK1: + *count = clk_table->Vcn1ClkLevelsEnabled; + break; + case SMU_MCLK: + *count = clk_table->NumMemPstatesEnabled; + break; + case SMU_FCLK: + *count = clk_table->NumFclkLevelsEnabled; + break; + default: + break; + } + + return 0; +} + static int smu_v14_0_0_get_dpm_level_count(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t *count) @@ -840,6 +1073,18 @@ static int smu_v14_0_0_get_dpm_level_count(struct smu_context *smu, return 0; } +static int smu_v14_0_common_get_dpm_level_count(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t *count) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_v14_0_0_get_dpm_level_count(smu, clk_type, count); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_v14_0_1_get_dpm_level_count(smu, clk_type, count); + + return 0; +} + static int smu_v14_0_0_print_clk_levels(struct smu_context *smu, enum smu_clk_type clk_type, char *buf) { @@ -866,18 +1111,20 @@ static int smu_v14_0_0_print_clk_levels(struct smu_context *smu, case SMU_SOCCLK: case SMU_VCLK: case SMU_DCLK: + case SMU_VCLK1: + case SMU_DCLK1: case SMU_MCLK: case SMU_FCLK: ret = smu_v14_0_0_get_current_clk_freq(smu, clk_type, &cur_value); if (ret) break; - ret = smu_v14_0_0_get_dpm_level_count(smu, clk_type, &count); + ret = smu_v14_0_common_get_dpm_level_count(smu, clk_type, &count); if (ret) break; for (i = 0; i < count; i++) { - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, i, &value); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, i, &value); if (ret) break; @@ -940,8 +1187,13 @@ static int smu_v14_0_0_set_soft_freq_limited_range(struct smu_context *smu, break; case SMU_VCLK: case SMU_DCLK: - msg_set_min = SMU_MSG_SetHardMinVcn; - msg_set_max = SMU_MSG_SetSoftMaxVcn; + msg_set_min = SMU_MSG_SetHardMinVcn0; + msg_set_max = SMU_MSG_SetSoftMaxVcn0; + break; + case SMU_VCLK1: + case SMU_DCLK1: + msg_set_min = SMU_MSG_SetHardMinVcn1; + msg_set_max = SMU_MSG_SetSoftMaxVcn1; break; default: return -EINVAL; @@ -971,11 +1223,11 @@ static int smu_v14_0_0_force_clk_levels(struct smu_context *smu, case SMU_FCLK: case SMU_VCLK: case SMU_DCLK: - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, soft_min_level, &min_freq); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, soft_min_level, &min_freq); if (ret) break; - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, soft_max_level, &max_freq); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, soft_max_level, &max_freq); if (ret) break; @@ -1000,25 +1252,25 @@ static int smu_v14_0_0_set_performance_level(struct smu_context *smu, switch (level) { case AMD_DPM_FORCED_LEVEL_HIGH: - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SCLK, NULL, &sclk_max); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_FCLK, NULL, &fclk_max); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SOCCLK, NULL, &socclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SCLK, NULL, &sclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_FCLK, NULL, &fclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SOCCLK, NULL, &socclk_max); sclk_min = sclk_max; fclk_min = fclk_max; socclk_min = socclk_max; break; case AMD_DPM_FORCED_LEVEL_LOW: - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SCLK, &sclk_min, NULL); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_FCLK, &fclk_min, NULL); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SOCCLK, &socclk_min, NULL); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SCLK, &sclk_min, NULL); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_FCLK, &fclk_min, NULL); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SOCCLK, &socclk_min, NULL); sclk_max = sclk_min; fclk_max = fclk_min; socclk_max = socclk_min; break; case AMD_DPM_FORCED_LEVEL_AUTO: - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SCLK, &sclk_min, &sclk_max); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_FCLK, &fclk_min, &fclk_max); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SOCCLK, &socclk_min, &socclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SCLK, &sclk_min, &sclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_FCLK, &fclk_min, &fclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SOCCLK, &socclk_min, &socclk_max); break; case AMD_DPM_FORCED_LEVEL_PROFILE_STANDARD: case AMD_DPM_FORCED_LEVEL_PROFILE_MIN_SCLK: @@ -1067,6 +1319,18 @@ static int smu_v14_0_0_set_performance_level(struct smu_context *smu, return ret; } +static int smu_v14_0_1_set_fine_grain_gfx_freq_parameters(struct smu_context *smu) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + + smu->gfx_default_hard_min_freq = clk_table->MinGfxClk; + smu->gfx_default_soft_max_freq = clk_table->MaxGfxClk; + smu->gfx_actual_hard_min_freq = 0; + smu->gfx_actual_soft_max_freq = 0; + + return 0; +} + static int smu_v14_0_0_set_fine_grain_gfx_freq_parameters(struct smu_context *smu) { DpmClocks_t *clk_table = smu->smu_table.clocks_table; @@ -1079,6 +1343,16 @@ static int smu_v14_0_0_set_fine_grain_gfx_freq_parameters(struct smu_context *sm return 0; } +static int smu_v14_0_common_set_fine_grain_gfx_freq_parameters(struct smu_context *smu) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_v14_0_0_set_fine_grain_gfx_freq_parameters(smu); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_v14_0_1_set_fine_grain_gfx_freq_parameters(smu); + + return 0; +} + static int smu_v14_0_0_set_vpe_enable(struct smu_context *smu, bool enable) { @@ -1095,6 +1369,25 @@ static int smu_v14_0_0_set_umsch_mm_enable(struct smu_context *smu, 0, NULL); } +static int smu_14_0_1_get_dpm_table(struct smu_context *smu, struct dpm_clocks *clock_table) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + uint8_t idx; + + /* Only the Clock information of SOC and VPE is copied to provide VPE DPM settings for use. */ + for (idx = 0; idx < NUM_SOCCLK_DPM_LEVELS; idx++) { + clock_table->SocClocks[idx].Freq = (idx < clk_table->NumSocClkLevelsEnabled) ? clk_table->SocClocks[idx]:0; + clock_table->SocClocks[idx].Vol = 0; + } + + for (idx = 0; idx < NUM_VPE_DPM_LEVELS; idx++) { + clock_table->VPEClocks[idx].Freq = (idx < clk_table->VpeClkLevelsEnabled) ? clk_table->VPEClocks[idx]:0; + clock_table->VPEClocks[idx].Vol = 0; + } + + return 0; +} + static int smu_14_0_0_get_dpm_table(struct smu_context *smu, struct dpm_clocks *clock_table) { DpmClocks_t *clk_table = smu->smu_table.clocks_table; @@ -1114,6 +1407,16 @@ static int smu_14_0_0_get_dpm_table(struct smu_context *smu, struct dpm_clocks * return 0; } +static int smu_v14_0_common_get_dpm_table(struct smu_context *smu, struct dpm_clocks *clock_table) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_14_0_0_get_dpm_table(smu, clock_table); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_14_0_1_get_dpm_table(smu, clock_table); + + return 0; +} + static const struct pptable_funcs smu_v14_0_0_ppt_funcs = { .check_fw_status = smu_v14_0_check_fw_status, .check_fw_version = smu_v14_0_check_fw_version, @@ -1135,16 +1438,16 @@ static const struct pptable_funcs smu_v14_0_0_ppt_funcs = { .set_driver_table_location = smu_v14_0_set_driver_table_location, .gfx_off_control = smu_v14_0_gfx_off_control, .mode2_reset = smu_v14_0_0_mode2_reset, - .get_dpm_ultimate_freq = smu_v14_0_0_get_dpm_ultimate_freq, + .get_dpm_ultimate_freq = smu_v14_0_common_get_dpm_ultimate_freq, .od_edit_dpm_table = smu_v14_0_od_edit_dpm_table, .print_clk_levels = smu_v14_0_0_print_clk_levels, .force_clk_levels = smu_v14_0_0_force_clk_levels, .set_performance_level = smu_v14_0_0_set_performance_level, - .set_fine_grain_gfx_freq_parameters = smu_v14_0_0_set_fine_grain_gfx_freq_parameters, + .set_fine_grain_gfx_freq_parameters = smu_v14_0_common_set_fine_grain_gfx_freq_parameters, .set_gfx_power_up_by_imu = smu_v14_0_set_gfx_power_up_by_imu, .dpm_set_vpe_enable = smu_v14_0_0_set_vpe_enable, .dpm_set_umsch_mm_enable = smu_v14_0_0_set_umsch_mm_enable, - .get_dpm_clock_table = smu_14_0_0_get_dpm_table, + .get_dpm_clock_table = smu_v14_0_common_get_dpm_table, }; static void smu_v14_0_0_set_smu_mailbox_registers(struct smu_context *smu) -- GitLab From 533eefb9be76c3b23d220ee18edfda8eb56cefff Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Tue, 12 Dec 2023 17:17:05 +0800 Subject: [PATCH 658/989] drm/amdgpu: add smu 14.0.1 discovery support This patch to add smu 14.0.1 support Reviewed-by: Alex Deucher Signed-off-by: Yifan Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index fdd36fb027ab6..ac5bf01fe8d2a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -1896,6 +1896,7 @@ static int amdgpu_discovery_set_smu_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &smu_v13_0_ip_block); break; case IP_VERSION(14, 0, 0): + case IP_VERSION(14, 0, 1): amdgpu_device_ip_block_add(adev, &smu_v14_0_ip_block); break; default: -- GitLab From f886b49feaae30acd599e37d4284836024b0f3ed Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 28 Mar 2024 18:22:10 +0800 Subject: [PATCH 659/989] drm/amdgpu: implement IRQ_STATE_ENABLE for SDMA v4.4.2 SDMA_CNTL is not set in some cases, driver configures it by itself. v2: simplify code Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 34237a1b1f2e4..82eab49be82bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1602,19 +1602,9 @@ static int sdma_v4_4_2_set_ecc_irq_state(struct amdgpu_device *adev, u32 sdma_cntl; sdma_cntl = RREG32_SDMA(type, regSDMA_CNTL); - switch (state) { - case AMDGPU_IRQ_STATE_DISABLE: - sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, - DRAM_ECC_INT_ENABLE, 0); - WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl); - break; - /* sdma ecc interrupt is enabled by default - * driver doesn't need to do anything to - * enable the interrupt */ - case AMDGPU_IRQ_STATE_ENABLE: - default: - break; - } + sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, DRAM_ECC_INT_ENABLE, + state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0); + WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl); return 0; } -- GitLab From ecedd99a9369fb5cde601ae9abd58bca2739f1ae Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Fri, 15 Mar 2024 21:25:25 -0600 Subject: [PATCH 660/989] drm/amd/display: Skip on writeback when it's not applicable [WHY] dynamic memory safety error detector (KASAN) catches and generates error messages "BUG: KASAN: slab-out-of-bounds" as writeback connector does not support certain features which are not initialized. [HOW] Skip them when connector type is DRM_MODE_CONNECTOR_WRITEBACK. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3199 Reviewed-by: Harry Wentland Reviewed-by: Rodrigo Siqueira Acked-by: Roman Li Signed-off-by: Alex Hung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 173438fc75375..6cf0d5f276feb 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -3047,6 +3047,10 @@ static int dm_resume(void *handle) /* Do mst topology probing after resuming cached state*/ drm_connector_list_iter_begin(ddev, &iter); drm_for_each_connector_iter(connector, &iter) { + + if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK) + continue; + aconnector = to_amdgpu_dm_connector(connector); if (aconnector->dc_link->type != dc_connection_mst_branch || aconnector->mst_root) @@ -5926,6 +5930,9 @@ get_highest_refresh_rate_mode(struct amdgpu_dm_connector *aconnector, &aconnector->base.probed_modes : &aconnector->base.modes; + if (aconnector->base.connector_type == DRM_MODE_CONNECTOR_WRITEBACK) + return NULL; + if (aconnector->freesync_vid_base.clock != 0) return &aconnector->freesync_vid_base; @@ -8767,10 +8774,10 @@ static void amdgpu_dm_commit_audio(struct drm_device *dev, if (!drm_atomic_crtc_needs_modeset(new_crtc_state)) continue; +notify: if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK) continue; -notify: aconnector = to_amdgpu_dm_connector(connector); mutex_lock(&adev->dm.audio_lock); -- GitLab From 3818708e9c9712e2ba4006bc23502ee7b031bd3f Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Thu, 28 Mar 2024 11:00:50 +0800 Subject: [PATCH 661/989] drm/amd/pm: fix the high voltage issue after unload fix the high voltage issue after unload on smu 13.0.10 Signed-off-by: Kenneth Feng Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 26 ++++++++++-------- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 27 +++++++++++++++++-- drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 1 + .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 8 +++++- 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index aa16d51dd8421..7753a2e64d411 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4135,18 +4135,22 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->ip_blocks[i].status.hw = true; } } + } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && + !amdgpu_device_has_display_hardware(adev)) { + r = psp_gpu_reset(adev); } else { - tmp = amdgpu_reset_method; - /* It should do a default reset when loading or reloading the driver, - * regardless of the module parameter reset_method. - */ - amdgpu_reset_method = AMD_RESET_METHOD_NONE; - r = amdgpu_asic_reset(adev); - amdgpu_reset_method = tmp; - if (r) { - dev_err(adev->dev, "asic reset on init failed\n"); - goto failed; - } + tmp = amdgpu_reset_method; + /* It should do a default reset when loading or reloading the driver, + * regardless of the module parameter reset_method. + */ + amdgpu_reset_method = AMD_RESET_METHOD_NONE; + r = amdgpu_asic_reset(adev); + amdgpu_reset_method = tmp; + } + + if (r) { + dev_err(adev->dev, "asic reset on init failed\n"); + goto failed; } } diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 246b211b1e85f..65333141b1c1b 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -735,7 +735,7 @@ static int smu_early_init(void *handle) smu->adev = adev; smu->pm_enabled = !!amdgpu_dpm; smu->is_apu = false; - smu->smu_baco.state = SMU_BACO_STATE_EXIT; + smu->smu_baco.state = SMU_BACO_STATE_NONE; smu->smu_baco.platform_support = false; smu->user_dpm_profile.fan_mode = -1; @@ -1966,10 +1966,25 @@ static int smu_smc_hw_cleanup(struct smu_context *smu) return 0; } +static int smu_reset_mp1_state(struct smu_context *smu) +{ + struct amdgpu_device *adev = smu->adev; + int ret = 0; + + if ((!adev->in_runpm) && (!adev->in_suspend) && + (!amdgpu_in_reset(adev)) && amdgpu_ip_version(adev, MP1_HWIP, 0) == + IP_VERSION(13, 0, 10) && + !amdgpu_device_has_display_hardware(adev)) + ret = smu_set_mp1_state(smu, PP_MP1_STATE_UNLOAD); + + return ret; +} + static int smu_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct smu_context *smu = adev->powerplay.pp_handle; + int ret; if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) return 0; @@ -1987,7 +2002,15 @@ static int smu_hw_fini(void *handle) adev->pm.dpm_enabled = false; - return smu_smc_hw_cleanup(smu); + ret = smu_smc_hw_cleanup(smu); + if (ret) + return ret; + + ret = smu_reset_mp1_state(smu); + if (ret) + return ret; + + return 0; } static void smu_late_fini(void *handle) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h index a870bdd49a4e3..1fa81575788c5 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h @@ -424,6 +424,7 @@ enum smu_reset_mode { enum smu_baco_state { SMU_BACO_STATE_ENTER = 0, SMU_BACO_STATE_EXIT, + SMU_BACO_STATE_NONE, }; struct smu_baco_context { diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 9c03296f92cdd..67117ced7c6ae 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2751,7 +2751,13 @@ static int smu_v13_0_0_set_mp1_state(struct smu_context *smu, switch (mp1_state) { case PP_MP1_STATE_UNLOAD: - ret = smu_cmn_set_mp1_state(smu, mp1_state); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_PrepareMp1ForUnload, + 0x55, NULL); + + if (!ret && smu->smu_baco.state == SMU_BACO_STATE_EXIT) + ret = smu_v13_0_disable_pmfw_state(smu); + break; default: /* Ignore others */ -- GitLab From f7e232de51bb1b45646e5b7dc4ebcf13510f2630 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Wed, 6 Mar 2024 17:05:07 +0530 Subject: [PATCH 662/989] drm/amdgpu: Fix VCN allocation in CPX partition VCN need not be shared in CPX mode always for all GFX 9.4.3 SOC SKUs. In certain configs, VCN instance can be exclusively allocated to a partition even under CPX mode. Signed-off-by: Lijo Lazar Reviewed-by: James Zhu Reviewed-by: Asad Kamal Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c index d6f808acfb17b..fbb43ae7624f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c @@ -62,6 +62,11 @@ void aqua_vanjaram_doorbell_index_init(struct amdgpu_device *adev) adev->doorbell_index.max_assignment = AMDGPU_DOORBELL_LAYOUT1_MAX_ASSIGNMENT << 1; } +static bool aqua_vanjaram_xcp_vcn_shared(struct amdgpu_device *adev) +{ + return (adev->xcp_mgr->num_xcps > adev->vcn.num_vcn_inst); +} + static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev, uint32_t inst_idx, struct amdgpu_ring *ring) { @@ -87,7 +92,7 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev, case AMDGPU_RING_TYPE_VCN_ENC: case AMDGPU_RING_TYPE_VCN_JPEG: ip_blk = AMDGPU_XCP_VCN; - if (adev->xcp_mgr->mode == AMDGPU_CPX_PARTITION_MODE) + if (aqua_vanjaram_xcp_vcn_shared(adev)) inst_mask = 1 << (inst_idx * 2); break; default: @@ -140,10 +145,12 @@ static int aqua_vanjaram_xcp_sched_list_update( aqua_vanjaram_xcp_gpu_sched_update(adev, ring, ring->xcp_id); - /* VCN is shared by two partitions under CPX MODE */ + /* VCN may be shared by two partitions under CPX MODE in certain + * configs. + */ if ((ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC || - ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG) && - adev->xcp_mgr->mode == AMDGPU_CPX_PARTITION_MODE) + ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG) && + aqua_vanjaram_xcp_vcn_shared(adev)) aqua_vanjaram_xcp_gpu_sched_update(adev, ring, ring->xcp_id + 1); } -- GitLab From e33997e18d0fddd217a0fce988abbfd015338631 Mon Sep 17 00:00:00 2001 From: ZhenGuo Yin Date: Tue, 2 Apr 2024 11:41:05 +0800 Subject: [PATCH 663/989] drm/amdgpu: clear set_q_mode_offs when VM changed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Why] set_q_mode_offs don't get cleared after GPU reset, nexting SET_Q_MODE packet to init shadow memory will be skiped, hence there has a page fault. [How] VM flush is needed after GPU reset, clear set_q_mode_offs when emitting VM flush. Fixes: 8bc75586ea01 ("drm/amdgpu: workaround to avoid SET_Q_MODE packets v2") Reviewed-by: Christian König Signed-off-by: ZhenGuo Yin Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 1770e496c1b7c..2c82e27416190 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -5465,6 +5465,7 @@ static void gfx_v11_0_ring_emit_vm_flush(struct amdgpu_ring *ring, /* Make sure that we can't skip the SET_Q_MODE packets when the VM * changed in any way. */ + ring->set_q_mode_offs = 0; ring->set_q_mode_ptr = NULL; } -- GitLab From d06af584be5a769d124b7302b32a033e9559761d Mon Sep 17 00:00:00 2001 From: Zhigang Luo Date: Mon, 18 Mar 2024 14:13:10 -0400 Subject: [PATCH 664/989] amd/amdkfd: sync all devices to wait all processes being evicted If there are more than one device doing reset in parallel, the first device will call kfd_suspend_all_processes() to evict all processes on all devices, this call takes time to finish. other device will start reset and recover without waiting. if the process has not been evicted before doing recover, it will be restored, then caused page fault. Signed-off-by: Zhigang Luo Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 041ec3de55e72..719d6d365e150 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -960,7 +960,6 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) { struct kfd_node *node; int i; - int count; if (!kfd->init_complete) return; @@ -968,12 +967,10 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) /* for runtime suspend, skip locking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = ++kfd_locked; - mutex_unlock(&kfd_processes_mutex); - /* For first KFD device suspend all the KFD processes */ - if (count == 1) + if (++kfd_locked == 1) kfd_suspend_all_processes(); + mutex_unlock(&kfd_processes_mutex); } for (i = 0; i < kfd->num_nodes; i++) { @@ -984,7 +981,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) { - int ret, count, i; + int ret, i; if (!kfd->init_complete) return 0; @@ -998,12 +995,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) /* for runtime resume, skip unlocking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = --kfd_locked; - mutex_unlock(&kfd_processes_mutex); - - WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); - if (count == 0) + if (--kfd_locked == 0) ret = kfd_resume_all_processes(); + WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error"); + mutex_unlock(&kfd_processes_mutex); } return ret; -- GitLab From 2cc69a10d83180f3de9f5afe3a98e972b1453d4c Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Sat, 23 Mar 2024 12:02:54 -0600 Subject: [PATCH 665/989] drm/amd/display: Return max resolution supported by DWB mode_config's max width x height is 4096x2160 and is higher than DWB's max resolution 3840x2160 which is returned instead. Cc: stable@vger.kernel.org Reviewed-by: Harry Wentland Acked-by: Hamza Mahfooz Signed-off-by: Alex Hung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c index 16e72d623630c..08c494a7a21ba 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c @@ -76,10 +76,8 @@ static int amdgpu_dm_wb_encoder_atomic_check(struct drm_encoder *encoder, static int amdgpu_dm_wb_connector_get_modes(struct drm_connector *connector) { - struct drm_device *dev = connector->dev; - - return drm_add_modes_noedid(connector, dev->mode_config.max_width, - dev->mode_config.max_height); + /* Maximum resolution supported by DWB */ + return drm_add_modes_noedid(connector, 3840, 2160); } static int amdgpu_dm_wb_prepare_job(struct drm_writeback_connector *wb_connector, -- GitLab From bbca7f414ae9a12ea231cdbafd79c607e3337ea8 Mon Sep 17 00:00:00 2001 From: Tim Huang Date: Wed, 3 Apr 2024 17:28:44 +0800 Subject: [PATCH 666/989] drm/amdgpu: fix incorrect number of active RBs for gfx11 The RB bitmap should be global active RB bitmap & active RB bitmap based on active SA. Signed-off-by: Tim Huang Reviewed-by: Yifan Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 2c82e27416190..f7325b02a191f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1635,7 +1635,7 @@ static void gfx_v11_0_setup_rb(struct amdgpu_device *adev) active_rb_bitmap |= (0x3 << (i * rb_bitmap_width_per_sa)); } - active_rb_bitmap |= global_active_rb_bitmap; + active_rb_bitmap &= global_active_rb_bitmap; adev->gfx.config.backend_enable_mask = active_rb_bitmap; adev->gfx.config.num_rbs = hweight32(active_rb_bitmap); } -- GitLab From 81901d8d0472e9a19d294ae1dea76b950548195d Mon Sep 17 00:00:00 2001 From: Wenjing Liu Date: Fri, 22 Mar 2024 15:02:45 -0400 Subject: [PATCH 667/989] drm/amd/display: always reset ODM mode in context when adding first plane [why] In current implemenation ODM mode is only reset when the last plane is removed from dc state. For any dc validate we will always remove all current planes and add new planes. However when switching from no planes to 1 plane, ODM mode is not reset because no planes get removed. This has caused an issue where we kept ODM combine when it should have been remove when a plane is added. The change is to reset ODM mode when adding the first plane. Cc: stable@vger.kernel.org Reviewed-by: Alvin Lee Acked-by: Hamza Mahfooz Signed-off-by: Wenjing Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_state.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c b/drivers/gpu/drm/amd/display/dc/core/dc_state.c index 5cc7f8da209c5..61986e5cb4919 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c @@ -436,6 +436,15 @@ bool dc_state_add_plane( goto out; } + if (stream_status->plane_count == 0 && dc->config.enable_windowed_mpo_odm) + /* ODM combine could prevent us from supporting more planes + * we will reset ODM slice count back to 1 when all planes have + * been removed to maximize the amount of planes supported when + * new planes are added. + */ + resource_update_pipes_for_stream_with_slice_count( + state, dc->current_state, dc->res_pool, stream, 1); + otg_master_pipe = resource_get_otg_master_for_stream( &state->res_ctx, stream); if (otg_master_pipe) -- GitLab From 953927587f37b731abdeabe46ad44a3b3ec67a52 Mon Sep 17 00:00:00 2001 From: Dillon Varone Date: Thu, 21 Mar 2024 13:49:43 -0400 Subject: [PATCH 668/989] drm/amd/display: Do not recursively call manual trigger programming [WHY&HOW] We should not be recursively calling the manual trigger programming function when FAMS is not in use. Cc: stable@vger.kernel.org Reviewed-by: Alvin Lee Acked-by: Hamza Mahfooz Signed-off-by: Dillon Varone Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c index f07a4c7e48bc2..52eab8fccb7f1 100644 --- a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c +++ b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c @@ -267,9 +267,6 @@ static void optc32_setup_manual_trigger(struct timing_generator *optc) OTG_V_TOTAL_MAX_SEL, 1, OTG_FORCE_LOCK_ON_EVENT, 0, OTG_SET_V_TOTAL_MIN_MASK, (1 << 1)); /* TRIGA */ - - // Setup manual flow control for EOF via TRIG_A - optc->funcs->setup_manual_trigger(optc); } } -- GitLab From cf79814cb0bf5749b9f0db53ca231aa540c02768 Mon Sep 17 00:00:00 2001 From: Fudongwang Date: Tue, 26 Mar 2024 16:03:16 +0800 Subject: [PATCH 669/989] drm/amd/display: fix disable otg wa logic in DCN316 [Why] Wrong logic cause screen corruption. [How] Port logic from DCN35/314. Cc: stable@vger.kernel.org Reviewed-by: Nicholas Kazlauskas Acked-by: Hamza Mahfooz Signed-off-by: Fudongwang Signed-off-by: Alex Deucher --- .../dc/clk_mgr/dcn316/dcn316_clk_mgr.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c index 12f3e8aa46d8d..6ad4f4efec5dd 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c @@ -99,20 +99,25 @@ static int dcn316_get_active_display_cnt_wa( return display_count; } -static void dcn316_disable_otg_wa(struct clk_mgr *clk_mgr_base, struct dc_state *context, bool disable) +static void dcn316_disable_otg_wa(struct clk_mgr *clk_mgr_base, struct dc_state *context, + bool safe_to_lower, bool disable) { struct dc *dc = clk_mgr_base->ctx->dc; int i; for (i = 0; i < dc->res_pool->pipe_count; ++i) { - struct pipe_ctx *pipe = &dc->current_state->res_ctx.pipe_ctx[i]; + struct pipe_ctx *pipe = safe_to_lower + ? &context->res_ctx.pipe_ctx[i] + : &dc->current_state->res_ctx.pipe_ctx[i]; if (pipe->top_pipe || pipe->prev_odm_pipe) continue; - if (pipe->stream && (pipe->stream->dpms_off || pipe->plane_state == NULL || - dc_is_virtual_signal(pipe->stream->signal))) { + if (pipe->stream && (pipe->stream->dpms_off || dc_is_virtual_signal(pipe->stream->signal) || + !pipe->stream->link_enc)) { if (disable) { - pipe->stream_res.tg->funcs->immediate_disable_crtc(pipe->stream_res.tg); + if (pipe->stream_res.tg && pipe->stream_res.tg->funcs->immediate_disable_crtc) + pipe->stream_res.tg->funcs->immediate_disable_crtc(pipe->stream_res.tg); + reset_sync_context_for_pipe(dc, context, i); } else pipe->stream_res.tg->funcs->enable_crtc(pipe->stream_res.tg); @@ -207,11 +212,11 @@ static void dcn316_update_clocks(struct clk_mgr *clk_mgr_base, } if (should_set_clock(safe_to_lower, new_clocks->dispclk_khz, clk_mgr_base->clks.dispclk_khz)) { - dcn316_disable_otg_wa(clk_mgr_base, context, true); + dcn316_disable_otg_wa(clk_mgr_base, context, safe_to_lower, true); clk_mgr_base->clks.dispclk_khz = new_clocks->dispclk_khz; dcn316_smu_set_dispclk(clk_mgr, clk_mgr_base->clks.dispclk_khz); - dcn316_disable_otg_wa(clk_mgr_base, context, false); + dcn316_disable_otg_wa(clk_mgr_base, context, safe_to_lower, false); update_dispclk = true; } -- GitLab From 9e61ef8d219877202d4ee51d0d2ad9072c99a262 Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Tue, 12 Mar 2024 11:55:52 -0400 Subject: [PATCH 670/989] drm/amd/display: Program VSC SDP colorimetry for all DP sinks >= 1.4 In order for display colorimetry to work correctly on DP displays we need to send the VSC SDP packet. We should only do so for panels with DPCD revision greater or equal to 1.4 as older receivers might have problems with it. Cc: stable@vger.kernel.org Cc: Joshua Ashton Cc: Xaver Hugl Cc: Melissa Wen Cc: Agustin Gutierrez Reviewed-by: Agustin Gutierrez Acked-by: Hamza Mahfooz Signed-off-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 6cf0d5f276feb..d112ce6c812ce 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6318,7 +6318,9 @@ create_stream_for_sink(struct drm_connector *connector, if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A) mod_build_hf_vsif_infopacket(stream, &stream->vsp_infopacket); - if (stream->link->psr_settings.psr_feature_enabled || stream->link->replay_settings.replay_feature_enabled) { + if (stream->signal == SIGNAL_TYPE_DISPLAY_PORT || + stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST || + stream->signal == SIGNAL_TYPE_EDP) { // // should decide stream support vsc sdp colorimetry capability // before building vsc info packet @@ -6328,7 +6330,8 @@ create_stream_for_sink(struct drm_connector *connector, stream->use_vsc_sdp_for_colorimetry = aconnector->dc_sink->is_vsc_sdp_colorimetry_supported; } else { - if (stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED) + if (stream->link->dpcd_caps.dpcd_rev.raw >= 0x14 && + stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED) stream->use_vsc_sdp_for_colorimetry = true; } if (stream->out_transfer_func->tf == TRANSFER_FUNCTION_GAMMA22) -- GitLab From c3e2a5f2da904a18661335e8be2b961738574998 Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Thu, 21 Mar 2024 11:13:38 -0400 Subject: [PATCH 671/989] drm/amd/display: Set VSC SDP Colorimetry same way for MST and SST The previous check for the is_vsc_sdp_colorimetry_supported flag for MST sink signals did nothing. Simplify the code and use the same check for MST and SST. Cc: stable@vger.kernel.org Reviewed-by: Agustin Gutierrez Acked-by: Hamza Mahfooz Signed-off-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index d112ce6c812ce..6d2f60c61decc 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6325,15 +6325,9 @@ create_stream_for_sink(struct drm_connector *connector, // should decide stream support vsc sdp colorimetry capability // before building vsc info packet // - stream->use_vsc_sdp_for_colorimetry = false; - if (aconnector->dc_sink->sink_signal == SIGNAL_TYPE_DISPLAY_PORT_MST) { - stream->use_vsc_sdp_for_colorimetry = - aconnector->dc_sink->is_vsc_sdp_colorimetry_supported; - } else { - if (stream->link->dpcd_caps.dpcd_rev.raw >= 0x14 && - stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED) - stream->use_vsc_sdp_for_colorimetry = true; - } + stream->use_vsc_sdp_for_colorimetry = stream->link->dpcd_caps.dpcd_rev.raw >= 0x14 && + stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED; + if (stream->out_transfer_func->tf == TRANSFER_FUNCTION_GAMMA22) tf = TRANSFER_FUNC_GAMMA_22; mod_build_vsc_infopacket(stream, &stream->vsc_infopacket, stream->output_color_space, tf); -- GitLab From e047dd448d2bc12b8c30d7e3e6e98cea1fc28a17 Mon Sep 17 00:00:00 2001 From: Zhongwei Date: Wed, 27 Mar 2024 13:49:40 +0800 Subject: [PATCH 672/989] drm/amd/display: Adjust dprefclk by down spread percentage. [Why] OLED panels show no display for large vtotal timings. [How] Check if ss is enabled and read from lut for spread spectrum percentage. Adjust dprefclk as required. DP_DTO adjustment is for edp only. Cc: stable@vger.kernel.org Reviewed-by: Nicholas Kazlauskas Acked-by: Hamza Mahfooz Signed-off-by: Zhongwei Signed-off-by: Alex Deucher --- .../display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c | 50 +++++++++++++++++++ .../drm/amd/display/dc/dce/dce_clock_source.c | 8 +-- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c index 101fe96287cb4..d9c5692c86c21 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c @@ -73,6 +73,12 @@ #define CLK1_CLK2_BYPASS_CNTL__CLK2_BYPASS_SEL_MASK 0x00000007L #define CLK1_CLK2_BYPASS_CNTL__CLK2_BYPASS_DIV_MASK 0x000F0000L +#define regCLK5_0_CLK5_spll_field_8 0x464b +#define regCLK5_0_CLK5_spll_field_8_BASE_IDX 0 + +#define CLK5_0_CLK5_spll_field_8__spll_ssc_en__SHIFT 0xd +#define CLK5_0_CLK5_spll_field_8__spll_ssc_en_MASK 0x00002000L + #define SMU_VER_THRESHOLD 0x5D4A00 //93.74.0 #define REG(reg_name) \ @@ -411,6 +417,17 @@ static void dcn35_dump_clk_registers(struct clk_state_registers_and_bypass *regs { } +static bool dcn35_is_spll_ssc_enabled(struct clk_mgr *clk_mgr_base) +{ + struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base); + struct dc_context *ctx = clk_mgr->base.ctx; + uint32_t ssc_enable; + + REG_GET(CLK5_0_CLK5_spll_field_8, spll_ssc_en, &ssc_enable); + + return ssc_enable == 1; +} + static void init_clk_states(struct clk_mgr *clk_mgr) { struct clk_mgr_internal *clk_mgr_int = TO_CLK_MGR_INTERNAL(clk_mgr); @@ -428,7 +445,16 @@ static void init_clk_states(struct clk_mgr *clk_mgr) void dcn35_init_clocks(struct clk_mgr *clk_mgr) { + struct clk_mgr_internal *clk_mgr_int = TO_CLK_MGR_INTERNAL(clk_mgr); init_clk_states(clk_mgr); + + // to adjust dp_dto reference clock if ssc is enable otherwise to apply dprefclk + if (dcn35_is_spll_ssc_enabled(clk_mgr)) + clk_mgr->dp_dto_source_clock_in_khz = + dce_adjust_dp_ref_freq_for_ss(clk_mgr_int, clk_mgr->dprefclk_khz); + else + clk_mgr->dp_dto_source_clock_in_khz = clk_mgr->dprefclk_khz; + } static struct clk_bw_params dcn35_bw_params = { .vram_type = Ddr4MemType, @@ -517,6 +543,28 @@ static DpmClocks_t_dcn35 dummy_clocks; static struct dcn35_watermarks dummy_wms = { 0 }; +static struct dcn35_ss_info_table ss_info_table = { + .ss_divider = 1000, + .ss_percentage = {0, 0, 375, 375, 375} +}; + +static void dcn35_read_ss_info_from_lut(struct clk_mgr_internal *clk_mgr) +{ + struct dc_context *ctx = clk_mgr->base.ctx; + uint32_t clock_source; + + REG_GET(CLK1_CLK2_BYPASS_CNTL, CLK2_BYPASS_SEL, &clock_source); + // If it's DFS mode, clock_source is 0. + if (dcn35_is_spll_ssc_enabled(&clk_mgr->base) && (clock_source < ARRAY_SIZE(ss_info_table.ss_percentage))) { + clk_mgr->dprefclk_ss_percentage = ss_info_table.ss_percentage[clock_source]; + + if (clk_mgr->dprefclk_ss_percentage != 0) { + clk_mgr->ss_on_dprefclk = true; + clk_mgr->dprefclk_ss_divider = ss_info_table.ss_divider; + } + } +} + static void dcn35_build_watermark_ranges(struct clk_bw_params *bw_params, struct dcn35_watermarks *table) { int i, num_valid_sets; @@ -1061,6 +1109,8 @@ void dcn35_clk_mgr_construct( dce_clock_read_ss_info(&clk_mgr->base); /*when clk src is from FCH, it could have ss, same clock src as DPREF clk*/ + dcn35_read_ss_info_from_lut(&clk_mgr->base); + clk_mgr->base.base.bw_params = &dcn35_bw_params; if (clk_mgr->base.base.ctx->dc->debug.pstate_enabled) { diff --git a/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c b/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c index 970644b695cd4..b5e0289d2fe82 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c @@ -976,7 +976,10 @@ static bool dcn31_program_pix_clk( struct bp_pixel_clock_parameters bp_pc_params = {0}; enum transmitter_color_depth bp_pc_colour_depth = TRANSMITTER_COLOR_DEPTH_24; - if (clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz != 0) + // Apply ssed(spread spectrum) dpref clock for edp only. + if (clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz != 0 + && pix_clk_params->signal_type == SIGNAL_TYPE_EDP + && encoding == DP_8b_10b_ENCODING) dp_dto_ref_khz = clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz; // For these signal types Driver to program DP_DTO without calling VBIOS Command table if (dc_is_dp_signal(pix_clk_params->signal_type) || dc_is_virtual_signal(pix_clk_params->signal_type)) { @@ -1093,9 +1096,6 @@ static bool get_pixel_clk_frequency_100hz( unsigned int modulo_hz = 0; unsigned int dp_dto_ref_khz = clock_source->ctx->dc->clk_mgr->dprefclk_khz; - if (clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz != 0) - dp_dto_ref_khz = clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz; - if (clock_source->id == CLOCK_SOURCE_ID_DP_DTO) { clock_hz = REG_READ(PHASE[inst]); -- GitLab From 6dba20d23e85034901ccb765a7ca71199bcca4df Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Sun, 7 Apr 2024 22:01:35 +0800 Subject: [PATCH 673/989] drm/amdgpu: differentiate external rev id for gfx 11.5.0 This patch to differentiate external rev id for gfx 11.5.0. Signed-off-by: Yifan Zhang Reviewed-by: Tim Huang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/soc21.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index abe319b0f0639..43ca63fe85ac3 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -720,7 +720,10 @@ static int soc21_common_early_init(void *handle) AMD_PG_SUPPORT_VCN | AMD_PG_SUPPORT_JPEG | AMD_PG_SUPPORT_GFX_PG; - adev->external_rev_id = adev->rev_id + 0x1; + if (adev->rev_id == 0) + adev->external_rev_id = 0x1; + else + adev->external_rev_id = adev->rev_id + 0x10; break; case IP_VERSION(11, 5, 1): adev->cg_flags = -- GitLab From dec8ced871e17eea46f097542dd074d022be4bd1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 5 Mar 2024 22:10:03 -0800 Subject: [PATCH 674/989] perf/x86: Fix out of range data On x86 each struct cpu_hw_events maintains a table for counter assignment but it missed to update one for the deleted event in x86_pmu_del(). This can make perf_clear_dirty_counters() reset used counter if it's called before event scheduling or enabling. Then it would return out of range data which doesn't make sense. The following code can reproduce the problem. $ cat repro.c #include #include #include #include #include #include #include #include struct perf_event_attr attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, .disabled = 1, }; void *worker(void *arg) { int cpu = (long)arg; int fd1 = syscall(SYS_perf_event_open, &attr, -1, cpu, -1, 0); int fd2 = syscall(SYS_perf_event_open, &attr, -1, cpu, -1, 0); void *p; do { ioctl(fd1, PERF_EVENT_IOC_ENABLE, 0); p = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd1, 0); ioctl(fd2, PERF_EVENT_IOC_ENABLE, 0); ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0); munmap(p, 4096); ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0); } while (1); return NULL; } int main(void) { int i; int n = sysconf(_SC_NPROCESSORS_ONLN); pthread_t *th = calloc(n, sizeof(*th)); for (i = 0; i < n; i++) pthread_create(&th[i], NULL, worker, (void *)(long)i); for (i = 0; i < n; i++) pthread_join(th[i], NULL); free(th); return 0; } And you can see the out of range data using perf stat like this. Probably it'd be easier to see on a large machine. $ gcc -o repro repro.c -pthread $ ./repro & $ sudo perf stat -A -I 1000 2>&1 | awk '{ if (length($3) > 15) print }' 1.001028462 CPU6 196,719,295,683,763 cycles # 194290.996 GHz (71.54%) 1.001028462 CPU3 396,077,485,787,730 branch-misses # 15804359784.80% of all branches (71.07%) 1.001028462 CPU17 197,608,350,727,877 branch-misses # 14594186554.56% of all branches (71.22%) 2.020064073 CPU4 198,372,472,612,140 cycles # 194681.113 GHz (70.95%) 2.020064073 CPU6 199,419,277,896,696 cycles # 195720.007 GHz (70.57%) 2.020064073 CPU20 198,147,174,025,639 cycles # 194474.654 GHz (71.03%) 2.020064073 CPU20 198,421,240,580,145 stalled-cycles-frontend # 100.14% frontend cycles idle (70.93%) 3.037443155 CPU4 197,382,689,923,416 cycles # 194043.065 GHz (71.30%) 3.037443155 CPU20 196,324,797,879,414 cycles # 193003.773 GHz (71.69%) 3.037443155 CPU5 197,679,956,608,205 stalled-cycles-backend # 1315606428.66% backend cycles idle (71.19%) 3.037443155 CPU5 198,571,860,474,851 instructions # 13215422.58 insn per cycle It should move the contents in the cpuc->assign as well. Fixes: 5471eea5d3bf ("perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task") Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240306061003.1894224-1-namhyung@kernel.org --- arch/x86/events/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 09050641ce5d3..5b0dd07b1ef19 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1644,6 +1644,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; + cpuc->assign[i-1] = cpuc->assign[i]; } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; -- GitLab From 04f4230e2f86a4e961ea5466eda3db8c1762004d Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Tue, 9 Apr 2024 16:08:05 -0700 Subject: [PATCH 675/989] x86/bugs: Fix return type of spectre_bhi_state() The definition of spectre_bhi_state() incorrectly returns a const char * const. This causes the a compiler warning when building with W=1: warning: type qualifiers ignored on function return type [-Wignored-qualifiers] 2812 | static const char * const spectre_bhi_state(void) Remove the const qualifier from the pointer. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Reported-by: Sean Christopherson Signed-off-by: Daniel Sneddon Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240409230806.1545822-1-daniel.sneddon@linux.intel.com --- arch/x86/kernel/cpu/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 295463707e681..27f5004a8e24f 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2809,7 +2809,7 @@ static char *pbrsb_eibrs_state(void) } } -static const char * const spectre_bhi_state(void) +static const char *spectre_bhi_state(void) { if (!boot_cpu_has_bug(X86_BUG_BHI)) return "; BHI: Not affected"; -- GitLab From 81665adf25d28a00a986533f1d3a5df76b79cad9 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Mon, 8 Apr 2024 09:35:40 -0700 Subject: [PATCH 676/989] pds_core: Fix pdsc_check_pci_health function to use work thread When the driver notices fw_status == 0xff it tries to perform a PCI reset on itself via pci_reset_function() in the context of the driver's health thread. However, pdsc_reset_prepare calls pdsc_stop_health_thread(), which attempts to stop/flush the health thread. This results in a deadlock because the stop/flush will never complete since the driver called pci_reset_function() from the health thread context. Fix by changing the pdsc_check_pci_health_function() to queue a newly introduced pdsc_pci_reset_thread() on the pdsc's work queue. Unloading the driver in the fw_down/dead state uncovered another issue, which can be seen in the following trace: WARNING: CPU: 51 PID: 6914 at kernel/workqueue.c:1450 __queue_work+0x358/0x440 [...] RIP: 0010:__queue_work+0x358/0x440 [...] Call Trace: ? __warn+0x85/0x140 ? __queue_work+0x358/0x440 ? report_bug+0xfc/0x1e0 ? handle_bug+0x3f/0x70 ? exc_invalid_op+0x17/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? __queue_work+0x358/0x440 queue_work_on+0x28/0x30 pdsc_devcmd_locked+0x96/0xe0 [pds_core] pdsc_devcmd_reset+0x71/0xb0 [pds_core] pdsc_teardown+0x51/0xe0 [pds_core] pdsc_remove+0x106/0x200 [pds_core] pci_device_remove+0x37/0xc0 device_release_driver_internal+0xae/0x140 driver_detach+0x48/0x90 bus_remove_driver+0x6d/0xf0 pci_unregister_driver+0x2e/0xa0 pdsc_cleanup_module+0x10/0x780 [pds_core] __x64_sys_delete_module+0x142/0x2b0 ? syscall_trace_enter.isra.18+0x126/0x1a0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7fbd9d03a14b [...] Fix this by preventing the devcmd reset if the FW is not running. Fixes: d9407ff11809 ("pds_core: Prevent health thread from running during reset/remove") Reviewed-by: Shannon Nelson Signed-off-by: Brett Creeley Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/pds_core/core.c | 13 ++++++++++++- drivers/net/ethernet/amd/pds_core/core.h | 2 ++ drivers/net/ethernet/amd/pds_core/dev.c | 3 +++ drivers/net/ethernet/amd/pds_core/main.c | 1 + 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c index 9662ee72814c0..536635e577279 100644 --- a/drivers/net/ethernet/amd/pds_core/core.c +++ b/drivers/net/ethernet/amd/pds_core/core.c @@ -593,6 +593,16 @@ err_out: pdsc_teardown(pdsc, PDSC_TEARDOWN_RECOVERY); } +void pdsc_pci_reset_thread(struct work_struct *work) +{ + struct pdsc *pdsc = container_of(work, struct pdsc, pci_reset_work); + struct pci_dev *pdev = pdsc->pdev; + + pci_dev_get(pdev); + pci_reset_function(pdev); + pci_dev_put(pdev); +} + static void pdsc_check_pci_health(struct pdsc *pdsc) { u8 fw_status; @@ -607,7 +617,8 @@ static void pdsc_check_pci_health(struct pdsc *pdsc) if (fw_status != PDS_RC_BAD_PCI) return; - pci_reset_function(pdsc->pdev); + /* prevent deadlock between pdsc_reset_prepare and pdsc_health_thread */ + queue_work(pdsc->wq, &pdsc->pci_reset_work); } void pdsc_health_thread(struct work_struct *work) diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h index 92d7657dd6147..a3e17a0c187a6 100644 --- a/drivers/net/ethernet/amd/pds_core/core.h +++ b/drivers/net/ethernet/amd/pds_core/core.h @@ -197,6 +197,7 @@ struct pdsc { struct pdsc_qcq notifyqcq; u64 last_eid; struct pdsc_viftype *viftype_status; + struct work_struct pci_reset_work; }; /** enum pds_core_dbell_bits - bitwise composition of dbell values. @@ -313,5 +314,6 @@ int pdsc_firmware_update(struct pdsc *pdsc, const struct firmware *fw, void pdsc_fw_down(struct pdsc *pdsc); void pdsc_fw_up(struct pdsc *pdsc); +void pdsc_pci_reset_thread(struct work_struct *work); #endif /* _PDSC_H_ */ diff --git a/drivers/net/ethernet/amd/pds_core/dev.c b/drivers/net/ethernet/amd/pds_core/dev.c index e494e1298dc9a..495ef4ef8c103 100644 --- a/drivers/net/ethernet/amd/pds_core/dev.c +++ b/drivers/net/ethernet/amd/pds_core/dev.c @@ -229,6 +229,9 @@ int pdsc_devcmd_reset(struct pdsc *pdsc) .reset.opcode = PDS_CORE_CMD_RESET, }; + if (!pdsc_is_fw_running(pdsc)) + return 0; + return pdsc_devcmd(pdsc, &cmd, &comp, pdsc->devcmd_timeout); } diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c index ab6133e7db422..660268ff95623 100644 --- a/drivers/net/ethernet/amd/pds_core/main.c +++ b/drivers/net/ethernet/amd/pds_core/main.c @@ -239,6 +239,7 @@ static int pdsc_init_pf(struct pdsc *pdsc) snprintf(wq_name, sizeof(wq_name), "%s.%d", PDS_CORE_DRV_NAME, pdsc->uid); pdsc->wq = create_singlethread_workqueue(wq_name); INIT_WORK(&pdsc->health_work, pdsc_health_thread); + INIT_WORK(&pdsc->pci_reset_work, pdsc_pci_reset_thread); timer_setup(&pdsc->wdtimer, pdsc_wdtimer_cb, 0); pdsc->wdtimer_period = PDSC_WATCHDOG_SECS * HZ; -- GitLab From c38fa07dc69f0b9e6f43ecab96dc7861a70c827c Mon Sep 17 00:00:00 2001 From: Gil Fine Date: Fri, 1 Mar 2024 15:22:53 +0200 Subject: [PATCH 677/989] thunderbolt: Fix wake configurations after device unplug Currently we don't configure correctly the wake events after unplug of device router. What can happen is that the downstream ports of host router will be configured to wake on: USB4-wake and wake-on-disconnect, but not on wake-on-connect. This may cause the later plugged device not to wake the domain and fail in enumeration. Fix this by clearing downstream port's "USB4 Port is Configured" bit, after unplug of a device router. Signed-off-by: Gil Fine Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/switch.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index 6ffc4e81ffed7..4edfd6e34e319 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -3180,22 +3180,29 @@ void tb_switch_unconfigure_link(struct tb_switch *sw) { struct tb_port *up, *down; - if (sw->is_unplugged) - return; if (!tb_route(sw) || tb_switch_is_icm(sw)) return; + /* + * Unconfigure downstream port so that wake-on-connect can be + * configured after router unplug. No need to unconfigure upstream port + * since its router is unplugged. + */ up = tb_upstream_port(sw); - if (tb_switch_is_usb4(up->sw)) - usb4_port_unconfigure(up); - else - tb_lc_unconfigure_port(up); - down = up->remote; if (tb_switch_is_usb4(down->sw)) usb4_port_unconfigure(down); else tb_lc_unconfigure_port(down); + + if (sw->is_unplugged) + return; + + up = tb_upstream_port(sw); + if (tb_switch_is_usb4(up->sw)) + usb4_port_unconfigure(up); + else + tb_lc_unconfigure_port(up); } static void tb_switch_credits_init(struct tb_switch *sw) -- GitLab From dcd12acaf384c30437fa5a9a1f71df06fc9835fd Mon Sep 17 00:00:00 2001 From: Gil Fine Date: Fri, 1 Mar 2024 15:11:18 +0200 Subject: [PATCH 678/989] thunderbolt: Avoid notify PM core about runtime PM resume Currently we notify PM core about occurred wakes after any resume. This is not actually needed after resume from runtime suspend. Hence, notify PM core about occurred wakes only after resume from system sleep. Also, if the wake occurred in USB4 router upstream port, we don't notify the PM core about it since it is not actually needed and can cause unexpected autowake (e.g. if /sys/power/wakeup_count is used). While there add the missing kernel-doc for tb_switch_resume(). Signed-off-by: Gil Fine Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/switch.c | 27 +++++++++++++++++++++++++-- drivers/thunderbolt/tb.c | 4 ++-- drivers/thunderbolt/tb.h | 3 ++- drivers/thunderbolt/usb4.c | 13 +++++++------ 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index 4edfd6e34e319..326433df5880e 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -3448,7 +3448,26 @@ static int tb_switch_set_wake(struct tb_switch *sw, unsigned int flags) return tb_lc_set_wake(sw, flags); } -int tb_switch_resume(struct tb_switch *sw) +static void tb_switch_check_wakes(struct tb_switch *sw) +{ + if (device_may_wakeup(&sw->dev)) { + if (tb_switch_is_usb4(sw)) + usb4_switch_check_wakes(sw); + } +} + +/** + * tb_switch_resume() - Resume a switch after sleep + * @sw: Switch to resume + * @runtime: Is this resume from runtime suspend or system sleep + * + * Resumes and re-enumerates router (and all its children), if still plugged + * after suspend. Don't enumerate device router whose UID was changed during + * suspend. If this is resume from system sleep, notifies PM core about the + * wakes occurred during suspend. Disables all wakes, except USB4 wake of + * upstream port for USB4 routers that shall be always enabled. + */ +int tb_switch_resume(struct tb_switch *sw, bool runtime) { struct tb_port *port; int err; @@ -3497,6 +3516,9 @@ int tb_switch_resume(struct tb_switch *sw) if (err) return err; + if (!runtime) + tb_switch_check_wakes(sw); + /* Disable wakes */ tb_switch_set_wake(sw, 0); @@ -3526,7 +3548,8 @@ int tb_switch_resume(struct tb_switch *sw) */ if (tb_port_unlock(port)) tb_port_warn(port, "failed to unlock port\n"); - if (port->remote && tb_switch_resume(port->remote->sw)) { + if (port->remote && + tb_switch_resume(port->remote->sw, runtime)) { tb_port_warn(port, "lost during suspend, disconnecting\n"); tb_sw_set_unplugged(port->remote->sw); diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c index 360cb95f39aab..3e44c78ac4092 100644 --- a/drivers/thunderbolt/tb.c +++ b/drivers/thunderbolt/tb.c @@ -2942,7 +2942,7 @@ static int tb_resume_noirq(struct tb *tb) if (!tb_switch_is_usb4(tb->root_switch)) tb_switch_reset(tb->root_switch); - tb_switch_resume(tb->root_switch); + tb_switch_resume(tb->root_switch, false); tb_free_invalid_tunnels(tb); tb_free_unplugged_children(tb->root_switch); tb_restore_children(tb->root_switch); @@ -3068,7 +3068,7 @@ static int tb_runtime_resume(struct tb *tb) struct tb_tunnel *tunnel, *n; mutex_lock(&tb->lock); - tb_switch_resume(tb->root_switch); + tb_switch_resume(tb->root_switch, true); tb_free_invalid_tunnels(tb); tb_restore_children(tb->root_switch); list_for_each_entry_safe(tunnel, n, &tcm->tunnel_list, list) diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h index feed8ecaf712e..18aae4ccaed59 100644 --- a/drivers/thunderbolt/tb.h +++ b/drivers/thunderbolt/tb.h @@ -827,7 +827,7 @@ int tb_switch_configuration_valid(struct tb_switch *sw); int tb_switch_add(struct tb_switch *sw); void tb_switch_remove(struct tb_switch *sw); void tb_switch_suspend(struct tb_switch *sw, bool runtime); -int tb_switch_resume(struct tb_switch *sw); +int tb_switch_resume(struct tb_switch *sw, bool runtime); int tb_switch_reset(struct tb_switch *sw); int tb_switch_wait_for_bit(struct tb_switch *sw, u32 offset, u32 bit, u32 value, int timeout_msec); @@ -1288,6 +1288,7 @@ static inline bool tb_switch_is_usb4(const struct tb_switch *sw) return usb4_switch_version(sw) > 0; } +void usb4_switch_check_wakes(struct tb_switch *sw); int usb4_switch_setup(struct tb_switch *sw); int usb4_switch_configuration_valid(struct tb_switch *sw); int usb4_switch_read_uid(struct tb_switch *sw, u64 *uid); diff --git a/drivers/thunderbolt/usb4.c b/drivers/thunderbolt/usb4.c index 9860b49d7a2b2..78b06e922fdac 100644 --- a/drivers/thunderbolt/usb4.c +++ b/drivers/thunderbolt/usb4.c @@ -155,7 +155,13 @@ static inline int usb4_switch_op_data(struct tb_switch *sw, u16 opcode, tx_dwords, rx_data, rx_dwords); } -static void usb4_switch_check_wakes(struct tb_switch *sw) +/** + * usb4_switch_check_wakes() - Check for wakes and notify PM core about them + * @sw: Router whose wakes to check + * + * Checks wakes occurred during suspend and notify the PM core about them. + */ +void usb4_switch_check_wakes(struct tb_switch *sw) { bool wakeup_usb4 = false; struct usb4_port *usb4; @@ -163,9 +169,6 @@ static void usb4_switch_check_wakes(struct tb_switch *sw) bool wakeup = false; u32 val; - if (!device_may_wakeup(&sw->dev)) - return; - if (tb_route(sw)) { if (tb_sw_read(sw, &val, TB_CFG_SWITCH, ROUTER_CS_6, 1)) return; @@ -244,8 +247,6 @@ int usb4_switch_setup(struct tb_switch *sw) u32 val = 0; int ret; - usb4_switch_check_wakes(sw); - if (!tb_route(sw)) return 0; -- GitLab From f87cbcb345d059f0377b4fa0ba1b766a17fc3710 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Apr 2024 12:29:12 +0200 Subject: [PATCH 679/989] timekeeping: Use READ/WRITE_ONCE() for tick_do_timer_cpu tick_do_timer_cpu is used lockless to check which CPU needs to take care of the per tick timekeeping duty. This is done to avoid a thundering herd problem on jiffies_lock. The read and writes are not annotated so KCSAN complains about data races: BUG: KCSAN: data-race in tick_nohz_idle_stop_tick / tick_nohz_next_event write to 0xffffffff8a2bda30 of 4 bytes by task 0 on cpu 26: tick_nohz_idle_stop_tick+0x3b1/0x4a0 do_idle+0x1e3/0x250 read to 0xffffffff8a2bda30 of 4 bytes by task 0 on cpu 16: tick_nohz_next_event+0xe7/0x1e0 tick_nohz_get_sleep_length+0xa7/0xe0 menu_select+0x82/0xb90 cpuidle_select+0x44/0x60 do_idle+0x1c2/0x250 value changed: 0x0000001a -> 0xffffffff Annotate them with READ/WRITE_ONCE() to document the intentional data race. Reported-by: Mirsad Todorovac Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Tested-by: Sean Anderson Link: https://lore.kernel.org/r/87cyqy7rt3.ffs@tglx --- kernel/time/tick-common.c | 17 +++++++++-------- kernel/time/tick-sched.c | 36 ++++++++++++++++++++++-------------- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index fb0fdec8719a1..d88b13076b794 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -7,6 +7,7 @@ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ +#include #include #include #include @@ -84,7 +85,7 @@ int tick_is_oneshot_available(void) */ static void tick_periodic(int cpu) { - if (tick_do_timer_cpu == cpu) { + if (READ_ONCE(tick_do_timer_cpu) == cpu) { raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); @@ -215,8 +216,8 @@ static void tick_setup_device(struct tick_device *td, * If no cpu took the do_timer update, assign it to * this cpu: */ - if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - tick_do_timer_cpu = cpu; + if (READ_ONCE(tick_do_timer_cpu) == TICK_DO_TIMER_BOOT) { + WRITE_ONCE(tick_do_timer_cpu, cpu); tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* @@ -232,7 +233,7 @@ static void tick_setup_device(struct tick_device *td, !tick_nohz_full_cpu(cpu)) { tick_take_do_timer_from_boot(); tick_do_timer_boot_cpu = -1; - WARN_ON(tick_do_timer_cpu != cpu); + WARN_ON(READ_ONCE(tick_do_timer_cpu) != cpu); #endif } @@ -406,10 +407,10 @@ void tick_assert_timekeeping_handover(void) int tick_cpu_dying(unsigned int dying_cpu) { /* - * If the current CPU is the timekeeper, it's the only one that - * can safely hand over its duty. Also all online CPUs are in - * stop machine, guaranteed not to be idle, therefore it's safe - * to pick any online successor. + * If the current CPU is the timekeeper, it's the only one that can + * safely hand over its duty. Also all online CPUs are in stop + * machine, guaranteed not to be idle, therefore there is no + * concurrency and it's safe to pick any online successor. */ if (tick_do_timer_cpu == dying_cpu) tick_do_timer_cpu = cpumask_first(cpu_online_mask); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1331216a9cae7..71a792cd89362 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -8,6 +8,7 @@ * * Started by: Thomas Gleixner and Ingo Molnar */ +#include #include #include #include @@ -204,7 +205,7 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts, static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { - int cpu = smp_processor_id(); + int tick_cpu, cpu = smp_processor_id(); /* * Check if the do_timer duty was dropped. We don't care about @@ -216,16 +217,18 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) * If nohz_full is enabled, this should not happen because the * 'tick_do_timer_cpu' CPU never relinquishes. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && - unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { + tick_cpu = READ_ONCE(tick_do_timer_cpu); + + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif - tick_do_timer_cpu = cpu; + WRITE_ONCE(tick_do_timer_cpu, cpu); + tick_cpu = cpu; } /* Check if jiffies need an update */ - if (tick_do_timer_cpu == cpu) + if (tick_cpu == cpu) tick_do_update_jiffies64(now); /* @@ -610,7 +613,7 @@ bool tick_nohz_cpu_hotpluggable(unsigned int cpu) * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ - if (tick_nohz_full_running && tick_do_timer_cpu == cpu) + if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu) return false; return true; } @@ -891,6 +894,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, delta, expires; unsigned long basejiff; + int tick_cpu; basemono = get_jiffies_update(&basejiff); ts->last_jiffies = basejiff; @@ -947,9 +951,9 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); - if (cpu != tick_do_timer_cpu && - (tick_do_timer_cpu != TICK_DO_TIMER_NONE || - !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) + tick_cpu = READ_ONCE(tick_do_timer_cpu); + if (tick_cpu != cpu && + (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) delta = KTIME_MAX; /* Calculate the next expiry time */ @@ -970,6 +974,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) unsigned long basejiff = ts->last_jiffies; u64 basemono = ts->timer_expires_base; bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED); + int tick_cpu; u64 expires; /* Make sure we won't be trying to stop it twice in a row. */ @@ -1007,10 +1012,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) * do_timer() never gets invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ - if (cpu == tick_do_timer_cpu) { - tick_do_timer_cpu = TICK_DO_TIMER_NONE; + tick_cpu = READ_ONCE(tick_do_timer_cpu); + if (tick_cpu == cpu) { + WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE); tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST); - } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + } else if (tick_cpu != TICK_DO_TIMER_NONE) { tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST); } @@ -1173,15 +1179,17 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; if (tick_nohz_full_enabled()) { + int tick_cpu = READ_ONCE(tick_do_timer_cpu); + /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ - if (tick_do_timer_cpu == cpu) + if (tick_cpu == cpu) return false; /* Should not happen for nohz-full */ - if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) return false; } -- GitLab From 19fa4f2a85d777a8052e869c1b892a2f7556569d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 8 Apr 2024 20:47:40 +0200 Subject: [PATCH 680/989] r8169: fix LED-related deadlock on module removal Binding devm_led_classdev_register() to the netdev is problematic because on module removal we get a RTNL-related deadlock. Fix this by avoiding the device-managed LED functions. Note: We can safely call led_classdev_unregister() for a LED even if registering it failed, because led_classdev_unregister() detects this and is a no-op in this case. Fixes: 18764b883e15 ("r8169: add support for LED's on RTL8168/RTL8101") Cc: stable@vger.kernel.org Reported-by: Lukas Wunner Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.h | 6 ++-- drivers/net/ethernet/realtek/r8169_leds.c | 35 +++++++++++++++-------- drivers/net/ethernet/realtek/r8169_main.c | 8 ++++-- 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h index 4c043052198d4..00882ffc7a029 100644 --- a/drivers/net/ethernet/realtek/r8169.h +++ b/drivers/net/ethernet/realtek/r8169.h @@ -73,6 +73,7 @@ enum mac_version { }; struct rtl8169_private; +struct r8169_led_classdev; void r8169_apply_firmware(struct rtl8169_private *tp); u16 rtl8168h_2_get_adc_bias_ioffset(struct rtl8169_private *tp); @@ -84,7 +85,8 @@ void r8169_get_led_name(struct rtl8169_private *tp, int idx, char *buf, int buf_len); int rtl8168_get_led_mode(struct rtl8169_private *tp); int rtl8168_led_mod_ctrl(struct rtl8169_private *tp, u16 mask, u16 val); -void rtl8168_init_leds(struct net_device *ndev); +struct r8169_led_classdev *rtl8168_init_leds(struct net_device *ndev); int rtl8125_get_led_mode(struct rtl8169_private *tp, int index); int rtl8125_set_led_mode(struct rtl8169_private *tp, int index, u16 mode); -void rtl8125_init_leds(struct net_device *ndev); +struct r8169_led_classdev *rtl8125_init_leds(struct net_device *ndev); +void r8169_remove_leds(struct r8169_led_classdev *leds); diff --git a/drivers/net/ethernet/realtek/r8169_leds.c b/drivers/net/ethernet/realtek/r8169_leds.c index 7c5dc9d0df855..e10bee706bc69 100644 --- a/drivers/net/ethernet/realtek/r8169_leds.c +++ b/drivers/net/ethernet/realtek/r8169_leds.c @@ -146,22 +146,22 @@ static void rtl8168_setup_ldev(struct r8169_led_classdev *ldev, led_cdev->hw_control_get_device = r8169_led_hw_control_get_device; /* ignore errors */ - devm_led_classdev_register(&ndev->dev, led_cdev); + led_classdev_register(&ndev->dev, led_cdev); } -void rtl8168_init_leds(struct net_device *ndev) +struct r8169_led_classdev *rtl8168_init_leds(struct net_device *ndev) { - /* bind resource mgmt to netdev */ - struct device *dev = &ndev->dev; struct r8169_led_classdev *leds; int i; - leds = devm_kcalloc(dev, RTL8168_NUM_LEDS, sizeof(*leds), GFP_KERNEL); + leds = kcalloc(RTL8168_NUM_LEDS + 1, sizeof(*leds), GFP_KERNEL); if (!leds) - return; + return NULL; for (i = 0; i < RTL8168_NUM_LEDS; i++) rtl8168_setup_ldev(leds + i, ndev, i); + + return leds; } static int rtl8125_led_hw_control_is_supported(struct led_classdev *led_cdev, @@ -245,20 +245,31 @@ static void rtl8125_setup_led_ldev(struct r8169_led_classdev *ldev, led_cdev->hw_control_get_device = r8169_led_hw_control_get_device; /* ignore errors */ - devm_led_classdev_register(&ndev->dev, led_cdev); + led_classdev_register(&ndev->dev, led_cdev); } -void rtl8125_init_leds(struct net_device *ndev) +struct r8169_led_classdev *rtl8125_init_leds(struct net_device *ndev) { - /* bind resource mgmt to netdev */ - struct device *dev = &ndev->dev; struct r8169_led_classdev *leds; int i; - leds = devm_kcalloc(dev, RTL8125_NUM_LEDS, sizeof(*leds), GFP_KERNEL); + leds = kcalloc(RTL8125_NUM_LEDS + 1, sizeof(*leds), GFP_KERNEL); if (!leds) - return; + return NULL; for (i = 0; i < RTL8125_NUM_LEDS; i++) rtl8125_setup_led_ldev(leds + i, ndev, i); + + return leds; +} + +void r8169_remove_leds(struct r8169_led_classdev *leds) +{ + if (!leds) + return; + + for (struct r8169_led_classdev *l = leds; l->ndev; l++) + led_classdev_unregister(&l->led); + + kfree(leds); } diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 6f1e6f386b7ba..8a27328eae34b 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -647,6 +647,8 @@ struct rtl8169_private { const char *fw_name; struct rtl_fw *rtl_fw; + struct r8169_led_classdev *leds; + u32 ocp_base; }; @@ -5044,6 +5046,8 @@ static void rtl_remove_one(struct pci_dev *pdev) cancel_work_sync(&tp->wk.work); + r8169_remove_leds(tp->leds); + unregister_netdev(tp->dev); if (tp->dash_type != RTL_DASH_NONE) @@ -5501,9 +5505,9 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (IS_ENABLED(CONFIG_R8169_LEDS)) { if (rtl_is_8125(tp)) - rtl8125_init_leds(dev); + tp->leds = rtl8125_init_leds(dev); else if (tp->mac_version > RTL_GIGA_MAC_VER_06) - rtl8168_init_leds(dev); + tp->leds = rtl8168_init_leds(dev); } netdev_info(dev, "%s, %pM, XID %03x, IRQ %d\n", -- GitLab From 3bbb331c1d34fdd5520a050fce35f71579430485 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 13 Mar 2024 10:30:04 +0800 Subject: [PATCH 681/989] tools/power/turbostat: Introduce BIC_SAM_mc6/BIC_SAMMHz/BIC_SAMACTMHz Graphics driver (i915/Xe) on mordern platforms splits GFX and SA Media information via different sysfs knobs. Existing BIC_GFX_rc6/BIC_GFXMHz/BIC_GFXACTMHz columns can be reused for GFX. Introduce BIC_SAM_mc6/BIC_SAMMHz/BIC_SAMACTMHz columns for SA Media. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.8 | 12 +++- tools/power/x86/turbostat/turbostat.c | 91 +++++++++++++++++++++++++-- 2 files changed, 96 insertions(+), 7 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 567327b004e68..0d3672e5d9ed1 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -129,9 +129,17 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBPkgTmp\fP Degrees Celsius reported by the per-package Package Thermal Monitor. .PP -\fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms. +\fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms or /sys/class/drm/card0/gt/gt0/rc6_residency_ms or /sys/class/drm/card0/device/tile0/gtN/gtidle/idle_residency_ms depending on the graphics driver being used. .PP -\fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz. +\fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/cur_freq depending on the graphics driver being used. +.PP +\fBGFXAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz or /sys/class/drm/card0/gt_act_freq_mhz or /sys/class/drm/card0/gt/gt0/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used. +.PP +\fBSAM%mc6\fP The percentage of time the SA Media is in the "module C6" state, mc6, during the measurement interval. From /sys/class/drm/card0/gt/gt1/rc6_residency_ms or /sys/class/drm/card0/device/tile0/gtN/gtidle/idle_residency_ms depending on the graphics driver being used. +.PP +\fBSAMMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/cur_freq depending on the graphics driver being used. +.PP +\fBSAMAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used. .PP \fBPkg%pc2, Pkg%pc3, Pkg%pc6, Pkg%pc7\fP percentage residency in hardware package idle states. These numbers are from hardware residency counters. .PP diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 016a5c7dc9bf3..4fa2810da1a3d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -133,6 +133,9 @@ struct msr_counter bic[] = { { 0x0, "IPC", "", 0, 0, 0, NULL, 0 }, { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 }, { 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAM%mc6", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAMMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAMAMHz", "", 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -191,11 +194,14 @@ struct msr_counter bic[] = { #define BIC_IPC (1ULL << 52) #define BIC_CORE_THROT_CNT (1ULL << 53) #define BIC_UNCORE_MHZ (1ULL << 54) +#define BIC_SAM_mc6 (1ULL << 55) +#define BIC_SAMMHz (1ULL << 56) +#define BIC_SAMACTMHz (1ULL << 57) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) -#define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_UNCORE_MHZ) -#define BIC_IDLE ( BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX) +#define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) +#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6) #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -277,6 +283,9 @@ enum gfx_sysfs_idx { GFX_rc6, GFX_MHz, GFX_ACTMHz, + SAM_mc6, + SAM_MHz, + SAM_ACTMHz, GFX_MAX }; @@ -1193,6 +1202,9 @@ struct pkg_data { long long gfx_rc6_ms; unsigned int gfx_mhz; unsigned int gfx_act_mhz; + long long sam_mc6_ms; + unsigned int sam_mhz; + unsigned int sam_act_mhz; unsigned int package_id; struct rapl_counter energy_pkg; /* MSR_PKG_ENERGY_STATUS */ struct rapl_counter energy_dram; /* MSR_DRAM_ENERGY_STATUS */ @@ -1844,6 +1856,15 @@ void print_header(char *delim) if (DO_BIC(BIC_GFXACTMHz)) outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_SAM_mc6)) + outp += sprintf(outp, "%sSAM%%mc6", (printed++ ? delim : "")); + + if (DO_BIC(BIC_SAMMHz)) + outp += sprintf(outp, "%sSAMMHz", (printed++ ? delim : "")); + + if (DO_BIC(BIC_SAMACTMHz)) + outp += sprintf(outp, "%sSAMAMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : "")); if (DO_BIC(BIC_Any_c0)) @@ -2251,6 +2272,24 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_GFXACTMHz)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz); + /* SAMmc6 */ + if (DO_BIC(BIC_SAM_mc6)) { + if (p->sam_mc6_ms == -1) { /* detect GFX counter reset */ + outp += sprintf(outp, "%s**.**", (printed++ ? delim : "")); + } else { + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + p->sam_mc6_ms / 10.0 / interval_float); + } + } + + /* SAMMHz */ + if (DO_BIC(BIC_SAMMHz)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_mhz); + + /* SAMACTMHz */ + if (DO_BIC(BIC_SAMACTMHz)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_act_mhz); + /* Totl%C0, Any%C0 GFX%C0 CPUGFX% */ if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc); @@ -2437,6 +2476,15 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->gfx_mhz = new->gfx_mhz; old->gfx_act_mhz = new->gfx_act_mhz; + /* flag an error when mc6 counter resets/wraps */ + if (old->sam_mc6_ms > new->sam_mc6_ms) + old->sam_mc6_ms = -1; + else + old->sam_mc6_ms = new->sam_mc6_ms - old->sam_mc6_ms; + + old->sam_mhz = new->sam_mhz; + old->sam_act_mhz = new->sam_act_mhz; + old->energy_pkg.raw_value = new->energy_pkg.raw_value - old->energy_pkg.raw_value; old->energy_cores.raw_value = new->energy_cores.raw_value - old->energy_cores.raw_value; old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value; @@ -2661,6 +2709,9 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->uncore_mhz = 0; p->gfx_mhz = 0; p->gfx_act_mhz = 0; + p->sam_mc6_ms = 0; + p->sam_mhz = 0; + p->sam_act_mhz = 0; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) t->counter[i] = 0; @@ -2775,6 +2826,9 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.uncore_mhz = p->uncore_mhz; average.packages.gfx_mhz = p->gfx_mhz; average.packages.gfx_act_mhz = p->gfx_act_mhz; + average.packages.sam_mc6_ms = p->sam_mc6_ms; + average.packages.sam_mhz = p->sam_mhz; + average.packages.sam_act_mhz = p->sam_act_mhz; average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c); @@ -3572,19 +3626,28 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F); } - if (DO_BIC(BIC_GFX_rc6)) - p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; - /* n.b. assume die0 uncore frequency applies to whole package */ if (DO_BIC(BIC_UNCORE_MHZ)) p->uncore_mhz = get_uncore_mhz(p->package_id, 0); + if (DO_BIC(BIC_GFX_rc6)) + p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; + if (DO_BIC(BIC_GFXMHz)) p->gfx_mhz = gfx_info[GFX_MHz].val; if (DO_BIC(BIC_GFXACTMHz)) p->gfx_act_mhz = gfx_info[GFX_ACTMHz].val; + if (DO_BIC(BIC_SAM_mc6)) + p->sam_mc6_ms = gfx_info[SAM_mc6].val_ull; + + if (DO_BIC(BIC_SAMMHz)) + p->sam_mhz = gfx_info[SAM_MHz].val; + + if (DO_BIC(BIC_SAMACTMHz)) + p->sam_act_mhz = gfx_info[SAM_ACTMHz].val; + for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &p->counter[i])) return -10; @@ -4634,6 +4697,7 @@ int snapshot_graphics(int idx) switch (idx) { case GFX_rc6: + case SAM_mc6: fp = fopen_or_die(gfx_info[idx].path, "r"); retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull); if (retval != 1) @@ -4642,6 +4706,8 @@ int snapshot_graphics(int idx) return 0; case GFX_MHz: case GFX_ACTMHz: + case SAM_MHz: + case SAM_ACTMHz: if (gfx_info[idx].fp == NULL) { gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); } else { @@ -4727,6 +4793,15 @@ int snapshot_proc_sysfs_files(void) if (DO_BIC(BIC_GFXACTMHz)) snapshot_graphics(GFX_ACTMHz); + if (DO_BIC(BIC_SAM_mc6)) + snapshot_graphics(SAM_mc6); + + if (DO_BIC(BIC_SAMMHz)) + snapshot_graphics(SAM_MHz); + + if (DO_BIC(BIC_SAMACTMHz)) + snapshot_graphics(SAM_ACTMHz); + if (DO_BIC(BIC_CPU_LPI)) snapshot_cpu_lpi_us(); @@ -5325,6 +5400,12 @@ static void probe_graphics(void) BIC_PRESENT(BIC_GFXMHz); if (gfx_info[GFX_ACTMHz].path) BIC_PRESENT(BIC_GFXACTMHz); + if (gfx_info[SAM_mc6].path) + BIC_PRESENT(BIC_SAM_mc6); + if (gfx_info[SAM_MHz].path) + BIC_PRESENT(BIC_SAMMHz); + if (gfx_info[SAM_ACTMHz].path) + BIC_PRESENT(BIC_SAMACTMHz); } static void dump_sysfs_cstate_config(void) -- GitLab From dc02dc937a3ef819c5da10e97084af6977be26bf Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 22 Mar 2024 09:52:24 +0800 Subject: [PATCH 682/989] tools/power/turbostat: Add support for new i915 sysfs knobs On Meteorlake platform, i915 driver supports the traditional graphics sysfs knobs including /sys/class/drm/card0/power/rc6_residency_ms /sys/class/drm/card0/gt_cur_freq_mhz /sys/class/drm/card0/gt_act_freq_mhz At the same time, it also supports /sys/class/drm/card0/gt/gt0/rc6_residency_ms /sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz /sys/class/drm/card0/gt/gt0/rps_act_freq_mhz /sys/class/drm/card0/gt/gt1/rc6_residency_ms /sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz /sys/class/drm/card0/gt/gt1/rps_act_freq_mhz gt0 is for GFX and gt1 is for SA Media. Enhance turbostat to prefer the i915 new sysfs knobs. Export gt0 via BIC_GFX_rc6/BIC_GFXMHz/BIC_GFXACTMHz. Export gt1 via BIC_SMA_mc6/BIC_SMAMHz/BIC_SMAACTMHz. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4fa2810da1a3d..feca7f4cb5cda 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5380,6 +5380,29 @@ probe_cluster: static void probe_graphics(void) { + /* New i915 graphics sysfs knobs */ + if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) { + gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms"; + + if (!access("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", R_OK)) + gfx_info[SAM_mc6].path = "/sys/class/drm/card0/gt/gt1/rc6_residency_ms"; + + if (!access("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", R_OK)) + gfx_info[SAM_MHz].path = "/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", R_OK)) + gfx_info[SAM_ACTMHz].path = "/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz"; + + goto end; + } + + /* Fall back to traditional i915 graphics sysfs knobs */ if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms"; @@ -5394,6 +5417,7 @@ static void probe_graphics(void) else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz"; +end: if (gfx_info[GFX_rc6].path) BIC_PRESENT(BIC_GFX_rc6); if (gfx_info[GFX_MHz].path) -- GitLab From 91a91d389543a86963beec148d98d37875154bd4 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 12 Mar 2024 23:56:02 +0800 Subject: [PATCH 683/989] tools/power/turbostat: Add support for Xe sysfs knobs Xe graphics driver uses different graphics sysfs knobs including /sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms /sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq /sys/class/drm/card0/device/tile0/gt0/freq0/act_freq /sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms /sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq /sys/class/drm/card0/device/tile0/gt1/freq0/act_freq Plus that, /sys/class/drm/card0/device/tile0/gt/gtidle/name returns either gt-rc or gt-mc. rc is for GFX and mc is SA Media. Enhance turbostat to prefer the Xe sysfs knobs when they are available. Export gt-rc via BIC_GFX_rc6/BIC_GFXMHz/BIC_GFXACTMHz. Export gt-mc via BIC_SMA_mc6/BIC_SMAMHz/BIC_SMAACTMHz. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 51 +++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index feca7f4cb5cda..bc103851df70b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5380,6 +5380,57 @@ probe_cluster: static void probe_graphics(void) { + /* Xe graphics sysfs knobs */ + if (!access("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", R_OK)) { + FILE *fp; + char buf[8]; + bool gt0_is_gt; + int idx; + + fp = fopen("/sys/class/drm/card0/device/tile0/gt0/gtidle/name", "r"); + if (!fp) + goto next; + + if (!fread(buf, sizeof(char), 7, fp)) { + fclose(fp); + goto next; + } + fclose(fp); + + if (!strncmp(buf, "gt0-rc", strlen("gt0-rc"))) + gt0_is_gt = true; + else if (!strncmp(buf, "gt0-mc", strlen("gt0-mc"))) + gt0_is_gt = false; + else + goto next; + + idx = gt0_is_gt ? GFX_rc6 : SAM_mc6; + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms"; + + idx = gt0_is_gt ? GFX_MHz : SAM_MHz; + if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq"; + + idx = gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz; + if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq"; + + idx = gt0_is_gt ? SAM_mc6 : GFX_rc6; + if (!access("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms"; + + idx = gt0_is_gt ? SAM_MHz : GFX_MHz; + if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq"; + + idx = gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz; + if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq"; + + goto end; + } + +next: /* New i915 graphics sysfs knobs */ if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) { gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms"; -- GitLab From 3ab7296a7e6aa34634dcc2926af933107a117996 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 8 Apr 2024 19:32:58 -0400 Subject: [PATCH 684/989] tools/power turbostat: v2024.04.10 Much of turbostat can now run with perf, rather than using the MSR driver Some of turbostat can now run as a regular non-root user. Add some new output columns for some new GFX hardware. [This patch updates the version, but otherwise changes no function; it touches up some checkpatch issues from previous patches] Signed-off-by: Len Brown --- MAINTAINERS | 1 + tools/power/x86/turbostat/turbostat.c | 41 ++++++++++++------- .../testing/selftests/turbostat/defcolumns.py | 1 + 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index a7c4cf8201e01..b8582a466128f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22116,6 +22116,7 @@ Q: https://patchwork.kernel.org/project/linux-pm/list/ B: https://bugzilla.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git turbostat F: tools/power/x86/turbostat/ +F: tools/testing/selftests/turbostat/ TW5864 VIDEO4LINUX DRIVER M: Bluecherry Maintainers diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bc103851df70b..98256468e2480 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3,7 +3,7 @@ * turbostat -- show CPU frequency and C-state residency * on modern Intel and AMD processors. * - * Copyright (c) 2023 Intel Corporation. + * Copyright (c) 2024 Intel Corporation. * Len Brown */ @@ -1360,6 +1360,7 @@ struct sys_counters { void free_sys_counters(void) { struct msr_counter *p = sys.tp, *pnext = NULL; + while (p) { pnext = p->next; free(p); @@ -1979,6 +1980,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p const unsigned long long energy_value = c->core_energy.raw_value * c->core_energy.scale; const double energy_scale = c->core_energy.scale; + if (c->core_energy.unit == RAPL_UNIT_JOULES) outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale); @@ -3153,7 +3155,7 @@ static unsigned int read_perf_counter_info_n(const char *const path, const char return v; } -static unsigned read_msr_type(void) +static unsigned int read_msr_type(void) { const char *const path = "/sys/bus/event_source/devices/msr/type"; const char *const format = "%u"; @@ -3161,7 +3163,7 @@ static unsigned read_msr_type(void) return read_perf_counter_info_n(path, format); } -static unsigned read_aperf_config(void) +static unsigned int read_aperf_config(void) { const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; const char *const format = "event=%x"; @@ -3169,7 +3171,7 @@ static unsigned read_aperf_config(void) return read_perf_counter_info_n(path, format); } -static unsigned read_mperf_config(void) +static unsigned int read_mperf_config(void) { const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; const char *const format = "event=%x"; @@ -3177,7 +3179,7 @@ static unsigned read_mperf_config(void) return read_perf_counter_info_n(path, format); } -static unsigned read_perf_type(const char *subsys) +static unsigned int read_perf_type(const char *subsys) { const char *const path_format = "/sys/bus/event_source/devices/%s/type"; const char *const format = "%u"; @@ -3188,7 +3190,7 @@ static unsigned read_perf_type(const char *subsys) return read_perf_counter_info_n(path, format); } -static unsigned read_rapl_config(const char *subsys, const char *event_name) +static unsigned int read_rapl_config(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s"; const char *const format = "event=%x"; @@ -3199,7 +3201,7 @@ static unsigned read_rapl_config(const char *subsys, const char *event_name) return read_perf_counter_info_n(path, format); } -static unsigned read_perf_rapl_unit(const char *subsys, const char *event_name) +static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.unit"; const char *const format = "%s"; @@ -3235,7 +3237,7 @@ static struct amperf_group_fd open_amperf_fd(int cpu) const unsigned int msr_type = read_msr_type(); const unsigned int aperf_config = read_aperf_config(); const unsigned int mperf_config = read_mperf_config(); - struct amperf_group_fd fds = {.aperf = -1,.mperf = -1 }; + struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 }; fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); @@ -3277,6 +3279,7 @@ static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu) t->tsc = rdtsc(); const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array)); + if (n != sizeof(cnt.as_array)) return -2; @@ -3371,7 +3374,7 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain]; if (debug) - fprintf(stderr, "get_rapl_counters: cpu%d domain%d\n", cpu, domain); + fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); assert(rapl_counter_info_perdomain); @@ -3382,8 +3385,9 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data size_t num_perf_counters = rapl_counter_info_count_perf(rci); const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long); const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data)); + if (actual_read_size != expected_read_size) - err(-1, "get_rapl_counters: failed to read perf_data (%zu %zu)", expected_read_size, + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size); } @@ -3454,7 +3458,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) int status; if (cpu_migrate(cpu)) { - fprintf(outf, "get_counters: Could not migrate to CPU %d\n", cpu); + fprintf(outf, "%s: Could not migrate to CPU %d\n", __func__, cpu); return -1; } @@ -6411,15 +6415,17 @@ int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struc return -1; const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name); + if (scale == 0.0) return -1; const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name); + if (unit == RAPL_UNIT_INVALID) return -1; - const unsigned rapl_type = read_perf_type(cai->perf_subsys); - const unsigned rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); + const unsigned int rapl_type = read_perf_type(cai->perf_subsys); + const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); const int fd_counter = open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP); @@ -6441,7 +6447,7 @@ int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit); if (debug) - fprintf(stderr, "add_rapl_perf_counter: %d (cpu: %d)\n", ret, cpu); + fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); return ret; } @@ -6462,6 +6468,7 @@ void linux_perf_init(void) } const bool aperf_required = is_aperf_access_required(); + if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); if (fd_amperf_percpu == NULL) @@ -6483,6 +6490,7 @@ void rapl_perf_init(void) */ for (int domain_id = 0; domain_id < num_domains; ++domain_id) { struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id]; + rci->fd_perf = -1; for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) { rci->data[i] = 0; @@ -7296,6 +7304,7 @@ static void set_amperf_source(void) amperf_source = AMPERF_SOURCE_PERF; const bool aperf_required = is_aperf_access_required(); + if (no_perf || !aperf_required || !has_amperf_access_via_perf()) amperf_source = AMPERF_SOURCE_MSR; @@ -7373,10 +7382,12 @@ void check_msr_access(void) void check_perf_access(void) { const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC); + if (no_perf || !intrcount_required || !has_instr_count_access()) bic_enabled &= ~BIC_IPC; const bool aperf_required = is_aperf_access_required(); + if (!aperf_required || !has_amperf_access()) { bic_enabled &= ~BIC_Avg_MHz; bic_enabled &= ~BIC_Busy; @@ -7486,7 +7497,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2023.11.07 - Len Brown \n"); + fprintf(outf, "turbostat version 2024.04.08 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 diff --git a/tools/testing/selftests/turbostat/defcolumns.py b/tools/testing/selftests/turbostat/defcolumns.py index 70d3b7780311e..d9b042097da7a 100755 --- a/tools/testing/selftests/turbostat/defcolumns.py +++ b/tools/testing/selftests/turbostat/defcolumns.py @@ -1,4 +1,5 @@ #!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 import subprocess from shutil import which -- GitLab From 0871bc0129d403747ea0272a4384895d7ad37a6c Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: [PATCH 685/989] mm: Move lowmem_page_address() a little later LoongArch will override page_to_virt() which use page_address() in the KFENCE case (by defining WANT_PAGE_VIRTUAL/HASHED_PAGE_VIRTUAL). So move lowmem_page_address() a little later to avoid such build errors: error: implicit declaration of function 'page_address'. Acked-by: Andrew Morton Signed-off-by: Huacai Chen --- include/linux/mm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0436b919f1c7f..7b0ee64225de9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2207,11 +2207,6 @@ static inline int arch_make_folio_accessible(struct folio *folio) */ #include -static __always_inline void *lowmem_page_address(const struct page *page) -{ - return page_to_virt(page); -} - #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif @@ -2234,6 +2229,11 @@ void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif +static __always_inline void *lowmem_page_address(const struct page *page) +{ + return page_to_virt(page); +} + #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) -- GitLab From 0ca84aeaee150796d4b5577b1b0ae52a947e7813 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: [PATCH 686/989] LoongArch: Make {virt, phys, page, pfn} translation work with KFENCE KFENCE changes virt_to_page() to be able to translate tlb mapped virtual addresses, but forget to change virt_to_phys()/phys_to_virt() and other translation functions as well. This patch fix it, otherwise some drivers (such as nvme and virtio-blk) cannot work with KFENCE. All {virt, phys, page, pfn} translation functions are updated: 1, virt_to_pfn()/pfn_to_virt(); 2, virt_to_page()/page_to_virt(); 3, virt_to_phys()/phys_to_virt(). DMW/TLB mapped addresses are distinguished by comparing the vaddress with vm_map_base in virt_to_xyz(), and we define WANT_PAGE_VIRTUAL in the KFENCE case for the reverse translations, xyz_to_virt(). Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/io.h | 20 +++++++++++++++----- arch/loongarch/include/asm/kfence.h | 9 +++++++++ arch/loongarch/include/asm/page.h | 26 +++++++++++++++++++++++++- arch/loongarch/mm/pgtable.c | 4 ++-- 4 files changed, 51 insertions(+), 8 deletions(-) diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index 4a8adcca329b8..c2f9979b2979e 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -14,11 +14,6 @@ #include #include -/* - * Change "struct page" to physical address. - */ -#define page_to_phys(page) ((phys_addr_t)page_to_pfn(page) << PAGE_SHIFT) - extern void __init __iomem *early_ioremap(u64 phys_addr, unsigned long size); extern void __init early_iounmap(void __iomem *addr, unsigned long size); @@ -73,6 +68,21 @@ extern void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t #define __io_aw() mmiowb() +#ifdef CONFIG_KFENCE +#define virt_to_phys(kaddr) \ +({ \ + (likely((unsigned long)kaddr < vm_map_base)) ? __pa((unsigned long)kaddr) : \ + page_to_phys(tlb_virt_to_page((unsigned long)kaddr)) + offset_in_page((unsigned long)kaddr);\ +}) + +#define phys_to_virt(paddr) \ +({ \ + extern char *__kfence_pool; \ + (unlikely(__kfence_pool == NULL)) ? __va((unsigned long)paddr) : \ + page_address(phys_to_page((unsigned long)paddr)) + offset_in_page((unsigned long)paddr);\ +}) +#endif + #include #define ARCH_HAS_VALID_PHYS_ADDR_RANGE diff --git a/arch/loongarch/include/asm/kfence.h b/arch/loongarch/include/asm/kfence.h index 6c82aea1c9939..a6a5760da3a33 100644 --- a/arch/loongarch/include/asm/kfence.h +++ b/arch/loongarch/include/asm/kfence.h @@ -16,6 +16,7 @@ static inline bool arch_kfence_init_pool(void) { int err; + char *kaddr, *vaddr; char *kfence_pool = __kfence_pool; struct vm_struct *area; @@ -35,6 +36,14 @@ static inline bool arch_kfence_init_pool(void) return false; } + kaddr = kfence_pool; + vaddr = __kfence_pool; + while (kaddr < kfence_pool + KFENCE_POOL_SIZE) { + set_page_address(virt_to_page(kaddr), vaddr); + kaddr += PAGE_SIZE; + vaddr += PAGE_SIZE; + } + return true; } diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 44027060c54a2..e85df33f11c77 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -78,7 +78,26 @@ typedef struct { unsigned long pgprot; } pgprot_t; struct page *dmw_virt_to_page(unsigned long kaddr); struct page *tlb_virt_to_page(unsigned long kaddr); -#define virt_to_pfn(kaddr) PFN_DOWN(PHYSADDR(kaddr)) +#define pfn_to_phys(pfn) __pfn_to_phys(pfn) +#define phys_to_pfn(paddr) __phys_to_pfn(paddr) + +#define page_to_phys(page) pfn_to_phys(page_to_pfn(page)) +#define phys_to_page(paddr) pfn_to_page(phys_to_pfn(paddr)) + +#ifndef CONFIG_KFENCE + +#define page_to_virt(page) __va(page_to_phys(page)) +#define virt_to_page(kaddr) phys_to_page(__pa(kaddr)) + +#else + +#define WANT_PAGE_VIRTUAL + +#define page_to_virt(page) \ +({ \ + extern char *__kfence_pool; \ + (__kfence_pool == NULL) ? __va(page_to_phys(page)) : page_address(page); \ +}) #define virt_to_page(kaddr) \ ({ \ @@ -86,6 +105,11 @@ struct page *tlb_virt_to_page(unsigned long kaddr); dmw_virt_to_page((unsigned long)kaddr) : tlb_virt_to_page((unsigned long)kaddr);\ }) +#endif + +#define pfn_to_virt(pfn) page_to_virt(pfn_to_page(pfn)) +#define virt_to_pfn(kaddr) page_to_pfn(virt_to_page(kaddr)) + extern int __virt_addr_valid(volatile void *kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((volatile void *)(kaddr)) diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index 2aae72e638713..bda018150000e 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -11,13 +11,13 @@ struct page *dmw_virt_to_page(unsigned long kaddr) { - return pfn_to_page(virt_to_pfn(kaddr)); + return phys_to_page(__pa(kaddr)); } EXPORT_SYMBOL(dmw_virt_to_page); struct page *tlb_virt_to_page(unsigned long kaddr) { - return pfn_to_page(pte_pfn(*virt_to_kpte(kaddr))); + return phys_to_page(pfn_to_phys(pte_pfn(*virt_to_kpte(kaddr)))); } EXPORT_SYMBOL(tlb_virt_to_page); -- GitLab From 1a629fe4cca0fc4cf546b4ca2e9d4b75bbfde9ff Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: [PATCH 687/989] LoongArch: Make virt_addr_valid()/__virt_addr_valid() work with KFENCE When enabling both CONFIG_KFENCE and CONFIG_DEBUG_SG, I get the following backtraces when running LongArch kernels. [ 2.496257] kernel BUG at include/linux/scatterlist.h:187! ... [ 2.501925] Call Trace: [ 2.501950] [<9000000004ad59c4>] sg_init_one+0xac/0xc0 [ 2.502204] [<9000000004a438f8>] do_test_kpp+0x278/0x6e4 [ 2.502353] [<9000000004a43dd4>] alg_test_kpp+0x70/0xf4 [ 2.502494] [<9000000004a41b48>] alg_test+0x128/0x690 [ 2.502631] [<9000000004a3d898>] cryptomgr_test+0x20/0x40 [ 2.502775] [<90000000041b4508>] kthread+0x138/0x158 [ 2.502912] [<9000000004161c48>] ret_from_kernel_thread+0xc/0xa4 The backtrace is always similar but not exactly the same. It is always triggered from cryptomgr_test, but not always from the same test. Analysis shows that with CONFIG_KFENCE active, the address returned from kmalloc() and friends is not always below vm_map_base. It is allocated by kfence_alloc() which at least sometimes seems to get its memory from an address space above vm_map_base. This causes __virt_addr_valid() to return false for the affected objects. Let __virt_addr_valid() return 1 for kfence pool addresses, this make virt_addr_valid()/__virt_addr_valid() work with KFENCE. Reported-by: Guenter Roeck Suggested-by: Guenter Roeck Signed-off-by: Huacai Chen --- arch/loongarch/mm/mmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/loongarch/mm/mmap.c b/arch/loongarch/mm/mmap.c index a9630a81b38ab..89af7c12e8c08 100644 --- a/arch/loongarch/mm/mmap.c +++ b/arch/loongarch/mm/mmap.c @@ -4,6 +4,7 @@ */ #include #include +#include #include #include #include @@ -111,6 +112,9 @@ int __virt_addr_valid(volatile void *kaddr) { unsigned long vaddr = (unsigned long)kaddr; + if (is_kfence_address((void *)kaddr)) + return 1; + if ((vaddr < PAGE_OFFSET) || (vaddr >= vm_map_base)) return 0; -- GitLab From ec2bbc575e44909fd333328537519145852927a6 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: [PATCH 688/989] LoongArch: Update dts for Loongson-2K1000 to support ISA/LPC Some Loongson-2K1000 platforms have ISA/LPC devices such as Super-IO, define an ISA node in the dts file to avoid access error. Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/loongson-2k1000.dtsi | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/loongarch/boot/dts/loongson-2k1000.dtsi b/arch/loongarch/boot/dts/loongson-2k1000.dtsi index 49a70f8c3cab2..b6aeb1f70e2a0 100644 --- a/arch/loongarch/boot/dts/loongson-2k1000.dtsi +++ b/arch/loongarch/boot/dts/loongson-2k1000.dtsi @@ -100,6 +100,13 @@ #size-cells = <2>; dma-coherent; + isa@18000000 { + compatible = "isa"; + #size-cells = <1>; + #address-cells = <2>; + ranges = <1 0x0 0x0 0x18000000 0x4000>; + }; + liointc0: interrupt-controller@1fe01400 { compatible = "loongson,liointc-2.0"; reg = <0x0 0x1fe01400 0x0 0x40>, -- GitLab From b07b9f353d750ae503fc9fcbdc5f29dc38553605 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: [PATCH 689/989] LoongArch: Update dts for Loongson-2K2000 to support ISA/LPC Some Loongson-2K2000 platforms have ISA/LPC devices such as Super-IO, define an ISA node in the dts file to avoid access error. Also adjust the PCI io resource range to avoid confliction. Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/loongson-2k2000.dtsi | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/boot/dts/loongson-2k2000.dtsi b/arch/loongarch/boot/dts/loongson-2k2000.dtsi index a231949b5f553..dcee9f3783fdd 100644 --- a/arch/loongarch/boot/dts/loongson-2k2000.dtsi +++ b/arch/loongarch/boot/dts/loongson-2k2000.dtsi @@ -51,6 +51,13 @@ #address-cells = <2>; #size-cells = <2>; + isa@18400000 { + compatible = "isa"; + #size-cells = <1>; + #address-cells = <2>; + ranges = <1 0x0 0x0 0x18400000 0x4000>; + }; + pmc: power-management@100d0000 { compatible = "loongson,ls2k2000-pmc", "loongson,ls2k0500-pmc", "syscon"; reg = <0x0 0x100d0000 0x0 0x58>; @@ -141,7 +148,7 @@ #size-cells = <2>; device_type = "pci"; bus-range = <0x0 0xff>; - ranges = <0x01000000 0x0 0x00008000 0x0 0x18400000 0x0 0x00008000>, + ranges = <0x01000000 0x0 0x00008000 0x0 0x18408000 0x0 0x00008000>, <0x02000000 0x0 0x60000000 0x0 0x60000000 0x0 0x20000000>; gmac0: ethernet@3,0 { -- GitLab From 84892cebdc7fd5d6d70ba5b667a7440dfed2d032 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: [PATCH 690/989] LoongArch: Update dts for Loongson-2K2000 to support PCI-MSI Current dts file for Loongson-2K2000 misses the interrupt-controller & interrupt-cells descriptions in the msi-controller node, and misses the msi-parent link in the pci root node. Add them to support PCI-MSI. Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/loongson-2k2000.dtsi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/loongarch/boot/dts/loongson-2k2000.dtsi b/arch/loongarch/boot/dts/loongson-2k2000.dtsi index dcee9f3783fdd..481c6c869a8b2 100644 --- a/arch/loongarch/boot/dts/loongson-2k2000.dtsi +++ b/arch/loongarch/boot/dts/loongson-2k2000.dtsi @@ -116,6 +116,8 @@ msi: msi-controller@1fe01140 { compatible = "loongson,pch-msi-1.0"; reg = <0x0 0x1fe01140 0x0 0x8>; + interrupt-controller; + #interrupt-cells = <1>; msi-controller; loongson,msi-base-vec = <64>; loongson,msi-num-vecs = <192>; @@ -147,6 +149,7 @@ #address-cells = <3>; #size-cells = <2>; device_type = "pci"; + msi-parent = <&msi>; bus-range = <0x0 0xff>; ranges = <0x01000000 0x0 0x00008000 0x0 0x18408000 0x0 0x00008000>, <0x02000000 0x0 0x60000000 0x0 0x60000000 0x0 0x20000000>; -- GitLab From 3744e0ee80251149135aac59870147e9ed6faae7 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: [PATCH 691/989] LoongArch: Update dts for Loongson-2K2000 to support GMAC/GNET Current dts file for Loongson-2K2000's GMAC/GNET is incomplete, both irq and phy descriptions are missing. Add them to make GMAC/GNET work. Signed-off-by: Huacai Chen --- .../boot/dts/loongson-2k2000-ref.dts | 33 +++++++++++++++++++ arch/loongarch/boot/dts/loongson-2k2000.dtsi | 12 +++++-- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/boot/dts/loongson-2k2000-ref.dts b/arch/loongarch/boot/dts/loongson-2k2000-ref.dts index dca91caf895e3..74b99bd234cc3 100644 --- a/arch/loongarch/boot/dts/loongson-2k2000-ref.dts +++ b/arch/loongarch/boot/dts/loongson-2k2000-ref.dts @@ -61,12 +61,45 @@ &gmac0 { status = "okay"; + + phy-mode = "gmii"; + phy-handle = <&phy0>; + mdio { + compatible = "snps,dwmac-mdio"; + #address-cells = <1>; + #size-cells = <0>; + phy0: ethernet-phy@0 { + reg = <2>; + }; + }; }; &gmac1 { status = "okay"; + + phy-mode = "gmii"; + phy-handle = <&phy1>; + mdio { + compatible = "snps,dwmac-mdio"; + #address-cells = <1>; + #size-cells = <0>; + phy1: ethernet-phy@1 { + reg = <2>; + }; + }; }; &gmac2 { status = "okay"; + + phy-mode = "rgmii"; + phy-handle = <&phy2>; + mdio { + compatible = "snps,dwmac-mdio"; + #address-cells = <1>; + #size-cells = <0>; + phy2: ethernet-phy@2 { + reg = <0>; + }; + }; }; diff --git a/arch/loongarch/boot/dts/loongson-2k2000.dtsi b/arch/loongarch/boot/dts/loongson-2k2000.dtsi index 481c6c869a8b2..9eab2d02cbe8b 100644 --- a/arch/loongarch/boot/dts/loongson-2k2000.dtsi +++ b/arch/loongarch/boot/dts/loongson-2k2000.dtsi @@ -156,21 +156,27 @@ gmac0: ethernet@3,0 { reg = <0x1800 0x0 0x0 0x0 0x0>; - interrupts = <12 IRQ_TYPE_LEVEL_HIGH>; + interrupts = <12 IRQ_TYPE_LEVEL_HIGH>, + <13 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq", "eth_lpi"; interrupt-parent = <&pic>; status = "disabled"; }; gmac1: ethernet@3,1 { reg = <0x1900 0x0 0x0 0x0 0x0>; - interrupts = <14 IRQ_TYPE_LEVEL_HIGH>; + interrupts = <14 IRQ_TYPE_LEVEL_HIGH>, + <15 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq", "eth_lpi"; interrupt-parent = <&pic>; status = "disabled"; }; gmac2: ethernet@3,2 { reg = <0x1a00 0x0 0x0 0x0 0x0>; - interrupts = <17 IRQ_TYPE_LEVEL_HIGH>; + interrupts = <17 IRQ_TYPE_LEVEL_HIGH>, + <18 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq", "eth_lpi"; interrupt-parent = <&pic>; status = "disabled"; }; -- GitLab From a07c772fa658645887119184de48b255bf19a46e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: [PATCH 692/989] LoongArch: Include linux/sizes.h in addrspace.h to prevent build errors LoongArch's include/asm/addrspace.h uses SZ_32M and SZ_16K, so add to provide those macros to prevent build errors: In file included from ../arch/loongarch/include/asm/io.h:11, from ../include/linux/io.h:13, from ../include/linux/io-64-nonatomic-lo-hi.h:5, from ../drivers/cxl/pci.c:4: ../include/asm-generic/io.h: In function 'ioport_map': ../arch/loongarch/include/asm/addrspace.h:124:25: error: 'SZ_32M' undeclared (first use in this function); did you mean 'PS_32M'? 124 | #define PCI_IOSIZE SZ_32M Signed-off-by: Randy Dunlap Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/addrspace.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/loongarch/include/asm/addrspace.h b/arch/loongarch/include/asm/addrspace.h index b24437e28c6ed..7bd47d65bf7a0 100644 --- a/arch/loongarch/include/asm/addrspace.h +++ b/arch/loongarch/include/asm/addrspace.h @@ -11,6 +11,7 @@ #define _ASM_ADDRSPACE_H #include +#include #include -- GitLab From a9025cd1c673a8d6eefc79d911075b8b452eba8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Apr 2024 15:22:01 +0200 Subject: [PATCH 693/989] x86/topology: Don't update cpu_possible_map in topo_set_cpuids() topo_set_cpuids() updates cpu_present_map and cpu_possible map. It is invoked during enumeration and "physical hotplug" operations. In the latter case this results in a kernel crash because cpu_possible_map is marked read only after init completes. There is no reason to update cpu_possible_map in that function. During enumeration cpu_possible_map is not relevant and gets fully initialized after enumeration completed. On "physical hotplug" the bit is already set because the kernel allows only CPUs to be plugged which have been enumerated and associated to a CPU number during early boot. Remove the bogus update of cpu_possible_map. Fixes: 0e53e7b656cf ("x86/cpu/topology: Sanitize the APIC admission logic") Reported-by: Jonathan Cameron Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/87ttkc6kwx.ffs@tglx --- arch/x86/kernel/cpu/topology.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index aaca8d235dc2b..d17c9b71eb4a2 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -123,7 +123,6 @@ static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id) early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id; early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id; #endif - set_cpu_possible(cpu, true); set_cpu_present(cpu, true); } @@ -210,7 +209,11 @@ static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present) topo_info.nr_disabled_cpus++; } - /* Register present and possible CPUs in the domain maps */ + /* + * Register present and possible CPUs in the domain + * maps. cpu_possible_map will be updated in + * topology_init_possible_cpus() after enumeration is done. + */ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map); } -- GitLab From f337a6a21e2fd67eadea471e93d05dd37baaa9be Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 9 Apr 2024 10:51:05 -0700 Subject: [PATCH 694/989] x86/cpu: Actually turn off mitigations by default for SPECULATION_MITIGATIONS=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initialize cpu_mitigations to CPU_MITIGATIONS_OFF if the kernel is built with CONFIG_SPECULATION_MITIGATIONS=n, as the help text quite clearly states that disabling SPECULATION_MITIGATIONS is supposed to turn off all mitigations by default. │ If you say N, all mitigations will be disabled. You really │ should know what you are doing to say so. As is, the kernel still defaults to CPU_MITIGATIONS_AUTO, which results in some mitigations being enabled in spite of SPECULATION_MITIGATIONS=n. Fixes: f43b9876e857 ("x86/retbleed: Add fine grained Kconfig knobs") Signed-off-by: Sean Christopherson Signed-off-by: Ingo Molnar Reviewed-by: Daniel Sneddon Cc: stable@vger.kernel.org Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240409175108.1512861-2-seanjc@google.com --- kernel/cpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 8f6affd051f77..07ad53b7f1195 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3207,7 +3207,8 @@ enum cpu_mitigations { }; static enum cpu_mitigations cpu_mitigations __ro_after_init = - CPU_MITIGATIONS_AUTO; + IS_ENABLED(CONFIG_SPECULATION_MITIGATIONS) ? CPU_MITIGATIONS_AUTO : + CPU_MITIGATIONS_OFF; static int __init mitigations_parse_cmdline(char *arg) { -- GitLab From 325f3fb551f8cd672dbbfc4cf58b14f9ee3fc9e8 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 10 Apr 2024 09:58:02 +0800 Subject: [PATCH 695/989] kprobes: Fix possible use-after-free issue on kprobe registration When unloading a module, its state is changing MODULE_STATE_LIVE -> MODULE_STATE_GOING -> MODULE_STATE_UNFORMED. Each change will take a time. `is_module_text_address()` and `__module_text_address()` works with MODULE_STATE_LIVE and MODULE_STATE_GOING. If we use `is_module_text_address()` and `__module_text_address()` separately, there is a chance that the first one is succeeded but the next one is failed because module->state becomes MODULE_STATE_UNFORMED between those operations. In `check_kprobe_address_safe()`, if the second `__module_text_address()` is failed, that is ignored because it expected a kernel_text address. But it may have failed simply because module->state has been changed to MODULE_STATE_UNFORMED. In this case, arm_kprobe() will try to modify non-exist module text address (use-after-free). To fix this problem, we should not use separated `is_module_text_address()` and `__module_text_address()`, but use only `__module_text_address()` once and do `try_module_get(module)` which is only available with MODULE_STATE_LIVE. Link: https://lore.kernel.org/all/20240410015802.265220-1-zhengyejian1@huawei.com/ Fixes: 28f6c37a2910 ("kprobes: Forbid probing on trampoline and BPF code areas") Cc: stable@vger.kernel.org Signed-off-by: Zheng Yejian Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9d9095e817928..65adc815fc6e6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1567,10 +1567,17 @@ static int check_kprobe_address_safe(struct kprobe *p, jump_label_lock(); preempt_disable(); - /* Ensure it is not in reserved area nor out of text */ - if (!(core_kernel_text((unsigned long) p->addr) || - is_module_text_address((unsigned long) p->addr)) || - in_gate_area_no_mm((unsigned long) p->addr) || + /* Ensure the address is in a text area, and find a module if exists. */ + *probed_mod = NULL; + if (!core_kernel_text((unsigned long) p->addr)) { + *probed_mod = __module_text_address((unsigned long) p->addr); + if (!(*probed_mod)) { + ret = -EINVAL; + goto out; + } + } + /* Ensure it is not in reserved area. */ + if (in_gate_area_no_mm((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || static_call_text_reserved(p->addr, p->addr) || @@ -1580,8 +1587,7 @@ static int check_kprobe_address_safe(struct kprobe *p, goto out; } - /* Check if 'p' is probing a module. */ - *probed_mod = __module_text_address((unsigned long) p->addr); + /* Get module refcount and reject __init functions for loaded modules. */ if (*probed_mod) { /* * We must hold a refcount of the probed module while updating -- GitLab From e3ba51ab24fddef79fc212f9840de54db8fd1685 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 5 Apr 2024 13:58:50 +1000 Subject: [PATCH 696/989] arm64: tlb: Fix TLBI RANGE operand KVM/arm64 relies on TLBI RANGE feature to flush TLBs when the dirty pages are collected by VMM and the page table entries become write protected during live migration. Unfortunately, the operand passed to the TLBI RANGE instruction isn't correctly sorted out due to the commit 117940aa6e5f ("KVM: arm64: Define kvm_tlb_flush_vmid_range()"). It leads to crash on the destination VM after live migration because TLBs aren't flushed completely and some of the dirty pages are missed. For example, I have a VM where 8GB memory is assigned, starting from 0x40000000 (1GB). Note that the host has 4KB as the base page size. In the middile of migration, kvm_tlb_flush_vmid_range() is executed to flush TLBs. It passes MAX_TLBI_RANGE_PAGES as the argument to __kvm_tlb_flush_vmid_range() and __flush_s2_tlb_range_op(). SCALE#3 and NUM#31, corresponding to MAX_TLBI_RANGE_PAGES, isn't supported by __TLBI_RANGE_NUM(). In this specific case, -1 has been returned from __TLBI_RANGE_NUM() for SCALE#3/2/1/0 and rejected by the loop in the __flush_tlb_range_op() until the variable @scale underflows and becomes -9, 0xffff708000040000 is set as the operand. The operand is wrong since it's sorted out by __TLBI_VADDR_RANGE() according to invalid @scale and @num. Fix it by extending __TLBI_RANGE_NUM() to support the combination of SCALE#3 and NUM#31. With the changes, [-1 31] instead of [-1 30] can be returned from the macro, meaning the TLBs for 0x200000 pages in the above example can be flushed in one shoot with SCALE#3 and NUM#31. The macro TLBI_RANGE_MASK is dropped since no one uses it any more. The comments are also adjusted accordingly. Fixes: 117940aa6e5f ("KVM: arm64: Define kvm_tlb_flush_vmid_range()") Cc: stable@kernel.org # v6.6+ Reported-by: Yihuang Yu Suggested-by: Marc Zyngier Signed-off-by: Gavin Shan Reviewed-by: Catalin Marinas Reviewed-by: Ryan Roberts Reviewed-by: Anshuman Khandual Reviewed-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240405035852.1532010-2-gshan@redhat.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 3b0e8248e1a41..a75de2665d844 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -161,12 +161,18 @@ static inline unsigned long get_trans_granule(void) #define MAX_TLBI_RANGE_PAGES __TLBI_RANGE_PAGES(31, 3) /* - * Generate 'num' values from -1 to 30 with -1 rejected by the - * __flush_tlb_range() loop below. + * Generate 'num' values from -1 to 31 with -1 rejected by the + * __flush_tlb_range() loop below. Its return value is only + * significant for a maximum of MAX_TLBI_RANGE_PAGES pages. If + * 'pages' is more than that, you must iterate over the overall + * range. */ -#define TLBI_RANGE_MASK GENMASK_ULL(4, 0) -#define __TLBI_RANGE_NUM(pages, scale) \ - ((((pages) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK) - 1) +#define __TLBI_RANGE_NUM(pages, scale) \ + ({ \ + int __pages = min((pages), \ + __TLBI_RANGE_PAGES(31, (scale))); \ + (__pages >> (5 * (scale) + 1)) - 1; \ + }) /* * TLB Invalidation @@ -379,10 +385,6 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * 3. If there is 1 page remaining, flush it through non-range operations. Range * operations can only span an even number of pages. We save this for last to * ensure 64KB start alignment is maintained for the LPA2 case. - * - * Note that certain ranges can be represented by either num = 31 and - * scale or num = 0 and scale + 1. The loop below favours the latter - * since num is limited to 30 by the __TLBI_RANGE_NUM() macro. */ #define __flush_tlb_range_op(op, start, pages, stride, \ asid, tlb_level, tlbi_user, lpa2) \ -- GitLab From b37cab587aa3c9ab29c6b10aa55627dad713011f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 13 Mar 2024 15:43:18 -0400 Subject: [PATCH 697/989] Bluetooth: ISO: Don't reject BT_ISO_QOS if parameters are unset Consider certain values (0x00) as unset and load proper default if an application has not set them properly. Fixes: 0fe8c8d07134 ("Bluetooth: Split bt_iso_qos into dedicated structures") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index c8793e57f4b54..d24148ea883c4 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1451,8 +1451,8 @@ static bool check_ucast_qos(struct bt_iso_qos *qos) static bool check_bcast_qos(struct bt_iso_qos *qos) { - if (qos->bcast.sync_factor == 0x00) - return false; + if (!qos->bcast.sync_factor) + qos->bcast.sync_factor = 0x01; if (qos->bcast.packing > 0x01) return false; @@ -1475,6 +1475,9 @@ static bool check_bcast_qos(struct bt_iso_qos *qos) if (qos->bcast.skip > 0x01f3) return false; + if (!qos->bcast.sync_timeout) + qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT; + if (qos->bcast.sync_timeout < 0x000a || qos->bcast.sync_timeout > 0x4000) return false; @@ -1484,6 +1487,9 @@ static bool check_bcast_qos(struct bt_iso_qos *qos) if (qos->bcast.mse > 0x1f) return false; + if (!qos->bcast.timeout) + qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT; + if (qos->bcast.timeout < 0x000a || qos->bcast.timeout > 0x4000) return false; -- GitLab From 53cb4197e63ab2363aa28c3029061e4d516e7626 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 28 Mar 2024 15:58:10 -0400 Subject: [PATCH 698/989] Bluetooth: hci_sync: Fix using the same interval and window for Coded PHY Coded PHY recommended intervals are 3 time bigger than the 1M PHY so this aligns with that by multiplying by 3 the values given to 1M PHY since the code already used recommended values for that. Fixes: 288c90224eec ("Bluetooth: Enable all supported LE PHY by default") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 8fe02921adf15..c5d8799046ccf 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2814,8 +2814,8 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, if (qos->bcast.in.phy & BT_ISO_PHY_CODED) { cp->scanning_phys |= LE_SCAN_PHY_CODED; hci_le_scan_phy_params(phy, type, - interval, - window); + interval * 3, + window * 3); num_phy++; phy++; } @@ -2835,7 +2835,7 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, if (scan_coded(hdev)) { cp->scanning_phys |= LE_SCAN_PHY_CODED; - hci_le_scan_phy_params(phy, type, interval, window); + hci_le_scan_phy_params(phy, type, interval * 3, window * 3); num_phy++; phy++; } -- GitLab From 45d355a926ab40f3ae7bc0b0a00cb0e3e8a5a810 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Tue, 2 Apr 2024 14:32:05 +0300 Subject: [PATCH 699/989] Bluetooth: Fix memory leak in hci_req_sync_complete() In 'hci_req_sync_complete()', always free the previous sync request state before assigning reference to a new one. Reported-by: syzbot+39ec16ff6cc18b1d066d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=39ec16ff6cc18b1d066d Cc: stable@vger.kernel.org Fixes: f60cb30579d3 ("Bluetooth: Convert hci_req_sync family of function to new request API") Signed-off-by: Dmitry Antipov Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_request.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 00e02138003ec..efea25eb56ce0 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -105,8 +105,10 @@ void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, if (hdev->req_status == HCI_REQ_PEND) { hdev->req_result = result; hdev->req_status = HCI_REQ_DONE; - if (skb) + if (skb) { + kfree_skb(hdev->req_skb); hdev->req_skb = skb_get(skb); + } wake_up_interruptible(&hdev->req_wait_q); } } -- GitLab From 51eda36d33e43201e7a4fd35232e069b2c850b01 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 15:41:52 -0400 Subject: [PATCH 700/989] Bluetooth: SCO: Fix not validating setsockopt user input syzbot reported sco_sock_setsockopt() is copying data without checking user input length. BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in sco_sock_setsockopt+0xc0b/0xf90 net/bluetooth/sco.c:893 Read of size 4 at addr ffff88805f7b15a3 by task syz-executor.5/12578 Fixes: ad10b1a48754 ("Bluetooth: Add Bluetooth socket voice option") Fixes: b96e9c671b05 ("Bluetooth: Add BT_DEFER_SETUP option to sco socket") Fixes: 00398e1d5183 ("Bluetooth: Add support for BT_PKT_STATUS CMSG data for SCO connections") Fixes: f6873401a608 ("Bluetooth: Allow setting of codec for HFP offload use case") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 9 +++++++++ net/bluetooth/sco.c | 23 ++++++++++------------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 9fe95a22abeb7..eaec5d6caa29d 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -585,6 +585,15 @@ static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk, return skb; } +static inline int bt_copy_from_sockptr(void *dst, size_t dst_size, + sockptr_t src, size_t src_size) +{ + if (dst_size > src_size) + return -EINVAL; + + return copy_from_sockptr(dst, src, dst_size); +} + int bt_to_errno(u16 code); __u8 bt_status(int err); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 43daf965a01e4..368e026f4d15c 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -824,7 +824,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; - int len, err = 0; + int err = 0; struct bt_voice voice; u32 opt; struct bt_codecs *codecs; @@ -843,10 +843,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); @@ -863,11 +862,10 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, voice.setting = sco_pi(sk)->setting; - len = min_t(unsigned int, sizeof(voice), optlen); - if (copy_from_sockptr(&voice, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&voice, sizeof(voice), optval, + optlen); + if (err) break; - } /* Explicitly check for these values */ if (voice.setting != BT_VOICE_TRANSPARENT && @@ -890,10 +888,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_PKT_STATUS: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); @@ -934,9 +931,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(buffer, optval, optlen)) { + err = bt_copy_from_sockptr(buffer, optlen, optval, optlen); + if (err) { hci_dev_put(hdev); - err = -EFAULT; break; } -- GitLab From a97de7bff13b1cc825c1b1344eaed8d6c2d3e695 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 15:43:45 -0400 Subject: [PATCH 701/989] Bluetooth: RFCOMM: Fix not validating setsockopt user input syzbot reported rfcomm_sock_setsockopt_old() is copying data without checking user input length. BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in rfcomm_sock_setsockopt_old net/bluetooth/rfcomm/sock.c:632 [inline] BUG: KASAN: slab-out-of-bounds in rfcomm_sock_setsockopt+0x893/0xa70 net/bluetooth/rfcomm/sock.c:673 Read of size 4 at addr ffff8880209a8bc3 by task syz-executor632/5064 Fixes: 9f2c8a03fbb3 ("Bluetooth: Replace RFCOMM link mode with security level") Fixes: bb23c0ab8246 ("Bluetooth: Add support for deferring RFCOMM connection setup") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/rfcomm/sock.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index b54e8a530f55a..29aa07e9db9d7 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -629,7 +629,7 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, switch (optname) { case RFCOMM_LM: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { + if (bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen)) { err = -EFAULT; break; } @@ -664,7 +664,6 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, struct sock *sk = sock->sk; struct bt_security sec; int err = 0; - size_t len; u32 opt; BT_DBG("sk %p", sk); @@ -686,11 +685,9 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, sec.level = BT_SECURITY_LOW; - len = min_t(unsigned int, sizeof(sec), optlen); - if (copy_from_sockptr(&sec, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&sec, sizeof(sec), optval, optlen); + if (err) break; - } if (sec.level > BT_SECURITY_HIGH) { err = -EINVAL; @@ -706,10 +703,9 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); -- GitLab From 4f3951242ace5efc7131932e2e01e6ac6baed846 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 15:50:47 -0400 Subject: [PATCH 702/989] Bluetooth: L2CAP: Fix not validating setsockopt user input Check user input length before copying data. Fixes: 33575df7be67 ("Bluetooth: move l2cap_sock_setsockopt() to l2cap_sock.c") Fixes: 3ee7b7cd8390 ("Bluetooth: Add BT_MODE socket option") Signed-off-by: Eric Dumazet Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_sock.c | 52 +++++++++++++++----------------------- 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 4287aa6cc988e..e7d810b23082f 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -727,7 +727,7 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct l2cap_options opts; - int len, err = 0; + int err = 0; u32 opt; BT_DBG("sk %p", sk); @@ -754,11 +754,9 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, opts.max_tx = chan->max_tx; opts.txwin_size = chan->tx_win; - len = min_t(unsigned int, sizeof(opts), optlen); - if (copy_from_sockptr(&opts, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opts, sizeof(opts), optval, optlen); + if (err) break; - } if (opts.txwin_size > L2CAP_DEFAULT_EXT_WINDOW) { err = -EINVAL; @@ -801,10 +799,9 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; case L2CAP_LM: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt & L2CAP_LM_FIPS) { err = -EINVAL; @@ -885,7 +882,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, struct bt_security sec; struct bt_power pwr; struct l2cap_conn *conn; - int len, err = 0; + int err = 0; u32 opt; u16 mtu; u8 mode; @@ -911,11 +908,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, sec.level = BT_SECURITY_LOW; - len = min_t(unsigned int, sizeof(sec), optlen); - if (copy_from_sockptr(&sec, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&sec, sizeof(sec), optval, optlen); + if (err) break; - } if (sec.level < BT_SECURITY_LOW || sec.level > BT_SECURITY_FIPS) { @@ -960,10 +955,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) { set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); @@ -975,10 +969,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_FLUSHABLE: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt > BT_FLUSHABLE_ON) { err = -EINVAL; @@ -1010,11 +1003,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, pwr.force_active = BT_POWER_FORCE_ACTIVE_ON; - len = min_t(unsigned int, sizeof(pwr), optlen); - if (copy_from_sockptr(&pwr, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&pwr, sizeof(pwr), optval, optlen); + if (err) break; - } if (pwr.force_active) set_bit(FLAG_FORCE_ACTIVE, &chan->flags); @@ -1023,10 +1014,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_CHANNEL_POLICY: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } err = -EOPNOTSUPP; break; @@ -1055,10 +1045,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&mtu, optval, sizeof(u16))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&mtu, sizeof(mtu), optval, optlen); + if (err) break; - } if (chan->mode == L2CAP_MODE_EXT_FLOWCTL && sk->sk_state == BT_CONNECTED) @@ -1086,10 +1075,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&mode, optval, sizeof(u8))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&mode, sizeof(mode), optval, optlen); + if (err) break; - } BT_DBG("mode %u", mode); -- GitLab From 9e8742cdfc4b0e65266bb4a901a19462bda9285e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 15:56:50 -0400 Subject: [PATCH 703/989] Bluetooth: ISO: Fix not validating setsockopt user input Check user input length before copying data. Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Fixes: 0731c5ab4d51 ("Bluetooth: ISO: Add support for BT_PKT_STATUS") Fixes: f764a6c2c1e4 ("Bluetooth: ISO: Add broadcast support") Signed-off-by: Eric Dumazet Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index d24148ea883c4..ef0cc80b4c0cc 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1500,7 +1500,7 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; - int len, err = 0; + int err = 0; struct bt_iso_qos qos = default_qos; u32 opt; @@ -1515,10 +1515,9 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); @@ -1527,10 +1526,9 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_PKT_STATUS: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); @@ -1545,17 +1543,9 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; } - len = min_t(unsigned int, sizeof(qos), optlen); - - if (copy_from_sockptr(&qos, optval, len)) { - err = -EFAULT; - break; - } - - if (len == sizeof(qos.ucast) && !check_ucast_qos(&qos)) { - err = -EINVAL; + err = bt_copy_from_sockptr(&qos, sizeof(qos), optval, optlen); + if (err) break; - } iso_pi(sk)->qos = qos; iso_pi(sk)->qos_user_set = true; @@ -1570,18 +1560,16 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, } if (optlen > sizeof(iso_pi(sk)->base)) { - err = -EOVERFLOW; + err = -EINVAL; break; } - len = min_t(unsigned int, sizeof(iso_pi(sk)->base), optlen); - - if (copy_from_sockptr(iso_pi(sk)->base, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(iso_pi(sk)->base, optlen, optval, + optlen); + if (err) break; - } - iso_pi(sk)->base_len = len; + iso_pi(sk)->base_len = optlen; break; -- GitLab From b2186061d6043d6345a97100460363e990af0d46 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 16:46:50 -0400 Subject: [PATCH 704/989] Bluetooth: hci_sock: Fix not validating setsockopt user input Check user input length before copying data. Fixes: 09572fca7223 ("Bluetooth: hci_sock: Add support for BT_{SND,RCV}BUF") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sock.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 4ee1b976678b2..703b84bd48d5b 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1946,10 +1946,9 @@ static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, switch (optname) { case HCI_DATA_DIR: - if (copy_from_sockptr(&opt, optval, sizeof(opt))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, len); + if (err) break; - } if (opt) hci_pi(sk)->cmsg_mask |= HCI_CMSG_DIR; @@ -1958,10 +1957,9 @@ static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, break; case HCI_TIME_STAMP: - if (copy_from_sockptr(&opt, optval, sizeof(opt))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, len); + if (err) break; - } if (opt) hci_pi(sk)->cmsg_mask |= HCI_CMSG_TSTAMP; @@ -1979,11 +1977,9 @@ static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, uf.event_mask[1] = *((u32 *) f->event_mask + 1); } - len = min_t(unsigned int, len, sizeof(uf)); - if (copy_from_sockptr(&uf, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&uf, sizeof(uf), optval, len); + if (err) break; - } if (!capable(CAP_NET_RAW)) { uf.type_mask &= hci_sec_filter.type_mask; @@ -2042,10 +2038,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, goto done; } - if (copy_from_sockptr(&opt, optval, sizeof(opt))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, len); + if (err) break; - } hci_pi(sk)->mtu = opt; break; -- GitLab From 600b0bbe73d3a9a264694da0e4c2c0800309141e Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Thu, 4 Apr 2024 18:50:23 +0800 Subject: [PATCH 705/989] Bluetooth: l2cap: Don't double set the HCI_CONN_MGMT_CONNECTED bit The bit is set and tested inside mgmt_device_connected(), therefore we must not set it just outside the function. Fixes: eeda1bf97bb5 ("Bluetooth: hci_event: Fix not indicating new connection for BIG Sync") Signed-off-by: Archie Pusaka Reviewed-by: Manish Mandlik Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 467b242d8be07..dc08974087936 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -4054,8 +4054,7 @@ static int l2cap_connect_req(struct l2cap_conn *conn, return -EPROTO; hci_dev_lock(hdev); - if (hci_dev_test_flag(hdev, HCI_MGMT) && - !test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &hcon->flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_device_connected(hdev, hcon, NULL, 0); hci_dev_unlock(hdev); -- GitLab From 5284984a4fbacb0883bfebe905902cdda2891a07 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Apr 2024 18:32:12 +0300 Subject: [PATCH 706/989] bug: Fix no-return-statement warning with !CONFIG_BUG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BUG() does not return, and arch implementations of BUG() use unreachable() or other non-returning code. However with !CONFIG_BUG, the default implementation is often used instead, and that does not do that. x86 always uses its own implementation, but powerpc with !CONFIG_BUG gives a build error: kernel/time/timekeeping.c: In function ‘timekeeping_debug_get_ns’: kernel/time/timekeeping.c:286:1: error: no return statement in function returning non-void [-Werror=return-type] Add unreachable() to default !CONFIG_BUG BUG() implementation. Fixes: e8e9d21a5df6 ("timekeeping: Refactor timekeeping helpers") Reported-by: Naresh Kamboju Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Tested-by: Linux Kernel Functional Testing Link: https://lore.kernel.org/r/20240410153212.127477-1-adrian.hunter@intel.com Closes: https://lore.kernel.org/all/CA+G9fYvjdZCW=7ZGxS6A_3bysjQ56YF7S-+PNLQ_8a4DKh1Bhg@mail.gmail.com/ --- include/asm-generic/bug.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 6e794420bd398..b7de3a4eade1c 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -156,7 +156,10 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #else /* !CONFIG_BUG */ #ifndef HAVE_ARCH_BUG -#define BUG() do {} while (1) +#define BUG() do { \ + do {} while (1); \ + unreachable(); \ +} while (0) #endif #ifndef HAVE_ARCH_BUG_ON -- GitLab From 076361362122a6d8a4c45f172ced5576b2d4a50d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 9 Apr 2024 13:22:12 -0700 Subject: [PATCH 707/989] selftests: timers: Fix valid-adjtimex signed left-shift undefined behavior The struct adjtimex freq field takes a signed value who's units are in shifted (<<16) parts-per-million. Unfortunately for negative adjustments, the straightforward use of: freq = ppm << 16 trips undefined behavior warnings with clang: valid-adjtimex.c:66:6: warning: shifting a negative signed value is undefined [-Wshift-negative-value] -499<<16, ~~~~^ valid-adjtimex.c:67:6: warning: shifting a negative signed value is undefined [-Wshift-negative-value] -450<<16, ~~~~^ .. Fix it by using a multiply by (1 << 16) instead of shifting negative values in the valid-adjtimex test case. Align the values for better readability. Reported-by: Lee Jones Reported-by: Muhammad Usama Anjum Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240409202222.2830476-1-jstultz@google.com Link: https://lore.kernel.org/lkml/0c6d4f0d-2064-4444-986b-1d1ed782135f@collabora.com/ --- .../testing/selftests/timers/valid-adjtimex.c | 73 +++++++++---------- 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c index 48b9a803235a8..d13ebde203221 100644 --- a/tools/testing/selftests/timers/valid-adjtimex.c +++ b/tools/testing/selftests/timers/valid-adjtimex.c @@ -21,9 +21,6 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - - - #include #include #include @@ -62,45 +59,47 @@ int clear_time_state(void) #define NUM_FREQ_OUTOFRANGE 4 #define NUM_FREQ_INVALID 2 +#define SHIFTED_PPM (1 << 16) + long valid_freq[NUM_FREQ_VALID] = { - -499<<16, - -450<<16, - -400<<16, - -350<<16, - -300<<16, - -250<<16, - -200<<16, - -150<<16, - -100<<16, - -75<<16, - -50<<16, - -25<<16, - -10<<16, - -5<<16, - -1<<16, + -499 * SHIFTED_PPM, + -450 * SHIFTED_PPM, + -400 * SHIFTED_PPM, + -350 * SHIFTED_PPM, + -300 * SHIFTED_PPM, + -250 * SHIFTED_PPM, + -200 * SHIFTED_PPM, + -150 * SHIFTED_PPM, + -100 * SHIFTED_PPM, + -75 * SHIFTED_PPM, + -50 * SHIFTED_PPM, + -25 * SHIFTED_PPM, + -10 * SHIFTED_PPM, + -5 * SHIFTED_PPM, + -1 * SHIFTED_PPM, -1000, - 1<<16, - 5<<16, - 10<<16, - 25<<16, - 50<<16, - 75<<16, - 100<<16, - 150<<16, - 200<<16, - 250<<16, - 300<<16, - 350<<16, - 400<<16, - 450<<16, - 499<<16, + 1 * SHIFTED_PPM, + 5 * SHIFTED_PPM, + 10 * SHIFTED_PPM, + 25 * SHIFTED_PPM, + 50 * SHIFTED_PPM, + 75 * SHIFTED_PPM, + 100 * SHIFTED_PPM, + 150 * SHIFTED_PPM, + 200 * SHIFTED_PPM, + 250 * SHIFTED_PPM, + 300 * SHIFTED_PPM, + 350 * SHIFTED_PPM, + 400 * SHIFTED_PPM, + 450 * SHIFTED_PPM, + 499 * SHIFTED_PPM, }; long outofrange_freq[NUM_FREQ_OUTOFRANGE] = { - -1000<<16, - -550<<16, - 550<<16, - 1000<<16, + -1000 * SHIFTED_PPM, + -550 * SHIFTED_PPM, + 550 * SHIFTED_PPM, + 1000 * SHIFTED_PPM, }; #define LONG_MAX (~0UL>>1) -- GitLab From d9ea7a3f66a5c7e1a2f73cf4b20f5eff3ced4ff8 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 19 Mar 2024 11:43:50 +0800 Subject: [PATCH 708/989] hv: vmbus: Convert sprintf() family to sysfs_emit() family Per filesystems/sysfs.rst, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Coccinelle complains that there are still a couple of functions that use snprintf(). Convert them to sysfs_emit(). sprintf() and scnprintf() will be converted as well if these files have such abused cases. This patch is generated by make coccicheck M= MODE=patch \ COCCI=scripts/coccinelle/api/device_attr_show.cocci No functional change intended. CC: "K. Y. Srinivasan" CC: Haiyang Zhang CC: Wei Liu CC: Dexuan Cui CC: linux-hyperv@vger.kernel.org Signed-off-by: Li Zhijian Link: https://lore.kernel.org/r/20240319034350.1574454-1-lizhijian@fujitsu.com Signed-off-by: Wei Liu Message-ID: <20240319034350.1574454-1-lizhijian@fujitsu.com> --- drivers/hv/vmbus_drv.c | 94 +++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 52 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 7f7965f3d1878..121f1ab32b51c 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -131,7 +131,7 @@ static ssize_t id_show(struct device *dev, struct device_attribute *dev_attr, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->offermsg.child_relid); + return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.child_relid); } static DEVICE_ATTR_RO(id); @@ -142,7 +142,7 @@ static ssize_t state_show(struct device *dev, struct device_attribute *dev_attr, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->state); + return sysfs_emit(buf, "%d\n", hv_dev->channel->state); } static DEVICE_ATTR_RO(state); @@ -153,7 +153,7 @@ static ssize_t monitor_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->offermsg.monitorid); + return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.monitorid); } static DEVICE_ATTR_RO(monitor_id); @@ -164,8 +164,8 @@ static ssize_t class_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "{%pUl}\n", - &hv_dev->channel->offermsg.offer.if_type); + return sysfs_emit(buf, "{%pUl}\n", + &hv_dev->channel->offermsg.offer.if_type); } static DEVICE_ATTR_RO(class_id); @@ -176,8 +176,8 @@ static ssize_t device_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "{%pUl}\n", - &hv_dev->channel->offermsg.offer.if_instance); + return sysfs_emit(buf, "{%pUl}\n", + &hv_dev->channel->offermsg.offer.if_instance); } static DEVICE_ATTR_RO(device_id); @@ -186,7 +186,7 @@ static ssize_t modalias_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); + return sysfs_emit(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); } static DEVICE_ATTR_RO(modalias); @@ -199,7 +199,7 @@ static ssize_t numa_node_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu)); + return sysfs_emit(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu)); } static DEVICE_ATTR_RO(numa_node); #endif @@ -212,9 +212,8 @@ static ssize_t server_monitor_pending_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_pending(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_pending); @@ -226,9 +225,8 @@ static ssize_t client_monitor_pending_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_pending(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_pending); @@ -240,9 +238,8 @@ static ssize_t server_monitor_latency_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_latency(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_latency); @@ -254,9 +251,8 @@ static ssize_t client_monitor_latency_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_latency(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_latency); @@ -268,9 +264,8 @@ static ssize_t server_monitor_conn_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_conn_id(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_conn_id); @@ -282,9 +277,8 @@ static ssize_t client_monitor_conn_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_conn_id(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_conn_id); @@ -303,7 +297,7 @@ static ssize_t out_intr_mask_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_interrupt_mask); + return sysfs_emit(buf, "%d\n", outbound.current_interrupt_mask); } static DEVICE_ATTR_RO(out_intr_mask); @@ -321,7 +315,7 @@ static ssize_t out_read_index_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_read_index); + return sysfs_emit(buf, "%d\n", outbound.current_read_index); } static DEVICE_ATTR_RO(out_read_index); @@ -340,7 +334,7 @@ static ssize_t out_write_index_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_write_index); + return sysfs_emit(buf, "%d\n", outbound.current_write_index); } static DEVICE_ATTR_RO(out_write_index); @@ -359,7 +353,7 @@ static ssize_t out_read_bytes_avail_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.bytes_avail_toread); + return sysfs_emit(buf, "%d\n", outbound.bytes_avail_toread); } static DEVICE_ATTR_RO(out_read_bytes_avail); @@ -378,7 +372,7 @@ static ssize_t out_write_bytes_avail_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.bytes_avail_towrite); + return sysfs_emit(buf, "%d\n", outbound.bytes_avail_towrite); } static DEVICE_ATTR_RO(out_write_bytes_avail); @@ -396,7 +390,7 @@ static ssize_t in_intr_mask_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_interrupt_mask); + return sysfs_emit(buf, "%d\n", inbound.current_interrupt_mask); } static DEVICE_ATTR_RO(in_intr_mask); @@ -414,7 +408,7 @@ static ssize_t in_read_index_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_read_index); + return sysfs_emit(buf, "%d\n", inbound.current_read_index); } static DEVICE_ATTR_RO(in_read_index); @@ -432,7 +426,7 @@ static ssize_t in_write_index_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_write_index); + return sysfs_emit(buf, "%d\n", inbound.current_write_index); } static DEVICE_ATTR_RO(in_write_index); @@ -451,7 +445,7 @@ static ssize_t in_read_bytes_avail_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.bytes_avail_toread); + return sysfs_emit(buf, "%d\n", inbound.bytes_avail_toread); } static DEVICE_ATTR_RO(in_read_bytes_avail); @@ -470,7 +464,7 @@ static ssize_t in_write_bytes_avail_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.bytes_avail_towrite); + return sysfs_emit(buf, "%d\n", inbound.bytes_avail_towrite); } static DEVICE_ATTR_RO(in_write_bytes_avail); @@ -480,7 +474,7 @@ static ssize_t channel_vp_mapping_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); struct vmbus_channel *channel = hv_dev->channel, *cur_sc; - int buf_size = PAGE_SIZE, n_written, tot_written; + int n_written; struct list_head *cur; if (!channel) @@ -488,25 +482,21 @@ static ssize_t channel_vp_mapping_show(struct device *dev, mutex_lock(&vmbus_connection.channel_mutex); - tot_written = snprintf(buf, buf_size, "%u:%u\n", - channel->offermsg.child_relid, channel->target_cpu); + n_written = sysfs_emit(buf, "%u:%u\n", + channel->offermsg.child_relid, + channel->target_cpu); list_for_each(cur, &channel->sc_list) { - if (tot_written >= buf_size - 1) - break; cur_sc = list_entry(cur, struct vmbus_channel, sc_list); - n_written = scnprintf(buf + tot_written, - buf_size - tot_written, - "%u:%u\n", - cur_sc->offermsg.child_relid, - cur_sc->target_cpu); - tot_written += n_written; + n_written += sysfs_emit_at(buf, n_written, "%u:%u\n", + cur_sc->offermsg.child_relid, + cur_sc->target_cpu); } mutex_unlock(&vmbus_connection.channel_mutex); - return tot_written; + return n_written; } static DEVICE_ATTR_RO(channel_vp_mapping); @@ -516,7 +506,7 @@ static ssize_t vendor_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "0x%x\n", hv_dev->vendor_id); + return sysfs_emit(buf, "0x%x\n", hv_dev->vendor_id); } static DEVICE_ATTR_RO(vendor); @@ -526,7 +516,7 @@ static ssize_t device_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "0x%x\n", hv_dev->device_id); + return sysfs_emit(buf, "0x%x\n", hv_dev->device_id); } static DEVICE_ATTR_RO(device); @@ -551,7 +541,7 @@ static ssize_t driver_override_show(struct device *dev, ssize_t len; device_lock(dev); - len = snprintf(buf, PAGE_SIZE, "%s\n", hv_dev->driver_override); + len = sysfs_emit(buf, "%s\n", hv_dev->driver_override); device_unlock(dev); return len; -- GitLab From f971f6dd3742d22dd13710306fb4365ea7bcb536 Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Fri, 22 Mar 2024 06:46:02 -0700 Subject: [PATCH 709/989] hv/hv_kvp_daemon: Handle IPv4 and Ipv6 combination for keyfile format If the network configuration strings are passed as a combination of IPv4 and IPv6 addresses, the current KVP daemon does not handle processing for the keyfile configuration format. With these changes, the keyfile config generation logic scans through the list twice to generate IPv4 and IPv6 sections for the configuration files to handle this support. Testcases ran:Rhel 9, Hyper-V VMs (IPv4 only, IPv6 only, IPv4 and IPv6 combination) Co-developed-by: Ani Sinha Signed-off-by: Ani Sinha Signed-off-by: Shradha Gupta Reviewed-by: Easwar Hariharan Tested-by: Ani Sinha Reviewed-by: Ani Sinha Link: https://lore.kernel.org/r/1711115162-11629-1-git-send-email-shradhagupta@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1711115162-11629-1-git-send-email-shradhagupta@linux.microsoft.com> --- tools/hv/hv_kvp_daemon.c | 213 +++++++++++++++++++++++++++++++-------- 1 file changed, 172 insertions(+), 41 deletions(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 318e2dad27e04..ae57bf69ad4af 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -76,6 +76,12 @@ enum { DNS }; +enum { + IPV4 = 1, + IPV6, + IP_TYPE_MAX +}; + static int in_hand_shake; static char *os_name = ""; @@ -102,6 +108,11 @@ static struct utsname uts_buf; #define MAX_FILE_NAME 100 #define ENTRIES_PER_BLOCK 50 +/* + * Change this entry if the number of addresses increases in future + */ +#define MAX_IP_ENTRIES 64 +#define OUTSTR_BUF_SIZE ((INET6_ADDRSTRLEN + 1) * MAX_IP_ENTRIES) struct kvp_record { char key[HV_KVP_EXCHANGE_MAX_KEY_SIZE]; @@ -1171,6 +1182,18 @@ static int process_ip_string(FILE *f, char *ip_string, int type) return 0; } +int ip_version_check(const char *input_addr) +{ + struct in6_addr addr; + + if (inet_pton(AF_INET, input_addr, &addr)) + return IPV4; + else if (inet_pton(AF_INET6, input_addr, &addr)) + return IPV6; + + return -EINVAL; +} + /* * Only IPv4 subnet strings needs to be converted to plen * For IPv6 the subnet is already privided in plen format @@ -1197,14 +1220,75 @@ static int kvp_subnet_to_plen(char *subnet_addr_str) return plen; } +static int process_dns_gateway_nm(FILE *f, char *ip_string, int type, + int ip_sec) +{ + char addr[INET6_ADDRSTRLEN], *output_str; + int ip_offset = 0, error = 0, ip_ver; + char *param_name; + + if (type == DNS) + param_name = "dns"; + else if (type == GATEWAY) + param_name = "gateway"; + else + return -EINVAL; + + output_str = (char *)calloc(OUTSTR_BUF_SIZE, sizeof(char)); + if (!output_str) + return -ENOMEM; + + while (1) { + memset(addr, 0, sizeof(addr)); + + if (!parse_ip_val_buffer(ip_string, &ip_offset, addr, + (MAX_IP_ADDR_SIZE * 2))) + break; + + ip_ver = ip_version_check(addr); + if (ip_ver < 0) + continue; + + if ((ip_ver == IPV4 && ip_sec == IPV4) || + (ip_ver == IPV6 && ip_sec == IPV6)) { + /* + * do a bound check to avoid out-of bound writes + */ + if ((OUTSTR_BUF_SIZE - strlen(output_str)) > + (strlen(addr) + 1)) { + strncat(output_str, addr, + OUTSTR_BUF_SIZE - + strlen(output_str) - 1); + strncat(output_str, ",", + OUTSTR_BUF_SIZE - + strlen(output_str) - 1); + } + } else { + continue; + } + } + + if (strlen(output_str)) { + /* + * This is to get rid of that extra comma character + * in the end of the string + */ + output_str[strlen(output_str) - 1] = '\0'; + error = fprintf(f, "%s=%s\n", param_name, output_str); + } + + free(output_str); + return error; +} + static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, - int is_ipv6) + int ip_sec) { char addr[INET6_ADDRSTRLEN]; char subnet_addr[INET6_ADDRSTRLEN]; - int error, i = 0; + int error = 0, i = 0; int ip_offset = 0, subnet_offset = 0; - int plen; + int plen, ip_ver; memset(addr, 0, sizeof(addr)); memset(subnet_addr, 0, sizeof(subnet_addr)); @@ -1216,10 +1300,16 @@ static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, subnet_addr, (MAX_IP_ADDR_SIZE * 2))) { - if (!is_ipv6) + ip_ver = ip_version_check(addr); + if (ip_ver < 0) + continue; + + if (ip_ver == IPV4 && ip_sec == IPV4) plen = kvp_subnet_to_plen((char *)subnet_addr); - else + else if (ip_ver == IPV6 && ip_sec == IPV6) plen = atoi(subnet_addr); + else + continue; if (plen < 0) return plen; @@ -1233,17 +1323,16 @@ static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, memset(subnet_addr, 0, sizeof(subnet_addr)); } - return 0; + return error; } static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) { - int error = 0; + int error = 0, ip_ver; char if_filename[PATH_MAX]; char nm_filename[PATH_MAX]; FILE *ifcfg_file, *nmfile; char cmd[PATH_MAX]; - int is_ipv6 = 0; char *mac_addr; int str_len; @@ -1421,52 +1510,94 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) if (error) goto setval_error; - if (new_val->addr_family & ADDR_FAMILY_IPV6) { - error = fprintf(nmfile, "\n[ipv6]\n"); - if (error < 0) - goto setval_error; - is_ipv6 = 1; - } else { - error = fprintf(nmfile, "\n[ipv4]\n"); - if (error < 0) - goto setval_error; - } - /* * Now we populate the keyfile format + * + * The keyfile format expects the IPv6 and IPv4 configuration in + * different sections. Therefore we iterate through the list twice, + * once to populate the IPv4 section and the next time for IPv6 */ + ip_ver = IPV4; + do { + if (ip_ver == IPV4) { + error = fprintf(nmfile, "\n[ipv4]\n"); + if (error < 0) + goto setval_error; + } else { + error = fprintf(nmfile, "\n[ipv6]\n"); + if (error < 0) + goto setval_error; + } - if (new_val->dhcp_enabled) { - error = kvp_write_file(nmfile, "method", "", "auto"); - if (error < 0) - goto setval_error; - } else { - error = kvp_write_file(nmfile, "method", "", "manual"); + /* + * Write the configuration for ipaddress, netmask, gateway and + * name services + */ + error = process_ip_string_nm(nmfile, (char *)new_val->ip_addr, + (char *)new_val->sub_net, + ip_ver); if (error < 0) goto setval_error; - } - /* - * Write the configuration for ipaddress, netmask, gateway and - * name services - */ - error = process_ip_string_nm(nmfile, (char *)new_val->ip_addr, - (char *)new_val->sub_net, is_ipv6); - if (error < 0) - goto setval_error; + /* + * As dhcp_enabled is only valid for ipv4, we do not set dhcp + * methods for ipv6 based on dhcp_enabled flag. + * + * For ipv4, set method to manual only when dhcp_enabled is + * false and specific ipv4 addresses are configured. If neither + * dhcp_enabled is true and no ipv4 addresses are configured, + * set method to 'disabled'. + * + * For ipv6, set method to manual when we configure ipv6 + * addresses. Otherwise set method to 'auto' so that SLAAC from + * RA may be used. + */ + if (ip_ver == IPV4) { + if (new_val->dhcp_enabled) { + error = kvp_write_file(nmfile, "method", "", + "auto"); + if (error < 0) + goto setval_error; + } else if (error) { + error = kvp_write_file(nmfile, "method", "", + "manual"); + if (error < 0) + goto setval_error; + } else { + error = kvp_write_file(nmfile, "method", "", + "disabled"); + if (error < 0) + goto setval_error; + } + } else if (ip_ver == IPV6) { + if (error) { + error = kvp_write_file(nmfile, "method", "", + "manual"); + if (error < 0) + goto setval_error; + } else { + error = kvp_write_file(nmfile, "method", "", + "auto"); + if (error < 0) + goto setval_error; + } + } - /* we do not want ipv4 addresses in ipv6 section and vice versa */ - if (is_ipv6 != is_ipv4((char *)new_val->gate_way)) { - error = fprintf(nmfile, "gateway=%s\n", (char *)new_val->gate_way); + error = process_dns_gateway_nm(nmfile, + (char *)new_val->gate_way, + GATEWAY, ip_ver); if (error < 0) goto setval_error; - } - if (is_ipv6 != is_ipv4((char *)new_val->dns_addr)) { - error = fprintf(nmfile, "dns=%s\n", (char *)new_val->dns_addr); + error = process_dns_gateway_nm(nmfile, + (char *)new_val->dns_addr, DNS, + ip_ver); if (error < 0) goto setval_error; - } + + ip_ver++; + } while (ip_ver < IP_TYPE_MAX); + fclose(nmfile); fclose(ifcfg_file); -- GitLab From 03f5a999adba062456c8c818a683beb1b498983a Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:54 -0700 Subject: [PATCH 710/989] Drivers: hv: vmbus: Leak pages if set_memory_encrypted() fails In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. VMBus code could free decrypted pages if set_memory_encrypted()/decrypted() fails. Leak the pages if this happens. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-2-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-2-mhklinux@outlook.com> --- drivers/hv/connection.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 3cabeeabb1cac..f001ae880e1db 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -237,8 +237,17 @@ int vmbus_connect(void) vmbus_connection.monitor_pages[0], 1); ret |= set_memory_decrypted((unsigned long) vmbus_connection.monitor_pages[1], 1); - if (ret) + if (ret) { + /* + * If set_memory_decrypted() fails, the encryption state + * of the memory is unknown. So leak the memory instead + * of risking returning decrypted memory to the free list. + * For simplicity, always handle both pages the same. + */ + vmbus_connection.monitor_pages[0] = NULL; + vmbus_connection.monitor_pages[1] = NULL; goto cleanup; + } /* * Set_memory_decrypted() will change the memory contents if @@ -337,13 +346,19 @@ void vmbus_disconnect(void) vmbus_connection.int_page = NULL; } - set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[0], 1); - set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[1], 1); + if (vmbus_connection.monitor_pages[0]) { + if (!set_memory_encrypted( + (unsigned long)vmbus_connection.monitor_pages[0], 1)) + hv_free_hyperv_page(vmbus_connection.monitor_pages[0]); + vmbus_connection.monitor_pages[0] = NULL; + } - hv_free_hyperv_page(vmbus_connection.monitor_pages[0]); - hv_free_hyperv_page(vmbus_connection.monitor_pages[1]); - vmbus_connection.monitor_pages[0] = NULL; - vmbus_connection.monitor_pages[1] = NULL; + if (vmbus_connection.monitor_pages[1]) { + if (!set_memory_encrypted( + (unsigned long)vmbus_connection.monitor_pages[1], 1)) + hv_free_hyperv_page(vmbus_connection.monitor_pages[1]); + vmbus_connection.monitor_pages[1] = NULL; + } } /* -- GitLab From 211f514ebf1ef5de37b1cf6df9d28a56cfd242ca Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:55 -0700 Subject: [PATCH 711/989] Drivers: hv: vmbus: Track decrypted status in vmbus_gpadl In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. In order to make sure callers of vmbus_establish_gpadl() and vmbus_teardown_gpadl() don't return decrypted/shared pages to allocators, add a field in struct vmbus_gpadl to keep track of the decryption status of the buffers. This will allow the callers to know if they should free or leak the pages. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-3-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-3-mhklinux@outlook.com> --- drivers/hv/channel.c | 25 +++++++++++++++++++++---- include/linux/hyperv.h | 1 + 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index adbf674355b2b..98259b4925029 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -436,9 +436,18 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, (atomic_inc_return(&vmbus_connection.next_gpadl_handle) - 1); ret = create_gpadl_header(type, kbuffer, size, send_offset, &msginfo); - if (ret) + if (ret) { + gpadl->decrypted = false; return ret; + } + /* + * Set the "decrypted" flag to true for the set_memory_decrypted() + * success case. In the failure case, the encryption state of the + * memory is unknown. Leave "decrypted" as true to ensure the + * memory will be leaked instead of going back on the free list. + */ + gpadl->decrypted = true; ret = set_memory_decrypted((unsigned long)kbuffer, PFN_UP(size)); if (ret) { @@ -527,9 +536,15 @@ cleanup: kfree(msginfo); - if (ret) - set_memory_encrypted((unsigned long)kbuffer, - PFN_UP(size)); + if (ret) { + /* + * If set_memory_encrypted() fails, the decrypted flag is + * left as true so the memory is leaked instead of being + * put back on the free list. + */ + if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size))) + gpadl->decrypted = false; + } return ret; } @@ -850,6 +865,8 @@ post_msg_err: if (ret) pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret); + gpadl->decrypted = ret; + return ret; } EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 6ef0557b4bff8..96ceb4095425e 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -832,6 +832,7 @@ struct vmbus_gpadl { u32 gpadl_handle; u32 size; void *buffer; + bool decrypted; }; struct vmbus_channel { -- GitLab From bbf9ac34677b57506a13682b31a2a718934c0e31 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:56 -0700 Subject: [PATCH 712/989] hv_netvsc: Don't free decrypted memory In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. The netvsc driver could free decrypted/shared pages if set_memory_decrypted() fails. Check the decrypted field in the gpadl to decide whether to free the memory. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-4-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-4-mhklinux@outlook.com> --- drivers/net/hyperv/netvsc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index a6fcbda64ecc6..2b6ec979a62f2 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -154,8 +154,11 @@ static void free_netvsc_device(struct rcu_head *head) int i; kfree(nvdev->extension); - vfree(nvdev->recv_buf); - vfree(nvdev->send_buf); + + if (!nvdev->recv_buf_gpadl_handle.decrypted) + vfree(nvdev->recv_buf); + if (!nvdev->send_buf_gpadl_handle.decrypted) + vfree(nvdev->send_buf); bitmap_free(nvdev->send_section_map); for (i = 0; i < VRSS_CHANNEL_MAX; i++) { -- GitLab From 3d788b2fbe6a1a1a9e3db09742b90809d51638b7 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:57 -0700 Subject: [PATCH 713/989] uio_hv_generic: Don't free decrypted memory In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. The VMBus device UIO driver could free decrypted/shared pages if set_memory_decrypted() fails. Check the decrypted field in the gpadl to decide whether to free the memory. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-5-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-5-mhklinux@outlook.com> --- drivers/uio/uio_hv_generic.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/uio/uio_hv_generic.c b/drivers/uio/uio_hv_generic.c index 20d9762331bd7..6be3462b109ff 100644 --- a/drivers/uio/uio_hv_generic.c +++ b/drivers/uio/uio_hv_generic.c @@ -181,12 +181,14 @@ hv_uio_cleanup(struct hv_device *dev, struct hv_uio_private_data *pdata) { if (pdata->send_gpadl.gpadl_handle) { vmbus_teardown_gpadl(dev->channel, &pdata->send_gpadl); - vfree(pdata->send_buf); + if (!pdata->send_gpadl.decrypted) + vfree(pdata->send_buf); } if (pdata->recv_gpadl.gpadl_handle) { vmbus_teardown_gpadl(dev->channel, &pdata->recv_gpadl); - vfree(pdata->recv_buf); + if (!pdata->recv_gpadl.decrypted) + vfree(pdata->recv_buf); } } @@ -295,7 +297,8 @@ hv_uio_probe(struct hv_device *dev, ret = vmbus_establish_gpadl(channel, pdata->recv_buf, RECV_BUFFER_SIZE, &pdata->recv_gpadl); if (ret) { - vfree(pdata->recv_buf); + if (!pdata->recv_gpadl.decrypted) + vfree(pdata->recv_buf); goto fail_close; } @@ -317,7 +320,8 @@ hv_uio_probe(struct hv_device *dev, ret = vmbus_establish_gpadl(channel, pdata->send_buf, SEND_BUFFER_SIZE, &pdata->send_gpadl); if (ret) { - vfree(pdata->send_buf); + if (!pdata->send_gpadl.decrypted) + vfree(pdata->send_buf); goto fail_close; } -- GitLab From 30d18df6567be09c1433e81993e35e3da573ac48 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 11 Mar 2024 09:15:58 -0700 Subject: [PATCH 714/989] Drivers: hv: vmbus: Don't free ring buffers that couldn't be re-encrypted In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. The VMBus ring buffer code could free decrypted/shared pages if set_memory_decrypted() fails. Check the decrypted field in the struct vmbus_gpadl for the ring buffers to decide whether to free the memory. Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-6-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-6-mhklinux@outlook.com> --- drivers/hv/channel.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 98259b4925029..fb8cd8469328e 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -153,7 +153,9 @@ void vmbus_free_ring(struct vmbus_channel *channel) hv_ringbuffer_cleanup(&channel->inbound); if (channel->ringbuffer_page) { - __free_pages(channel->ringbuffer_page, + /* In a CoCo VM leak the memory if it didn't get re-encrypted */ + if (!channel->ringbuffer_gpadlhandle.decrypted) + __free_pages(channel->ringbuffer_page, get_order(channel->ringbuffer_pagecount << PAGE_SHIFT)); channel->ringbuffer_page = NULL; -- GitLab From a4833e3abae132d613ce7da0e0c9a9465d1681fa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 10 Apr 2024 12:38:13 -0400 Subject: [PATCH 715/989] SUNRPC: Fix rpcgss_context trace event acceptor field The rpcgss_context trace event acceptor field is a dynamically sized string that records the "data" parameter. But this parameter is also dependent on the "len" field to determine the size of the data. It needs to use __string_len() helper macro where the length can be passed in. It also incorrectly uses strncpy() to save it instead of __assign_str(). As these macros can change, it is not wise to open code them in trace events. As of commit c759e609030c ("tracing: Remove __assign_str_len()"), __assign_str() can be used for both __string() and __string_len() fields. Before that commit, __assign_str_len() is required to be used. This needs to be noted for backporting. (In actuality, commit c1fa617caeb0 ("tracing: Rework __assign_str() and __string() to not duplicate getting the string") is the commit that makes __string_str_len() obsolete). Cc: stable@vger.kernel.org Fixes: 0c77668ddb4e ("SUNRPC: Introduce trace points in rpc_auth_gss.ko") Signed-off-by: Steven Rostedt (Google) Signed-off-by: Chuck Lever --- include/trace/events/rpcgss.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h index ba2d96a1bc2f9..f50fcafc69de2 100644 --- a/include/trace/events/rpcgss.h +++ b/include/trace/events/rpcgss.h @@ -609,7 +609,7 @@ TRACE_EVENT(rpcgss_context, __field(unsigned int, timeout) __field(u32, window_size) __field(int, len) - __string(acceptor, data) + __string_len(acceptor, data, len) ), TP_fast_assign( @@ -618,7 +618,7 @@ TRACE_EVENT(rpcgss_context, __entry->timeout = timeout; __entry->window_size = window_size; __entry->len = len; - strncpy(__get_str(acceptor), data, len); + __assign_str(acceptor, data); ), TP_printk("win_size=%u expiry=%lu now=%lu timeout=%u acceptor=%.*s", -- GitLab From ec4535b2a1d709d3a1fbec26739c672f13c98a7b Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 8 Apr 2024 18:32:17 -0300 Subject: [PATCH 716/989] smb: client: fix NULL ptr deref in cifs_mark_open_handles_for_deleted_file() cifs_get_fattr() may be called with a NULL inode, so check for a non-NULL inode before calling cifs_mark_open_handles_for_deleted_file(). This fixes the following oops: mount.cifs //srv/share /mnt -o ...,vers=3.1.1 cd /mnt touch foo; tail -f foo & rm foo cat foo BUG: kernel NULL pointer dereference, address: 00000000000005c0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 2 PID: 696 Comm: cat Not tainted 6.9.0-rc2 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-1.fc39 04/01/2014 RIP: 0010:__lock_acquire+0x5d/0x1c70 Code: 00 00 44 8b a4 24 a0 00 00 00 45 85 f6 0f 84 bb 06 00 00 8b 2d 48 e2 95 01 45 89 c3 41 89 d2 45 89 c8 85 ed 0 0 <48> 81 3f 40 7a 76 83 44 0f 44 d8 83 fe 01 0f 86 1b 03 00 00 31 d2 RSP: 0018:ffffc90000b37490 EFLAGS: 00010002 RAX: 0000000000000000 RBX: ffff888110021ec0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000000005c0 RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000200 FS: 00007f2a1fa08740(0000) GS:ffff888157a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000005c0 CR3: 000000011ac7c000 CR4: 0000000000750ef0 PKRU: 55555554 Call Trace: ? __die+0x23/0x70 ? page_fault_oops+0x180/0x490 ? srso_alias_return_thunk+0x5/0xfbef5 ? exc_page_fault+0x70/0x230 ? asm_exc_page_fault+0x26/0x30 ? __lock_acquire+0x5d/0x1c70 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 lock_acquire+0xc0/0x2d0 ? cifs_mark_open_handles_for_deleted_file+0x3a/0x100 [cifs] ? srso_alias_return_thunk+0x5/0xfbef5 ? kmem_cache_alloc+0x2d9/0x370 _raw_spin_lock+0x34/0x80 ? cifs_mark_open_handles_for_deleted_file+0x3a/0x100 [cifs] cifs_mark_open_handles_for_deleted_file+0x3a/0x100 [cifs] cifs_get_fattr+0x24c/0x940 [cifs] ? srso_alias_return_thunk+0x5/0xfbef5 cifs_get_inode_info+0x96/0x120 [cifs] cifs_lookup+0x16e/0x800 [cifs] cifs_atomic_open+0xc7/0x5d0 [cifs] ? lookup_open.isra.0+0x3ce/0x5f0 ? __pfx_cifs_atomic_open+0x10/0x10 [cifs] lookup_open.isra.0+0x3ce/0x5f0 path_openat+0x42b/0xc30 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 do_filp_open+0xc4/0x170 do_sys_openat2+0xab/0xe0 __x64_sys_openat+0x57/0xa0 do_syscall_64+0xc1/0x1e0 entry_SYSCALL_64_after_hwframe+0x72/0x7a Fixes: ffceb7640cbf ("smb: client: do not defer close open handles to deleted files") Reviewed-by: Meetakshi Setiya Reviewed-by: Bharath SM Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 91b07ef9e25ca..60afab5c83d41 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1105,7 +1105,8 @@ static int cifs_get_fattr(struct cifs_open_info_data *data, } else { cifs_open_info_to_fattr(fattr, data, sb); } - if (!rc && fattr->cf_flags & CIFS_FATTR_DELETE_PENDING) + if (!rc && *inode && + (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING)) cifs_mark_open_handles_for_deleted_file(*inode, full_path); break; case -EREMOTE: -- GitLab From 06dfcd4098cfdc4d4577d94793a4f9125386da8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Mon, 8 Apr 2024 10:08:53 +0300 Subject: [PATCH 717/989] net: dsa: mt7530: fix enabling EEE on MT7531 switch on all boards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 40b5d2f15c09 ("net: dsa: mt7530: Add support for EEE features") brought EEE support but did not enable EEE on MT7531 switch MACs. EEE is enabled on MT7531 switch MACs by pulling the LAN2LED0 pin low on the board (bootstrapping), unsetting the EEE_DIS bit on the trap register, or setting the internal EEE switch bit on the CORE_PLL_GROUP4 register. Thanks to SkyLake Huang (黃啟澤) from MediaTek for providing information on the internal EEE switch bit. There are existing boards that were not designed to pull the pin low. Because of that, the EEE status currently depends on the board design. The EEE_DIS bit on the trap pertains to the LAN2LED0 pin which is usually used to control an LED. Once the bit is unset, the pin will be low. That will make the active low LED turn on. The pin is controlled by the switch PHY. It seems that the PHY controls the pin in the way that it inverts the pin state. That means depending on the wiring of the LED connected to LAN2LED0 on the board, the LED may be on without an active link. To not cause this unwanted behaviour whilst enabling EEE on all boards, set the internal EEE switch bit on the CORE_PLL_GROUP4 register. My testing on MT7531 shows a certain amount of traffic loss when EEE is enabled. That said, I haven't come across a board that enables EEE. So enable EEE on the switch MACs but disable EEE advertisement on the switch PHYs. This way, we don't change the behaviour of the majority of the boards that have this switch. The mediatek-ge PHY driver already disables EEE advertisement on the switch PHYs but my testing shows that it is somehow enabled afterwards. Disabling EEE advertisement before the PHY driver initialises keeps it off. With this change, EEE can now be enabled using ethtool. Fixes: 40b5d2f15c09 ("net: dsa: mt7530: Add support for EEE features") Reviewed-by: Florian Fainelli Signed-off-by: Arınç ÜNAL Tested-by: Daniel Golle Reviewed-by: Daniel Golle Link: https://lore.kernel.org/r/20240408-for-net-mt7530-fix-eee-for-mt7531-mt7988-v3-1-84fdef1f008b@arinc9.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mt7530.c | 17 ++++++++++++----- drivers/net/dsa/mt7530.h | 1 + 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 1035820c2377a..451ff0620c2e1 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2505,18 +2505,25 @@ mt7531_setup(struct dsa_switch *ds) mt7530_rmw(priv, MT7531_GPIO_MODE0, MT7531_GPIO0_MASK, MT7531_GPIO0_INTERRUPT); - /* Enable PHY core PLL, since phy_device has not yet been created - * provided for phy_[read,write]_mmd_indirect is called, we provide - * our own mt7531_ind_mmd_phy_[read,write] to complete this - * function. + /* Enable Energy-Efficient Ethernet (EEE) and PHY core PLL, since + * phy_device has not yet been created provided for + * phy_[read,write]_mmd_indirect is called, we provide our own + * mt7531_ind_mmd_phy_[read,write] to complete this function. */ val = mt7531_ind_c45_phy_read(priv, MT753X_CTRL_PHY_ADDR, MDIO_MMD_VEND2, CORE_PLL_GROUP4); - val |= MT7531_PHY_PLL_BYPASS_MODE; + val |= MT7531_RG_SYSPLL_DMY2 | MT7531_PHY_PLL_BYPASS_MODE; val &= ~MT7531_PHY_PLL_OFF; mt7531_ind_c45_phy_write(priv, MT753X_CTRL_PHY_ADDR, MDIO_MMD_VEND2, CORE_PLL_GROUP4, val); + /* Disable EEE advertisement on the switch PHYs. */ + for (i = MT753X_CTRL_PHY_ADDR; + i < MT753X_CTRL_PHY_ADDR + MT7530_NUM_PHYS; i++) { + mt7531_ind_c45_phy_write(priv, i, MDIO_MMD_AN, MDIO_AN_EEE_ADV, + 0); + } + mt7531_setup_common(ds); /* Setup VLAN ID 0 for VLAN-unaware bridges */ diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index d17b318e6ee48..9439db4950015 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -616,6 +616,7 @@ enum mt7531_clk_skew { #define RG_SYSPLL_DDSFBK_EN BIT(12) #define RG_SYSPLL_BIAS_EN BIT(11) #define RG_SYSPLL_BIAS_LPF_EN BIT(10) +#define MT7531_RG_SYSPLL_DMY2 BIT(6) #define MT7531_PHY_PLL_OFF BIT(5) #define MT7531_PHY_PLL_BYPASS_MODE BIT(4) -- GitLab From 5e700b384ec13f5bcac9855cb28fcc674f1d3593 Mon Sep 17 00:00:00 2001 From: Noah Loomans Date: Wed, 10 Apr 2024 20:26:19 +0200 Subject: [PATCH 718/989] platform/chrome: cros_ec_uart: properly fix race condition The cros_ec_uart_probe() function calls devm_serdev_device_open() before it calls serdev_device_set_client_ops(). This can trigger a NULL pointer dereference: BUG: kernel NULL pointer dereference, address: 0000000000000000 ... Call Trace: ... ? ttyport_receive_buf A simplified version of crashing code is as follows: static inline size_t serdev_controller_receive_buf(struct serdev_controller *ctrl, const u8 *data, size_t count) { struct serdev_device *serdev = ctrl->serdev; if (!serdev || !serdev->ops->receive_buf) // CRASH! return 0; return serdev->ops->receive_buf(serdev, data, count); } It assumes that if SERPORT_ACTIVE is set and serdev exists, serdev->ops will also exist. This conflicts with the existing cros_ec_uart_probe() logic, as it first calls devm_serdev_device_open() (which sets SERPORT_ACTIVE), and only later sets serdev->ops via serdev_device_set_client_ops(). Commit 01f95d42b8f4 ("platform/chrome: cros_ec_uart: fix race condition") attempted to fix a similar race condition, but while doing so, made the window of error for this race condition to happen much wider. Attempt to fix the race condition again, making sure we fully setup before calling devm_serdev_device_open(). Fixes: 01f95d42b8f4 ("platform/chrome: cros_ec_uart: fix race condition") Cc: stable@vger.kernel.org Signed-off-by: Noah Loomans Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20240410182618.169042-2-noah@noahloomans.com Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec_uart.c | 28 +++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_uart.c b/drivers/platform/chrome/cros_ec_uart.c index 68d80559fddc2..eb5eddeb73f72 100644 --- a/drivers/platform/chrome/cros_ec_uart.c +++ b/drivers/platform/chrome/cros_ec_uart.c @@ -263,12 +263,6 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) if (!ec_dev) return -ENOMEM; - ret = devm_serdev_device_open(dev, serdev); - if (ret) { - dev_err(dev, "Unable to open UART device"); - return ret; - } - serdev_device_set_drvdata(serdev, ec_dev); init_waitqueue_head(&ec_uart->response.wait_queue); @@ -280,14 +274,6 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) return ret; } - ret = serdev_device_set_baudrate(serdev, ec_uart->baudrate); - if (ret < 0) { - dev_err(dev, "Failed to set up host baud rate (%d)", ret); - return ret; - } - - serdev_device_set_flow_control(serdev, ec_uart->flowcontrol); - /* Initialize ec_dev for cros_ec */ ec_dev->phys_name = dev_name(dev); ec_dev->dev = dev; @@ -301,6 +287,20 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) serdev_device_set_client_ops(serdev, &cros_ec_uart_client_ops); + ret = devm_serdev_device_open(dev, serdev); + if (ret) { + dev_err(dev, "Unable to open UART device"); + return ret; + } + + ret = serdev_device_set_baudrate(serdev, ec_uart->baudrate); + if (ret < 0) { + dev_err(dev, "Failed to set up host baud rate (%d)", ret); + return ret; + } + + serdev_device_set_flow_control(serdev, ec_uart->flowcontrol); + return cros_ec_register(ec_dev); } -- GitLab From 97e176fcbbf3c0f2bd410c9b241177c051f57176 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 10 Apr 2024 15:11:28 +0200 Subject: [PATCH 719/989] r8169: add missing conditional compiling for call to r8169_remove_leds Add missing dependency on CONFIG_R8169_LEDS. As-is a link error occurs if config option CONFIG_R8169_LEDS isn't enabled. Fixes: 19fa4f2a85d7 ("r8169: fix LED-related deadlock on module removal") Reported-by: Venkat Rao Bagalkote Signed-off-by: Heiner Kallweit Tested-By: Venkat Rao Bagalkote Link: https://lore.kernel.org/r/d080038c-eb6b-45ac-9237-b8c1cdd7870f@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 8a27328eae34b..0fc5fe564ae50 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5046,7 +5046,8 @@ static void rtl_remove_one(struct pci_dev *pdev) cancel_work_sync(&tp->wk.work); - r8169_remove_leds(tp->leds); + if (IS_ENABLED(CONFIG_R8169_LEDS)) + r8169_remove_leds(tp->leds); unregister_netdev(tp->dev); -- GitLab From beccf29114886f1604e26f739cd108f048878ca8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Apr 2024 12:53:28 -0400 Subject: [PATCH 720/989] bcachefs: Fix a race in btree_update_nodes_written() One btree update might have terminated in a node update, and then while it is in flight another btree update might free that original node. This race has to be handled in btree_update_nodes_written() - we were missing a READ_ONCE(). Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index a4a63e363047d..c4a5e83a56a43 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -704,9 +704,13 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, "%s", bch2_err_str(ret)); err: - if (as->b) { - - b = as->b; + /* + * We have to be careful because another thread might be getting ready + * to free as->b and calling btree_update_reparent() on us - we'll + * recheck under btree_update_lock below: + */ + b = READ_ONCE(as->b); + if (b) { btree_path_idx_t path_idx = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p); struct btree_path *path = trans->paths + path_idx; -- GitLab From 517236cb3e2f77bc785f06802dfbcca19dffd9ad Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Apr 2024 00:10:18 -0400 Subject: [PATCH 721/989] bcachefs: Kill read lock dropping in bch2_btree_node_lock_write_nofail() dropping read locks in bch2_btree_node_lock_write_nofail() dates from before we had the cycle detector; we can now tell the cycle detector directly when taking a lock may not fail because we can't handle transaction restarts. This is needed for adding should_be_locked asserts. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index b9b151e693ed6..f2caf491957ef 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -440,33 +440,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, struct btree_path *path, struct btree_bkey_cached_common *b) { - struct btree_path *linked; - unsigned i, iter; - int ret; - - /* - * XXX BIG FAT NOTICE - * - * Drop all read locks before taking a write lock: - * - * This is a hack, because bch2_btree_node_lock_write_nofail() is a - * hack - but by dropping read locks first, this should never fail, and - * we only use this in code paths where whatever read locks we've - * already taken are no longer needed: - */ - - trans_for_each_path(trans, linked, iter) { - if (!linked->nodes_locked) - continue; - - for (i = 0; i < BTREE_MAX_DEPTH; i++) - if (btree_node_read_locked(linked, i)) { - btree_node_unlock(trans, linked, i); - btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK); - } - } - - ret = __btree_node_lock_write(trans, path, b, true); + int ret = __btree_node_lock_write(trans, path, b, true); BUG_ON(ret); } -- GitLab From 1189bdda6c991cbf9342d84410042dd5f3a792e0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Apr 2024 01:30:22 -0400 Subject: [PATCH 722/989] bcachefs: Fix __bch2_btree_and_journal_iter_init_node_iter() We weren't respecting trans->journal_replay_not_finished - we shouldn't be searching the journal keys unless we have a ref on them. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index a6413a8747d3c..1e8cf49a69353 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -469,10 +469,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, iter->trans = trans; iter->b = b; iter->node_iter = node_iter; - bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); - INIT_LIST_HEAD(&iter->journal.list); iter->pos = b->data->min_key; iter->at_end = false; + INIT_LIST_HEAD(&iter->journal.list); + + if (trans->journal_replay_not_finished) { + bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); + if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags)) + list_add(&iter->journal.list, &trans->c->journal_iters); + } } /* @@ -487,9 +492,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, bch2_btree_node_iter_init_from_start(&node_iter, b); __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); - if (trans->journal_replay_not_finished && - !test_bit(BCH_FS_may_go_rw, &trans->c->flags)) - list_add(&iter->journal.list, &trans->c->journal_iters); } /* sort and dedup all keys in the journal: */ -- GitLab From 65acf6e0501ac8880a4f73980d01b5d27648b956 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Apr 2024 12:07:41 +0000 Subject: [PATCH 723/989] netfilter: complete validation of user input In my recent commit, I missed that do_replace() handlers use copy_from_sockptr() (which I fixed), followed by unsafe copy_from_sockptr_offset() calls. In all functions, we can perform the @optlen validation before even calling xt_alloc_table_info() with the following check: if ((u64)optlen < (u64)tmp.size + sizeof(tmp)) return -EINVAL; Fixes: 0c83842df40f ("netfilter: validate user input for expected length") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Pablo Neira Ayuso Link: https://lore.kernel.org/r/20240409120741.3538135-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/netfilter/arp_tables.c | 4 ++++ net/ipv4/netfilter/ip_tables.c | 4 ++++ net/ipv6/netfilter/ip6_tables.c | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b150c9929b12e..14365b20f1c5c 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -966,6 +966,8 @@ static int do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; @@ -1266,6 +1268,8 @@ static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4876707595781..fe89a056eb06c 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1118,6 +1118,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; @@ -1504,6 +1506,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 636b360311c53..131f7bb2110d3 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1135,6 +1135,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; @@ -1513,6 +1515,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; -- GitLab From 0553e753ea9ee724acaf6b3dfc7354702af83567 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 9 Apr 2024 22:08:09 +0300 Subject: [PATCH 724/989] net/mlx5: E-switch, store eswitch pointer before registering devlink_param Next patch will move devlink register to be first. Therefore, whenever mlx5 will register a param, the user will be notified. In order to notify the user, devlink is using the get() callback of the param. Hence, resources that are being used by the get() callback must be set before the devlink param is registered. Therefore, store eswitch pointer inside mdev before registering the param. Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 9 +++------ .../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 ++++ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 3047d7015c525..1789800faaeb6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1868,6 +1868,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) if (err) goto abort; + dev->priv.eswitch = esw; err = esw_offloads_init(esw); if (err) goto reps_err; @@ -1892,11 +1893,6 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_BASIC; else esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE; - if (MLX5_ESWITCH_MANAGER(dev) && - mlx5_esw_vport_match_metadata_supported(esw)) - esw->flags |= MLX5_ESWITCH_VPORT_MATCH_METADATA; - - dev->priv.eswitch = esw; BLOCKING_INIT_NOTIFIER_HEAD(&esw->n_head); esw_info(dev, @@ -1908,6 +1904,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) reps_err: mlx5_esw_vports_cleanup(esw); + dev->priv.eswitch = NULL; abort: if (esw->work_queue) destroy_workqueue(esw->work_queue); @@ -1926,7 +1923,6 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) esw_info(esw->dev, "cleanup\n"); - esw->dev->priv.eswitch = NULL; destroy_workqueue(esw->work_queue); WARN_ON(refcount_read(&esw->qos.refcnt)); mutex_destroy(&esw->state_lock); @@ -1937,6 +1933,7 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) mutex_destroy(&esw->offloads.encap_tbl_lock); mutex_destroy(&esw->offloads.decap_tbl_lock); esw_offloads_cleanup(esw); + esw->dev->priv.eswitch = NULL; mlx5_esw_vports_cleanup(esw); debugfs_remove_recursive(esw->debugfs_root); devl_params_unregister(priv_to_devlink(esw->dev), mlx5_eswitch_params, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index baaae628b0a0f..e3cce110e52fd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -2476,6 +2476,10 @@ int esw_offloads_init(struct mlx5_eswitch *esw) if (err) return err; + if (MLX5_ESWITCH_MANAGER(esw->dev) && + mlx5_esw_vport_match_metadata_supported(esw)) + esw->flags |= MLX5_ESWITCH_VPORT_MATCH_METADATA; + err = devl_params_register(priv_to_devlink(esw->dev), esw_devlink_params, ARRAY_SIZE(esw_devlink_params)); -- GitLab From c6e77aa9dd82bc18a89bf49418f8f7e961cfccc8 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 9 Apr 2024 22:08:10 +0300 Subject: [PATCH 725/989] net/mlx5: Register devlink first under devlink lock In case device is having a non fatal FW error during probe, the driver will report the error to user via devlink. This will trigger a WARN_ON, since mlx5 is calling devlink_register() last. In order to avoid the WARN_ON[1], change mlx5 to invoke devl_register() first under devlink lock. [1] WARNING: CPU: 5 PID: 227 at net/devlink/health.c:483 devlink_recover_notify.constprop.0+0xb8/0xc0 CPU: 5 PID: 227 Comm: kworker/u16:3 Not tainted 6.4.0-rc5_for_upstream_min_debug_2023_06_12_12_38 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: mlx5_health0000:08:00.0 mlx5_fw_reporter_err_work [mlx5_core] RIP: 0010:devlink_recover_notify.constprop.0+0xb8/0xc0 Call Trace: ? __warn+0x79/0x120 ? devlink_recover_notify.constprop.0+0xb8/0xc0 ? report_bug+0x17c/0x190 ? handle_bug+0x3c/0x60 ? exc_invalid_op+0x14/0x70 ? asm_exc_invalid_op+0x16/0x20 ? devlink_recover_notify.constprop.0+0xb8/0xc0 devlink_health_report+0x4a/0x1c0 mlx5_fw_reporter_err_work+0xa4/0xd0 [mlx5_core] process_one_work+0x1bb/0x3c0 ? process_one_work+0x3c0/0x3c0 worker_thread+0x4d/0x3c0 ? process_one_work+0x3c0/0x3c0 kthread+0xc6/0xf0 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 Fixes: cf530217408e ("devlink: Notify users when objects are accessible") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/main.c | 37 ++++++++++--------- .../mellanox/mlx5/core/sf/dev/driver.c | 1 - 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index c2593625c09ad..59806553889e9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1480,6 +1480,14 @@ int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev) if (err) goto err_register; + err = mlx5_crdump_enable(dev); + if (err) + mlx5_core_err(dev, "mlx5_crdump_enable failed with error code %d\n", err); + + err = mlx5_hwmon_dev_register(dev); + if (err) + mlx5_core_err(dev, "mlx5_hwmon_dev_register failed with error code %d\n", err); + mutex_unlock(&dev->intf_state_mutex); return 0; @@ -1505,7 +1513,10 @@ int mlx5_init_one(struct mlx5_core_dev *dev) int err; devl_lock(devlink); + devl_register(devlink); err = mlx5_init_one_devl_locked(dev); + if (err) + devl_unregister(devlink); devl_unlock(devlink); return err; } @@ -1517,6 +1528,8 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev) devl_lock(devlink); mutex_lock(&dev->intf_state_mutex); + mlx5_hwmon_dev_unregister(dev); + mlx5_crdump_disable(dev); mlx5_unregister_device(dev); if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { @@ -1534,6 +1547,7 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev) mlx5_function_teardown(dev, true); out: mutex_unlock(&dev->intf_state_mutex); + devl_unregister(devlink); devl_unlock(devlink); } @@ -1680,16 +1694,20 @@ int mlx5_init_one_light(struct mlx5_core_dev *dev) } devl_lock(devlink); + devl_register(devlink); + err = mlx5_devlink_params_register(priv_to_devlink(dev)); - devl_unlock(devlink); if (err) { mlx5_core_warn(dev, "mlx5_devlink_param_reg err = %d\n", err); goto query_hca_caps_err; } + devl_unlock(devlink); return 0; query_hca_caps_err: + devl_unregister(devlink); + devl_unlock(devlink); mlx5_function_disable(dev, true); out: dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; @@ -1702,6 +1720,7 @@ void mlx5_uninit_one_light(struct mlx5_core_dev *dev) devl_lock(devlink); mlx5_devlink_params_unregister(priv_to_devlink(dev)); + devl_unregister(devlink); devl_unlock(devlink); if (dev->state != MLX5_DEVICE_STATE_UP) return; @@ -1943,16 +1962,7 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id) goto err_init_one; } - err = mlx5_crdump_enable(dev); - if (err) - dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err); - - err = mlx5_hwmon_dev_register(dev); - if (err) - mlx5_core_err(dev, "mlx5_hwmon_dev_register failed with error code %d\n", err); - pci_save_state(pdev); - devlink_register(devlink); return 0; err_init_one: @@ -1973,16 +1983,9 @@ static void remove_one(struct pci_dev *pdev) struct devlink *devlink = priv_to_devlink(dev); set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state); - /* mlx5_drain_fw_reset() and mlx5_drain_health_wq() are using - * devlink notify APIs. - * Hence, we must drain them before unregistering the devlink. - */ mlx5_drain_fw_reset(dev); mlx5_drain_health_wq(dev); - devlink_unregister(devlink); mlx5_sriov_disable(pdev, false); - mlx5_hwmon_dev_unregister(dev); - mlx5_crdump_disable(dev); mlx5_uninit_one(dev); mlx5_pci_close(dev); mlx5_mdev_uninit(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c index bc863e1f062e6..e3bf8c7e4baa6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -101,7 +101,6 @@ static void mlx5_sf_dev_remove(struct auxiliary_device *adev) devlink = priv_to_devlink(mdev); set_bit(MLX5_BREAK_FW_WAIT, &mdev->intf_state); mlx5_drain_health_wq(mdev); - devlink_unregister(devlink); if (mlx5_dev_is_lightweight(mdev)) mlx5_uninit_one_light(mdev); else -- GitLab From 9f7e8fbb91f8fa29548e2f6ab50c03b628c67ede Mon Sep 17 00:00:00 2001 From: Michael Liang Date: Tue, 9 Apr 2024 22:08:11 +0300 Subject: [PATCH 726/989] net/mlx5: offset comp irq index in name by one The mlx5 comp irq name scheme is changed a little bit between commit 3663ad34bc70 ("net/mlx5: Shift control IRQ to the last index") and commit 3354822cde5a ("net/mlx5: Use dynamic msix vectors allocation"). The index in the comp irq name used to start from 0 but now it starts from 1. There is nothing critical here, but it's harmless to change back to the old behavior, a.k.a starting from 0. Fixes: 3354822cde5a ("net/mlx5: Use dynamic msix vectors allocation") Reviewed-by: Mohamed Khalfella Reviewed-by: Yuanyuan Zhong Signed-off-by: Michael Liang Reviewed-by: Shay Drory Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 4dcf995cb1a20..6bac8ad70ba60 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -19,6 +19,7 @@ #define MLX5_IRQ_CTRL_SF_MAX 8 /* min num of vectors for SFs to be enabled */ #define MLX5_IRQ_VEC_COMP_BASE_SF 2 +#define MLX5_IRQ_VEC_COMP_BASE 1 #define MLX5_EQ_SHARE_IRQ_MAX_COMP (8) #define MLX5_EQ_SHARE_IRQ_MAX_CTRL (UINT_MAX) @@ -246,6 +247,7 @@ static void irq_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx) return; } + vecidx -= MLX5_IRQ_VEC_COMP_BASE; snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", vecidx); } @@ -585,7 +587,7 @@ struct mlx5_irq *mlx5_irq_request_vector(struct mlx5_core_dev *dev, u16 cpu, struct mlx5_irq_table *table = mlx5_irq_table_get(dev); struct mlx5_irq_pool *pool = table->pcif_pool; struct irq_affinity_desc af_desc; - int offset = 1; + int offset = MLX5_IRQ_VEC_COMP_BASE; if (!pool->xa_num_irqs.max) offset = 0; -- GitLab From 7c6782ad4911cbee874e85630226ed389ff2e453 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 9 Apr 2024 22:08:12 +0300 Subject: [PATCH 727/989] net/mlx5: Properly link new fs rules into the tree Previously, add_rule_fg would only add newly created rules from the handle into the tree when they had a refcount of 1. On the other hand, create_flow_handle tries hard to find and reference already existing identical rules instead of creating new ones. These two behaviors can result in a situation where create_flow_handle 1) creates a new rule and references it, then 2) in a subsequent step during the same handle creation references it again, resulting in a rule with a refcount of 2 that is not linked into the tree, will have a NULL parent and root and will result in a crash when the flow group is deleted because del_sw_hw_rule, invoked on rule deletion, assumes node->parent is != NULL. This happened in the wild, due to another bug related to incorrect handling of duplicate pkt_reformat ids, which lead to the code in create_flow_handle incorrectly referencing a just-added rule in the same flow handle, resulting in the problem described above. Full details are at [1]. This patch changes add_rule_fg to add new rules without parents into the tree, properly initializing them and avoiding the crash. This makes it more consistent with how rules are added to an FTE in create_flow_handle. Fixes: 74491de93712 ("net/mlx5: Add multi dest support") Link: https://lore.kernel.org/netdev/ea5264d6-6b55-4449-a602-214c6f509c1e@163.com/T/#u [1] Signed-off-by: Cosmin Ratiu Reviewed-by: Tariq Toukan Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index e6bfa7e4f146c..2a9421342a503 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1808,8 +1808,9 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg, } trace_mlx5_fs_set_fte(fte, false); + /* Link newly added rules into the tree. */ for (i = 0; i < handle->num_rules; i++) { - if (refcount_read(&handle->rule[i]->node.refcount) == 1) { + if (!handle->rule[i]->node.parent) { tree_add_node(&handle->rule[i]->node, &fte->node); trace_mlx5_fs_add_rule(handle->rule[i]); } -- GitLab From 9eca93f4d5ab03905516a68683674d9c50ff95bd Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 9 Apr 2024 22:08:13 +0300 Subject: [PATCH 728/989] net/mlx5: Correctly compare pkt reformat ids struct mlx5_pkt_reformat contains a naked union of a u32 id and a dr_action pointer which is used when the action is SW-managed (when pkt_reformat.owner is set to MLX5_FLOW_RESOURCE_OWNER_SW). Using id directly in that case is incorrect, as it maps to the least significant 32 bits of the 64-bit pointer in mlx5_fs_dr_action and not to the pkt reformat id allocated in firmware. For the purpose of comparing whether two rules are identical, interpreting the least significant 32 bits of the mlx5_fs_dr_action pointer as an id mostly works... until it breaks horribly and produces the outcome described in [1]. This patch fixes mlx5_flow_dests_cmp to correctly compare ids using mlx5_fs_dr_action_get_pkt_reformat_id for the SW-managed rules. Link: https://lore.kernel.org/netdev/ea5264d6-6b55-4449-a602-214c6f509c1e@163.com/T/#u [1] Fixes: 6a48faeeca10 ("net/mlx5: Add direct rule fs_cmd implementation") Signed-off-by: Cosmin Ratiu Reviewed-by: Mark Bloch Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 2a9421342a503..cf085a478e3e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1664,6 +1664,16 @@ static int create_auto_flow_group(struct mlx5_flow_table *ft, return err; } +static bool mlx5_pkt_reformat_cmp(struct mlx5_pkt_reformat *p1, + struct mlx5_pkt_reformat *p2) +{ + return p1->owner == p2->owner && + (p1->owner == MLX5_FLOW_RESOURCE_OWNER_FW ? + p1->id == p2->id : + mlx5_fs_dr_action_get_pkt_reformat_id(p1) == + mlx5_fs_dr_action_get_pkt_reformat_id(p2)); +} + static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1, struct mlx5_flow_destination *d2) { @@ -1675,8 +1685,8 @@ static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1, ((d1->vport.flags & MLX5_FLOW_DEST_VPORT_VHCA_ID) ? (d1->vport.vhca_id == d2->vport.vhca_id) : true) && ((d1->vport.flags & MLX5_FLOW_DEST_VPORT_REFORMAT_ID) ? - (d1->vport.pkt_reformat->id == - d2->vport.pkt_reformat->id) : true)) || + mlx5_pkt_reformat_cmp(d1->vport.pkt_reformat, + d2->vport.pkt_reformat) : true)) || (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE && d1->ft == d2->ft) || (d1->type == MLX5_FLOW_DESTINATION_TYPE_TIR && -- GitLab From ee3572409f74a838154af74ce1e56e62c17786a8 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 9 Apr 2024 22:08:14 +0300 Subject: [PATCH 729/989] net/mlx5e: RSS, Block changing channels number when RXFH is configured Changing the channels number after configuring the receive flow hash indirection table may affect the RSS table size. The previous configuration may no longer be compatible with the new receive flow hash indirection table. Block changing the channels number when RXFH is configured and changing the channels number requires resizing the RSS table size. Fixes: 74a8dadac17e ("net/mlx5e: Preparations for supporting larger number of channels") Signed-off-by: Carolina Jubran Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/en_ethtool.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index cc51ce16df14a..93461b0c5703b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -451,6 +451,23 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv, mutex_lock(&priv->state_lock); + /* If RXFH is configured, changing the channels number is allowed only if + * it does not require resizing the RSS table. This is because the previous + * configuration may no longer be compatible with the new RSS table. + */ + if (netif_is_rxfh_configured(priv->netdev)) { + int cur_rqt_size = mlx5e_rqt_size(priv->mdev, cur_params->num_channels); + int new_rqt_size = mlx5e_rqt_size(priv->mdev, count); + + if (new_rqt_size != cur_rqt_size) { + err = -EINVAL; + netdev_err(priv->netdev, + "%s: RXFH is configured, block changing channels number that affects RSS table size (new: %d, current: %d)\n", + __func__, new_rqt_size, cur_rqt_size); + goto out; + } + } + /* Don't allow changing the number of channels if HTB offload is active, * because the numeration of the QoS SQs will change, while per-queue * qdiscs are attached. -- GitLab From ecb829459a841198e142f72fadab56424ae96519 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 9 Apr 2024 22:08:15 +0300 Subject: [PATCH 730/989] net/mlx5e: Fix mlx5e_priv_init() cleanup flow When mlx5e_priv_init() fails, the cleanup flow calls mlx5e_selq_cleanup which calls mlx5e_selq_apply() that assures that the `priv->state_lock` is held using lockdep_is_held(). Acquire the state_lock in mlx5e_selq_cleanup(). Kernel log: ============================= WARNING: suspicious RCU usage 6.8.0-rc3_net_next_841a9b5 #1 Not tainted ----------------------------- drivers/net/ethernet/mellanox/mlx5/core/en/selq.c:124 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 2 locks held by systemd-modules/293: #0: ffffffffa05067b0 (devices_rwsem){++++}-{3:3}, at: ib_register_client+0x109/0x1b0 [ib_core] #1: ffff8881096c65c0 (&device->client_data_rwsem){++++}-{3:3}, at: add_client_context+0x104/0x1c0 [ib_core] stack backtrace: CPU: 4 PID: 293 Comm: systemd-modules Not tainted 6.8.0-rc3_net_next_841a9b5 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x8a/0xa0 lockdep_rcu_suspicious+0x154/0x1a0 mlx5e_selq_apply+0x94/0xa0 [mlx5_core] mlx5e_selq_cleanup+0x3a/0x60 [mlx5_core] mlx5e_priv_init+0x2be/0x2f0 [mlx5_core] mlx5_rdma_setup_rn+0x7c/0x1a0 [mlx5_core] rdma_init_netdev+0x4e/0x80 [ib_core] ? mlx5_rdma_netdev_free+0x70/0x70 [mlx5_core] ipoib_intf_init+0x64/0x550 [ib_ipoib] ipoib_intf_alloc+0x4e/0xc0 [ib_ipoib] ipoib_add_one+0xb0/0x360 [ib_ipoib] add_client_context+0x112/0x1c0 [ib_core] ib_register_client+0x166/0x1b0 [ib_core] ? 0xffffffffa0573000 ipoib_init_module+0xeb/0x1a0 [ib_ipoib] do_one_initcall+0x61/0x250 do_init_module+0x8a/0x270 init_module_from_file+0x8b/0xd0 idempotent_init_module+0x17d/0x230 __x64_sys_finit_module+0x61/0xb0 do_syscall_64+0x71/0x140 entry_SYSCALL_64_after_hwframe+0x46/0x4e Fixes: 8bf30be75069 ("net/mlx5e: Introduce select queue parameters") Signed-off-by: Carolina Jubran Reviewed-by: Tariq Toukan Reviewed-by: Dragos Tatulea Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-8-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/selq.c | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/selq.c b/drivers/net/ethernet/mellanox/mlx5/core/en/selq.c index f675b1926340f..f66bbc8464645 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/selq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/selq.c @@ -57,6 +57,7 @@ int mlx5e_selq_init(struct mlx5e_selq *selq, struct mutex *state_lock) void mlx5e_selq_cleanup(struct mlx5e_selq *selq) { + mutex_lock(selq->state_lock); WARN_ON_ONCE(selq->is_prepared); kvfree(selq->standby); @@ -67,6 +68,7 @@ void mlx5e_selq_cleanup(struct mlx5e_selq *selq) kvfree(selq->standby); selq->standby = NULL; + mutex_unlock(selq->state_lock); } void mlx5e_selq_prepare_params(struct mlx5e_selq *selq, struct mlx5e_params *params) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 91848eae45655..b375ef268671a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5726,9 +5726,7 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv) kfree(priv->tx_rates); kfree(priv->txq2sq); destroy_workqueue(priv->wq); - mutex_lock(&priv->state_lock); mlx5e_selq_cleanup(&priv->selq); - mutex_unlock(&priv->state_lock); free_cpumask_var(priv->scratchpad.cpumask); for (i = 0; i < priv->htb_max_qos_sqs; i++) -- GitLab From 2f436f1869771d46e1a9f85738d5a1a7c5653a4e Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 9 Apr 2024 22:08:16 +0300 Subject: [PATCH 731/989] net/mlx5e: HTB, Fix inconsistencies with QoS SQs number When creating a new HTB class while the interface is down, the variable that follows the number of QoS SQs (htb_max_qos_sqs) may not be consistent with the number of HTB classes. Previously, we compared these two values to ensure that the node_qid is lower than the number of QoS SQs, and we allocated stats for that SQ when they are equal. Change the check to compare the node_qid with the current number of leaf nodes and fix the checking conditions to ensure allocation of stats_list and stats for each node. Fixes: 214baf22870c ("net/mlx5e: Support HTB offload") Signed-off-by: Carolina Jubran Reviewed-by: Tariq Toukan Reviewed-by: Dragos Tatulea Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-9-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en/qos.c | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c index e87e26f2c669c..6743806b84806 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c @@ -83,24 +83,25 @@ int mlx5e_open_qos_sq(struct mlx5e_priv *priv, struct mlx5e_channels *chs, txq_ix = mlx5e_qid_from_qos(chs, node_qid); - WARN_ON(node_qid > priv->htb_max_qos_sqs); - if (node_qid == priv->htb_max_qos_sqs) { - struct mlx5e_sq_stats *stats, **stats_list = NULL; - - if (priv->htb_max_qos_sqs == 0) { - stats_list = kvcalloc(mlx5e_qos_max_leaf_nodes(priv->mdev), - sizeof(*stats_list), - GFP_KERNEL); - if (!stats_list) - return -ENOMEM; - } + WARN_ON(node_qid >= mlx5e_htb_cur_leaf_nodes(priv->htb)); + if (!priv->htb_qos_sq_stats) { + struct mlx5e_sq_stats **stats_list; + + stats_list = kvcalloc(mlx5e_qos_max_leaf_nodes(priv->mdev), + sizeof(*stats_list), GFP_KERNEL); + if (!stats_list) + return -ENOMEM; + + WRITE_ONCE(priv->htb_qos_sq_stats, stats_list); + } + + if (!priv->htb_qos_sq_stats[node_qid]) { + struct mlx5e_sq_stats *stats; + stats = kzalloc(sizeof(*stats), GFP_KERNEL); - if (!stats) { - kvfree(stats_list); + if (!stats) return -ENOMEM; - } - if (stats_list) - WRITE_ONCE(priv->htb_qos_sq_stats, stats_list); + WRITE_ONCE(priv->htb_qos_sq_stats[node_qid], stats); /* Order htb_max_qos_sqs increment after writing the array pointer. * Pairs with smp_load_acquire in en_stats.c. -- GitLab From 86b0ca5b118d3a0bae5e5645a13e66f8a4f6c525 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 9 Apr 2024 22:08:17 +0300 Subject: [PATCH 732/989] net/mlx5e: Do not produce metadata freelist entries in Tx port ts WQE xmit Free Tx port timestamping metadata entries in the NAPI poll context and consume metadata enties in the WQE xmit path. Do not free a Tx port timestamping metadata entry in the WQE xmit path even in the error path to avoid a race between two metadata entry producers. Fixes: 3178308ad4ca ("net/mlx5e: Make tx_port_ts logic resilient to out-of-order CQEs") Signed-off-by: Rahul Rameshbabu Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-10-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h | 8 +++++++- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 7 +++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h index 86f1854698b4e..883c044852f1d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h @@ -95,9 +95,15 @@ static inline void mlx5e_ptp_metadata_fifo_push(struct mlx5e_ptp_metadata_fifo * } static inline u8 +mlx5e_ptp_metadata_fifo_peek(struct mlx5e_ptp_metadata_fifo *fifo) +{ + return fifo->data[fifo->mask & fifo->cc]; +} + +static inline void mlx5e_ptp_metadata_fifo_pop(struct mlx5e_ptp_metadata_fifo *fifo) { - return fifo->data[fifo->mask & fifo->cc++]; + fifo->cc++; } static inline void diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 2fa076b23fbea..e21a3b4128ce8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -398,6 +398,8 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb, (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) { u8 metadata_index = be32_to_cpu(eseg->flow_table_metadata); + mlx5e_ptp_metadata_fifo_pop(&sq->ptpsq->metadata_freelist); + mlx5e_skb_cb_hwtstamp_init(skb); mlx5e_ptp_metadata_map_put(&sq->ptpsq->metadata_map, skb, metadata_index); @@ -496,9 +498,6 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, err_drop: stats->dropped++; - if (unlikely(sq->ptpsq && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) - mlx5e_ptp_metadata_fifo_push(&sq->ptpsq->metadata_freelist, - be32_to_cpu(eseg->flow_table_metadata)); dev_kfree_skb_any(skb); mlx5e_tx_flush(sq); } @@ -657,7 +656,7 @@ static void mlx5e_cqe_ts_id_eseg(struct mlx5e_ptpsq *ptpsq, struct sk_buff *skb, { if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) eseg->flow_table_metadata = - cpu_to_be32(mlx5e_ptp_metadata_fifo_pop(&ptpsq->metadata_freelist)); + cpu_to_be32(mlx5e_ptp_metadata_fifo_peek(&ptpsq->metadata_freelist)); } static void mlx5e_txwqe_build_eseg(struct mlx5e_priv *priv, struct mlx5e_txqsq *sq, -- GitLab From 49e6c9387051716169ff6a6c5ddd4d9f358db2e9 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 9 Apr 2024 22:08:18 +0300 Subject: [PATCH 733/989] net/mlx5e: RSS, Block XOR hash with over 128 channels When supporting more than 128 channels, the RQT size is calculated by multiplying the number of channels by 2 and rounding up to the nearest power of 2. The index of the RQT is derived from the RSS hash calculations. If XOR8 is used as the RSS hash function, there are only 256 possible hash results, and therefore, only 256 indexes can be reached in the RQT. Block setting the RSS hash function to XOR when the number of channels exceeds 128. Fixes: 74a8dadac17e ("net/mlx5e: Preparations for supporting larger number of channels") Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-11-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en/rqt.c | 7 +++++ .../net/ethernet/mellanox/mlx5/core/en/rqt.h | 1 + .../ethernet/mellanox/mlx5/core/en_ethtool.c | 28 +++++++++++++++++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c index bcafb4bf94154..8d9a3b5ec973b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c @@ -179,6 +179,13 @@ u32 mlx5e_rqt_size(struct mlx5_core_dev *mdev, unsigned int num_channels) return min_t(u32, rqt_size, max_cap_rqt_size); } +#define MLX5E_MAX_RQT_SIZE_ALLOWED_WITH_XOR8_HASH 256 + +unsigned int mlx5e_rqt_max_num_channels_allowed_for_xor8(void) +{ + return MLX5E_MAX_RQT_SIZE_ALLOWED_WITH_XOR8_HASH / MLX5E_UNIFORM_SPREAD_RQT_FACTOR; +} + void mlx5e_rqt_destroy(struct mlx5e_rqt *rqt) { mlx5_core_destroy_rqt(rqt->mdev, rqt->rqtn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h index e0bc30308c770..2f9e04a8418f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h @@ -38,6 +38,7 @@ static inline u32 mlx5e_rqt_get_rqtn(struct mlx5e_rqt *rqt) } u32 mlx5e_rqt_size(struct mlx5_core_dev *mdev, unsigned int num_channels); +unsigned int mlx5e_rqt_max_num_channels_allowed_for_xor8(void); int mlx5e_rqt_redirect_direct(struct mlx5e_rqt *rqt, u32 rqn, u32 *vhca_id); int mlx5e_rqt_redirect_indir(struct mlx5e_rqt *rqt, u32 *rqns, u32 *vhca_ids, unsigned int num_rqns, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 93461b0c5703b..8f101181648c6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -451,6 +451,17 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv, mutex_lock(&priv->state_lock); + if (mlx5e_rx_res_get_current_hash(priv->rx_res).hfunc == ETH_RSS_HASH_XOR) { + unsigned int xor8_max_channels = mlx5e_rqt_max_num_channels_allowed_for_xor8(); + + if (count > xor8_max_channels) { + err = -EINVAL; + netdev_err(priv->netdev, "%s: Requested number of channels (%d) exceeds the maximum allowed by the XOR8 RSS hfunc (%d)\n", + __func__, count, xor8_max_channels); + goto out; + } + } + /* If RXFH is configured, changing the channels number is allowed only if * it does not require resizing the RSS table. This is because the previous * configuration may no longer be compatible with the new RSS table. @@ -1298,17 +1309,30 @@ int mlx5e_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh, struct mlx5e_priv *priv = netdev_priv(dev); u32 *rss_context = &rxfh->rss_context; u8 hfunc = rxfh->hfunc; + unsigned int count; int err; mutex_lock(&priv->state_lock); + + count = priv->channels.params.num_channels; + + if (hfunc == ETH_RSS_HASH_XOR) { + unsigned int xor8_max_channels = mlx5e_rqt_max_num_channels_allowed_for_xor8(); + + if (count > xor8_max_channels) { + err = -EINVAL; + netdev_err(priv->netdev, "%s: Cannot set RSS hash function to XOR, current number of channels (%d) exceeds the maximum allowed for XOR8 RSS hfunc (%d)\n", + __func__, count, xor8_max_channels); + goto unlock; + } + } + if (*rss_context && rxfh->rss_delete) { err = mlx5e_rx_res_rss_destroy(priv->rx_res, *rss_context); goto unlock; } if (*rss_context == ETH_RXFH_CONTEXT_ALLOC) { - unsigned int count = priv->channels.params.num_channels; - err = mlx5e_rx_res_rss_init(priv->rx_res, rss_context, count); if (err) goto unlock; -- GitLab From 7772dc7460e8ef359f3eee88c3b708cb403e19af Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 9 Apr 2024 22:08:19 +0300 Subject: [PATCH 734/989] net/mlx5: Disallow SRIOV switchdev mode when in multi-PF netdev Adaptations need to be made for the auxiliary device management in the core driver level. Block this combination for now. Fixes: 678eb448055a ("net/mlx5: SD, Implement basic query and instantiation") Signed-off-by: Tariq Toukan Reviewed-by: Dragos Tatulea Reviewed-by: Gal Pressman Link: https://lore.kernel.org/r/20240409190820.227554-12-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index e3cce110e52fd..1f60954c12f72 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -43,6 +43,7 @@ #include "rdma.h" #include "en.h" #include "fs_core.h" +#include "lib/mlx5.h" #include "lib/devcom.h" #include "lib/eq.h" #include "lib/fs_chains.h" @@ -3711,6 +3712,12 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, if (esw_mode_from_devlink(mode, &mlx5_mode)) return -EINVAL; + if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV && mlx5_get_sd(esw->dev)) { + NL_SET_ERR_MSG_MOD(extack, + "Can't change E-Switch mode to switchdev when multi-PF netdev (Socket Direct) is configured."); + return -EPERM; + } + mlx5_lag_disable_change(esw->dev); err = mlx5_esw_try_lock(esw); if (err < 0) { -- GitLab From fe87922cee6161f066f4b9dd542033e048eeedaf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2024 09:41:10 +0200 Subject: [PATCH 735/989] net/mlx5: fix possible stack overflows A couple of debug functions use a 512 byte temporary buffer and call another function that has another buffer of the same size, which in turn exceeds the usual warning limit for excessive stack usage: drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c:1073:1: error: stack frame size (1448) exceeds limit (1024) in 'dr_dump_start' [-Werror,-Wframe-larger-than] dr_dump_start(struct seq_file *file, loff_t *pos) drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c:1009:1: error: stack frame size (1120) exceeds limit (1024) in 'dr_dump_domain' [-Werror,-Wframe-larger-than] dr_dump_domain(struct seq_file *file, struct mlx5dr_domain *dmn) drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c:705:1: error: stack frame size (1104) exceeds limit (1024) in 'dr_dump_matcher_rx_tx' [-Werror,-Wframe-larger-than] dr_dump_matcher_rx_tx(struct seq_file *file, bool is_rx, Rework these so that each of the various code paths only ever has one of these buffers in it, and exactly the functions that declare one have the 'noinline_for_stack' annotation that prevents them from all being inlined into the same caller. Fixes: 917d1e799ddf ("net/mlx5: DR, Change SWS usage to debug fs seq_file interface") Reviewed-by: Simon Horman Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/all/20240219100506.648089-1-arnd@kernel.org/ Signed-off-by: Arnd Bergmann Acked-by: Tariq Toukan Link: https://lore.kernel.org/r/20240408074142.3007036-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/dr_dbg.c | 82 +++++++++---------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c index 64f4cc284aea4..030a5776c9374 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c @@ -205,12 +205,11 @@ dr_dump_hex_print(char hex[DR_HEX_SIZE], char *src, u32 size) } static int -dr_dump_rule_action_mem(struct seq_file *file, const u64 rule_id, +dr_dump_rule_action_mem(struct seq_file *file, char *buff, const u64 rule_id, struct mlx5dr_rule_action_member *action_mem) { struct mlx5dr_action *action = action_mem->action; const u64 action_id = DR_DBG_PTR_TO_ID(action); - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; u64 hit_tbl_ptr, miss_tbl_ptr; u32 hit_tbl_id, miss_tbl_id; int ret; @@ -488,10 +487,9 @@ dr_dump_rule_action_mem(struct seq_file *file, const u64 rule_id, } static int -dr_dump_rule_mem(struct seq_file *file, struct mlx5dr_ste *ste, +dr_dump_rule_mem(struct seq_file *file, char *buff, struct mlx5dr_ste *ste, bool is_rx, const u64 rule_id, u8 format_ver) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; char hw_ste_dump[DR_HEX_SIZE]; u32 mem_rec_type; int ret; @@ -522,7 +520,8 @@ dr_dump_rule_mem(struct seq_file *file, struct mlx5dr_ste *ste, } static int -dr_dump_rule_rx_tx(struct seq_file *file, struct mlx5dr_rule_rx_tx *rule_rx_tx, +dr_dump_rule_rx_tx(struct seq_file *file, char *buff, + struct mlx5dr_rule_rx_tx *rule_rx_tx, bool is_rx, const u64 rule_id, u8 format_ver) { struct mlx5dr_ste *ste_arr[DR_RULE_MAX_STES + DR_ACTION_MAX_STES]; @@ -533,7 +532,7 @@ dr_dump_rule_rx_tx(struct seq_file *file, struct mlx5dr_rule_rx_tx *rule_rx_tx, return 0; while (i--) { - ret = dr_dump_rule_mem(file, ste_arr[i], is_rx, rule_id, + ret = dr_dump_rule_mem(file, buff, ste_arr[i], is_rx, rule_id, format_ver); if (ret < 0) return ret; @@ -542,7 +541,8 @@ dr_dump_rule_rx_tx(struct seq_file *file, struct mlx5dr_rule_rx_tx *rule_rx_tx, return 0; } -static int dr_dump_rule(struct seq_file *file, struct mlx5dr_rule *rule) +static noinline_for_stack int +dr_dump_rule(struct seq_file *file, struct mlx5dr_rule *rule) { struct mlx5dr_rule_action_member *action_mem; const u64 rule_id = DR_DBG_PTR_TO_ID(rule); @@ -565,19 +565,19 @@ static int dr_dump_rule(struct seq_file *file, struct mlx5dr_rule *rule) return ret; if (rx->nic_matcher) { - ret = dr_dump_rule_rx_tx(file, rx, true, rule_id, format_ver); + ret = dr_dump_rule_rx_tx(file, buff, rx, true, rule_id, format_ver); if (ret < 0) return ret; } if (tx->nic_matcher) { - ret = dr_dump_rule_rx_tx(file, tx, false, rule_id, format_ver); + ret = dr_dump_rule_rx_tx(file, buff, tx, false, rule_id, format_ver); if (ret < 0) return ret; } list_for_each_entry(action_mem, &rule->rule_actions_list, list) { - ret = dr_dump_rule_action_mem(file, rule_id, action_mem); + ret = dr_dump_rule_action_mem(file, buff, rule_id, action_mem); if (ret < 0) return ret; } @@ -586,10 +586,10 @@ static int dr_dump_rule(struct seq_file *file, struct mlx5dr_rule *rule) } static int -dr_dump_matcher_mask(struct seq_file *file, struct mlx5dr_match_param *mask, +dr_dump_matcher_mask(struct seq_file *file, char *buff, + struct mlx5dr_match_param *mask, u8 criteria, const u64 matcher_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; char dump[DR_HEX_SIZE]; int ret; @@ -681,10 +681,10 @@ dr_dump_matcher_mask(struct seq_file *file, struct mlx5dr_match_param *mask, } static int -dr_dump_matcher_builder(struct seq_file *file, struct mlx5dr_ste_build *builder, +dr_dump_matcher_builder(struct seq_file *file, char *buff, + struct mlx5dr_ste_build *builder, u32 index, bool is_rx, const u64 matcher_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; int ret; ret = snprintf(buff, MLX5DR_DEBUG_DUMP_BUFF_LENGTH, @@ -702,11 +702,10 @@ dr_dump_matcher_builder(struct seq_file *file, struct mlx5dr_ste_build *builder, } static int -dr_dump_matcher_rx_tx(struct seq_file *file, bool is_rx, +dr_dump_matcher_rx_tx(struct seq_file *file, char *buff, bool is_rx, struct mlx5dr_matcher_rx_tx *matcher_rx_tx, const u64 matcher_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; enum dr_dump_rec_type rec_type; u64 s_icm_addr, e_icm_addr; int i, ret; @@ -731,7 +730,7 @@ dr_dump_matcher_rx_tx(struct seq_file *file, bool is_rx, return ret; for (i = 0; i < matcher_rx_tx->num_of_builders; i++) { - ret = dr_dump_matcher_builder(file, + ret = dr_dump_matcher_builder(file, buff, &matcher_rx_tx->ste_builder[i], i, is_rx, matcher_id); if (ret < 0) @@ -741,7 +740,7 @@ dr_dump_matcher_rx_tx(struct seq_file *file, bool is_rx, return 0; } -static int +static noinline_for_stack int dr_dump_matcher(struct seq_file *file, struct mlx5dr_matcher *matcher) { struct mlx5dr_matcher_rx_tx *rx = &matcher->rx; @@ -763,19 +762,19 @@ dr_dump_matcher(struct seq_file *file, struct mlx5dr_matcher *matcher) if (ret) return ret; - ret = dr_dump_matcher_mask(file, &matcher->mask, + ret = dr_dump_matcher_mask(file, buff, &matcher->mask, matcher->match_criteria, matcher_id); if (ret < 0) return ret; if (rx->nic_tbl) { - ret = dr_dump_matcher_rx_tx(file, true, rx, matcher_id); + ret = dr_dump_matcher_rx_tx(file, buff, true, rx, matcher_id); if (ret < 0) return ret; } if (tx->nic_tbl) { - ret = dr_dump_matcher_rx_tx(file, false, tx, matcher_id); + ret = dr_dump_matcher_rx_tx(file, buff, false, tx, matcher_id); if (ret < 0) return ret; } @@ -803,11 +802,10 @@ dr_dump_matcher_all(struct seq_file *file, struct mlx5dr_matcher *matcher) } static int -dr_dump_table_rx_tx(struct seq_file *file, bool is_rx, +dr_dump_table_rx_tx(struct seq_file *file, char *buff, bool is_rx, struct mlx5dr_table_rx_tx *table_rx_tx, const u64 table_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; enum dr_dump_rec_type rec_type; u64 s_icm_addr; int ret; @@ -829,7 +827,8 @@ dr_dump_table_rx_tx(struct seq_file *file, bool is_rx, return 0; } -static int dr_dump_table(struct seq_file *file, struct mlx5dr_table *table) +static noinline_for_stack int +dr_dump_table(struct seq_file *file, struct mlx5dr_table *table) { struct mlx5dr_table_rx_tx *rx = &table->rx; struct mlx5dr_table_rx_tx *tx = &table->tx; @@ -848,14 +847,14 @@ static int dr_dump_table(struct seq_file *file, struct mlx5dr_table *table) return ret; if (rx->nic_dmn) { - ret = dr_dump_table_rx_tx(file, true, rx, + ret = dr_dump_table_rx_tx(file, buff, true, rx, DR_DBG_PTR_TO_ID(table)); if (ret < 0) return ret; } if (tx->nic_dmn) { - ret = dr_dump_table_rx_tx(file, false, tx, + ret = dr_dump_table_rx_tx(file, buff, false, tx, DR_DBG_PTR_TO_ID(table)); if (ret < 0) return ret; @@ -881,10 +880,10 @@ static int dr_dump_table_all(struct seq_file *file, struct mlx5dr_table *tbl) } static int -dr_dump_send_ring(struct seq_file *file, struct mlx5dr_send_ring *ring, +dr_dump_send_ring(struct seq_file *file, char *buff, + struct mlx5dr_send_ring *ring, const u64 domain_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; int ret; ret = snprintf(buff, MLX5DR_DEBUG_DUMP_BUFF_LENGTH, @@ -902,13 +901,13 @@ dr_dump_send_ring(struct seq_file *file, struct mlx5dr_send_ring *ring, return 0; } -static noinline_for_stack int +static int dr_dump_domain_info_flex_parser(struct seq_file *file, + char *buff, const char *flex_parser_name, const u8 flex_parser_value, const u64 domain_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; int ret; ret = snprintf(buff, MLX5DR_DEBUG_DUMP_BUFF_LENGTH, @@ -925,11 +924,11 @@ dr_dump_domain_info_flex_parser(struct seq_file *file, return 0; } -static noinline_for_stack int -dr_dump_domain_info_caps(struct seq_file *file, struct mlx5dr_cmd_caps *caps, +static int +dr_dump_domain_info_caps(struct seq_file *file, char *buff, + struct mlx5dr_cmd_caps *caps, const u64 domain_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; struct mlx5dr_cmd_vport_cap *vport_caps; unsigned long i, vports_num; int ret; @@ -969,34 +968,35 @@ dr_dump_domain_info_caps(struct seq_file *file, struct mlx5dr_cmd_caps *caps, } static int -dr_dump_domain_info(struct seq_file *file, struct mlx5dr_domain_info *info, +dr_dump_domain_info(struct seq_file *file, char *buff, + struct mlx5dr_domain_info *info, const u64 domain_id) { int ret; - ret = dr_dump_domain_info_caps(file, &info->caps, domain_id); + ret = dr_dump_domain_info_caps(file, buff, &info->caps, domain_id); if (ret < 0) return ret; - ret = dr_dump_domain_info_flex_parser(file, "icmp_dw0", + ret = dr_dump_domain_info_flex_parser(file, buff, "icmp_dw0", info->caps.flex_parser_id_icmp_dw0, domain_id); if (ret < 0) return ret; - ret = dr_dump_domain_info_flex_parser(file, "icmp_dw1", + ret = dr_dump_domain_info_flex_parser(file, buff, "icmp_dw1", info->caps.flex_parser_id_icmp_dw1, domain_id); if (ret < 0) return ret; - ret = dr_dump_domain_info_flex_parser(file, "icmpv6_dw0", + ret = dr_dump_domain_info_flex_parser(file, buff, "icmpv6_dw0", info->caps.flex_parser_id_icmpv6_dw0, domain_id); if (ret < 0) return ret; - ret = dr_dump_domain_info_flex_parser(file, "icmpv6_dw1", + ret = dr_dump_domain_info_flex_parser(file, buff, "icmpv6_dw1", info->caps.flex_parser_id_icmpv6_dw1, domain_id); if (ret < 0) @@ -1032,12 +1032,12 @@ dr_dump_domain(struct seq_file *file, struct mlx5dr_domain *dmn) if (ret) return ret; - ret = dr_dump_domain_info(file, &dmn->info, domain_id); + ret = dr_dump_domain_info(file, buff, &dmn->info, domain_id); if (ret < 0) return ret; if (dmn->info.supp_sw_steering) { - ret = dr_dump_send_ring(file, dmn->send_ring, domain_id); + ret = dr_dump_send_ring(file, buff, dmn->send_ring, domain_id); if (ret < 0) return ret; } -- GitLab From 2f7b1d8b5505efb0057cd1ab85fca206063ea4c3 Mon Sep 17 00:00:00 2001 From: Pin-yen Lin Date: Tue, 12 Mar 2024 19:51:55 +0800 Subject: [PATCH 736/989] clk: mediatek: Do a runtime PM get on controllers during probe mt8183-mfgcfg has a mutual dependency with genpd during the probing stage, which leads to a deadlock in the following call stack: CPU0: genpd_lock --> clk_prepare_lock genpd_power_off_work_fn() genpd_lock() generic_pm_domain::power_off() clk_unprepare() clk_prepare_lock() CPU1: clk_prepare_lock --> genpd_lock clk_register() __clk_core_init() clk_prepare_lock() clk_pm_runtime_get() genpd_lock() Do a runtime PM get at the probe function to make sure clk_register() won't acquire the genpd lock. Instead of only modifying mt8183-mfgcfg, do this on all mediatek clock controller probings because we don't believe this would cause any regression. Verified on MT8183 and MT8192 Chromebooks. Fixes: acddfc2c261b ("clk: mediatek: Add MT8183 clock support") Signed-off-by: Pin-yen Lin Link: https://lore.kernel.org/r/20240312115249.3341654-1-treapking@chromium.org Reviewed-by: AngeloGioacchino Del Regno Tested-by: AngeloGioacchino Del Regno Signed-off-by: Stephen Boyd --- drivers/clk/mediatek/clk-mtk.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/clk/mediatek/clk-mtk.c b/drivers/clk/mediatek/clk-mtk.c index 2e55368dc4d82..bd37ab4d1a9bb 100644 --- a/drivers/clk/mediatek/clk-mtk.c +++ b/drivers/clk/mediatek/clk-mtk.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "clk-mtk.h" @@ -494,6 +495,16 @@ static int __mtk_clk_simple_probe(struct platform_device *pdev, return IS_ERR(base) ? PTR_ERR(base) : -ENOMEM; } + + devm_pm_runtime_enable(&pdev->dev); + /* + * Do a pm_runtime_resume_and_get() to workaround a possible + * deadlock between clk_register() and the genpd framework. + */ + r = pm_runtime_resume_and_get(&pdev->dev); + if (r) + return r; + /* Calculate how many clk_hw_onecell_data entries to allocate */ num_clks = mcd->num_clks + mcd->num_composite_clks; num_clks += mcd->num_fixed_clks + mcd->num_factor_clks; @@ -574,6 +585,8 @@ static int __mtk_clk_simple_probe(struct platform_device *pdev, goto unregister_clks; } + pm_runtime_put(&pdev->dev); + return r; unregister_clks: @@ -604,6 +617,8 @@ free_data: free_base: if (mcd->shared_io && base) iounmap(base); + + pm_runtime_put(&pdev->dev); return r; } -- GitLab From d3e8a91a848a5941e3c31ecebd6b2612b37e01a6 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Wed, 13 Mar 2024 22:05:37 +0000 Subject: [PATCH 737/989] clk: mediatek: mt7988-infracfg: fix clocks for 2nd PCIe port Due to what seems to be an undocumented oddity in MediaTek's MT7988 SoC design the CLK_INFRA_PCIE_PERI_26M_CK_P2 clock requires CLK_INFRA_PCIE_PERI_26M_CK_P3 to be enabled. This currently leads to PCIe port 2 not working in Linux. Reflect the apparent relationship in the clk driver to make sure PCIe port 2 of the MT7988 SoC works. Fixes: 4b4719437d85f ("clk: mediatek: add drivers for MT7988 SoC") Suggested-by: Sam Shih Signed-off-by: Daniel Golle Link: https://lore.kernel.org/r/1da2506a51f970706bf4ec9509dd04e0471065e5.1710367453.git.daniel@makrotopia.org Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Stephen Boyd --- drivers/clk/mediatek/clk-mt7988-infracfg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/mediatek/clk-mt7988-infracfg.c b/drivers/clk/mediatek/clk-mt7988-infracfg.c index 449041f8abbc9..c8c023afe3e5a 100644 --- a/drivers/clk/mediatek/clk-mt7988-infracfg.c +++ b/drivers/clk/mediatek/clk-mt7988-infracfg.c @@ -156,7 +156,7 @@ static const struct mtk_gate infra_clks[] = { GATE_INFRA0(CLK_INFRA_PCIE_PERI_26M_CK_P1, "infra_pcie_peri_ck_26m_ck_p1", "csw_infra_f26m_sel", 8), GATE_INFRA0(CLK_INFRA_PCIE_PERI_26M_CK_P2, "infra_pcie_peri_ck_26m_ck_p2", - "csw_infra_f26m_sel", 9), + "infra_pcie_peri_ck_26m_ck_p3", 9), GATE_INFRA0(CLK_INFRA_PCIE_PERI_26M_CK_P3, "infra_pcie_peri_ck_26m_ck_p3", "csw_infra_f26m_sel", 10), /* INFRA1 */ -- GitLab From 33623113a48ea906f1955cbf71094f6aa4462e8f Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Tue, 9 Apr 2024 12:41:59 +0200 Subject: [PATCH 738/989] net: sparx5: fix wrong config being used when reconfiguring PCS The wrong port config is being used if the PCS is reconfigured. Fix this by correctly using the new config instead of the old one. Fixes: 946e7fd5053a ("net: sparx5: add port module support") Signed-off-by: Daniel Machon Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20240409-link-mode-reconfiguration-fix-v2-1-db6a507f3627@microchip.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microchip/sparx5/sparx5_port.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c index 3a1b1a1f5a195..60dd2fd603a85 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c @@ -731,7 +731,7 @@ static int sparx5_port_pcs_low_set(struct sparx5 *sparx5, bool sgmii = false, inband_aneg = false; int err; - if (port->conf.inband) { + if (conf->inband) { if (conf->portmode == PHY_INTERFACE_MODE_SGMII || conf->portmode == PHY_INTERFACE_MODE_QSGMII) inband_aneg = true; /* Cisco-SGMII in-band-aneg */ @@ -948,7 +948,7 @@ int sparx5_port_pcs_set(struct sparx5 *sparx5, if (err) return -EINVAL; - if (port->conf.inband) { + if (conf->inband) { /* Enable/disable 1G counters in ASM */ spx5_rmw(ASM_PORT_CFG_CSC_STAT_DIS_SET(high_speed_dev), ASM_PORT_CFG_CSC_STAT_DIS, -- GitLab From d51dc8dd6ab6f93a894ff8b38d3b8d02c98eb9fb Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Tue, 9 Apr 2024 13:37:53 +0200 Subject: [PATCH 739/989] Revert "s390/ism: fix receive message buffer allocation" This reverts commit 58effa3476536215530c9ec4910ffc981613b413. Review was not finished on this patch. So it's not ready for upstreaming. Signed-off-by: Gerd Bayer Link: https://lore.kernel.org/r/20240409113753.2181368-1-gbayer@linux.ibm.com Fixes: 58effa347653 ("s390/ism: fix receive message buffer allocation") Signed-off-by: Paolo Abeni --- drivers/s390/net/ism_drv.c | 38 +++++++++----------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index affb05521e146..2c8e964425dc3 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -14,8 +14,6 @@ #include #include #include -#include -#include #include "ism.h" @@ -294,15 +292,13 @@ out: static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); - dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len, - DMA_FROM_DEVICE); - folio_put(virt_to_folio(dmb->cpu_addr)); + dma_free_coherent(&ism->pdev->dev, dmb->dmb_len, + dmb->cpu_addr, dmb->dma_addr); } static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { unsigned long bit; - int rc; if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev)) return -EINVAL; @@ -319,30 +315,14 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) test_and_set_bit(dmb->sba_idx, ism->sba_bitmap)) return -EINVAL; - dmb->cpu_addr = - folio_address(folio_alloc(GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_NORETRY, - get_order(dmb->dmb_len))); + dmb->cpu_addr = dma_alloc_coherent(&ism->pdev->dev, dmb->dmb_len, + &dmb->dma_addr, + GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | __GFP_NORETRY); + if (!dmb->cpu_addr) + clear_bit(dmb->sba_idx, ism->sba_bitmap); - if (!dmb->cpu_addr) { - rc = -ENOMEM; - goto out_bit; - } - dmb->dma_addr = dma_map_page(&ism->pdev->dev, - virt_to_page(dmb->cpu_addr), 0, - dmb->dmb_len, DMA_FROM_DEVICE); - if (dma_mapping_error(&ism->pdev->dev, dmb->dma_addr)) { - rc = -ENOMEM; - goto out_free; - } - - return 0; - -out_free: - kfree(dmb->cpu_addr); -out_bit: - clear_bit(dmb->sba_idx, ism->sba_bitmap); - return rc; + return dmb->cpu_addr ? 0 : -ENOMEM; } int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, -- GitLab From 17c560113231ddc20088553c7b499b289b664311 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Tue, 9 Apr 2024 18:01:14 +0300 Subject: [PATCH 740/989] net: dsa: mt7530: trap link-local frames regardless of ST Port State MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In Clause 5 of IEEE Std 802-2014, two sublayers of the data link layer (DLL) of the Open Systems Interconnection basic reference model (OSI/RM) are described; the medium access control (MAC) and logical link control (LLC) sublayers. The MAC sublayer is the one facing the physical layer. In 8.2 of IEEE Std 802.1Q-2022, the Bridge architecture is described. A Bridge component comprises a MAC Relay Entity for interconnecting the Ports of the Bridge, at least two Ports, and higher layer entities with at least a Spanning Tree Protocol Entity included. Each Bridge Port also functions as an end station and shall provide the MAC Service to an LLC Entity. Each instance of the MAC Service is provided to a distinct LLC Entity that supports protocol identification, multiplexing, and demultiplexing, for protocol data unit (PDU) transmission and reception by one or more higher layer entities. It is described in 8.13.9 of IEEE Std 802.1Q-2022 that in a Bridge, the LLC Entity associated with each Bridge Port is modeled as being directly connected to the attached Local Area Network (LAN). On the switch with CPU port architecture, CPU port functions as Management Port, and the Management Port functionality is provided by software which functions as an end station. Software is connected to an IEEE 802 LAN that is wholly contained within the system that incorporates the Bridge. Software provides access to the LLC Entity associated with each Bridge Port by the value of the source port field on the special tag on the frame received by software. We call frames that carry control information to determine the active topology and current extent of each Virtual Local Area Network (VLAN), i.e., spanning tree or Shortest Path Bridging (SPB) and Multiple VLAN Registration Protocol Data Units (MVRPDUs), and frames from other link constrained protocols, such as Extensible Authentication Protocol over LAN (EAPOL) and Link Layer Discovery Protocol (LLDP), link-local frames. They are not forwarded by a Bridge. Permanently configured entries in the filtering database (FDB) ensure that such frames are discarded by the Forwarding Process. In 8.6.3 of IEEE Std 802.1Q-2022, this is described in detail: Each of the reserved MAC addresses specified in Table 8-1 (01-80-C2-00-00-[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F]) shall be permanently configured in the FDB in C-VLAN components and ERs. Each of the reserved MAC addresses specified in Table 8-2 (01-80-C2-00-00-[01,02,03,04,05,06,07,08,09,0A,0E]) shall be permanently configured in the FDB in S-VLAN components. Each of the reserved MAC addresses specified in Table 8-3 (01-80-C2-00-00-[01,02,04,0E]) shall be permanently configured in the FDB in TPMR components. The FDB entries for reserved MAC addresses shall specify filtering for all Bridge Ports and all VIDs. Management shall not provide the capability to modify or remove entries for reserved MAC addresses. The addresses in Table 8-1, Table 8-2, and Table 8-3 determine the scope of propagation of PDUs within a Bridged Network, as follows: The Nearest Bridge group address (01-80-C2-00-00-0E) is an address that no conformant Two-Port MAC Relay (TPMR) component, Service VLAN (S-VLAN) component, Customer VLAN (C-VLAN) component, or MAC Bridge can forward. PDUs transmitted using this destination address, or any other addresses that appear in Table 8-1, Table 8-2, and Table 8-3 (01-80-C2-00-00-[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F]), can therefore travel no further than those stations that can be reached via a single individual LAN from the originating station. The Nearest non-TPMR Bridge group address (01-80-C2-00-00-03), is an address that no conformant S-VLAN component, C-VLAN component, or MAC Bridge can forward; however, this address is relayed by a TPMR component. PDUs using this destination address, or any of the other addresses that appear in both Table 8-1 and Table 8-2 but not in Table 8-3 (01-80-C2-00-00-[00,03,05,06,07,08,09,0A,0B,0C,0D,0F]), will be relayed by any TPMRs but will propagate no further than the nearest S-VLAN component, C-VLAN component, or MAC Bridge. The Nearest Customer Bridge group address (01-80-C2-00-00-00) is an address that no conformant C-VLAN component, MAC Bridge can forward; however, it is relayed by TPMR components and S-VLAN components. PDUs using this destination address, or any of the other addresses that appear in Table 8-1 but not in either Table 8-2 or Table 8-3 (01-80-C2-00-00-[00,0B,0C,0D,0F]), will be relayed by TPMR components and S-VLAN components but will propagate no further than the nearest C-VLAN component or MAC Bridge. Because the LLC Entity associated with each Bridge Port is provided via CPU port, we must not filter these frames but forward them to CPU port. In a Bridge, the transmission Port is majorly decided by ingress and egress rules, FDB, and spanning tree Port State functions of the Forwarding Process. For link-local frames, only CPU port should be designated as destination port in the FDB, and the other functions of the Forwarding Process must not interfere with the decision of the transmission Port. We call this process trapping frames to CPU port. Therefore, on the switch with CPU port architecture, link-local frames must be trapped to CPU port, and certain link-local frames received by a Port of a Bridge comprising a TPMR component or an S-VLAN component must be excluded from it. A Bridge of the switch with CPU port architecture cannot comprise a Two-Port MAC Relay (TPMR) component as a TPMR component supports only a subset of the functionality of a MAC Bridge. A Bridge comprising two Ports (Management Port doesn't count) of this architecture will either function as a standard MAC Bridge or a standard VLAN Bridge. Therefore, a Bridge of this architecture can only comprise S-VLAN components, C-VLAN components, or MAC Bridge components. Since there's no TPMR component, we don't need to relay PDUs using the destination addresses specified on the Nearest non-TPMR section, and the proportion of the Nearest Customer Bridge section where they must be relayed by TPMR components. One option to trap link-local frames to CPU port is to add static FDB entries with CPU port designated as destination port. However, because that Independent VLAN Learning (IVL) is being used on every VID, each entry only applies to a single VLAN Identifier (VID). For a Bridge comprising a MAC Bridge component or a C-VLAN component, there would have to be 16 times 4096 entries. This switch intellectual property can only hold a maximum of 2048 entries. Using this option, there also isn't a mechanism to prevent link-local frames from being discarded when the spanning tree Port State of the reception Port is discarding. The remaining option is to utilise the BPC, RGAC1, RGAC2, RGAC3, and RGAC4 registers. Whilst this applies to every VID, it doesn't contain all of the reserved MAC addresses without affecting the remaining Standard Group MAC Addresses. The REV_UN frame tag utilised using the RGAC4 register covers the remaining 01-80-C2-00-00-[04,05,06,07,08,09,0A,0B,0C,0D,0F] destination addresses. It also includes the 01-80-C2-00-00-22 to 01-80-C2-00-00-FF destination addresses which may be relayed by MAC Bridges or VLAN Bridges. The latter option provides better but not complete conformance. This switch intellectual property also does not provide a mechanism to trap link-local frames with specific destination addresses to CPU port by Bridge, to conform to the filtering rules for the distinct Bridge components. Therefore, regardless of the type of the Bridge component, link-local frames with these destination addresses will be trapped to CPU port: 01-80-C2-00-00-[00,01,02,03,0E] In a Bridge comprising a MAC Bridge component or a C-VLAN component: Link-local frames with these destination addresses won't be trapped to CPU port which won't conform to IEEE Std 802.1Q-2022: 01-80-C2-00-00-[04,05,06,07,08,09,0A,0B,0C,0D,0F] In a Bridge comprising an S-VLAN component: Link-local frames with these destination addresses will be trapped to CPU port which won't conform to IEEE Std 802.1Q-2022: 01-80-C2-00-00-00 Link-local frames with these destination addresses won't be trapped to CPU port which won't conform to IEEE Std 802.1Q-2022: 01-80-C2-00-00-[04,05,06,07,08,09,0A] Currently on this switch intellectual property, if the spanning tree Port State of the reception Port is discarding, link-local frames will be discarded. To trap link-local frames regardless of the spanning tree Port State, make the switch regard them as Bridge Protocol Data Units (BPDUs). This switch intellectual property only lets the frames regarded as BPDUs bypass the spanning tree Port State function of the Forwarding Process. With this change, the only remaining interference is the ingress rules. When the reception Port has no PVID assigned on software, VLAN-untagged frames won't be allowed in. There doesn't seem to be a mechanism on the switch intellectual property to have link-local frames bypass this function of the Forwarding Process. Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch") Reviewed-by: Daniel Golle Signed-off-by: Arınç ÜNAL Link: https://lore.kernel.org/r/20240409-b4-for-net-mt7530-fix-link-local-when-stp-discarding-v2-1-07b1150164ac@arinc9.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/mt7530.c | 229 +++++++++++++++++++++++++++++++++------ drivers/net/dsa/mt7530.h | 5 + 2 files changed, 200 insertions(+), 34 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 451ff0620c2e1..c0d0bce0b5942 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -950,20 +950,173 @@ static void mt7530_setup_port5(struct dsa_switch *ds, phy_interface_t interface) mutex_unlock(&priv->reg_mutex); } -/* On page 205, section "8.6.3 Frame filtering" of the active standard, IEEE Std - * 802.1Q™-2022, it is stated that frames with 01:80:C2:00:00:00-0F as MAC DA - * must only be propagated to C-VLAN and MAC Bridge components. That means - * VLAN-aware and VLAN-unaware bridges. On the switch designs with CPU ports, - * these frames are supposed to be processed by the CPU (software). So we make - * the switch only forward them to the CPU port. And if received from a CPU - * port, forward to a single port. The software is responsible of making the - * switch conform to the latter by setting a single port as destination port on - * the special tag. +/* In Clause 5 of IEEE Std 802-2014, two sublayers of the data link layer (DLL) + * of the Open Systems Interconnection basic reference model (OSI/RM) are + * described; the medium access control (MAC) and logical link control (LLC) + * sublayers. The MAC sublayer is the one facing the physical layer. * - * This switch intellectual property cannot conform to this part of the standard - * fully. Whilst the REV_UN frame tag covers the remaining :04-0D and :0F MAC - * DAs, it also includes :22-FF which the scope of propagation is not supposed - * to be restricted for these MAC DAs. + * In 8.2 of IEEE Std 802.1Q-2022, the Bridge architecture is described. A + * Bridge component comprises a MAC Relay Entity for interconnecting the Ports + * of the Bridge, at least two Ports, and higher layer entities with at least a + * Spanning Tree Protocol Entity included. + * + * Each Bridge Port also functions as an end station and shall provide the MAC + * Service to an LLC Entity. Each instance of the MAC Service is provided to a + * distinct LLC Entity that supports protocol identification, multiplexing, and + * demultiplexing, for protocol data unit (PDU) transmission and reception by + * one or more higher layer entities. + * + * It is described in 8.13.9 of IEEE Std 802.1Q-2022 that in a Bridge, the LLC + * Entity associated with each Bridge Port is modeled as being directly + * connected to the attached Local Area Network (LAN). + * + * On the switch with CPU port architecture, CPU port functions as Management + * Port, and the Management Port functionality is provided by software which + * functions as an end station. Software is connected to an IEEE 802 LAN that is + * wholly contained within the system that incorporates the Bridge. Software + * provides access to the LLC Entity associated with each Bridge Port by the + * value of the source port field on the special tag on the frame received by + * software. + * + * We call frames that carry control information to determine the active + * topology and current extent of each Virtual Local Area Network (VLAN), i.e., + * spanning tree or Shortest Path Bridging (SPB) and Multiple VLAN Registration + * Protocol Data Units (MVRPDUs), and frames from other link constrained + * protocols, such as Extensible Authentication Protocol over LAN (EAPOL) and + * Link Layer Discovery Protocol (LLDP), link-local frames. They are not + * forwarded by a Bridge. Permanently configured entries in the filtering + * database (FDB) ensure that such frames are discarded by the Forwarding + * Process. In 8.6.3 of IEEE Std 802.1Q-2022, this is described in detail: + * + * Each of the reserved MAC addresses specified in Table 8-1 + * (01-80-C2-00-00-[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F]) shall be + * permanently configured in the FDB in C-VLAN components and ERs. + * + * Each of the reserved MAC addresses specified in Table 8-2 + * (01-80-C2-00-00-[01,02,03,04,05,06,07,08,09,0A,0E]) shall be permanently + * configured in the FDB in S-VLAN components. + * + * Each of the reserved MAC addresses specified in Table 8-3 + * (01-80-C2-00-00-[01,02,04,0E]) shall be permanently configured in the FDB in + * TPMR components. + * + * The FDB entries for reserved MAC addresses shall specify filtering for all + * Bridge Ports and all VIDs. Management shall not provide the capability to + * modify or remove entries for reserved MAC addresses. + * + * The addresses in Table 8-1, Table 8-2, and Table 8-3 determine the scope of + * propagation of PDUs within a Bridged Network, as follows: + * + * The Nearest Bridge group address (01-80-C2-00-00-0E) is an address that no + * conformant Two-Port MAC Relay (TPMR) component, Service VLAN (S-VLAN) + * component, Customer VLAN (C-VLAN) component, or MAC Bridge can forward. + * PDUs transmitted using this destination address, or any other addresses + * that appear in Table 8-1, Table 8-2, and Table 8-3 + * (01-80-C2-00-00-[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F]), can + * therefore travel no further than those stations that can be reached via a + * single individual LAN from the originating station. + * + * The Nearest non-TPMR Bridge group address (01-80-C2-00-00-03), is an + * address that no conformant S-VLAN component, C-VLAN component, or MAC + * Bridge can forward; however, this address is relayed by a TPMR component. + * PDUs using this destination address, or any of the other addresses that + * appear in both Table 8-1 and Table 8-2 but not in Table 8-3 + * (01-80-C2-00-00-[00,03,05,06,07,08,09,0A,0B,0C,0D,0F]), will be relayed by + * any TPMRs but will propagate no further than the nearest S-VLAN component, + * C-VLAN component, or MAC Bridge. + * + * The Nearest Customer Bridge group address (01-80-C2-00-00-00) is an address + * that no conformant C-VLAN component, MAC Bridge can forward; however, it is + * relayed by TPMR components and S-VLAN components. PDUs using this + * destination address, or any of the other addresses that appear in Table 8-1 + * but not in either Table 8-2 or Table 8-3 (01-80-C2-00-00-[00,0B,0C,0D,0F]), + * will be relayed by TPMR components and S-VLAN components but will propagate + * no further than the nearest C-VLAN component or MAC Bridge. + * + * Because the LLC Entity associated with each Bridge Port is provided via CPU + * port, we must not filter these frames but forward them to CPU port. + * + * In a Bridge, the transmission Port is majorly decided by ingress and egress + * rules, FDB, and spanning tree Port State functions of the Forwarding Process. + * For link-local frames, only CPU port should be designated as destination port + * in the FDB, and the other functions of the Forwarding Process must not + * interfere with the decision of the transmission Port. We call this process + * trapping frames to CPU port. + * + * Therefore, on the switch with CPU port architecture, link-local frames must + * be trapped to CPU port, and certain link-local frames received by a Port of a + * Bridge comprising a TPMR component or an S-VLAN component must be excluded + * from it. + * + * A Bridge of the switch with CPU port architecture cannot comprise a Two-Port + * MAC Relay (TPMR) component as a TPMR component supports only a subset of the + * functionality of a MAC Bridge. A Bridge comprising two Ports (Management Port + * doesn't count) of this architecture will either function as a standard MAC + * Bridge or a standard VLAN Bridge. + * + * Therefore, a Bridge of this architecture can only comprise S-VLAN components, + * C-VLAN components, or MAC Bridge components. Since there's no TPMR component, + * we don't need to relay PDUs using the destination addresses specified on the + * Nearest non-TPMR section, and the proportion of the Nearest Customer Bridge + * section where they must be relayed by TPMR components. + * + * One option to trap link-local frames to CPU port is to add static FDB entries + * with CPU port designated as destination port. However, because that + * Independent VLAN Learning (IVL) is being used on every VID, each entry only + * applies to a single VLAN Identifier (VID). For a Bridge comprising a MAC + * Bridge component or a C-VLAN component, there would have to be 16 times 4096 + * entries. This switch intellectual property can only hold a maximum of 2048 + * entries. Using this option, there also isn't a mechanism to prevent + * link-local frames from being discarded when the spanning tree Port State of + * the reception Port is discarding. + * + * The remaining option is to utilise the BPC, RGAC1, RGAC2, RGAC3, and RGAC4 + * registers. Whilst this applies to every VID, it doesn't contain all of the + * reserved MAC addresses without affecting the remaining Standard Group MAC + * Addresses. The REV_UN frame tag utilised using the RGAC4 register covers the + * remaining 01-80-C2-00-00-[04,05,06,07,08,09,0A,0B,0C,0D,0F] destination + * addresses. It also includes the 01-80-C2-00-00-22 to 01-80-C2-00-00-FF + * destination addresses which may be relayed by MAC Bridges or VLAN Bridges. + * The latter option provides better but not complete conformance. + * + * This switch intellectual property also does not provide a mechanism to trap + * link-local frames with specific destination addresses to CPU port by Bridge, + * to conform to the filtering rules for the distinct Bridge components. + * + * Therefore, regardless of the type of the Bridge component, link-local frames + * with these destination addresses will be trapped to CPU port: + * + * 01-80-C2-00-00-[00,01,02,03,0E] + * + * In a Bridge comprising a MAC Bridge component or a C-VLAN component: + * + * Link-local frames with these destination addresses won't be trapped to CPU + * port which won't conform to IEEE Std 802.1Q-2022: + * + * 01-80-C2-00-00-[04,05,06,07,08,09,0A,0B,0C,0D,0F] + * + * In a Bridge comprising an S-VLAN component: + * + * Link-local frames with these destination addresses will be trapped to CPU + * port which won't conform to IEEE Std 802.1Q-2022: + * + * 01-80-C2-00-00-00 + * + * Link-local frames with these destination addresses won't be trapped to CPU + * port which won't conform to IEEE Std 802.1Q-2022: + * + * 01-80-C2-00-00-[04,05,06,07,08,09,0A] + * + * To trap link-local frames to CPU port as conformant as this switch + * intellectual property can allow, link-local frames are made to be regarded as + * Bridge Protocol Data Units (BPDUs). This is because this switch intellectual + * property only lets the frames regarded as BPDUs bypass the spanning tree Port + * State function of the Forwarding Process. + * + * The only remaining interference is the ingress rules. When the reception Port + * has no PVID assigned on software, VLAN-untagged frames won't be allowed in. + * There doesn't seem to be a mechanism on the switch intellectual property to + * have link-local frames bypass this function of the Forwarding Process. */ static void mt753x_trap_frames(struct mt7530_priv *priv) @@ -971,35 +1124,43 @@ mt753x_trap_frames(struct mt7530_priv *priv) /* Trap 802.1X PAE frames and BPDUs to the CPU port(s) and egress them * VLAN-untagged. */ - mt7530_rmw(priv, MT753X_BPC, MT753X_PAE_EG_TAG_MASK | - MT753X_PAE_PORT_FW_MASK | MT753X_BPDU_EG_TAG_MASK | - MT753X_BPDU_PORT_FW_MASK, - MT753X_PAE_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | - MT753X_PAE_PORT_FW(MT753X_BPDU_CPU_ONLY) | - MT753X_BPDU_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | - MT753X_BPDU_CPU_ONLY); + mt7530_rmw(priv, MT753X_BPC, + MT753X_PAE_BPDU_FR | MT753X_PAE_EG_TAG_MASK | + MT753X_PAE_PORT_FW_MASK | MT753X_BPDU_EG_TAG_MASK | + MT753X_BPDU_PORT_FW_MASK, + MT753X_PAE_BPDU_FR | + MT753X_PAE_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | + MT753X_PAE_PORT_FW(MT753X_BPDU_CPU_ONLY) | + MT753X_BPDU_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | + MT753X_BPDU_CPU_ONLY); /* Trap frames with :01 and :02 MAC DAs to the CPU port(s) and egress * them VLAN-untagged. */ - mt7530_rmw(priv, MT753X_RGAC1, MT753X_R02_EG_TAG_MASK | - MT753X_R02_PORT_FW_MASK | MT753X_R01_EG_TAG_MASK | - MT753X_R01_PORT_FW_MASK, - MT753X_R02_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | - MT753X_R02_PORT_FW(MT753X_BPDU_CPU_ONLY) | - MT753X_R01_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | - MT753X_BPDU_CPU_ONLY); + mt7530_rmw(priv, MT753X_RGAC1, + MT753X_R02_BPDU_FR | MT753X_R02_EG_TAG_MASK | + MT753X_R02_PORT_FW_MASK | MT753X_R01_BPDU_FR | + MT753X_R01_EG_TAG_MASK | MT753X_R01_PORT_FW_MASK, + MT753X_R02_BPDU_FR | + MT753X_R02_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | + MT753X_R02_PORT_FW(MT753X_BPDU_CPU_ONLY) | + MT753X_R01_BPDU_FR | + MT753X_R01_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | + MT753X_BPDU_CPU_ONLY); /* Trap frames with :03 and :0E MAC DAs to the CPU port(s) and egress * them VLAN-untagged. */ - mt7530_rmw(priv, MT753X_RGAC2, MT753X_R0E_EG_TAG_MASK | - MT753X_R0E_PORT_FW_MASK | MT753X_R03_EG_TAG_MASK | - MT753X_R03_PORT_FW_MASK, - MT753X_R0E_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | - MT753X_R0E_PORT_FW(MT753X_BPDU_CPU_ONLY) | - MT753X_R03_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | - MT753X_BPDU_CPU_ONLY); + mt7530_rmw(priv, MT753X_RGAC2, + MT753X_R0E_BPDU_FR | MT753X_R0E_EG_TAG_MASK | + MT753X_R0E_PORT_FW_MASK | MT753X_R03_BPDU_FR | + MT753X_R03_EG_TAG_MASK | MT753X_R03_PORT_FW_MASK, + MT753X_R0E_BPDU_FR | + MT753X_R0E_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | + MT753X_R0E_PORT_FW(MT753X_BPDU_CPU_ONLY) | + MT753X_R03_BPDU_FR | + MT753X_R03_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | + MT753X_BPDU_CPU_ONLY); } static void diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 9439db4950015..585db03c05487 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -65,6 +65,7 @@ enum mt753x_id { /* Registers for BPDU and PAE frame control*/ #define MT753X_BPC 0x24 +#define MT753X_PAE_BPDU_FR BIT(25) #define MT753X_PAE_EG_TAG_MASK GENMASK(24, 22) #define MT753X_PAE_EG_TAG(x) FIELD_PREP(MT753X_PAE_EG_TAG_MASK, x) #define MT753X_PAE_PORT_FW_MASK GENMASK(18, 16) @@ -75,20 +76,24 @@ enum mt753x_id { /* Register for :01 and :02 MAC DA frame control */ #define MT753X_RGAC1 0x28 +#define MT753X_R02_BPDU_FR BIT(25) #define MT753X_R02_EG_TAG_MASK GENMASK(24, 22) #define MT753X_R02_EG_TAG(x) FIELD_PREP(MT753X_R02_EG_TAG_MASK, x) #define MT753X_R02_PORT_FW_MASK GENMASK(18, 16) #define MT753X_R02_PORT_FW(x) FIELD_PREP(MT753X_R02_PORT_FW_MASK, x) +#define MT753X_R01_BPDU_FR BIT(9) #define MT753X_R01_EG_TAG_MASK GENMASK(8, 6) #define MT753X_R01_EG_TAG(x) FIELD_PREP(MT753X_R01_EG_TAG_MASK, x) #define MT753X_R01_PORT_FW_MASK GENMASK(2, 0) /* Register for :03 and :0E MAC DA frame control */ #define MT753X_RGAC2 0x2c +#define MT753X_R0E_BPDU_FR BIT(25) #define MT753X_R0E_EG_TAG_MASK GENMASK(24, 22) #define MT753X_R0E_EG_TAG(x) FIELD_PREP(MT753X_R0E_EG_TAG_MASK, x) #define MT753X_R0E_PORT_FW_MASK GENMASK(18, 16) #define MT753X_R0E_PORT_FW(x) FIELD_PREP(MT753X_R0E_PORT_FW_MASK, x) +#define MT753X_R03_BPDU_FR BIT(9) #define MT753X_R03_EG_TAG_MASK GENMASK(8, 6) #define MT753X_R03_EG_TAG(x) FIELD_PREP(MT753X_R03_EG_TAG_MASK, x) #define MT753X_R03_PORT_FW_MASK GENMASK(2, 0) -- GitLab From 47d8ac011fe1c9251070e1bd64cb10b48193ec51 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 9 Apr 2024 22:09:39 +0200 Subject: [PATCH 741/989] af_unix: Fix garbage collector racing against connect() Garbage collector does not take into account the risk of embryo getting enqueued during the garbage collection. If such embryo has a peer that carries SCM_RIGHTS, two consecutive passes of scan_children() may see a different set of children. Leading to an incorrectly elevated inflight count, and then a dangling pointer within the gc_inflight_list. sockets are AF_UNIX/SOCK_STREAM S is an unconnected socket L is a listening in-flight socket bound to addr, not in fdtable V's fd will be passed via sendmsg(), gets inflight count bumped connect(S, addr) sendmsg(S, [V]); close(V) __unix_gc() ---------------- ------------------------- ----------- NS = unix_create1() skb1 = sock_wmalloc(NS) L = unix_find_other(addr) unix_state_lock(L) unix_peer(S) = NS // V count=1 inflight=0 NS = unix_peer(S) skb2 = sock_alloc() skb_queue_tail(NS, skb2[V]) // V became in-flight // V count=2 inflight=1 close(V) // V count=1 inflight=1 // GC candidate condition met for u in gc_inflight_list: if (total_refs == inflight_refs) add u to gc_candidates // gc_candidates={L, V} for u in gc_candidates: scan_children(u, dec_inflight) // embryo (skb1) was not // reachable from L yet, so V's // inflight remains unchanged __skb_queue_tail(L, skb1) unix_state_unlock(L) for u in gc_candidates: if (u.inflight) scan_children(u, inc_inflight_move_tail) // V count=1 inflight=2 (!) If there is a GC-candidate listening socket, lock/unlock its state. This makes GC wait until the end of any ongoing connect() to that socket. After flipping the lock, a possibly SCM-laden embryo is already enqueued. And if there is another embryo coming, it can not possibly carry SCM_RIGHTS. At this point, unix_inflight() can not happen because unix_gc_lock is already taken. Inflight graph remains unaffected. Fixes: 1fd05ba5a2f2 ("[AF_UNIX]: Rewrite garbage collector, fixes race.") Signed-off-by: Michal Luczaj Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240409201047.1032217-1-mhal@rbox.co Signed-off-by: Paolo Abeni --- net/unix/garbage.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index fa39b62652385..6433a414acf86 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -274,11 +274,22 @@ static void __unix_gc(struct work_struct *work) * receive queues. Other, non candidate sockets _can_ be * added to queue, so we must make sure only to touch * candidates. + * + * Embryos, though never candidates themselves, affect which + * candidates are reachable by the garbage collector. Before + * being added to a listener's queue, an embryo may already + * receive data carrying SCM_RIGHTS, potentially making the + * passed socket a candidate that is not yet reachable by the + * collector. It becomes reachable once the embryo is + * enqueued. Therefore, we must ensure that no SCM-laden + * embryo appears in a (candidate) listener's queue between + * consecutive scan_children() calls. */ list_for_each_entry_safe(u, next, &gc_inflight_list, link) { + struct sock *sk = &u->sk; long total_refs; - total_refs = file_count(u->sk.sk_socket->file); + total_refs = file_count(sk->sk_socket->file); WARN_ON_ONCE(!u->inflight); WARN_ON_ONCE(total_refs < u->inflight); @@ -286,6 +297,11 @@ static void __unix_gc(struct work_struct *work) list_move_tail(&u->link, &gc_candidates); __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); + + if (sk->sk_state == TCP_LISTEN) { + unix_state_lock(sk); + unix_state_unlock(sk); + } } } -- GitLab From dfe648903f42296866d79f10d03f8c85c9dfba30 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:45 -0700 Subject: [PATCH 742/989] x86/bugs: Fix BHI documentation Fix up some inaccuracies in the BHI documentation. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Nikolay Borisov Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/8c84f7451bfe0dd08543c6082a383f390d4aa7e2.1712813475.git.jpoimboe@kernel.org --- Documentation/admin-guide/hw-vuln/spectre.rst | 15 ++++++++------- Documentation/admin-guide/kernel-parameters.txt | 12 +++++++----- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index b70b1d8bd8e65..3cf18e4a1d9ab 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -439,11 +439,11 @@ The possible values in this file are: - System is protected by retpoline * - BHI: BHI_DIS_S - System is protected by BHI_DIS_S - * - BHI: SW loop; KVM SW loop + * - BHI: SW loop, KVM SW loop - System is protected by software clearing sequence * - BHI: Syscall hardening - Syscalls are hardened against BHI - * - BHI: Syscall hardening; KVM: SW loop + * - BHI: Syscall hardening, KVM: SW loop - System is protected from userspace attacks by syscall hardening; KVM is protected by software clearing sequence Full mitigation might require a microcode update from the CPU @@ -666,13 +666,14 @@ kernel command line. of the HW BHI control and the SW BHB clearing sequence. on - unconditionally enable. + (default) Enable the HW or SW mitigation as + needed. off - unconditionally disable. + Disable the mitigation. auto - enable if hardware mitigation - control(BHI_DIS_S) is available, otherwise - enable alternate mitigation in KVM. + Enable the HW mitigation if needed, but + *don't* enable the SW mitigation except for KVM. + The system may be vulnerable. For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 70046a019d42d..a029ad6c4963c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3444,6 +3444,7 @@ retbleed=off [X86] spec_rstack_overflow=off [X86] spec_store_bypass_disable=off [X86,PPC] + spectre_bhi=off [X86] spectre_v2_user=off [X86] srbds=off [X86,INTEL] ssbd=force-off [ARM64] @@ -6069,11 +6070,12 @@ deployment of the HW BHI control and the SW BHB clearing sequence. - on - unconditionally enable. - off - unconditionally disable. - auto - (default) enable hardware mitigation - (BHI_DIS_S) if available, otherwise enable - alternate mitigation in KVM. + on - (default) Enable the HW or SW mitigation + as needed. + off - Disable the mitigation. + auto - Enable the HW mitigation if needed, but + *don't* enable the SW mitigation except + for KVM. The system may be vulnerable. spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. -- GitLab From cb2db5bb04d7f778fbc1a1ea2507aab436f1bff3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:46 -0700 Subject: [PATCH 743/989] x86/bugs: Cache the value of MSR_IA32_ARCH_CAPABILITIES There's no need to keep reading MSR_IA32_ARCH_CAPABILITIES over and over. It's even read in the BHI sysfs function which is a big no-no. Just read it once and cache it. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Nikolay Borisov Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/9592a18a814368e75f8f4b9d74d3883aa4fd1eaf.1712813475.git.jpoimboe@kernel.org --- arch/x86/kernel/cpu/bugs.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 27f5004a8e24f..ff59fa8bb6109 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -61,6 +61,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; EXPORT_SYMBOL_GPL(x86_pred_cmd); +static u64 __ro_after_init ia32_cap; + static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; @@ -144,6 +146,8 @@ void __init cpu_select_mitigations(void) x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; } + ia32_cap = x86_read_arch_cap_msr(); + /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); @@ -301,8 +305,6 @@ static const char * const taa_strings[] = { static void __init taa_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_TAA)) { taa_mitigation = TAA_MITIGATION_OFF; return; @@ -341,7 +343,6 @@ static void __init taa_select_mitigation(void) * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode * update is required. */ - ia32_cap = x86_read_arch_cap_msr(); if ( (ia32_cap & ARCH_CAP_MDS_NO) && !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; @@ -401,8 +402,6 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || cpu_mitigations_off()) { @@ -413,8 +412,6 @@ static void __init mmio_select_mitigation(void) if (mmio_mitigation == MMIO_MITIGATION_OFF) return; - ia32_cap = x86_read_arch_cap_msr(); - /* * Enable CPU buffer clear mitigation for host and VMM, if also affected * by MDS or TAA. Otherwise, enable mitigation for VMM only. @@ -508,7 +505,7 @@ static void __init rfds_select_mitigation(void) if (rfds_mitigation == RFDS_MITIGATION_OFF) return; - if (x86_read_arch_cap_msr() & ARCH_CAP_RFDS_CLEAR) + if (ia32_cap & ARCH_CAP_RFDS_CLEAR) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; @@ -659,8 +656,6 @@ void update_srbds_msr(void) static void __init srbds_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_SRBDS)) return; @@ -669,7 +664,6 @@ static void __init srbds_select_mitigation(void) * are only exposed to SRBDS when TSX is enabled or when CPU is affected * by Processor MMIO Stale Data vulnerability. */ - ia32_cap = x86_read_arch_cap_msr(); if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; @@ -813,7 +807,7 @@ static void __init gds_select_mitigation(void) /* Will verify below that mitigation _can_ be disabled */ /* No microcode */ - if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { + if (!(ia32_cap & ARCH_CAP_GDS_CTRL)) { if (gds_mitigation == GDS_MITIGATION_FORCE) { /* * This only needs to be done on the boot CPU so do it @@ -1908,8 +1902,6 @@ static void update_indir_branch_cond(void) /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { - u64 ia32_cap = x86_read_arch_cap_msr(); - /* * Enable the idle clearing if SMT is active on CPUs which are * affected only by MSBDS and not any other MDS variant. @@ -2818,7 +2810,7 @@ static const char *spectre_bhi_state(void) else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) return "; BHI: SW loop, KVM: SW loop"; else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && - !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) + !(ia32_cap & ARCH_CAP_RRSBA)) return "; BHI: Retpoline"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) return "; BHI: Syscall hardening, KVM: SW loop"; -- GitLab From d0485730d2189ffe5d986d4e9e191f1e4d5ffd24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Apr 2024 09:25:36 +0200 Subject: [PATCH 744/989] x86/bugs: Rename various 'ia32_cap' variables to 'x86_arch_cap_msr' So we are using the 'ia32_cap' value in a number of places, which got its name from MSR_IA32_ARCH_CAPABILITIES MSR register. But there's very little 'IA32' about it - this isn't 32-bit only code, nor does it originate from there, it's just a historic quirk that many Intel MSR names are prefixed with IA32_. This is already clear from the helper method around the MSR: x86_read_arch_cap_msr(), which doesn't have the IA32 prefix. So rename 'ia32_cap' to 'x86_arch_cap_msr' to be consistent with its role and with the naming of the helper function. Signed-off-by: Ingo Molnar Cc: Josh Poimboeuf Cc: Nikolay Borisov Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/9592a18a814368e75f8f4b9d74d3883aa4fd1eaf.1712813475.git.jpoimboe@kernel.org --- arch/x86/kernel/apic/apic.c | 6 ++--- arch/x86/kernel/cpu/bugs.c | 30 +++++++++++----------- arch/x86/kernel/cpu/common.c | 48 ++++++++++++++++++------------------ 3 files changed, 42 insertions(+), 42 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a42d8a6f71495..c342c4aa9c684 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1687,11 +1687,11 @@ static int x2apic_state; static bool x2apic_hw_locked(void) { - u64 ia32_cap; + u64 x86_arch_cap_msr; u64 msr; - ia32_cap = x86_read_arch_cap_msr(); - if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) { + x86_arch_cap_msr = x86_read_arch_cap_msr(); + if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) { rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); return (msr & LEGACY_XAPIC_DISABLED); } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ff59fa8bb6109..1b0cfc1364324 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -61,7 +61,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; EXPORT_SYMBOL_GPL(x86_pred_cmd); -static u64 __ro_after_init ia32_cap; +static u64 __ro_after_init x86_arch_cap_msr; static DEFINE_MUTEX(spec_ctrl_mutex); @@ -146,7 +146,7 @@ void __init cpu_select_mitigations(void) x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; } - ia32_cap = x86_read_arch_cap_msr(); + x86_arch_cap_msr = x86_read_arch_cap_msr(); /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); @@ -343,8 +343,8 @@ static void __init taa_select_mitigation(void) * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode * update is required. */ - if ( (ia32_cap & ARCH_CAP_MDS_NO) && - !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) && + !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; /* @@ -434,7 +434,7 @@ static void __init mmio_select_mitigation(void) * be propagated to uncore buffers, clearing the Fill buffers on idle * is required irrespective of SMT state. */ - if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) + if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) static_branch_enable(&mds_idle_clear); /* @@ -444,10 +444,10 @@ static void __init mmio_select_mitigation(void) * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS * affected systems. */ - if ((ia32_cap & ARCH_CAP_FB_CLEAR) || + if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || (boot_cpu_has(X86_FEATURE_MD_CLEAR) && boot_cpu_has(X86_FEATURE_FLUSH_L1D) && - !(ia32_cap & ARCH_CAP_MDS_NO))) + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))) mmio_mitigation = MMIO_MITIGATION_VERW; else mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; @@ -505,7 +505,7 @@ static void __init rfds_select_mitigation(void) if (rfds_mitigation == RFDS_MITIGATION_OFF) return; - if (ia32_cap & ARCH_CAP_RFDS_CLEAR) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; @@ -664,7 +664,7 @@ static void __init srbds_select_mitigation(void) * are only exposed to SRBDS when TSX is enabled or when CPU is affected * by Processor MMIO Stale Data vulnerability. */ - if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && + if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) @@ -807,7 +807,7 @@ static void __init gds_select_mitigation(void) /* Will verify below that mitigation _can_ be disabled */ /* No microcode */ - if (!(ia32_cap & ARCH_CAP_GDS_CTRL)) { + if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) { if (gds_mitigation == GDS_MITIGATION_FORCE) { /* * This only needs to be done on the boot CPU so do it @@ -1541,14 +1541,14 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) /* Disable in-kernel use of non-RSB RET predictors */ static void __init spec_ctrl_disable_kernel_rrsba(void) { - u64 ia32_cap; + u64 x86_arch_cap_msr; if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) return; - ia32_cap = x86_read_arch_cap_msr(); + x86_arch_cap_msr = x86_read_arch_cap_msr(); - if (ia32_cap & ARCH_CAP_RRSBA) { + if (x86_arch_cap_msr & ARCH_CAP_RRSBA) { x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; update_spec_ctrl(x86_spec_ctrl_base); } @@ -1916,7 +1916,7 @@ static void update_mds_branch_idle(void) if (sched_smt_active()) { static_branch_enable(&mds_idle_clear); } else if (mmio_mitigation == MMIO_MITIGATION_OFF || - (ia32_cap & ARCH_CAP_FBSDP_NO)) { + (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) { static_branch_disable(&mds_idle_clear); } } @@ -2810,7 +2810,7 @@ static const char *spectre_bhi_state(void) else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) return "; BHI: SW loop, KVM: SW loop"; else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && - !(ia32_cap & ARCH_CAP_RRSBA)) + !(x86_arch_cap_msr & ARCH_CAP_RRSBA)) return "; BHI: Retpoline"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) return "; BHI: Syscall hardening, KVM: SW loop"; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 754d91857d634..605c26c009c8a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1284,25 +1284,25 @@ static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long whi u64 x86_read_arch_cap_msr(void) { - u64 ia32_cap = 0; + u64 x86_arch_cap_msr = 0; if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); - return ia32_cap; + return x86_arch_cap_msr; } -static bool arch_cap_mmio_immune(u64 ia32_cap) +static bool arch_cap_mmio_immune(u64 x86_arch_cap_msr) { - return (ia32_cap & ARCH_CAP_FBSDP_NO && - ia32_cap & ARCH_CAP_PSDP_NO && - ia32_cap & ARCH_CAP_SBDR_SSDP_NO); + return (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO && + x86_arch_cap_msr & ARCH_CAP_PSDP_NO && + x86_arch_cap_msr & ARCH_CAP_SBDR_SSDP_NO); } -static bool __init vulnerable_to_rfds(u64 ia32_cap) +static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr) { /* The "immunity" bit trumps everything else: */ - if (ia32_cap & ARCH_CAP_RFDS_NO) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_NO) return false; /* @@ -1310,7 +1310,7 @@ static bool __init vulnerable_to_rfds(u64 ia32_cap) * indicate that mitigation is needed because guest is running on a * vulnerable hardware or may migrate to such hardware: */ - if (ia32_cap & ARCH_CAP_RFDS_CLEAR) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) return true; /* Only consult the blacklist when there is no enumeration: */ @@ -1319,11 +1319,11 @@ static bool __init vulnerable_to_rfds(u64 ia32_cap) static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { - u64 ia32_cap = x86_read_arch_cap_msr(); + u64 x86_arch_cap_msr = x86_read_arch_cap_msr(); /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && - !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO)) setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) @@ -1335,7 +1335,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V2); if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && - !(ia32_cap & ARCH_CAP_SSB_NO) && + !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); @@ -1346,17 +1346,17 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Don't use AutoIBRS when SNP is enabled because it degrades host * userspace indirect branch performance. */ - if ((ia32_cap & ARCH_CAP_IBRS_ALL) || + if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) || (cpu_has(c, X86_FEATURE_AUTOIBRS) && !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && - !(ia32_cap & ARCH_CAP_PBRSB_NO)) + !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO)) setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); } if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && - !(ia32_cap & ARCH_CAP_MDS_NO)) { + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); @@ -1375,9 +1375,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * TSX_CTRL check alone is not sufficient for cases when the microcode * update is not present or running as guest that don't get TSX_CTRL. */ - if (!(ia32_cap & ARCH_CAP_TAA_NO) && + if (!(x86_arch_cap_msr & ARCH_CAP_TAA_NO) && (cpu_has(c, X86_FEATURE_RTM) || - (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + (x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); /* @@ -1403,7 +1403,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. */ - if (!arch_cap_mmio_immune(ia32_cap)) { + if (!arch_cap_mmio_immune(x86_arch_cap_msr)) { if (cpu_matches(cpu_vuln_blacklist, MMIO)) setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) @@ -1411,7 +1411,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) } if (!cpu_has(c, X86_FEATURE_BTC_NO)) { - if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (x86_arch_cap_msr & ARCH_CAP_RSBA)) setup_force_cpu_bug(X86_BUG_RETBLEED); } @@ -1429,15 +1429,15 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * disabling AVX2. The only way to do this in HW is to clear XCR0[2], * which means that AVX will be disabled. */ - if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) && + if (cpu_matches(cpu_vuln_blacklist, GDS) && !(x86_arch_cap_msr & ARCH_CAP_GDS_NO) && boot_cpu_has(X86_FEATURE_AVX)) setup_force_cpu_bug(X86_BUG_GDS); - if (vulnerable_to_rfds(ia32_cap)) + if (vulnerable_to_rfds(x86_arch_cap_msr)) setup_force_cpu_bug(X86_BUG_RFDS); /* When virtualized, eIBRS could be hidden, assume vulnerable */ - if (!(ia32_cap & ARCH_CAP_BHI_NO) && + if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) && !cpu_matches(cpu_vuln_whitelist, NO_BHI) && (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || boot_cpu_has(X86_FEATURE_HYPERVISOR))) @@ -1447,7 +1447,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) return; /* Rogue Data Cache Load? No! */ - if (ia32_cap & ARCH_CAP_RDCL_NO) + if (x86_arch_cap_msr & ARCH_CAP_RDCL_NO) return; setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); -- GitLab From 1cea8a280dfd1016148a3820676f2f03e3f5b898 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:47 -0700 Subject: [PATCH 745/989] x86/bugs: Fix BHI handling of RRSBA The ARCH_CAP_RRSBA check isn't correct: RRSBA may have already been disabled by the Spectre v2 mitigation (or can otherwise be disabled by the BHI mitigation itself if needed). In that case retpolines are fine. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/6f56f13da34a0834b69163467449be7f58f253dc.1712813475.git.jpoimboe@kernel.org --- arch/x86/kernel/cpu/bugs.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1b0cfc1364324..08dfb94fcb3a4 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1538,20 +1538,25 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) return SPECTRE_V2_RETPOLINE; } +static bool __ro_after_init rrsba_disabled; + /* Disable in-kernel use of non-RSB RET predictors */ static void __init spec_ctrl_disable_kernel_rrsba(void) { - u64 x86_arch_cap_msr; + if (rrsba_disabled) + return; - if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) { + rrsba_disabled = true; return; + } - x86_arch_cap_msr = x86_read_arch_cap_msr(); + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; - if (x86_arch_cap_msr & ARCH_CAP_RRSBA) { - x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; - update_spec_ctrl(x86_spec_ctrl_base); - } + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + rrsba_disabled = true; } static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) @@ -1652,9 +1657,11 @@ static void __init bhi_select_mitigation(void) return; /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */ - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && - !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) - return; + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { + spec_ctrl_disable_kernel_rrsba(); + if (rrsba_disabled) + return; + } if (spec_ctrl_bhi_dis()) return; @@ -2809,8 +2816,7 @@ static const char *spectre_bhi_state(void) return "; BHI: BHI_DIS_S"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) return "; BHI: SW loop, KVM: SW loop"; - else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && - !(x86_arch_cap_msr & ARCH_CAP_RRSBA)) + else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && rrsba_disabled) return "; BHI: Retpoline"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) return "; BHI: Syscall hardening, KVM: SW loop"; -- GitLab From 5f882f3b0a8bf0788d5a0ee44b1191de5319bb8a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:48 -0700 Subject: [PATCH 746/989] x86/bugs: Clarify that syscall hardening isn't a BHI mitigation While syscall hardening helps prevent some BHI attacks, there's still other low-hanging fruit remaining. Don't classify it as a mitigation and make it clear that the system may still be vulnerable if it doesn't have a HW or SW mitigation enabled. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/b5951dae3fdee7f1520d5136a27be3bdfe95f88b.1712813475.git.jpoimboe@kernel.org --- Documentation/admin-guide/hw-vuln/spectre.rst | 11 +++++------ Documentation/admin-guide/kernel-parameters.txt | 3 +-- arch/x86/kernel/cpu/bugs.c | 6 +++--- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 3cf18e4a1d9ab..5a39acf824835 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -441,10 +441,10 @@ The possible values in this file are: - System is protected by BHI_DIS_S * - BHI: SW loop, KVM SW loop - System is protected by software clearing sequence - * - BHI: Syscall hardening - - Syscalls are hardened against BHI - * - BHI: Syscall hardening, KVM: SW loop - - System is protected from userspace attacks by syscall hardening; KVM is protected by software clearing sequence + * - BHI: Vulnerable + - System is vulnerable to BHI + * - BHI: Vulnerable, KVM: SW loop + - System is vulnerable; KVM is protected by software clearing sequence Full mitigation might require a microcode update from the CPU vendor. When the necessary microcode is not available, the kernel will @@ -661,8 +661,7 @@ kernel command line. spectre_bhi= [X86] Control mitigation of Branch History Injection - (BHI) vulnerability. Syscalls are hardened against BHI - regardless of this setting. This setting affects the deployment + (BHI) vulnerability. This setting affects the deployment of the HW BHI control and the SW BHB clearing sequence. on diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a029ad6c4963c..a3874cc97892d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6065,8 +6065,7 @@ See Documentation/admin-guide/laptops/sonypi.rst spectre_bhi= [X86] Control mitigation of Branch History Injection - (BHI) vulnerability. Syscalls are hardened against BHI - reglardless of this setting. This setting affects the + (BHI) vulnerability. This setting affects the deployment of the HW BHI control and the SW BHB clearing sequence. diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 08dfb94fcb3a4..9eeb60f5fbb35 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2818,10 +2818,10 @@ static const char *spectre_bhi_state(void) return "; BHI: SW loop, KVM: SW loop"; else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && rrsba_disabled) return "; BHI: Retpoline"; - else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) - return "; BHI: Syscall hardening, KVM: SW loop"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) + return "; BHI: Vulnerable, KVM: SW loop"; - return "; BHI: Vulnerable (Syscall hardening enabled)"; + return "; BHI: Vulnerable"; } static ssize_t spectre_v2_show_state(char *buf) -- GitLab From 713a85195aad25d8a26786a37b674e3e5ec09e3c Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Wed, 10 Apr 2024 09:13:55 +0000 Subject: [PATCH 747/989] net: ena: Fix potential sign extension issue Small unsigned types are promoted to larger signed types in the case of multiplication, the result of which may overflow. In case the result of such a multiplication has its MSB turned on, it will be sign extended with '1's. This changes the multiplication result. Code example of the phenomenon: ------------------------------- u16 x, y; size_t z1, z2; x = y = 0xffff; printk("x=%x y=%x\n",x,y); z1 = x*y; z2 = (size_t)x*y; printk("z1=%lx z2=%lx\n", z1, z2); Output: ------- x=ffff y=ffff z1=fffffffffffe0001 z2=fffe0001 The expected result of ffff*ffff is fffe0001, and without the explicit casting to avoid the unwanted sign extension we got fffffffffffe0001. This commit adds an explicit casting to avoid the sign extension issue. Fixes: 689b2bdaaa14 ("net: ena: add functions for handling Low Latency Queues in ena_com") Signed-off-by: Arthur Kiyanovski Signed-off-by: David Arinzon Reviewed-by: Shannon Nelson Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amazon/ena/ena_com.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index 9e9e4a03f1a8c..2d8a66ea82fab 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -351,7 +351,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev, ENA_COM_BOUNCE_BUFFER_CNTRL_CNT; io_sq->bounce_buf_ctrl.next_to_use = 0; - size = io_sq->bounce_buf_ctrl.buffer_size * + size = (size_t)io_sq->bounce_buf_ctrl.buffer_size * io_sq->bounce_buf_ctrl.buffers_num; dev_node = dev_to_node(ena_dev->dmadev); -- GitLab From f7e417180665234fdb7af2ebe33d89aaa434d16f Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Wed, 10 Apr 2024 09:13:56 +0000 Subject: [PATCH 748/989] net: ena: Wrong missing IO completions check order Missing IO completions check is called every second (HZ jiffies). This commit fixes several issues with this check: 1. Duplicate queues check: Max of 4 queues are scanned on each check due to monitor budget. Once reaching the budget, this check exits under the assumption that the next check will continue to scan the remainder of the queues, but in practice, next check will first scan the last already scanned queue which is not necessary and may cause the full queue scan to last a couple of seconds longer. The fix is to start every check with the next queue to scan. For example, on 8 IO queues: Bug: [0,1,2,3], [3,4,5,6], [6,7] Fix: [0,1,2,3], [4,5,6,7] 2. Unbalanced queues check: In case the number of active IO queues is not a multiple of budget, there will be checks which don't utilize the full budget because the full scan exits when reaching the last queue id. The fix is to run every TX completion check with exact queue budget regardless of the queue id. For example, on 7 IO queues: Bug: [0,1,2,3], [4,5,6], [0,1,2,3] Fix: [0,1,2,3], [4,5,6,0], [1,2,3,4] The budget may be lowered in case the number of IO queues is less than the budget (4) to make sure there are no duplicate queues on the same check. For example, on 3 IO queues: Bug: [0,1,2,0], [1,2,0,1] Fix: [0,1,2], [0,1,2] Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Amit Bernstein Signed-off-by: David Arinzon Reviewed-by: Shannon Nelson Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 21 +++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 09e7da1a69c9f..59befc0f463cd 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -3481,10 +3481,11 @@ static void check_for_missing_completions(struct ena_adapter *adapter) { struct ena_ring *tx_ring; struct ena_ring *rx_ring; - int i, budget, rc; + int qid, budget, rc; int io_queue_count; io_queue_count = adapter->xdp_num_queues + adapter->num_io_queues; + /* Make sure the driver doesn't turn the device in other process */ smp_rmb(); @@ -3497,27 +3498,29 @@ static void check_for_missing_completions(struct ena_adapter *adapter) if (adapter->missing_tx_completion_to == ENA_HW_HINTS_NO_TIMEOUT) return; - budget = ENA_MONITORED_TX_QUEUES; + budget = min_t(u32, io_queue_count, ENA_MONITORED_TX_QUEUES); - for (i = adapter->last_monitored_tx_qid; i < io_queue_count; i++) { - tx_ring = &adapter->tx_ring[i]; - rx_ring = &adapter->rx_ring[i]; + qid = adapter->last_monitored_tx_qid; + + while (budget) { + qid = (qid + 1) % io_queue_count; + + tx_ring = &adapter->tx_ring[qid]; + rx_ring = &adapter->rx_ring[qid]; rc = check_missing_comp_in_tx_queue(adapter, tx_ring); if (unlikely(rc)) return; - rc = !ENA_IS_XDP_INDEX(adapter, i) ? + rc = !ENA_IS_XDP_INDEX(adapter, qid) ? check_for_rx_interrupt_queue(adapter, rx_ring) : 0; if (unlikely(rc)) return; budget--; - if (!budget) - break; } - adapter->last_monitored_tx_qid = i % io_queue_count; + adapter->last_monitored_tx_qid = qid; } /* trigger napi schedule after 2 consecutive detections */ -- GitLab From bf02d9fe00632d22fa91d34749c7aacf397b6cde Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Wed, 10 Apr 2024 09:13:57 +0000 Subject: [PATCH 749/989] net: ena: Fix incorrect descriptor free behavior ENA has two types of TX queues: - queues which only process TX packets arriving from the network stack - queues which only process TX packets forwarded to it by XDP_REDIRECT or XDP_TX instructions The ena_free_tx_bufs() cycles through all descriptors in a TX queue and unmaps + frees every descriptor that hasn't been acknowledged yet by the device (uncompleted TX transactions). The function assumes that the processed TX queue is necessarily from the first category listed above and ends up using napi_consume_skb() for descriptors belonging to an XDP specific queue. This patch solves a bug in which, in case of a VF reset, the descriptors aren't freed correctly, leading to crashes. Fixes: 548c4940b9f1 ("net: ena: Implement XDP_TX action") Signed-off-by: Shay Agroskin Signed-off-by: David Arinzon Reviewed-by: Shannon Nelson Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 59befc0f463cd..be5acfa41ee0c 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -718,8 +718,11 @@ void ena_unmap_tx_buff(struct ena_ring *tx_ring, static void ena_free_tx_bufs(struct ena_ring *tx_ring) { bool print_once = true; + bool is_xdp_ring; u32 i; + is_xdp_ring = ENA_IS_XDP_INDEX(tx_ring->adapter, tx_ring->qid); + for (i = 0; i < tx_ring->ring_size; i++) { struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i]; @@ -739,10 +742,15 @@ static void ena_free_tx_bufs(struct ena_ring *tx_ring) ena_unmap_tx_buff(tx_ring, tx_info); - dev_kfree_skb_any(tx_info->skb); + if (is_xdp_ring) + xdp_return_frame(tx_info->xdpf); + else + dev_kfree_skb_any(tx_info->skb); } - netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev, - tx_ring->qid)); + + if (!is_xdp_ring) + netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev, + tx_ring->qid)); } static void ena_free_all_tx_bufs(struct ena_adapter *adapter) -- GitLab From 36a1ca01f0452f2549420e7279c2588729bd94df Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Wed, 10 Apr 2024 09:13:58 +0000 Subject: [PATCH 750/989] net: ena: Set tx_info->xdpf value to NULL The patch mentioned in the `Fixes` tag removed the explicit assignment of tx_info->xdpf to NULL with the justification that there's no need to set tx_info->xdpf to NULL and tx_info->num_of_bufs to 0 in case of a mapping error. Both values won't be used once the mapping function returns an error, and their values would be overridden by the next transmitted packet. While both values do indeed get overridden in the next transmission call, the value of tx_info->xdpf is also used to check whether a TX descriptor's transmission has been completed (i.e. a completion for it was polled). An example scenario: 1. Mapping failed, tx_info->xdpf wasn't set to NULL 2. A VF reset occurred leading to IO resource destruction and a call to ena_free_tx_bufs() function 3. Although the descriptor whose mapping failed was freed by the transmission function, it still passes the check if (!tx_info->skb) (skb and xdp_frame are in a union) 4. The xdp_frame associated with the descriptor is freed twice This patch returns the assignment of NULL to tx_info->xdpf to make the cleaning function knows that the descriptor is already freed. Fixes: 504fd6a5390c ("net: ena: fix DMA mapping function issues in XDP") Signed-off-by: Shay Agroskin Signed-off-by: David Arinzon Reviewed-by: Shannon Nelson Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amazon/ena/ena_xdp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_xdp.c b/drivers/net/ethernet/amazon/ena/ena_xdp.c index 337c435d3ce99..5b175e7e92a10 100644 --- a/drivers/net/ethernet/amazon/ena/ena_xdp.c +++ b/drivers/net/ethernet/amazon/ena/ena_xdp.c @@ -89,7 +89,7 @@ int ena_xdp_xmit_frame(struct ena_ring *tx_ring, rc = ena_xdp_tx_map_frame(tx_ring, tx_info, xdpf, &ena_tx_ctx); if (unlikely(rc)) - return rc; + goto err; ena_tx_ctx.req_id = req_id; @@ -112,7 +112,9 @@ int ena_xdp_xmit_frame(struct ena_ring *tx_ring, error_unmap_dma: ena_unmap_tx_buff(tx_ring, tx_info); +err: tx_info->xdpf = NULL; + return rc; } -- GitLab From f969eb84ce482331a991079ab7a5c4dc3b7f89bf Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sun, 7 Apr 2024 14:56:04 +0800 Subject: [PATCH 751/989] netfilter: nf_tables: Fix potential data-race in __nft_expr_type_get() nft_unregister_expr() can concurrent with __nft_expr_type_get(), and there is not any protection when iterate over nf_tables_expressions list in __nft_expr_type_get(). Therefore, there is potential data-race of nf_tables_expressions list entry. Use list_for_each_entry_rcu() to iterate over nf_tables_expressions list in __nft_expr_type_get(), and use rcu_read_lock() in the caller nft_expr_type_get() to protect the entire type query process. Fixes: ef1f7df9170d ("netfilter: nf_tables: expression ops overloading") Signed-off-by: Ziyang Xuan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d89d779467197..53b8c00863ad2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3060,7 +3060,7 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, { const struct nft_expr_type *type, *candidate = NULL; - list_for_each_entry(type, &nf_tables_expressions, list) { + list_for_each_entry_rcu(type, &nf_tables_expressions, list) { if (!nla_strcmp(nla, type->name)) { if (!type->family && !candidate) candidate = type; @@ -3092,9 +3092,13 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); + rcu_read_lock(); type = __nft_expr_type_get(family, nla); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES -- GitLab From d78d867dcea69c328db30df665be5be7d0148484 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sun, 7 Apr 2024 14:56:05 +0800 Subject: [PATCH 752/989] netfilter: nf_tables: Fix potential data-race in __nft_obj_type_get() nft_unregister_obj() can concurrent with __nft_obj_type_get(), and there is not any protection when iterate over nf_tables_objects list in __nft_obj_type_get(). Therefore, there is potential data-race of nf_tables_objects list entry. Use list_for_each_entry_rcu() to iterate over nf_tables_objects list in __nft_obj_type_get(), and use rcu_read_lock() in the caller nft_obj_type_get() to protect the entire type query process. Fixes: e50092404c1b ("netfilter: nf_tables: add stateful objects") Signed-off-by: Ziyang Xuan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 53b8c00863ad2..f11d0c0a2c735 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7611,7 +7611,7 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family) { const struct nft_object_type *type; - list_for_each_entry(type, &nf_tables_objects, list) { + list_for_each_entry_rcu(type, &nf_tables_objects, list) { if (type->family != NFPROTO_UNSPEC && type->family != family) continue; @@ -7627,9 +7627,13 @@ nft_obj_type_get(struct net *net, u32 objtype, u8 family) { const struct nft_object_type *type; + rcu_read_lock(); type = __nft_obj_type_get(objtype, family); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES -- GitLab From 751de2012eafa4d46d8081056761fa0e9cc8a178 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Apr 2024 11:24:59 +0200 Subject: [PATCH 753/989] netfilter: br_netfilter: skip conntrack input hook for promisc packets For historical reasons, when bridge device is in promisc mode, packets that are directed to the taps follow bridge input hook path. This patch adds a workaround to reset conntrack for these packets. Jianbo Liu reports warning splats in their test infrastructure where cloned packets reach the br_netfilter input hook to confirm the conntrack object. Scratch one bit from BR_INPUT_SKB_CB to annotate that this packet has reached the input hook because it is passed up to the bridge device to reach the taps. [ 57.571874] WARNING: CPU: 1 PID: 0 at net/bridge/br_netfilter_hooks.c:616 br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.572749] Modules linked in: xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_isc si ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5ctl mlx5_core [ 57.575158] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 6.8.0+ #19 [ 57.575700] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 57.576662] RIP: 0010:br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.577195] Code: fe ff ff 41 bd 04 00 00 00 be 04 00 00 00 e9 4a ff ff ff be 04 00 00 00 48 89 ef e8 f3 a9 3c e1 66 83 ad b4 00 00 00 04 eb 91 <0f> 0b e9 f1 fe ff ff 0f 0b e9 df fe ff ff 48 89 df e8 b3 53 47 e1 [ 57.578722] RSP: 0018:ffff88885f845a08 EFLAGS: 00010202 [ 57.579207] RAX: 0000000000000002 RBX: ffff88812dfe8000 RCX: 0000000000000000 [ 57.579830] RDX: ffff88885f845a60 RSI: ffff8881022dc300 RDI: 0000000000000000 [ 57.580454] RBP: ffff88885f845a60 R08: 0000000000000001 R09: 0000000000000003 [ 57.581076] R10: 00000000ffff1300 R11: 0000000000000002 R12: 0000000000000000 [ 57.581695] R13: ffff8881047ffe00 R14: ffff888108dbee00 R15: ffff88814519b800 [ 57.582313] FS: 0000000000000000(0000) GS:ffff88885f840000(0000) knlGS:0000000000000000 [ 57.583040] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 57.583564] CR2: 000000c4206aa000 CR3: 0000000103847001 CR4: 0000000000370eb0 [ 57.584194] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 57.584820] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 57.585440] Call Trace: [ 57.585721] [ 57.585976] ? __warn+0x7d/0x130 [ 57.586323] ? br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.586811] ? report_bug+0xf1/0x1c0 [ 57.587177] ? handle_bug+0x3f/0x70 [ 57.587539] ? exc_invalid_op+0x13/0x60 [ 57.587929] ? asm_exc_invalid_op+0x16/0x20 [ 57.588336] ? br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.588825] nf_hook_slow+0x3d/0xd0 [ 57.589188] ? br_handle_vlan+0x4b/0x110 [ 57.589579] br_pass_frame_up+0xfc/0x150 [ 57.589970] ? br_port_flags_change+0x40/0x40 [ 57.590396] br_handle_frame_finish+0x346/0x5e0 [ 57.590837] ? ipt_do_table+0x32e/0x430 [ 57.591221] ? br_handle_local_finish+0x20/0x20 [ 57.591656] br_nf_hook_thresh+0x4b/0xf0 [br_netfilter] [ 57.592286] ? br_handle_local_finish+0x20/0x20 [ 57.592802] br_nf_pre_routing_finish+0x178/0x480 [br_netfilter] [ 57.593348] ? br_handle_local_finish+0x20/0x20 [ 57.593782] ? nf_nat_ipv4_pre_routing+0x25/0x60 [nf_nat] [ 57.594279] br_nf_pre_routing+0x24c/0x550 [br_netfilter] [ 57.594780] ? br_nf_hook_thresh+0xf0/0xf0 [br_netfilter] [ 57.595280] br_handle_frame+0x1f3/0x3d0 [ 57.595676] ? br_handle_local_finish+0x20/0x20 [ 57.596118] ? br_handle_frame_finish+0x5e0/0x5e0 [ 57.596566] __netif_receive_skb_core+0x25b/0xfc0 [ 57.597017] ? __napi_build_skb+0x37/0x40 [ 57.597418] __netif_receive_skb_list_core+0xfb/0x220 Fixes: 62e7151ae3eb ("netfilter: bridge: confirm multicast packets before passing them up the stack") Reported-by: Jianbo Liu Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_input.c | 15 +++++++++++---- net/bridge/br_netfilter_hooks.c | 6 ++++++ net/bridge/br_private.h | 1 + net/bridge/netfilter/nf_conntrack_bridge.c | 14 ++++++++++---- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index f21097e734827..ceaa5a89b947f 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -30,7 +30,7 @@ br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) return netif_receive_skb(skb); } -static int br_pass_frame_up(struct sk_buff *skb) +static int br_pass_frame_up(struct sk_buff *skb, bool promisc) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); @@ -65,6 +65,8 @@ static int br_pass_frame_up(struct sk_buff *skb) br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb), BR_MCAST_DIR_TX); + BR_INPUT_SKB_CB(skb)->promisc = promisc; + return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(indev), NULL, skb, indev, NULL, br_netif_receive_skb); @@ -82,6 +84,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb struct net_bridge_mcast *brmctx; struct net_bridge_vlan *vlan; struct net_bridge *br; + bool promisc; u16 vid = 0; u8 state; @@ -137,7 +140,9 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); - local_rcv = !!(br->dev->flags & IFF_PROMISC); + promisc = !!(br->dev->flags & IFF_PROMISC); + local_rcv = promisc; + if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { /* by definition the broadcast is also a multicast address */ if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) { @@ -200,7 +205,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb unsigned long now = jiffies; if (test_bit(BR_FDB_LOCAL, &dst->flags)) - return br_pass_frame_up(skb); + return br_pass_frame_up(skb, false); if (now != dst->used) dst->used = now; @@ -213,7 +218,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb } if (local_rcv) - return br_pass_frame_up(skb); + return br_pass_frame_up(skb, promisc); out: return 0; @@ -386,6 +391,8 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) goto forward; } + BR_INPUT_SKB_CB(skb)->promisc = false; + /* The else clause should be hit when nf_hook(): * - returns < 0 (drop/error) * - returns = 0 (stolen/nf_queue) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 35e10c5a766d5..22e35623c148a 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -600,11 +600,17 @@ static unsigned int br_nf_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + bool promisc = BR_INPUT_SKB_CB(skb)->promisc; struct nf_conntrack *nfct = skb_nfct(skb); const struct nf_ct_hook *ct_hook; struct nf_conn *ct; int ret; + if (promisc) { + nf_reset_ct(skb); + return NF_ACCEPT; + } + if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 86ea5e6689b5c..d4bedc87b1d8f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -589,6 +589,7 @@ struct br_input_skb_cb { #endif u8 proxyarp_replied:1; u8 src_port_isolated:1; + u8 promisc:1; #ifdef CONFIG_BRIDGE_VLAN_FILTERING u8 vlan_filtered:1; #endif diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 6f877e31709ba..c3c51b9a68265 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -294,18 +294,24 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - enum ip_conntrack_info ctinfo; + bool promisc = BR_INPUT_SKB_CB(skb)->promisc; + struct nf_conntrack *nfct = skb_nfct(skb); struct nf_conn *ct; - if (skb->pkt_type == PACKET_HOST) + if (promisc) { + nf_reset_ct(skb); + return NF_ACCEPT; + } + + if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; /* nf_conntrack_confirm() cannot handle concurrent clones, * this happens for broad/multicast frames with e.g. macvlan on top * of the bridge device. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) + ct = container_of(nfct, struct nf_conn, ct_general); + if (nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) return NF_ACCEPT; /* let inet prerouting call conntrack again */ -- GitLab From 29b359cf6d95fd60730533f7f10464e95bd17c73 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 10 Apr 2024 18:50:45 +0200 Subject: [PATCH 754/989] netfilter: nft_set_pipapo: walk over current view on netlink dump The generation mask can be updated while netlink dump is in progress. The pipapo set backend walk iterator cannot rely on it to infer what view of the datastructure is to be used. Add notation to specify if user wants to read/update the set. Based on patch from Florian Westphal. Fixes: 2b84e215f874 ("netfilter: nft_set_pipapo: .walk does not deal with generations") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 14 ++++++++++++++ net/netfilter/nf_tables_api.c | 6 ++++++ net/netfilter/nft_set_pipapo.c | 5 +++-- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e27c28b612e46..3f1ed467f951f 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -307,9 +307,23 @@ static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) return (void *)priv; } + +/** + * enum nft_iter_type - nftables set iterator type + * + * @NFT_ITER_READ: read-only iteration over set elements + * @NFT_ITER_UPDATE: iteration under mutex to update set element state + */ +enum nft_iter_type { + NFT_ITER_UNSPEC, + NFT_ITER_READ, + NFT_ITER_UPDATE, +}; + struct nft_set; struct nft_set_iter { u8 genmask; + enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f11d0c0a2c735..a7a34db62ea93 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -626,6 +626,7 @@ static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_deactivate, }; @@ -5445,6 +5446,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, } iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; @@ -5518,6 +5520,7 @@ static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_activate, }; @@ -5892,6 +5895,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.skb = skb; args.reset = dump_ctx->reset; args.iter.genmask = nft_genmask_cur(net); + args.iter.type = NFT_ITER_READ; args.iter.skip = cb->args[0]; args.iter.count = 0; args.iter.err = 0; @@ -7376,6 +7380,7 @@ static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { struct nft_set_iter iter = { .genmask = genmask, + .type = NFT_ITER_UPDATE, .fn = nft_setelem_flush, }; @@ -10879,6 +10884,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, continue; iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index df8de50902463..11e44e4dfb1f7 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2115,13 +2115,14 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_pipapo *priv = nft_set_priv(set); - struct net *net = read_pnet(&set->net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; unsigned int i, r; + WARN_ON_ONCE(iter->type == NFT_ITER_UNSPEC); + rcu_read_lock(); - if (iter->genmask == nft_genmask_cur(net)) + if (iter->type == NFT_ITER_READ) m = rcu_dereference(priv->match); else m = priv->clone; -- GitLab From 3cfc9ec039af60dbd8965ae085b2c2ccdcfbe1cc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Apr 2024 21:05:13 +0200 Subject: [PATCH 755/989] netfilter: nft_set_pipapo: do not free live element Pablo reports a crash with large batches of elements with a back-to-back add/remove pattern. Quoting Pablo: add_elem("00000000") timeout 100 ms ... add_elem("0000000X") timeout 100 ms del_elem("0000000X") <---------------- delete one that was just added ... add_elem("00005000") timeout 100 ms 1) nft_pipapo_remove() removes element 0000000X Then, KASAN shows a splat. Looking at the remove function there is a chance that we will drop a rule that maps to a non-deactivated element. Removal happens in two steps, first we do a lookup for key k and return the to-be-removed element and mark it as inactive in the next generation. Then, in a second step, the element gets removed from the set/map. The _remove function does not work correctly if we have more than one element that share the same key. This can happen if we insert an element into a set when the set already holds an element with same key, but the element mapping to the existing key has timed out or is not active in the next generation. In such case its possible that removal will unmap the wrong element. If this happens, we will leak the non-deactivated element, it becomes unreachable. The element that got deactivated (and will be freed later) will remain reachable in the set data structure, this can result in a crash when such an element is retrieved during lookup (stale pointer). Add a check that the fully matching key does in fact map to the element that we have marked as inactive in the deactivation step. If not, we need to continue searching. Add a bug/warn trap at the end of the function as well, the remove function must not ever be called with an invisible/unreachable/non-existent element. v2: avoid uneeded temporary variable (Stefano) Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Reported-by: Pablo Neira Ayuso Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 11e44e4dfb1f7..eeaf05ffba953 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2077,6 +2077,8 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, rules_fx = rules_f0; nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + if (!pipapo_match_field(f, start, rules_fx, match_start, match_end)) break; @@ -2089,16 +2091,18 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); - } - if (i == m->field_count) { - priv->dirty = true; - pipapo_drop(m, rulemap); - return; + if (last && f->mt[rulemap[i].to].e == e) { + priv->dirty = true; + pipapo_drop(m, rulemap); + return; + } } first_rule += rules_f0; } + + WARN_ON_ONCE(1); /* elem_priv not found */ } /** -- GitLab From 87b3593bed1868b2d9fe096c01bcdf0ea86cbebf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Apr 2024 13:47:33 +0200 Subject: [PATCH 756/989] netfilter: flowtable: validate pppoe header Ensure there is sufficient room to access the protocol field of the PPPoe header. Validate it once before the flowtable lookup, then use a helper function to access protocol field. Reported-by: syzbot+b6f07e1c07ef40199081@syzkaller.appspotmail.com Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 12 +++++++++++- net/netfilter/nf_flow_table_inet.c | 3 ++- net/netfilter/nf_flow_table_ip.c | 8 +++++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index a763dd327c6ea..9abb7ee40d72f 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -336,7 +336,7 @@ int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, int nf_flow_table_offload_init(void); void nf_flow_table_offload_exit(void); -static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) +static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb) { __be16 proto; @@ -352,6 +352,16 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) return 0; } +static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto) +{ + if (!pskb_may_pull(skb, PPPOE_SES_HLEN)) + return false; + + *inner_proto = __nf_flow_pppoe_proto(skb); + + return true; +} + #define NF_FLOW_TABLE_STAT_INC(net, count) __this_cpu_inc((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_DEC(net, count) __this_cpu_dec((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count) \ diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 9505f9d188ff2..6eef15648b7b0 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -21,7 +21,8 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, proto = veth->h_vlan_encapsulated_proto; break; case htons(ETH_P_PPP_SES): - proto = nf_flow_pppoe_proto(skb); + if (!nf_flow_pppoe_proto(skb, &proto)) + return NF_ACCEPT; break; default: proto = skb->protocol; diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index e45fade764096..9e9e105052dae 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -273,10 +273,11 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } -static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, +static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; + __be16 inner_proto; switch (skb->protocol) { case htons(ETH_P_8021Q): @@ -287,7 +288,8 @@ static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, } break; case htons(ETH_P_PPP_SES): - if (nf_flow_pppoe_proto(skb) == proto) { + if (nf_flow_pppoe_proto(skb, &inner_proto) && + inner_proto == proto) { *offset += PPPOE_SES_HLEN; return true; } @@ -316,7 +318,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb, skb_reset_network_header(skb); break; case htons(ETH_P_PPP_SES): - skb->protocol = nf_flow_pppoe_proto(skb); + skb->protocol = __nf_flow_pppoe_proto(skb); skb_pull(skb, PPPOE_SES_HLEN); skb_reset_network_header(skb); break; -- GitLab From 6db5dc7b351b9569940cd1cf445e237c42cd6d27 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Apr 2024 00:09:00 +0200 Subject: [PATCH 757/989] netfilter: flowtable: incorrect pppoe tuple pppoe traffic reaching ingress path does not match the flowtable entry because the pppoe header is expected to be at the network header offset. This bug causes a mismatch in the flow table lookup, so pppoe packets enter the classical forwarding path. Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 9e9e105052dae..5383bed3d3e00 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -157,7 +157,7 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, tuple->encap[i].proto = skb->protocol; break; case htons(ETH_P_PPP_SES): - phdr = (struct pppoe_hdr *)skb_mac_header(skb); + phdr = (struct pppoe_hdr *)skb_network_header(skb); tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; break; -- GitLab From 1aa4ad4eb695bac1b0a7ba542a16d6833c9c8dd8 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 11 Apr 2024 08:58:45 +0300 Subject: [PATCH 758/989] serial: core: Fix missing shutdown and startup for serial base port We are seeing start_tx being called after port shutdown as noted by Jiri. This happens because we are missing the startup and shutdown related functions for the serial base port. Let's fix the issue by adding startup and shutdown functions for the serial base port to block tx flushing for the serial base port when the port is not in use. Fixes: 84a9582fd203 ("serial: core: Start managing serial controllers to enable runtime PM") Cc: stable Reported-by: Jiri Slaby Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20240411055848.38190-1-tony@atomide.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_base.h | 4 ++++ drivers/tty/serial/serial_core.c | 20 ++++++++++++++++--- drivers/tty/serial/serial_port.c | 34 ++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/drivers/tty/serial/serial_base.h b/drivers/tty/serial/serial_base.h index c74c548f0db62..b6c38d2edfd40 100644 --- a/drivers/tty/serial/serial_base.h +++ b/drivers/tty/serial/serial_base.h @@ -22,6 +22,7 @@ struct serial_ctrl_device { struct serial_port_device { struct device dev; struct uart_port *port; + unsigned int tx_enabled:1; }; int serial_base_ctrl_init(void); @@ -30,6 +31,9 @@ void serial_base_ctrl_exit(void); int serial_base_port_init(void); void serial_base_port_exit(void); +void serial_base_port_startup(struct uart_port *port); +void serial_base_port_shutdown(struct uart_port *port); + int serial_base_driver_register(struct device_driver *driver); void serial_base_driver_unregister(struct device_driver *driver); diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 2247efe972503..c476d884356db 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -323,16 +323,26 @@ static int uart_startup(struct tty_struct *tty, struct uart_state *state, bool init_hw) { struct tty_port *port = &state->port; + struct uart_port *uport; int retval; if (tty_port_initialized(port)) - return 0; + goto out_base_port_startup; retval = uart_port_startup(tty, state, init_hw); - if (retval) + if (retval) { set_bit(TTY_IO_ERROR, &tty->flags); + return retval; + } - return retval; +out_base_port_startup: + uport = uart_port_check(state); + if (!uport) + return -EIO; + + serial_base_port_startup(uport); + + return 0; } /* @@ -355,6 +365,9 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state) if (tty) set_bit(TTY_IO_ERROR, &tty->flags); + if (uport) + serial_base_port_shutdown(uport); + if (tty_port_initialized(port)) { tty_port_set_initialized(port, false); @@ -1775,6 +1788,7 @@ static void uart_tty_port_shutdown(struct tty_port *port) uport->ops->stop_rx(uport); uart_port_unlock_irq(uport); + serial_base_port_shutdown(uport); uart_port_shutdown(port); /* diff --git a/drivers/tty/serial/serial_port.c b/drivers/tty/serial/serial_port.c index 22b9eeb23e68a..7e3a1c7b097c3 100644 --- a/drivers/tty/serial/serial_port.c +++ b/drivers/tty/serial/serial_port.c @@ -39,8 +39,12 @@ static int serial_port_runtime_resume(struct device *dev) /* Flush any pending TX for the port */ uart_port_lock_irqsave(port, &flags); + if (!port_dev->tx_enabled) + goto unlock; if (__serial_port_busy(port)) port->ops->start_tx(port); + +unlock: uart_port_unlock_irqrestore(port, flags); out: @@ -60,6 +64,11 @@ static int serial_port_runtime_suspend(struct device *dev) return 0; uart_port_lock_irqsave(port, &flags); + if (!port_dev->tx_enabled) { + uart_port_unlock_irqrestore(port, flags); + return 0; + } + busy = __serial_port_busy(port); if (busy) port->ops->start_tx(port); @@ -71,6 +80,31 @@ static int serial_port_runtime_suspend(struct device *dev) return busy ? -EBUSY : 0; } +static void serial_base_port_set_tx(struct uart_port *port, + struct serial_port_device *port_dev, + bool enabled) +{ + unsigned long flags; + + uart_port_lock_irqsave(port, &flags); + port_dev->tx_enabled = enabled; + uart_port_unlock_irqrestore(port, flags); +} + +void serial_base_port_startup(struct uart_port *port) +{ + struct serial_port_device *port_dev = port->port_dev; + + serial_base_port_set_tx(port, port_dev, true); +} + +void serial_base_port_shutdown(struct uart_port *port) +{ + struct serial_port_device *port_dev = port->port_dev; + + serial_base_port_set_tx(port, port_dev, false); +} + static DEFINE_RUNTIME_DEV_PM_OPS(serial_port_pm, serial_port_runtime_suspend, serial_port_runtime_resume, NULL); -- GitLab From 34b990e9bb54d20b9675ca9483be8668eed374d8 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Tue, 9 Apr 2024 13:29:10 -0300 Subject: [PATCH 759/989] usb: misc: onboard_usb_hub: Disable the USB hub clock on failure In case regulator_bulk_enable() fails, the previously enabled USB hub clock should be disabled. Fix it accordingly. Fixes: 65e62b8a955a ("usb: misc: onboard_usb_hub: Add support for clock input") Cc: stable Signed-off-by: Fabio Estevam Reviewed-by: Frieder Schrempf Acked-by: Matthias Kaehlcke Link: https://lore.kernel.org/r/20240409162910.2061640-1-festevam@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/onboard_usb_hub.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/misc/onboard_usb_hub.c b/drivers/usb/misc/onboard_usb_hub.c index c6101ed2d9d49..d8049275a023c 100644 --- a/drivers/usb/misc/onboard_usb_hub.c +++ b/drivers/usb/misc/onboard_usb_hub.c @@ -78,7 +78,7 @@ static int onboard_hub_power_on(struct onboard_hub *hub) err = regulator_bulk_enable(hub->pdata->num_supplies, hub->supplies); if (err) { dev_err(hub->dev, "failed to enable supplies: %pe\n", ERR_PTR(err)); - return err; + goto disable_clk; } fsleep(hub->pdata->reset_us); @@ -87,6 +87,10 @@ static int onboard_hub_power_on(struct onboard_hub *hub) hub->is_powered_on = true; return 0; + +disable_clk: + clk_disable_unprepare(hub->clk); + return err; } static int onboard_hub_power_off(struct onboard_hub *hub) -- GitLab From c8d2f34ea96ea3bce6ba2535f867f0d4ee3b22e1 Mon Sep 17 00:00:00 2001 From: Samuel Thibault Date: Sat, 23 Mar 2024 17:48:43 +0100 Subject: [PATCH 760/989] speakup: Avoid crash on very long word In case a console is set up really large and contains a really long word (> 256 characters), we have to stop before the length of the word buffer. Signed-off-by: Samuel Thibault Fixes: c6e3fd22cd538 ("Staging: add speakup to the staging directory") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240323164843.1426997-1-samuel.thibault@ens-lyon.org Signed-off-by: Greg Kroah-Hartman --- drivers/accessibility/speakup/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accessibility/speakup/main.c b/drivers/accessibility/speakup/main.c index 1fbc9b921c4fc..736c2eb8c0f37 100644 --- a/drivers/accessibility/speakup/main.c +++ b/drivers/accessibility/speakup/main.c @@ -574,7 +574,7 @@ static u_long get_word(struct vc_data *vc) } attr_ch = get_char(vc, (u_short *)tmp_pos, &spk_attr); buf[cnt++] = attr_ch; - while (tmpx < vc->vc_cols - 1) { + while (tmpx < vc->vc_cols - 1 && cnt < sizeof(buf) - 1) { tmp_pos += 2; tmpx++; ch = get_char(vc, (u_short *)tmp_pos, &temp); -- GitLab From 0dc04112bee6fdd6eb847ccb32214703022c0269 Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Tue, 12 Mar 2024 07:19:58 +0200 Subject: [PATCH 761/989] mei: me: disable RPL-S on SPS and IGN firmwares Extend the quirk to disable MEI interface on Intel PCH Ignition (IGN) and SPS firmwares for RPL-S devices. These firmwares do not support the MEI protocol. Fixes: 3ed8c7d39cfe ("mei: me: add raptor lake point S DID") Cc: stable@vger.kernel.org Signed-off-by: Alexander Usyskin Signed-off-by: Tomas Winkler Link: https://lore.kernel.org/r/20240312051958.118478-1-tomas.winkler@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/pci-me.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index b5757993c9b2a..c39718042e2e0 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -116,7 +116,7 @@ static const struct pci_device_id mei_me_pci_tbl[] = { {MEI_PCI_DEVICE(MEI_DEV_ID_ADP_P, MEI_ME_PCH15_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_ADP_N, MEI_ME_PCH15_CFG)}, - {MEI_PCI_DEVICE(MEI_DEV_ID_RPL_S, MEI_ME_PCH15_CFG)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_RPL_S, MEI_ME_PCH15_SPS_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_MTL_M, MEI_ME_PCH15_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_ARL_S, MEI_ME_PCH15_CFG)}, -- GitLab From 26ac2df47d4c58f17210b7a59037e40f7eca693e Mon Sep 17 00:00:00 2001 From: Ricky Wu Date: Thu, 14 Mar 2024 14:51:13 +0800 Subject: [PATCH 762/989] misc: rtsx: Fix rts5264 driver status incorrect when card removed rts5264 driver not clean express link error and set EXTRA_CAPS_SD_EXPRESS capability back when card removed Fixes: 6a511c9b3a0d ("misc: rtsx: add to support new card reader rts5264") Cc: stable Signed-off-by: Ricky Wu Link: https://lore.kernel.org/r/20240314065113.5962-1-ricky_wu@realtek.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cardreader/rtsx_pcr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/cardreader/rtsx_pcr.c b/drivers/misc/cardreader/rtsx_pcr.c index 1a64364700eb0..0ad2ff9065aad 100644 --- a/drivers/misc/cardreader/rtsx_pcr.c +++ b/drivers/misc/cardreader/rtsx_pcr.c @@ -1002,7 +1002,7 @@ static irqreturn_t rtsx_pci_isr(int irq, void *dev_id) } else { pcr->card_removed |= SD_EXIST; pcr->card_inserted &= ~SD_EXIST; - if (PCI_PID(pcr) == PID_5261) { + if ((PCI_PID(pcr) == PID_5261) || (PCI_PID(pcr) == PID_5264)) { rtsx_pci_write_register(pcr, RTS5261_FW_STATUS, RTS5261_EXPRESS_LINK_FAIL_MASK, 0); pcr->extra_caps |= EXTRA_CAPS_SD_EXPRESS; -- GitLab From e3dc66d998d2b0c2734db9ca1d6c94c97349529a Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 3 Apr 2024 13:13:40 +0800 Subject: [PATCH 763/989] Revert "mei: vsc: Call wake_up() in the threaded IRQ handler" This reverts commit 058a38acba15fd8e7b262ec6e17c4204cb15f984. It's not necessary to avoid a spinlock, a sleeping lock on PREEMPT_RT, in an interrupt handler as the interrupt handler itself would be called in a process context if PREEMPT_RT is enabled. So revert the patch. Cc: stable@vger.kernel.org # for 6.8 Signed-off-by: Sakari Ailus Acked-by: Tomas Winkler Link: https://lore.kernel.org/r/20240403051341.3534650-1-wentong.wu@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/vsc-tp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/misc/mei/vsc-tp.c b/drivers/misc/mei/vsc-tp.c index ecfb70cd057ca..968a92a7425df 100644 --- a/drivers/misc/mei/vsc-tp.c +++ b/drivers/misc/mei/vsc-tp.c @@ -419,6 +419,8 @@ static irqreturn_t vsc_tp_isr(int irq, void *data) atomic_inc(&tp->assert_cnt); + wake_up(&tp->xfer_wait); + return IRQ_WAKE_THREAD; } @@ -426,8 +428,6 @@ static irqreturn_t vsc_tp_thread_isr(int irq, void *data) { struct vsc_tp *tp = data; - wake_up(&tp->xfer_wait); - if (tp->event_notify) tp->event_notify(tp->event_notify_context); -- GitLab From f6085a96c97387154be7eaebd1a5420eb3cd55dc Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 3 Apr 2024 13:13:41 +0800 Subject: [PATCH 764/989] mei: vsc: Unregister interrupt handler for system suspend Unregister the MEI VSC interrupt handler before system suspend and re-register it at system resume time. This mirrors implementation of other MEI devices. This patch fixes the bug that causes continuous stream of MEI VSC errors after system resume. Fixes: 386a766c4169 ("mei: Add MEI hardware support for IVSC device") Cc: stable@vger.kernel.org # for 6.8 Reported-by: Dominik Brodowski Signed-off-by: Wentong Wu Signed-off-by: Sakari Ailus Acked-by: Tomas Winkler Link: https://lore.kernel.org/r/20240403051341.3534650-2-wentong.wu@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/platform-vsc.c | 17 ++++++- drivers/misc/mei/vsc-tp.c | 84 +++++++++++++++++++++++---------- drivers/misc/mei/vsc-tp.h | 3 ++ 3 files changed, 78 insertions(+), 26 deletions(-) diff --git a/drivers/misc/mei/platform-vsc.c b/drivers/misc/mei/platform-vsc.c index 6c9f00bcb94b1..b543e6b9f3cfd 100644 --- a/drivers/misc/mei/platform-vsc.c +++ b/drivers/misc/mei/platform-vsc.c @@ -400,25 +400,40 @@ static void mei_vsc_remove(struct platform_device *pdev) static int mei_vsc_suspend(struct device *dev) { struct mei_device *mei_dev = dev_get_drvdata(dev); + struct mei_vsc_hw *hw = mei_dev_to_vsc_hw(mei_dev); mei_stop(mei_dev); + mei_disable_interrupts(mei_dev); + + vsc_tp_free_irq(hw->tp); + return 0; } static int mei_vsc_resume(struct device *dev) { struct mei_device *mei_dev = dev_get_drvdata(dev); + struct mei_vsc_hw *hw = mei_dev_to_vsc_hw(mei_dev); int ret; - ret = mei_restart(mei_dev); + ret = vsc_tp_request_irq(hw->tp); if (ret) return ret; + ret = mei_restart(mei_dev); + if (ret) + goto err_free; + /* start timer if stopped in suspend */ schedule_delayed_work(&mei_dev->timer_work, HZ); return 0; + +err_free: + vsc_tp_free_irq(hw->tp); + + return ret; } static DEFINE_SIMPLE_DEV_PM_OPS(mei_vsc_pm_ops, mei_vsc_suspend, mei_vsc_resume); diff --git a/drivers/misc/mei/vsc-tp.c b/drivers/misc/mei/vsc-tp.c index 968a92a7425df..e6a98dba8a735 100644 --- a/drivers/misc/mei/vsc-tp.c +++ b/drivers/misc/mei/vsc-tp.c @@ -94,6 +94,27 @@ static const struct acpi_gpio_mapping vsc_tp_acpi_gpios[] = { {} }; +static irqreturn_t vsc_tp_isr(int irq, void *data) +{ + struct vsc_tp *tp = data; + + atomic_inc(&tp->assert_cnt); + + wake_up(&tp->xfer_wait); + + return IRQ_WAKE_THREAD; +} + +static irqreturn_t vsc_tp_thread_isr(int irq, void *data) +{ + struct vsc_tp *tp = data; + + if (tp->event_notify) + tp->event_notify(tp->event_notify_context); + + return IRQ_HANDLED; +} + /* wakeup firmware and wait for response */ static int vsc_tp_wakeup_request(struct vsc_tp *tp) { @@ -383,6 +404,37 @@ int vsc_tp_register_event_cb(struct vsc_tp *tp, vsc_tp_event_cb_t event_cb, } EXPORT_SYMBOL_NS_GPL(vsc_tp_register_event_cb, VSC_TP); +/** + * vsc_tp_request_irq - request irq for vsc_tp device + * @tp: vsc_tp device handle + */ +int vsc_tp_request_irq(struct vsc_tp *tp) +{ + struct spi_device *spi = tp->spi; + struct device *dev = &spi->dev; + int ret; + + irq_set_status_flags(spi->irq, IRQ_DISABLE_UNLAZY); + ret = request_threaded_irq(spi->irq, vsc_tp_isr, vsc_tp_thread_isr, + IRQF_TRIGGER_FALLING | IRQF_ONESHOT, + dev_name(dev), tp); + if (ret) + return ret; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(vsc_tp_request_irq, VSC_TP); + +/** + * vsc_tp_free_irq - free irq for vsc_tp device + * @tp: vsc_tp device handle + */ +void vsc_tp_free_irq(struct vsc_tp *tp) +{ + free_irq(tp->spi->irq, tp); +} +EXPORT_SYMBOL_NS_GPL(vsc_tp_free_irq, VSC_TP); + /** * vsc_tp_intr_synchronize - synchronize vsc_tp interrupt * @tp: vsc_tp device handle @@ -413,27 +465,6 @@ void vsc_tp_intr_disable(struct vsc_tp *tp) } EXPORT_SYMBOL_NS_GPL(vsc_tp_intr_disable, VSC_TP); -static irqreturn_t vsc_tp_isr(int irq, void *data) -{ - struct vsc_tp *tp = data; - - atomic_inc(&tp->assert_cnt); - - wake_up(&tp->xfer_wait); - - return IRQ_WAKE_THREAD; -} - -static irqreturn_t vsc_tp_thread_isr(int irq, void *data) -{ - struct vsc_tp *tp = data; - - if (tp->event_notify) - tp->event_notify(tp->event_notify_context); - - return IRQ_HANDLED; -} - static int vsc_tp_match_any(struct acpi_device *adev, void *data) { struct acpi_device **__adev = data; @@ -490,10 +521,9 @@ static int vsc_tp_probe(struct spi_device *spi) tp->spi = spi; irq_set_status_flags(spi->irq, IRQ_DISABLE_UNLAZY); - ret = devm_request_threaded_irq(dev, spi->irq, vsc_tp_isr, - vsc_tp_thread_isr, - IRQF_TRIGGER_FALLING | IRQF_ONESHOT, - dev_name(dev), tp); + ret = request_threaded_irq(spi->irq, vsc_tp_isr, vsc_tp_thread_isr, + IRQF_TRIGGER_FALLING | IRQF_ONESHOT, + dev_name(dev), tp); if (ret) return ret; @@ -522,6 +552,8 @@ static int vsc_tp_probe(struct spi_device *spi) err_destroy_lock: mutex_destroy(&tp->mutex); + free_irq(spi->irq, tp); + return ret; } @@ -532,6 +564,8 @@ static void vsc_tp_remove(struct spi_device *spi) platform_device_unregister(tp->pdev); mutex_destroy(&tp->mutex); + + free_irq(spi->irq, tp); } static const struct acpi_device_id vsc_tp_acpi_ids[] = { diff --git a/drivers/misc/mei/vsc-tp.h b/drivers/misc/mei/vsc-tp.h index f9513ddc3e409..14ca195cbddcc 100644 --- a/drivers/misc/mei/vsc-tp.h +++ b/drivers/misc/mei/vsc-tp.h @@ -37,6 +37,9 @@ int vsc_tp_xfer(struct vsc_tp *tp, u8 cmd, const void *obuf, size_t olen, int vsc_tp_register_event_cb(struct vsc_tp *tp, vsc_tp_event_cb_t event_cb, void *context); +int vsc_tp_request_irq(struct vsc_tp *tp); +void vsc_tp_free_irq(struct vsc_tp *tp); + void vsc_tp_intr_enable(struct vsc_tp *tp); void vsc_tp_intr_disable(struct vsc_tp *tp); void vsc_tp_intr_synchronize(struct vsc_tp *tp); -- GitLab From d1718530e3f640b7d5f0050e725216eab57a85d8 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Mon, 8 Apr 2024 10:16:33 -0700 Subject: [PATCH 765/989] comedi: vmk80xx: fix incomplete endpoint checking While vmk80xx does have endpoint checking implemented, some things can fall through the cracks. Depending on the hardware model, URBs can have either bulk or interrupt type, and current version of vmk80xx_find_usb_endpoints() function does not take that fully into account. While this warning does not seem to be too harmful, at the very least it will crash systems with 'panic_on_warn' set on them. Fix the issue found by Syzkaller [1] by somewhat simplifying the endpoint checking process with usb_find_common_endpoints() and ensuring that only expected endpoint types are present. This patch has not been tested on real hardware. [1] Syzkaller report: usb 1-1: BOGUS urb xfer, pipe 1 != type 3 WARNING: CPU: 0 PID: 781 at drivers/usb/core/urb.c:504 usb_submit_urb+0xc4e/0x18c0 drivers/usb/core/urb.c:503 ... Call Trace: usb_start_wait_urb+0x113/0x520 drivers/usb/core/message.c:59 vmk80xx_reset_device drivers/comedi/drivers/vmk80xx.c:227 [inline] vmk80xx_auto_attach+0xa1c/0x1a40 drivers/comedi/drivers/vmk80xx.c:818 comedi_auto_config+0x238/0x380 drivers/comedi/drivers.c:1067 usb_probe_interface+0x5cd/0xb00 drivers/usb/core/driver.c:399 ... Similar issue also found by Syzkaller: Link: https://syzkaller.appspot.com/bug?extid=5205eb2f17de3e01946e Reported-and-tested-by: syzbot+5f29dc6a889fc42bd896@syzkaller.appspotmail.com Cc: stable Fixes: 49253d542cc0 ("staging: comedi: vmk80xx: factor out usb endpoint detection") Reviewed-by: Ian Abbott Signed-off-by: Nikita Zhandarovich Link: https://lore.kernel.org/r/20240408171633.31649-1-n.zhandarovich@fintech.ru Signed-off-by: Greg Kroah-Hartman --- drivers/comedi/drivers/vmk80xx.c | 35 +++++++++++--------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/drivers/comedi/drivers/vmk80xx.c b/drivers/comedi/drivers/vmk80xx.c index 4536ed43f65b2..84dce5184a77a 100644 --- a/drivers/comedi/drivers/vmk80xx.c +++ b/drivers/comedi/drivers/vmk80xx.c @@ -641,33 +641,22 @@ static int vmk80xx_find_usb_endpoints(struct comedi_device *dev) struct vmk80xx_private *devpriv = dev->private; struct usb_interface *intf = comedi_to_usb_interface(dev); struct usb_host_interface *iface_desc = intf->cur_altsetting; - struct usb_endpoint_descriptor *ep_desc; - int i; - - if (iface_desc->desc.bNumEndpoints != 2) - return -ENODEV; - - for (i = 0; i < iface_desc->desc.bNumEndpoints; i++) { - ep_desc = &iface_desc->endpoint[i].desc; - - if (usb_endpoint_is_int_in(ep_desc) || - usb_endpoint_is_bulk_in(ep_desc)) { - if (!devpriv->ep_rx) - devpriv->ep_rx = ep_desc; - continue; - } + struct usb_endpoint_descriptor *ep_rx_desc, *ep_tx_desc; + int ret; - if (usb_endpoint_is_int_out(ep_desc) || - usb_endpoint_is_bulk_out(ep_desc)) { - if (!devpriv->ep_tx) - devpriv->ep_tx = ep_desc; - continue; - } - } + if (devpriv->model == VMK8061_MODEL) + ret = usb_find_common_endpoints(iface_desc, &ep_rx_desc, + &ep_tx_desc, NULL, NULL); + else + ret = usb_find_common_endpoints(iface_desc, NULL, NULL, + &ep_rx_desc, &ep_tx_desc); - if (!devpriv->ep_rx || !devpriv->ep_tx) + if (ret) return -ENODEV; + devpriv->ep_rx = ep_rx_desc; + devpriv->ep_tx = ep_tx_desc; + if (!usb_endpoint_maxp(devpriv->ep_rx) || !usb_endpoint_maxp(devpriv->ep_tx)) return -EINVAL; -- GitLab From a90bca2228c0646fc29a72689d308e5fe03e6d78 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 13 Mar 2024 17:43:41 -0400 Subject: [PATCH 766/989] fs: sysfs: Fix reference leak in sysfs_break_active_protection() The sysfs_break_active_protection() routine has an obvious reference leak in its error path. If the call to kernfs_find_and_get() fails then kn will be NULL, so the companion sysfs_unbreak_active_protection() routine won't get called (and would only cause an access violation by trying to dereference kn->parent if it was called). As a result, the reference to kobj acquired at the start of the function will never be released. Fix the leak by adding an explicit kobject_put() call when kn is NULL. Signed-off-by: Alan Stern Fixes: 2afc9166f79b ("scsi: sysfs: Introduce sysfs_{un,}break_active_protection()") Cc: Bart Van Assche Cc: stable@vger.kernel.org Reviewed-by: Bart Van Assche Acked-by: Tejun Heo Link: https://lore.kernel.org/r/8a4d3f0f-c5e3-4b70-a188-0ca433f9e6f9@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 6b7652fb80505..7cd64021d453d 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -463,6 +463,8 @@ struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, kn = kernfs_find_and_get(kobj->sd, attr->name); if (kn) kernfs_break_active_protection(kn); + else + kobject_put(kobj); return kn; } EXPORT_SYMBOL_GPL(sysfs_break_active_protection); -- GitLab From aaef73821a3b0194a01bd23ca77774f704a04d40 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Sat, 30 Mar 2024 19:01:14 +0000 Subject: [PATCH 767/989] binder: check offset alignment in binder_get_object() Commit 6d98eb95b450 ("binder: avoid potential data leakage when copying txn") introduced changes to how binder objects are copied. In doing so, it unintentionally removed an offset alignment check done through calls to binder_alloc_copy_from_buffer() -> check_buffer(). These calls were replaced in binder_get_object() with copy_from_user(), so now an explicit offset alignment check is needed here. This avoids later complications when unwinding the objects gets harder. It is worth noting this check existed prior to commit 7a67a39320df ("binder: add function to copy binder object from buffer"), likely removed due to redundancy at the time. Fixes: 6d98eb95b450 ("binder: avoid potential data leakage when copying txn") Cc: stable@vger.kernel.org Signed-off-by: Carlos Llamas Acked-by: Todd Kjos Link: https://lore.kernel.org/r/20240330190115.1877819-1-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index bad28cf420104..dd6923d37931f 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1708,8 +1708,10 @@ static size_t binder_get_object(struct binder_proc *proc, size_t object_size = 0; read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset); - if (offset > buffer->data_size || read_size < sizeof(*hdr)) + if (offset > buffer->data_size || read_size < sizeof(*hdr) || + !IS_ALIGNED(offset, sizeof(u32))) return 0; + if (u) { if (copy_from_user(object, u + offset, read_size)) return 0; -- GitLab From f488138b526715c6d2568d7329c4477911be4210 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 11 Apr 2024 11:45:57 +0200 Subject: [PATCH 768/989] NFSD: fix endianness issue in nfsd4_encode_fattr4 The nfs4 mount fails with EIO on 64-bit big endian architectures since v6.7. The issue arises from employing a union in the nfsd4_encode_fattr4() function to overlay a 32-bit array with a 64-bit values based bitmap, which does not function as intended. Address the endianness issue by utilizing bitmap_from_arr32() to copy 32-bit attribute masks into a bitmap in an endianness-agnostic manner. Cc: stable@vger.kernel.org Fixes: fce7913b13d0 ("NFSD: Use a bitmask loop to encode FATTR4 results") Link: https://bugs.launchpad.net/ubuntu/+source/nfs-utils/+bug/2060217 Signed-off-by: Vasily Gorbik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index fac938f563ad0..1955481832e03 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3490,11 +3490,13 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct dentry *dentry, const u32 *bmval, int ignore_crossmnt) { + DECLARE_BITMAP(attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)); struct nfsd4_fattr_args args; struct svc_fh *tempfh = NULL; int starting_len = xdr->buf->len; __be32 *attrlen_p, status; int attrlen_offset; + u32 attrmask[3]; int err; struct nfsd4_compoundres *resp = rqstp->rq_resp; u32 minorversion = resp->cstate.minorversion; @@ -3502,10 +3504,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, .mnt = exp->ex_path.mnt, .dentry = dentry, }; - union { - u32 attrmask[3]; - unsigned long mask[2]; - } u; unsigned long bit; bool file_modified = false; u64 size = 0; @@ -3521,20 +3519,19 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, /* * Make a local copy of the attribute bitmap that can be modified. */ - memset(&u, 0, sizeof(u)); - u.attrmask[0] = bmval[0]; - u.attrmask[1] = bmval[1]; - u.attrmask[2] = bmval[2]; + attrmask[0] = bmval[0]; + attrmask[1] = bmval[1]; + attrmask[2] = bmval[2]; args.rdattr_err = 0; if (exp->ex_fslocs.migrated) { - status = fattr_handle_absent_fs(&u.attrmask[0], &u.attrmask[1], - &u.attrmask[2], &args.rdattr_err); + status = fattr_handle_absent_fs(&attrmask[0], &attrmask[1], + &attrmask[2], &args.rdattr_err); if (status) goto out; } args.size = 0; - if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { + if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry), &file_modified, &size); if (status) @@ -3553,16 +3550,16 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, if (!(args.stat.result_mask & STATX_BTIME)) /* underlying FS does not offer btime so we can't share it */ - u.attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE; - if ((u.attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | + attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE; + if ((attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) || - (u.attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | + (attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL))) { err = vfs_statfs(&path, &args.statfs); if (err) goto out_nfserr; } - if ((u.attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && + if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); status = nfserr_jukebox; @@ -3577,10 +3574,10 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, args.fhp = fhp; args.acl = NULL; - if (u.attrmask[0] & FATTR4_WORD0_ACL) { + if (attrmask[0] & FATTR4_WORD0_ACL) { err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl); if (err == -EOPNOTSUPP) - u.attrmask[0] &= ~FATTR4_WORD0_ACL; + attrmask[0] &= ~FATTR4_WORD0_ACL; else if (err == -EINVAL) { status = nfserr_attrnotsupp; goto out; @@ -3592,17 +3589,17 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, #ifdef CONFIG_NFSD_V4_SECURITY_LABEL args.context = NULL; - if ((u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || - u.attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { + if ((attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || + attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { if (exp->ex_flags & NFSEXP_SECURITY_LABEL) err = security_inode_getsecctx(d_inode(dentry), &args.context, &args.contextlen); else err = -EOPNOTSUPP; args.contextsupport = (err == 0); - if (u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) { + if (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) { if (err == -EOPNOTSUPP) - u.attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; else if (err) goto out_nfserr; } @@ -3610,8 +3607,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ /* attrmask */ - status = nfsd4_encode_bitmap4(xdr, u.attrmask[0], - u.attrmask[1], u.attrmask[2]); + status = nfsd4_encode_bitmap4(xdr, attrmask[0], attrmask[1], + attrmask[2]); if (status) goto out; @@ -3620,7 +3617,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, attrlen_p = xdr_reserve_space(xdr, XDR_UNIT); if (!attrlen_p) goto out_resource; - for_each_set_bit(bit, (const unsigned long *)&u.mask, + bitmap_from_arr32(attr_bitmap, attrmask, + ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)); + for_each_set_bit(bit, attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) { status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args); if (status != nfs_ok) -- GitLab From 156539fd65019e8ed6b9fbac0583cf519cdbb227 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 22 Mar 2024 21:38:40 +1100 Subject: [PATCH 769/989] Documentation: embargoed-hardware-issues.rst: Add myself for Power Unfortunately Anton has left IBM. Add myself as the contact for Power, until someone else volunteers. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20240322103840.668746-1-mpe@ellerman.id.au Signed-off-by: Greg Kroah-Hartman --- Documentation/process/embargoed-hardware-issues.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index bb2100228cc7b..6e9a4597bf2cb 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -252,7 +252,7 @@ an involved disclosed party. The current ambassadors list: AMD Tom Lendacky Ampere Darren Hart ARM Catalin Marinas - IBM Power Anton Blanchard + IBM Power Michael Ellerman IBM Z Christian Borntraeger Intel Tony Luck Qualcomm Trilok Soni -- GitLab From 50a9b7fc151e67b9e642232d32e8c5a5ac13e64a Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 5 Apr 2024 13:07:11 -0700 Subject: [PATCH 770/989] drm/xe/display: Fix double mutex initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All of these mutexes are already initialized by the display side since commit 3fef3e6ff86a ("drm/i915: move display mutex inits to display code"), so the xe shouldn´t initialize them. Fixes: 44e694958b95 ("drm/xe/display: Implement display support") Cc: Jani Nikula Cc: Arun R Murthy Reviewed-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20240405200711.2041428-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 117de185edf2c5767f03575219bf7a43b161ff0d) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/display/xe_display.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index e4db069f0db3f..6ec375c1c4b6c 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -108,11 +108,6 @@ int xe_display_create(struct xe_device *xe) xe->display.hotplug.dp_wq = alloc_ordered_workqueue("xe-dp", 0); drmm_mutex_init(&xe->drm, &xe->sb_lock); - drmm_mutex_init(&xe->drm, &xe->display.backlight.lock); - drmm_mutex_init(&xe->drm, &xe->display.audio.mutex); - drmm_mutex_init(&xe->drm, &xe->display.wm.wm_mutex); - drmm_mutex_init(&xe->drm, &xe->display.pps.mutex); - drmm_mutex_init(&xe->drm, &xe->display.hdcp.hdcp_mutex); xe->enabled_irq_mask = ~0; err = drmm_add_action_or_reset(&xe->drm, display_destroy, NULL); -- GitLab From a8ad8715472bb8f6a2ea8b4072a28151eb9f4f24 Mon Sep 17 00:00:00 2001 From: Karthik Poosa Date: Fri, 5 Apr 2024 18:31:27 +0530 Subject: [PATCH 771/989] drm/xe/hwmon: Cast result to output precision on left shift of operand Address potential overflow in result of left shift of a lower precision (u32) operand before assignment to higher precision (u64) variable. v2: - Update commit message. (Himal) Fixes: 4446fcf220ce ("drm/xe/hwmon: Expose power1_max_interval") Signed-off-by: Karthik Poosa Reviewed-by: Anshuman Gupta Cc: Badal Nilawar Link: https://patchwork.freedesktop.org/patch/msgid/20240405130127.1392426-5-karthik.poosa@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 883232b47b81108b0252197c747f396ecd51455a) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_hwmon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c index b82233a416062..9ac7fbe201b3c 100644 --- a/drivers/gpu/drm/xe/xe_hwmon.c +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -290,7 +290,7 @@ xe_hwmon_power1_max_interval_show(struct device *dev, struct device_attribute *a * As y can be < 2, we compute tau4 = (4 | x) << y * and then add 2 when doing the final right shift to account for units */ - tau4 = ((1 << x_w) | x) << y; + tau4 = (u64)((1 << x_w) | x) << y; /* val in hwmon interface units (millisec) */ out = mul_u64_u32_shr(tau4, SF_TIME, hwmon->scl_shift_time + x_w); @@ -330,7 +330,7 @@ xe_hwmon_power1_max_interval_store(struct device *dev, struct device_attribute * r = FIELD_PREP(PKG_MAX_WIN, PKG_MAX_WIN_DEFAULT); x = REG_FIELD_GET(PKG_MAX_WIN_X, r); y = REG_FIELD_GET(PKG_MAX_WIN_Y, r); - tau4 = ((1 << x_w) | x) << y; + tau4 = (u64)((1 << x_w) | x) << y; max_win = mul_u64_u32_shr(tau4, SF_TIME, hwmon->scl_shift_time + x_w); if (val > max_win) -- GitLab From 9cb46b31f3d08ed3fce86349e8c12f96d7c88717 Mon Sep 17 00:00:00 2001 From: Himal Prasad Ghimiray Date: Mon, 1 Apr 2024 23:23:00 +0530 Subject: [PATCH 772/989] drm/xe/xe_migrate: Cast to output precision before multiplying operands Addressing potential overflow in result of multiplication of two lower precision (u32) operands before widening it to higher precision (u64). -v2 Fix commit message and description. (Rodrigo) Cc: Rodrigo Vivi Signed-off-by: Himal Prasad Ghimiray Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240401175300.3823653-1-himal.prasad.ghimiray@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 34820967ae7b45411f8f4f737c2d63b0c608e0d7) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_migrate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index ee1bb938c4934..2ba4fb9511f63 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -227,7 +227,7 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, if (vm->flags & XE_VM_FLAG_64K && level == 1) flags = XE_PDE_64K; - entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (level - 1) * + entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (u64)(level - 1) * XE_PAGE_SIZE, pat_index); xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64, entry | flags); @@ -235,7 +235,7 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, /* Write PDE's that point to our BO. */ for (i = 0; i < num_entries - num_level; i++) { - entry = vm->pt_ops->pde_encode_bo(bo, i * XE_PAGE_SIZE, + entry = vm->pt_ops->pde_encode_bo(bo, (u64)i * XE_PAGE_SIZE, pat_index); xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE + @@ -291,7 +291,7 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64)) drm_suballoc_manager_init(&m->vm_update_sa, - (map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * + (size_t)(map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * NUM_VMUSA_UNIT_PER_PAGE, 0); m->pt_bo = bo; @@ -490,7 +490,7 @@ static void emit_pte(struct xe_migrate *m, struct xe_vm *vm = m->q->vm; u16 pat_index; u32 ptes; - u64 ofs = at_pt * XE_PAGE_SIZE; + u64 ofs = (u64)at_pt * XE_PAGE_SIZE; u64 cur_ofs; /* Indirect access needs compression enabled uncached PAT index */ -- GitLab From f76646c83f028c62853c23dac49204232e903597 Mon Sep 17 00:00:00 2001 From: Ashutosh Dixit Date: Thu, 4 Apr 2024 09:12:56 -0700 Subject: [PATCH 773/989] drm/xe: Label RING_CONTEXT_CONTROL as masked RING_CONTEXT_CONTROL is a masked register. v2: Also clean up setting register value (Lucas) Reviewed-by: Matt Roper Reviewed-by: Lucas De Marchi Signed-off-by: Ashutosh Dixit Link: https://patchwork.freedesktop.org/patch/msgid/20240404161256.3852502-1-ashutosh.dixit@intel.com (cherry picked from commit dc30c6e7149baaae4288c742de95212b31f07438) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/regs/xe_engine_regs.h | 2 +- drivers/gpu/drm/xe/xe_lrc.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h index 0b1266c88a6af..deddc8be48c0a 100644 --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h @@ -125,7 +125,7 @@ #define RING_EXECLIST_STATUS_LO(base) XE_REG((base) + 0x234) #define RING_EXECLIST_STATUS_HI(base) XE_REG((base) + 0x234 + 4) -#define RING_CONTEXT_CONTROL(base) XE_REG((base) + 0x244) +#define RING_CONTEXT_CONTROL(base) XE_REG((base) + 0x244, XE_REG_OPTION_MASKED) #define CTX_CTRL_INHIBIT_SYN_CTX_SWITCH REG_BIT(3) #define CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT REG_BIT(0) diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index 1426febe86eb6..57066faf575ee 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -525,9 +525,8 @@ static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class) static void set_context_control(u32 *regs, struct xe_hw_engine *hwe) { - regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH) | - _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) | - CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; + regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | + CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); /* TODO: Timestamp */ } -- GitLab From ebaed6d4def877d2035786ff318379eb750044c8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Mar 2024 11:29:10 -0700 Subject: [PATCH 774/989] peci: linux/peci.h: fix Excess kernel-doc description warning Remove the @controller: line to prevent the kernel-doc warning: include/linux/peci.h:84: warning: Excess struct member 'controller' description in 'peci_device' Signed-off-by: Randy Dunlap Cc: Iwona Winiarska Cc: openbmc@lists.ozlabs.org Reviewed-by: Iwona Winiarska Fixes: 6523d3b2ffa2 ("peci: Add core infrastructure") Cc: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240329182910.29495-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/peci.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/peci.h b/include/linux/peci.h index 9b3d36aff431e..90e241458ef69 100644 --- a/include/linux/peci.h +++ b/include/linux/peci.h @@ -58,7 +58,6 @@ static inline struct peci_controller *to_peci_controller(void *d) /** * struct peci_device - PECI device * @dev: device object to register PECI device to the device model - * @controller: manages the bus segment hosting this PECI device * @info: PECI device characteristics * @info.family: device family * @info.model: device model -- GitLab From fd706c9b1674e2858766bfbf7430534c2b26fbef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Apr 2024 16:55:54 -0700 Subject: [PATCH 775/989] KVM: x86: Snapshot if a vCPU's vendor model is AMD vs. Intel compatible Add kvm_vcpu_arch.is_amd_compatible to cache if a vCPU's vendor model is compatible with AMD, i.e. if the vCPU vendor is AMD or Hygon, along with helpers to check if a vCPU is compatible AMD vs. Intel. To handle Intel vs. AMD behavior related to masking the LVTPC entry, KVM will need to check for vendor compatibility on every PMI injection, i.e. querying for AMD will soon be a moderately hot path. Note! This subtly (or maybe not-so-subtly) makes "Intel compatible" KVM's default behavior, both if userspace omits (or never sets) CPUID 0x0 and if userspace sets a completely unknown vendor. One could argue that KVM should treat such vCPUs as not being compatible with Intel *or* AMD, but that would add useless complexity to KVM. KVM needs to do *something* in the face of vendor specific behavior, and so unless KVM conjured up a magic third option, choosing to treat unknown vendors as neither Intel nor AMD means that checks on AMD compatibility would yield Intel behavior, and checks for Intel compatibility would yield AMD behavior. And that's far worse as it would effectively yield random behavior depending on whether KVM checked for AMD vs. Intel vs. !AMD vs. !Intel. And practically speaking, all x86 CPUs follow either Intel or AMD architecture, i.e. "supporting" an unknown third architecture adds no value. Deliberately don't convert any of the existing guest_cpuid_is_intel() checks, as the Intel side of things is messier due to some flows explicitly checking for exactly vendor==Intel, versus some flows assuming anything that isn't "AMD compatible" gets Intel behavior. The Intel code will be cleaned up in the future. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20240405235603.1173076-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 1 + arch/x86/kvm/cpuid.h | 10 ++++++++++ arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/x86.c | 2 +- 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 16e07a2eee195..6efd1497b0263 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -855,6 +855,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; struct kvm_hypervisor_cpuid kvm_cpuid; + bool is_amd_compatible; /* * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bfc0bfcb2bc60..77352a4abd87f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -376,6 +376,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_update_pv_runtime(vcpu); + vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 856e3037e74f3..23dbb9eb277c7 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -120,6 +120,16 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); } +static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.is_amd_compatible; +} + +static inline bool guest_cpuid_is_intel_compatible(struct kvm_vcpu *vcpu) +{ + return !guest_cpuid_is_amd_compatible(vcpu); +} + static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 992e651540e85..bf4de6d7e39c6 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4935,7 +4935,7 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, context->cpu_role.base.level, is_efer_nx(context), guest_can_use(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), - guest_cpuid_is_amd_or_hygon(vcpu)); + guest_cpuid_is_amd_compatible(vcpu)); } static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 47d9f03b77783..ebcc12d1e1de3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3470,7 +3470,7 @@ static bool is_mci_status_msr(u32 msr) static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd_or_hygon(vcpu)) + if (guest_cpuid_is_amd_compatible(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; -- GitLab From 49ff3b4aec51e3abfc9369997cc603319b02af9a Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 5 Apr 2024 16:55:55 -0700 Subject: [PATCH 776/989] KVM: x86/pmu: Do not mask LVTPC when handling a PMI on AMD platforms On AMD and Hygon platforms, the local APIC does not automatically set the mask bit of the LVTPC register when handling a PMI and there is no need to clear it in the kernel's PMI handler. For guests, the mask bit is currently set by kvm_apic_local_deliver() and unless it is cleared by the guest kernel's PMI handler, PMIs stop arriving and break use-cases like sampling with perf record. This does not affect non-PerfMonV2 guests because PMIs are handled in the guest kernel by x86_pmu_handle_irq() which always clears the LVTPC mask bit irrespective of the vendor. Before: $ perf record -e cycles:u true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.001 MB perf.data (1 samples) ] After: $ perf record -e cycles:u true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.002 MB perf.data (19 samples) ] Fixes: a16eb25b09c0 ("KVM: x86: Mask LVTPC when handling a PMI") Cc: stable@vger.kernel.org Signed-off-by: Sandipan Das Reviewed-by: Jim Mattson [sean: use is_intel_compatible instead of !is_amd_or_hygon()] Signed-off-by: Sean Christopherson Message-ID: <20240405235603.1173076-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cf37586f04668..ebf41023be382 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2776,7 +2776,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); - if (r && lvt_type == APIC_LVTPC) + if (r && lvt_type == APIC_LVTPC && + guest_cpuid_is_intel_compatible(apic->vcpu)) kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); return r; } -- GitLab From 2b8dbf69ec60faf6c7db49e57d7f316409ccec92 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 5 Apr 2024 14:17:57 -0700 Subject: [PATCH 777/989] perf annotate: Make sure to call symbol__annotate2() in TUI The symbol__annotate2() initializes some data structures needed by TUI. It has a logic to prevent calling it multiple times by checking if it has the annotated source. But data type profiling uses a different code (symbol__annotate) to allocate the annotated lines in advance. So TUI missed to call symbol__annotate2() when it shows the annotation browser. Make symbol__annotate() reentrant and handle that situation properly. This fixes a crash in the annotation browser started by perf report in TUI like below. $ perf report -s type,sym --tui # and press 'a' key and then move down Fixes: 81e57deec325 ("perf report: Support data type profiling") Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240405211800.1412920-2-namhyung@kernel.org --- tools/perf/ui/browsers/annotate.c | 2 +- tools/perf/util/annotate.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index ec5e219328760..4790c735599bd 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -970,7 +970,7 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, if (dso->annotate_warned) return -1; - if (not_annotated) { + if (not_annotated || !sym->annotate2) { err = symbol__annotate2(ms, evsel, &browser.arch); if (err) { char msg[BUFSIZ]; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index ac002d907d818..50ca92255ff62 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2461,6 +2461,9 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, if (parch) *parch = arch; + if (!list_empty(¬es->src->source)) + return 0; + args.arch = arch; args.ms = *ms; if (annotate_opts.full_addr) -- GitLab From b372e96bd0a32729d55d27f613c8bc80708a82e1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 25 Mar 2024 09:21:20 +1100 Subject: [PATCH 778/989] ceph: redirty page before returning AOP_WRITEPAGE_ACTIVATE The page has been marked clean before writepage is called. If we don't redirty it before postponing the write, it might never get written. Cc: stable@vger.kernel.org Fixes: 503d4fa6ee28 ("ceph: remove reliance on bdi congestion") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1340d77124ae4..ee9caf7916fb9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -795,8 +795,10 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) ihold(inode); if (wbc->sync_mode == WB_SYNC_NONE && - ceph_inode_to_fs_client(inode)->write_congested) + ceph_inode_to_fs_client(inode)->write_congested) { + redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; + } wait_on_page_fscache(page); -- GitLab From f3408580bac8ce5cd76e7391e529c0a22e7c7eb2 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 9 Apr 2024 15:55:42 -0700 Subject: [PATCH 779/989] perf lock contention: Add a missing NULL check I got a report for a failure in BPF verifier on a recent kernel with perf lock contention command. It checks task->sighand->siglock without checking if sighand is NULL or not. Let's add one. ; if (&curr->sighand->siglock == (void *)lock) 265: (79) r1 = *(u64 *)(r0 +2624) ; frame1: R0_w=trusted_ptr_task_struct(off=0,imm=0) ; R1_w=rcu_ptr_or_null_sighand_struct(off=0,imm=0) 266: (b7) r2 = 0 ; frame1: R2_w=0 267: (0f) r1 += r2 R1 pointer arithmetic on rcu_ptr_or_null_ prohibited, null-check it first processed 164 insns (limit 1000000) max_states_per_insn 1 total_states 15 peak_states 15 mark_read 5 -- END PROG LOAD LOG -- libbpf: prog 'contention_end': failed to load: -13 libbpf: failed to load object 'lock_contention_bpf' libbpf: failed to load BPF skeleton 'lock_contention_bpf': -13 Failed to load lock-contention BPF skeleton lock contention BPF setup failed lock contention did not detect any lock contention Fixes: 1811e82767dcc ("perf lock contention: Track and show siglock with address") Reviewed-by: Ian Rogers Acked-by: Arnaldo Carvalho de Melo Cc: Song Liu Cc: bpf@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240409225542.1870999-1-namhyung@kernel.org --- tools/perf/util/bpf_skel/lock_contention.bpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index fb54bd38e7d09..d931a898c434b 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -284,6 +284,7 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags) struct task_struct *curr; struct mm_struct___old *mm_old; struct mm_struct___new *mm_new; + struct sighand_struct *sighand; switch (flags) { case LCB_F_READ: /* rwsem */ @@ -305,7 +306,9 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags) break; case LCB_F_SPIN: /* spinlock */ curr = bpf_get_current_task_btf(); - if (&curr->sighand->siglock == (void *)lock) + sighand = curr->sighand; + + if (sighand && &sighand->siglock == (void *)lock) return LCD_F_SIGHAND_LOCK; break; default: -- GitLab From 3ef842a77e7cdf757fe3f1d2999aa2cc88eb53ba Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:12 -0700 Subject: [PATCH 780/989] tools/include: Sync uapi/drm/i915_drm.h with the kernel sources To pick up changes from: b112364867499 ("drm/i915: Add GuC submission interface version query") 5cf0fbf763741 ("drm/i915: Add some boring kerneldoc") This should be used to beautify DRM syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-2-namhyung@kernel.org --- tools/include/uapi/drm/i915_drm.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index fd4f9574d177a..2ee338860b7e0 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -3013,6 +3013,7 @@ struct drm_i915_query_item { * - %DRM_I915_QUERY_MEMORY_REGIONS (see struct drm_i915_query_memory_regions) * - %DRM_I915_QUERY_HWCONFIG_BLOB (see `GuC HWCONFIG blob uAPI`) * - %DRM_I915_QUERY_GEOMETRY_SUBSLICES (see struct drm_i915_query_topology_info) + * - %DRM_I915_QUERY_GUC_SUBMISSION_VERSION (see struct drm_i915_query_guc_submission_version) */ __u64 query_id; #define DRM_I915_QUERY_TOPOLOGY_INFO 1 @@ -3021,6 +3022,7 @@ struct drm_i915_query_item { #define DRM_I915_QUERY_MEMORY_REGIONS 4 #define DRM_I915_QUERY_HWCONFIG_BLOB 5 #define DRM_I915_QUERY_GEOMETRY_SUBSLICES 6 +#define DRM_I915_QUERY_GUC_SUBMISSION_VERSION 7 /* Must be kept compact -- no holes and well documented */ /** @@ -3566,6 +3568,20 @@ struct drm_i915_query_memory_regions { struct drm_i915_memory_region_info regions[]; }; +/** + * struct drm_i915_query_guc_submission_version - query GuC submission interface version + */ +struct drm_i915_query_guc_submission_version { + /** @branch: Firmware branch version. */ + __u32 branch; + /** @major: Firmware major version. */ + __u32 major; + /** @minor: Firmware minor version. */ + __u32 minor; + /** @patch: Firmware patch version. */ + __u32 patch; +}; + /** * DOC: GuC HWCONFIG blob uAPI * -- GitLab From 4cfa8a873d3e3a87894f8de056ee69a857b5adcd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:13 -0700 Subject: [PATCH 781/989] tools/include: Sync uapi/linux/fs.h with the kernel sources To pick up the changes from: 41bcbe59c3b3f ("fs: FS_IOC_GETUUID") ae8c511757304 ("fs: add FS_IOC_GETFSSYSFSPATH") 73fa7547c70b3 ("vfs: add RWF_NOAPPEND flag for pwritev2") This should be used to beautify fs syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/fs.h include/uapi/linux/fs.h Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-3-namhyung@kernel.org --- tools/include/uapi/linux/fs.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index 48ad69f7722e1..45e4e64fd6643 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -64,6 +64,24 @@ struct fstrim_range { __u64 minlen; }; +/* + * We include a length field because some filesystems (vfat) have an identifier + * that we do want to expose as a UUID, but doesn't have the standard length. + * + * We use a fixed size buffer beacuse this interface will, by fiat, never + * support "UUIDs" longer than 16 bytes; we don't want to force all downstream + * users to have to deal with that. + */ +struct fsuuid2 { + __u8 len; + __u8 uuid[16]; +}; + +struct fs_sysfs_path { + __u8 len; + __u8 name[128]; +}; + /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */ #define FILE_DEDUPE_RANGE_SAME 0 #define FILE_DEDUPE_RANGE_DIFFERS 1 @@ -215,6 +233,13 @@ struct fsxattr { #define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) #define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX]) #define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) +/* Returns the external filesystem UUID, the same one blkid returns */ +#define FS_IOC_GETFSUUID _IOR(0x15, 0, struct fsuuid2) +/* + * Returns the path component under /sys/fs/ that refers to this filesystem; + * also /sys/kernel/debug/ for filesystems with debugfs exports + */ +#define FS_IOC_GETFSSYSFSPATH _IOR(0x15, 1, struct fs_sysfs_path) /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) @@ -301,9 +326,12 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* per-IO negation of O_APPEND */ +#define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_NOAPPEND) /* Pagemap ioctl */ #define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) -- GitLab From bee3b820c66a6aae0e16d0ac47f9744446f33bff Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:14 -0700 Subject: [PATCH 782/989] tools/include: Sync uapi/linux/kvm.h and asm/kvm.h with the kernel sources To pick up the changes from: 6bda055d6258 ("KVM: define __KVM_HAVE_GUEST_DEBUG unconditionally") 5d9cb71642db ("KVM: arm64: move ARM-specific defines to uapi/asm/kvm.h") 71cd774ad2f9 ("KVM: s390: move s390-specific structs to uapi/asm/kvm.h") d750951c9ed7 ("KVM: powerpc: move powerpc-specific structs to uapi/asm/kvm.h") bcac0477277e ("KVM: x86: move x86-specific structs to uapi/asm/kvm.h") c0a411904e15 ("KVM: remove more traces of device assignment UAPI") f3c80061c0d3 ("KVM: SEV: fix compat ABI for KVM_MEMORY_ENCRYPT_OP") That should be used to beautify the KVM arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h diff -u tools/arch/powerpc/include/uapi/asm/kvm.h arch/powerpc/include/uapi/asm/kvm.h diff -u tools/arch/s390/include/uapi/asm/kvm.h arch/s390/include/uapi/asm/kvm.h diff -u tools/arch/arm64/include/uapi/asm/kvm.h arch/arm64/include/uapi/asm/kvm.h Cc: Paolo Bonzini Cc: kvm@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-4-namhyung@kernel.org --- tools/arch/arm64/include/uapi/asm/kvm.h | 15 +- tools/arch/powerpc/include/uapi/asm/kvm.h | 45 +- tools/arch/s390/include/uapi/asm/kvm.h | 315 +++++++++- tools/arch/x86/include/uapi/asm/kvm.h | 308 +++++++++- tools/include/uapi/linux/kvm.h | 689 +--------------------- 5 files changed, 672 insertions(+), 700 deletions(-) diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index 89d2fc872d9f5..964df31da9751 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -37,9 +37,7 @@ #include #include -#define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_IRQ_LINE -#define __KVM_HAVE_READONLY_MEM #define __KVM_HAVE_VCPU_EVENTS #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -76,11 +74,11 @@ struct kvm_regs { /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ #define KVM_ARM_DEVICE_TYPE_SHIFT 0 -#define KVM_ARM_DEVICE_TYPE_MASK GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \ - KVM_ARM_DEVICE_TYPE_SHIFT) +#define KVM_ARM_DEVICE_TYPE_MASK __GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \ + KVM_ARM_DEVICE_TYPE_SHIFT) #define KVM_ARM_DEVICE_ID_SHIFT 16 -#define KVM_ARM_DEVICE_ID_MASK GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \ - KVM_ARM_DEVICE_ID_SHIFT) +#define KVM_ARM_DEVICE_ID_MASK __GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \ + KVM_ARM_DEVICE_ID_SHIFT) /* Supported device IDs */ #define KVM_ARM_DEVICE_VGIC_V2 0 @@ -162,6 +160,11 @@ struct kvm_sync_regs { __u64 device_irq_level; }; +/* Bits for run->s.regs.device_irq_level */ +#define KVM_ARM_DEV_EL1_VTIMER (1 << 0) +#define KVM_ARM_DEV_EL1_PTIMER (1 << 1) +#define KVM_ARM_DEV_PMU (1 << 2) + /* * PMU filter structure. Describe a range of events with a particular * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER. diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index 9f18fa090f1f1..1691297a766a9 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -28,7 +28,6 @@ #define __KVM_HAVE_PPC_SMT #define __KVM_HAVE_IRQCHIP #define __KVM_HAVE_IRQ_LINE -#define __KVM_HAVE_GUEST_DEBUG /* Not always available, but if it is, this is the correct offset. */ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -733,4 +732,48 @@ struct kvm_ppc_xive_eq { #define KVM_XIVE_TIMA_PAGE_OFFSET 0 #define KVM_XIVE_ESB_PAGE_OFFSET 4 +/* for KVM_PPC_GET_PVINFO */ + +#define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) + +struct kvm_ppc_pvinfo { + /* out */ + __u32 flags; + __u32 hcall[4]; + __u8 pad[108]; +}; + +/* for KVM_PPC_GET_SMMU_INFO */ +#define KVM_PPC_PAGE_SIZES_MAX_SZ 8 + +struct kvm_ppc_one_page_size { + __u32 page_shift; /* Page shift (or 0) */ + __u32 pte_enc; /* Encoding in the HPTE (>>12) */ +}; + +struct kvm_ppc_one_seg_page_size { + __u32 page_shift; /* Base page shift of segment (or 0) */ + __u32 slb_enc; /* SLB encoding for BookS */ + struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; +}; + +#define KVM_PPC_PAGE_SIZES_REAL 0x00000001 +#define KVM_PPC_1T_SEGMENTS 0x00000002 +#define KVM_PPC_NO_HASH 0x00000004 + +struct kvm_ppc_smmu_info { + __u64 flags; + __u32 slb_size; + __u16 data_keys; /* # storage keys supported for data */ + __u16 instr_keys; /* # storage keys supported for instructions */ + struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; +}; + +/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index abe926d43cbe0..05eaf6db3ad4c 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -12,7 +12,320 @@ #include #define __KVM_S390 -#define __KVM_HAVE_GUEST_DEBUG + +struct kvm_s390_skeys { + __u64 start_gfn; + __u64 count; + __u64 skeydata_addr; + __u32 flags; + __u32 reserved[9]; +}; + +#define KVM_S390_CMMA_PEEK (1 << 0) + +/** + * kvm_s390_cmma_log - Used for CMMA migration. + * + * Used both for input and output. + * + * @start_gfn: Guest page number to start from. + * @count: Size of the result buffer. + * @flags: Control operation mode via KVM_S390_CMMA_* flags + * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty + * pages are still remaining. + * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set + * in the PGSTE. + * @values: Pointer to the values buffer. + * + * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls. + */ +struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; +}; + +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + +/* for KVM_S390_MEM_OP */ +struct kvm_s390_mem_op { + /* in */ + __u64 gaddr; /* the guest address */ + __u64 flags; /* flags */ + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ + union { + struct { + __u8 ar; /* the access register number */ + __u8 key; /* access key, ignored if flag unset */ + __u8 pad1[6]; /* ignored */ + __u64 old_addr; /* ignored if cmpxchg flag unset */ + }; + __u32 sida_offset; /* offset into the sida */ + __u8 reserved[32]; /* ignored */ + }; +}; +/* types for kvm_s390_mem_op->op */ +#define KVM_S390_MEMOP_LOGICAL_READ 0 +#define KVM_S390_MEMOP_LOGICAL_WRITE 1 +#define KVM_S390_MEMOP_SIDA_READ 2 +#define KVM_S390_MEMOP_SIDA_WRITE 3 +#define KVM_S390_MEMOP_ABSOLUTE_READ 4 +#define KVM_S390_MEMOP_ABSOLUTE_WRITE 5 +#define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG 6 + +/* flags for kvm_s390_mem_op->flags */ +#define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) +#define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) +#define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2) + +/* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */ +#define KVM_S390_MEMOP_EXTENSION_CAP_BASE (1 << 0) +#define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG (1 << 1) + +struct kvm_s390_psw { + __u64 mask; + __u64 addr; +}; + +/* valid values for type in kvm_s390_interrupt */ +#define KVM_S390_SIGP_STOP 0xfffe0000u +#define KVM_S390_PROGRAM_INT 0xfffe0001u +#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u +#define KVM_S390_RESTART 0xfffe0003u +#define KVM_S390_INT_PFAULT_INIT 0xfffe0004u +#define KVM_S390_INT_PFAULT_DONE 0xfffe0005u +#define KVM_S390_MCHK 0xfffe1000u +#define KVM_S390_INT_CLOCK_COMP 0xffff1004u +#define KVM_S390_INT_CPU_TIMER 0xffff1005u +#define KVM_S390_INT_VIRTIO 0xffff2603u +#define KVM_S390_INT_SERVICE 0xffff2401u +#define KVM_S390_INT_EMERGENCY 0xffff1201u +#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u +/* Anything below 0xfffe0000u is taken by INT_IO */ +#define KVM_S390_INT_IO(ai,cssid,ssid,schid) \ + (((schid)) | \ + ((ssid) << 16) | \ + ((cssid) << 18) | \ + ((ai) << 26)) +#define KVM_S390_INT_IO_MIN 0x00000000u +#define KVM_S390_INT_IO_MAX 0xfffdffffu +#define KVM_S390_INT_IO_AI_MASK 0x04000000u + + +struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; +}; + +struct kvm_s390_io_info { + __u16 subchannel_id; + __u16 subchannel_nr; + __u32 io_int_parm; + __u32 io_int_word; +}; + +struct kvm_s390_ext_info { + __u32 ext_params; + __u32 pad; + __u64 ext_params2; +}; + +struct kvm_s390_pgm_info { + __u64 trans_exc_code; + __u64 mon_code; + __u64 per_address; + __u32 data_exc_code; + __u16 code; + __u16 mon_class_nr; + __u8 per_code; + __u8 per_atmid; + __u8 exc_access_id; + __u8 per_access_id; + __u8 op_access_id; +#define KVM_S390_PGM_FLAGS_ILC_VALID 0x01 +#define KVM_S390_PGM_FLAGS_ILC_0 0x02 +#define KVM_S390_PGM_FLAGS_ILC_1 0x04 +#define KVM_S390_PGM_FLAGS_ILC_MASK 0x06 +#define KVM_S390_PGM_FLAGS_NO_REWIND 0x08 + __u8 flags; + __u8 pad[2]; +}; + +struct kvm_s390_prefix_info { + __u32 address; +}; + +struct kvm_s390_extcall_info { + __u16 code; +}; + +struct kvm_s390_emerg_info { + __u16 code; +}; + +#define KVM_S390_STOP_FLAG_STORE_STATUS 0x01 +struct kvm_s390_stop_info { + __u32 flags; +}; + +struct kvm_s390_mchk_info { + __u64 cr14; + __u64 mcic; + __u64 failing_storage_address; + __u32 ext_damage_code; + __u32 pad; + __u8 fixed_logout[16]; +}; + +struct kvm_s390_irq { + __u64 type; + union { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; + struct kvm_s390_mchk_info mchk; + char reserved[64]; + } u; +}; + +struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; /* will stay unused for compatibility reasons */ + __u32 len; + __u32 reserved[4]; /* will stay unused for compatibility reasons */ +}; + +struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; +}; + +struct kvm_s390_pv_sec_parm { + __u64 origin; + __u64 length; +}; + +struct kvm_s390_pv_unp { + __u64 addr; + __u64 size; + __u64 tweak; +}; + +enum pv_cmd_dmp_id { + KVM_PV_DUMP_INIT, + KVM_PV_DUMP_CONFIG_STOR_STATE, + KVM_PV_DUMP_COMPLETE, + KVM_PV_DUMP_CPU, +}; + +struct kvm_s390_pv_dmp { + __u64 subcmd; + __u64 buff_addr; + __u64 buff_len; + __u64 gaddr; /* For dump storage state */ + __u64 reserved[4]; +}; + +enum pv_cmd_info_id { + KVM_PV_INFO_VM, + KVM_PV_INFO_DUMP, +}; + +struct kvm_s390_pv_info_dump { + __u64 dump_cpu_buffer_len; + __u64 dump_config_mem_buffer_per_1m; + __u64 dump_config_finalize_len; +}; + +struct kvm_s390_pv_info_vm { + __u64 inst_calls_list[4]; + __u64 max_cpus; + __u64 max_guests; + __u64 max_guest_addr; + __u64 feature_indication; +}; + +struct kvm_s390_pv_info_header { + __u32 id; + __u32 len_max; + __u32 len_written; + __u32 reserved; +}; + +struct kvm_s390_pv_info { + struct kvm_s390_pv_info_header header; + union { + struct kvm_s390_pv_info_dump dump; + struct kvm_s390_pv_info_vm vm; + }; +}; + +enum pv_cmd_id { + KVM_PV_ENABLE, + KVM_PV_DISABLE, + KVM_PV_SET_SEC_PARMS, + KVM_PV_UNPACK, + KVM_PV_VERIFY, + KVM_PV_PREP_RESET, + KVM_PV_UNSHARE_ALL, + KVM_PV_INFO, + KVM_PV_DUMP, + KVM_PV_ASYNC_CLEANUP_PREPARE, + KVM_PV_ASYNC_CLEANUP_PERFORM, +}; + +struct kvm_pv_cmd { + __u32 cmd; /* Command to be executed */ + __u16 rc; /* Ultravisor return code */ + __u16 rrc; /* Ultravisor return reason code */ + __u64 data; /* Data or address */ + __u32 flags; /* flags for future extensions. Must be 0 for now */ + __u32 reserved[3]; +}; + +struct kvm_s390_zpci_op { + /* in */ + __u32 fh; /* target device */ + __u8 op; /* operation to perform */ + __u8 pad[3]; + union { + /* for KVM_S390_ZPCIOP_REG_AEN */ + struct { + __u64 ibv; /* Guest addr of interrupt bit vector */ + __u64 sb; /* Guest addr of summary bit */ + __u32 flags; + __u32 noi; /* Number of interrupts */ + __u8 isc; /* Guest interrupt subclass */ + __u8 sbo; /* Offset of guest summary bit vector */ + __u16 pad; + } reg_aen; + __u64 reserved[8]; + } u; +}; + +/* types for kvm_s390_zpci_op->op */ +#define KVM_S390_ZPCIOP_REG_AEN 0 +#define KVM_S390_ZPCIOP_DEREG_AEN 1 + +/* flags for kvm_s390_zpci_op->u.reg_aen.flags */ +#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) /* Device control API: s390-specific devices */ #define KVM_DEV_FLIC_GET_ALL_IRQS 1 diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index a448d0964fc06..ef11aa4cab425 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -7,6 +7,8 @@ * */ +#include +#include #include #include #include @@ -40,7 +42,6 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI -#define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_MSIX #define __KVM_HAVE_MCE #define __KVM_HAVE_PIT_STATE2 @@ -49,7 +50,6 @@ #define __KVM_HAVE_DEBUGREGS #define __KVM_HAVE_XSAVE #define __KVM_HAVE_XCRS -#define __KVM_HAVE_READONLY_MEM /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -526,9 +526,301 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 -#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0) +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS _BITUL(0) #define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS) +/* for KVM_CAP_MCE */ +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; + +/* for KVM_CAP_XEN_HVM */ +#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) +#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) +#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) +#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) +#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) +#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) +#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8) + +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; + +struct kvm_xen_hvm_attr { + __u16 type; + __u16 pad[3]; + union { + __u8 long_mode; + __u8 vector; + __u8 runstate_update_flag; + union { + __u64 gfn; +#define KVM_XEN_INVALID_GFN ((__u64)-1) + __u64 hva; + } shared_info; + struct { + __u32 send_port; + __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ + __u32 flags; +#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) +#define KVM_XEN_EVTCHN_UPDATE (1 << 1) +#define KVM_XEN_EVTCHN_RESET (1 << 2) + /* + * Events sent by the guest are either looped back to + * the guest itself (potentially on a different port#) + * or signalled via an eventfd. + */ + union { + struct { + __u32 port; + __u32 vcpu; + __u32 priority; + } port; + struct { + __u32 port; /* Zero for eventfd */ + __s32 fd; + } eventfd; + __u32 padding[4]; + } deliver; + } evtchn; + __u32 xen_version; + __u64 pad[8]; + } u; +}; + + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 +#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 +#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 +#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ +#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6 + +struct kvm_xen_vcpu_attr { + __u16 type; + __u16 pad[3]; + union { + __u64 gpa; +#define KVM_XEN_INVALID_GPA ((__u64)-1) + __u64 hva; + __u64 pad[8]; + struct { + __u64 state; + __u64 state_entry_time; + __u64 time_running; + __u64 time_runnable; + __u64 time_blocked; + __u64 time_offline; + } runstate; + __u32 vcpu_id; + struct { + __u32 port; + __u32 priority; + __u64 expires_ns; + } timer; + __u8 vector; + } u; +}; + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 +#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 +#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9 + +/* Secure Encrypted Virtualization command */ +enum sev_cmd_id { + /* Guest initialization commands */ + KVM_SEV_INIT = 0, + KVM_SEV_ES_INIT, + /* Guest launch commands */ + KVM_SEV_LAUNCH_START, + KVM_SEV_LAUNCH_UPDATE_DATA, + KVM_SEV_LAUNCH_UPDATE_VMSA, + KVM_SEV_LAUNCH_SECRET, + KVM_SEV_LAUNCH_MEASURE, + KVM_SEV_LAUNCH_FINISH, + /* Guest migration commands (outgoing) */ + KVM_SEV_SEND_START, + KVM_SEV_SEND_UPDATE_DATA, + KVM_SEV_SEND_UPDATE_VMSA, + KVM_SEV_SEND_FINISH, + /* Guest migration commands (incoming) */ + KVM_SEV_RECEIVE_START, + KVM_SEV_RECEIVE_UPDATE_DATA, + KVM_SEV_RECEIVE_UPDATE_VMSA, + KVM_SEV_RECEIVE_FINISH, + /* Guest status and debug commands */ + KVM_SEV_GUEST_STATUS, + KVM_SEV_DBG_DECRYPT, + KVM_SEV_DBG_ENCRYPT, + /* Guest certificates commands */ + KVM_SEV_CERT_EXPORT, + /* Attestation report */ + KVM_SEV_GET_ATTESTATION_REPORT, + /* Guest Migration Extension */ + KVM_SEV_SEND_CANCEL, + + KVM_SEV_NR_MAX, +}; + +struct kvm_sev_cmd { + __u32 id; + __u32 pad0; + __u64 data; + __u32 error; + __u32 sev_fd; +}; + +struct kvm_sev_launch_start { + __u32 handle; + __u32 policy; + __u64 dh_uaddr; + __u32 dh_len; + __u32 pad0; + __u64 session_uaddr; + __u32 session_len; + __u32 pad1; +}; + +struct kvm_sev_launch_update_data { + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + + +struct kvm_sev_launch_secret { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_launch_measure { + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_guest_status { + __u32 handle; + __u32 policy; + __u32 state; +}; + +struct kvm_sev_dbg { + __u64 src_uaddr; + __u64 dst_uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_attestation_report { + __u8 mnonce[16]; + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_send_start { + __u32 policy; + __u32 pad0; + __u64 pdh_cert_uaddr; + __u32 pdh_cert_len; + __u32 pad1; + __u64 plat_certs_uaddr; + __u32 plat_certs_len; + __u32 pad2; + __u64 amd_certs_uaddr; + __u32 amd_certs_len; + __u32 pad3; + __u64 session_uaddr; + __u32 session_len; + __u32 pad4; +}; + +struct kvm_sev_send_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_receive_start { + __u32 handle; + __u32 policy; + __u64 pdh_uaddr; + __u32 pdh_len; + __u32 pad0; + __u64 session_uaddr; + __u32 session_len; + __u32 pad1; +}; + +struct kvm_sev_receive_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +struct kvm_hyperv_eventfd { + __u32 conn_id; + __s32 fd; + __u32 flags; + __u32 padding[3]; +}; + +#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff +#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) + /* * Masked event layout. * Bits Description @@ -549,10 +841,10 @@ struct kvm_pmu_event_filter { ((__u64)(!!(exclude)) << 55)) #define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \ - (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32)) -#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56)) -#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8)) -#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55)) + (__GENMASK_ULL(7, 0) | __GENMASK_ULL(35, 32)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (__GENMASK_ULL(63, 56)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (__GENMASK_ULL(15, 8)) +#define KVM_PMU_MASKED_ENTRY_EXCLUDE (_BITULL(55)) #define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56) /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ @@ -560,7 +852,7 @@ struct kvm_pmu_event_filter { #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ /* x86-specific KVM_EXIT_HYPERCALL flags. */ -#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) +#define KVM_EXIT_HYPERCALL_LONG_MODE _BITULL(0) #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index c3308536482bd..2190adbe30027 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -16,6 +16,11 @@ #define KVM_API_VERSION 12 +/* + * Backwards-compatible definitions. + */ +#define __KVM_HAVE_GUEST_DEBUG + /* for KVM_SET_USER_MEMORY_REGION */ struct kvm_userspace_memory_region { __u32 slot; @@ -85,43 +90,6 @@ struct kvm_pit_config { #define KVM_PIT_SPEAKER_DUMMY 1 -struct kvm_s390_skeys { - __u64 start_gfn; - __u64 count; - __u64 skeydata_addr; - __u32 flags; - __u32 reserved[9]; -}; - -#define KVM_S390_CMMA_PEEK (1 << 0) - -/** - * kvm_s390_cmma_log - Used for CMMA migration. - * - * Used both for input and output. - * - * @start_gfn: Guest page number to start from. - * @count: Size of the result buffer. - * @flags: Control operation mode via KVM_S390_CMMA_* flags - * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty - * pages are still remaining. - * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set - * in the PGSTE. - * @values: Pointer to the values buffer. - * - * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls. - */ -struct kvm_s390_cmma_log { - __u64 start_gfn; - __u32 count; - __u32 flags; - union { - __u64 remaining; - __u64 mask; - }; - __u64 values; -}; - struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 @@ -315,11 +283,6 @@ struct kvm_run { __u32 ipb; } s390_sieic; /* KVM_EXIT_S390_RESET */ -#define KVM_S390_RESET_POR 1 -#define KVM_S390_RESET_CLEAR 2 -#define KVM_S390_RESET_SUBSYSTEM 4 -#define KVM_S390_RESET_CPU_INIT 8 -#define KVM_S390_RESET_IPL 16 __u64 s390_reset_flags; /* KVM_EXIT_S390_UCONTROL */ struct { @@ -536,43 +499,6 @@ struct kvm_translation { __u8 pad[5]; }; -/* for KVM_S390_MEM_OP */ -struct kvm_s390_mem_op { - /* in */ - __u64 gaddr; /* the guest address */ - __u64 flags; /* flags */ - __u32 size; /* amount of bytes */ - __u32 op; /* type of operation */ - __u64 buf; /* buffer in userspace */ - union { - struct { - __u8 ar; /* the access register number */ - __u8 key; /* access key, ignored if flag unset */ - __u8 pad1[6]; /* ignored */ - __u64 old_addr; /* ignored if cmpxchg flag unset */ - }; - __u32 sida_offset; /* offset into the sida */ - __u8 reserved[32]; /* ignored */ - }; -}; -/* types for kvm_s390_mem_op->op */ -#define KVM_S390_MEMOP_LOGICAL_READ 0 -#define KVM_S390_MEMOP_LOGICAL_WRITE 1 -#define KVM_S390_MEMOP_SIDA_READ 2 -#define KVM_S390_MEMOP_SIDA_WRITE 3 -#define KVM_S390_MEMOP_ABSOLUTE_READ 4 -#define KVM_S390_MEMOP_ABSOLUTE_WRITE 5 -#define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG 6 - -/* flags for kvm_s390_mem_op->flags */ -#define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) -#define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) -#define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2) - -/* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */ -#define KVM_S390_MEMOP_EXTENSION_CAP_BASE (1 << 0) -#define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG (1 << 1) - /* for KVM_INTERRUPT */ struct kvm_interrupt { /* in */ @@ -637,124 +563,6 @@ struct kvm_mp_state { __u32 mp_state; }; -struct kvm_s390_psw { - __u64 mask; - __u64 addr; -}; - -/* valid values for type in kvm_s390_interrupt */ -#define KVM_S390_SIGP_STOP 0xfffe0000u -#define KVM_S390_PROGRAM_INT 0xfffe0001u -#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u -#define KVM_S390_RESTART 0xfffe0003u -#define KVM_S390_INT_PFAULT_INIT 0xfffe0004u -#define KVM_S390_INT_PFAULT_DONE 0xfffe0005u -#define KVM_S390_MCHK 0xfffe1000u -#define KVM_S390_INT_CLOCK_COMP 0xffff1004u -#define KVM_S390_INT_CPU_TIMER 0xffff1005u -#define KVM_S390_INT_VIRTIO 0xffff2603u -#define KVM_S390_INT_SERVICE 0xffff2401u -#define KVM_S390_INT_EMERGENCY 0xffff1201u -#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u -/* Anything below 0xfffe0000u is taken by INT_IO */ -#define KVM_S390_INT_IO(ai,cssid,ssid,schid) \ - (((schid)) | \ - ((ssid) << 16) | \ - ((cssid) << 18) | \ - ((ai) << 26)) -#define KVM_S390_INT_IO_MIN 0x00000000u -#define KVM_S390_INT_IO_MAX 0xfffdffffu -#define KVM_S390_INT_IO_AI_MASK 0x04000000u - - -struct kvm_s390_interrupt { - __u32 type; - __u32 parm; - __u64 parm64; -}; - -struct kvm_s390_io_info { - __u16 subchannel_id; - __u16 subchannel_nr; - __u32 io_int_parm; - __u32 io_int_word; -}; - -struct kvm_s390_ext_info { - __u32 ext_params; - __u32 pad; - __u64 ext_params2; -}; - -struct kvm_s390_pgm_info { - __u64 trans_exc_code; - __u64 mon_code; - __u64 per_address; - __u32 data_exc_code; - __u16 code; - __u16 mon_class_nr; - __u8 per_code; - __u8 per_atmid; - __u8 exc_access_id; - __u8 per_access_id; - __u8 op_access_id; -#define KVM_S390_PGM_FLAGS_ILC_VALID 0x01 -#define KVM_S390_PGM_FLAGS_ILC_0 0x02 -#define KVM_S390_PGM_FLAGS_ILC_1 0x04 -#define KVM_S390_PGM_FLAGS_ILC_MASK 0x06 -#define KVM_S390_PGM_FLAGS_NO_REWIND 0x08 - __u8 flags; - __u8 pad[2]; -}; - -struct kvm_s390_prefix_info { - __u32 address; -}; - -struct kvm_s390_extcall_info { - __u16 code; -}; - -struct kvm_s390_emerg_info { - __u16 code; -}; - -#define KVM_S390_STOP_FLAG_STORE_STATUS 0x01 -struct kvm_s390_stop_info { - __u32 flags; -}; - -struct kvm_s390_mchk_info { - __u64 cr14; - __u64 mcic; - __u64 failing_storage_address; - __u32 ext_damage_code; - __u32 pad; - __u8 fixed_logout[16]; -}; - -struct kvm_s390_irq { - __u64 type; - union { - struct kvm_s390_io_info io; - struct kvm_s390_ext_info ext; - struct kvm_s390_pgm_info pgm; - struct kvm_s390_emerg_info emerg; - struct kvm_s390_extcall_info extcall; - struct kvm_s390_prefix_info prefix; - struct kvm_s390_stop_info stop; - struct kvm_s390_mchk_info mchk; - char reserved[64]; - } u; -}; - -struct kvm_s390_irq_state { - __u64 buf; - __u32 flags; /* will stay unused for compatibility reasons */ - __u32 len; - __u32 reserved[4]; /* will stay unused for compatibility reasons */ -}; - /* for KVM_SET_GUEST_DEBUG */ #define KVM_GUESTDBG_ENABLE 0x00000001 @@ -810,50 +618,6 @@ struct kvm_enable_cap { __u8 pad[64]; }; -/* for KVM_PPC_GET_PVINFO */ - -#define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) - -struct kvm_ppc_pvinfo { - /* out */ - __u32 flags; - __u32 hcall[4]; - __u8 pad[108]; -}; - -/* for KVM_PPC_GET_SMMU_INFO */ -#define KVM_PPC_PAGE_SIZES_MAX_SZ 8 - -struct kvm_ppc_one_page_size { - __u32 page_shift; /* Page shift (or 0) */ - __u32 pte_enc; /* Encoding in the HPTE (>>12) */ -}; - -struct kvm_ppc_one_seg_page_size { - __u32 page_shift; /* Base page shift of segment (or 0) */ - __u32 slb_enc; /* SLB encoding for BookS */ - struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; -}; - -#define KVM_PPC_PAGE_SIZES_REAL 0x00000001 -#define KVM_PPC_1T_SEGMENTS 0x00000002 -#define KVM_PPC_NO_HASH 0x00000004 - -struct kvm_ppc_smmu_info { - __u64 flags; - __u32 slb_size; - __u16 data_keys; /* # storage keys supported for data */ - __u16 instr_keys; /* # storage keys supported for instructions */ - struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; -}; - -/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ -struct kvm_ppc_resize_hpt { - __u64 flags; - __u32 shift; - __u32 pad; -}; - #define KVMIO 0xAE /* machine type bits, to be used as argument to KVM_CREATE_VM */ @@ -923,9 +687,7 @@ struct kvm_ppc_resize_hpt { /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 #define KVM_CAP_USER_NMI 22 -#ifdef __KVM_HAVE_GUEST_DEBUG #define KVM_CAP_SET_GUEST_DEBUG 23 -#endif #ifdef __KVM_HAVE_PIT #define KVM_CAP_REINJECT_CONTROL 24 #endif @@ -1156,8 +918,6 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 -#ifdef KVM_CAP_IRQ_ROUTING - struct kvm_irq_routing_irqchip { __u32 irqchip; __u32 pin; @@ -1222,42 +982,6 @@ struct kvm_irq_routing { struct kvm_irq_routing_entry entries[]; }; -#endif - -#ifdef KVM_CAP_MCE -/* x86 MCE */ -struct kvm_x86_mce { - __u64 status; - __u64 addr; - __u64 misc; - __u64 mcg_status; - __u8 bank; - __u8 pad1[7]; - __u64 pad2[3]; -}; -#endif - -#ifdef KVM_CAP_XEN_HVM -#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) -#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) -#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) -#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) -#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) -#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) -#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) -#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) - -struct kvm_xen_hvm_config { - __u32 flags; - __u32 msr; - __u64 blob_addr_32; - __u64 blob_addr_64; - __u8 blob_size_32; - __u8 blob_size_64; - __u8 pad2[30]; -}; -#endif - #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) /* * Available with KVM_CAP_IRQFD_RESAMPLE @@ -1442,11 +1166,6 @@ struct kvm_vfio_spapr_tce { struct kvm_userspace_memory_region2) /* enable ucontrol for s390 */ -struct kvm_s390_ucas_mapping { - __u64 user_addr; - __u64 vcpu_addr; - __u64 length; -}; #define KVM_S390_UCAS_MAP _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping) #define KVM_S390_UCAS_UNMAP _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping) #define KVM_S390_VCPU_FAULT _IOW(KVMIO, 0x52, unsigned long) @@ -1641,89 +1360,6 @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) -struct kvm_s390_pv_sec_parm { - __u64 origin; - __u64 length; -}; - -struct kvm_s390_pv_unp { - __u64 addr; - __u64 size; - __u64 tweak; -}; - -enum pv_cmd_dmp_id { - KVM_PV_DUMP_INIT, - KVM_PV_DUMP_CONFIG_STOR_STATE, - KVM_PV_DUMP_COMPLETE, - KVM_PV_DUMP_CPU, -}; - -struct kvm_s390_pv_dmp { - __u64 subcmd; - __u64 buff_addr; - __u64 buff_len; - __u64 gaddr; /* For dump storage state */ - __u64 reserved[4]; -}; - -enum pv_cmd_info_id { - KVM_PV_INFO_VM, - KVM_PV_INFO_DUMP, -}; - -struct kvm_s390_pv_info_dump { - __u64 dump_cpu_buffer_len; - __u64 dump_config_mem_buffer_per_1m; - __u64 dump_config_finalize_len; -}; - -struct kvm_s390_pv_info_vm { - __u64 inst_calls_list[4]; - __u64 max_cpus; - __u64 max_guests; - __u64 max_guest_addr; - __u64 feature_indication; -}; - -struct kvm_s390_pv_info_header { - __u32 id; - __u32 len_max; - __u32 len_written; - __u32 reserved; -}; - -struct kvm_s390_pv_info { - struct kvm_s390_pv_info_header header; - union { - struct kvm_s390_pv_info_dump dump; - struct kvm_s390_pv_info_vm vm; - }; -}; - -enum pv_cmd_id { - KVM_PV_ENABLE, - KVM_PV_DISABLE, - KVM_PV_SET_SEC_PARMS, - KVM_PV_UNPACK, - KVM_PV_VERIFY, - KVM_PV_PREP_RESET, - KVM_PV_UNSHARE_ALL, - KVM_PV_INFO, - KVM_PV_DUMP, - KVM_PV_ASYNC_CLEANUP_PREPARE, - KVM_PV_ASYNC_CLEANUP_PERFORM, -}; - -struct kvm_pv_cmd { - __u32 cmd; /* Command to be executed */ - __u16 rc; /* Ultravisor return code */ - __u16 rrc; /* Ultravisor return reason code */ - __u64 data; /* Data or address */ - __u32 flags; /* flags for future extensions. Must be 0 for now */ - __u32 reserved[3]; -}; - /* Available with KVM_CAP_S390_PROTECTED */ #define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd) @@ -1737,58 +1373,6 @@ struct kvm_pv_cmd { #define KVM_XEN_HVM_GET_ATTR _IOWR(KVMIO, 0xc8, struct kvm_xen_hvm_attr) #define KVM_XEN_HVM_SET_ATTR _IOW(KVMIO, 0xc9, struct kvm_xen_hvm_attr) -struct kvm_xen_hvm_attr { - __u16 type; - __u16 pad[3]; - union { - __u8 long_mode; - __u8 vector; - __u8 runstate_update_flag; - struct { - __u64 gfn; -#define KVM_XEN_INVALID_GFN ((__u64)-1) - } shared_info; - struct { - __u32 send_port; - __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ - __u32 flags; -#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) -#define KVM_XEN_EVTCHN_UPDATE (1 << 1) -#define KVM_XEN_EVTCHN_RESET (1 << 2) - /* - * Events sent by the guest are either looped back to - * the guest itself (potentially on a different port#) - * or signalled via an eventfd. - */ - union { - struct { - __u32 port; - __u32 vcpu; - __u32 priority; - } port; - struct { - __u32 port; /* Zero for eventfd */ - __s32 fd; - } eventfd; - __u32 padding[4]; - } deliver; - } evtchn; - __u32 xen_version; - __u64 pad[8]; - } u; -}; - - -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ -#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 -#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 -#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ -#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 -#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ -#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 - /* Per-vCPU Xen attributes */ #define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) #define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr) @@ -1799,242 +1383,6 @@ struct kvm_xen_hvm_attr { #define KVM_GET_SREGS2 _IOR(KVMIO, 0xcc, struct kvm_sregs2) #define KVM_SET_SREGS2 _IOW(KVMIO, 0xcd, struct kvm_sregs2) -struct kvm_xen_vcpu_attr { - __u16 type; - __u16 pad[3]; - union { - __u64 gpa; -#define KVM_XEN_INVALID_GPA ((__u64)-1) - __u64 pad[8]; - struct { - __u64 state; - __u64 state_entry_time; - __u64 time_running; - __u64 time_runnable; - __u64 time_blocked; - __u64 time_offline; - } runstate; - __u32 vcpu_id; - struct { - __u32 port; - __u32 priority; - __u64 expires_ns; - } timer; - __u8 vector; - } u; -}; - -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ -#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 -#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ -#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 -#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 -#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 - -/* Secure Encrypted Virtualization command */ -enum sev_cmd_id { - /* Guest initialization commands */ - KVM_SEV_INIT = 0, - KVM_SEV_ES_INIT, - /* Guest launch commands */ - KVM_SEV_LAUNCH_START, - KVM_SEV_LAUNCH_UPDATE_DATA, - KVM_SEV_LAUNCH_UPDATE_VMSA, - KVM_SEV_LAUNCH_SECRET, - KVM_SEV_LAUNCH_MEASURE, - KVM_SEV_LAUNCH_FINISH, - /* Guest migration commands (outgoing) */ - KVM_SEV_SEND_START, - KVM_SEV_SEND_UPDATE_DATA, - KVM_SEV_SEND_UPDATE_VMSA, - KVM_SEV_SEND_FINISH, - /* Guest migration commands (incoming) */ - KVM_SEV_RECEIVE_START, - KVM_SEV_RECEIVE_UPDATE_DATA, - KVM_SEV_RECEIVE_UPDATE_VMSA, - KVM_SEV_RECEIVE_FINISH, - /* Guest status and debug commands */ - KVM_SEV_GUEST_STATUS, - KVM_SEV_DBG_DECRYPT, - KVM_SEV_DBG_ENCRYPT, - /* Guest certificates commands */ - KVM_SEV_CERT_EXPORT, - /* Attestation report */ - KVM_SEV_GET_ATTESTATION_REPORT, - /* Guest Migration Extension */ - KVM_SEV_SEND_CANCEL, - - KVM_SEV_NR_MAX, -}; - -struct kvm_sev_cmd { - __u32 id; - __u64 data; - __u32 error; - __u32 sev_fd; -}; - -struct kvm_sev_launch_start { - __u32 handle; - __u32 policy; - __u64 dh_uaddr; - __u32 dh_len; - __u64 session_uaddr; - __u32 session_len; -}; - -struct kvm_sev_launch_update_data { - __u64 uaddr; - __u32 len; -}; - - -struct kvm_sev_launch_secret { - __u64 hdr_uaddr; - __u32 hdr_len; - __u64 guest_uaddr; - __u32 guest_len; - __u64 trans_uaddr; - __u32 trans_len; -}; - -struct kvm_sev_launch_measure { - __u64 uaddr; - __u32 len; -}; - -struct kvm_sev_guest_status { - __u32 handle; - __u32 policy; - __u32 state; -}; - -struct kvm_sev_dbg { - __u64 src_uaddr; - __u64 dst_uaddr; - __u32 len; -}; - -struct kvm_sev_attestation_report { - __u8 mnonce[16]; - __u64 uaddr; - __u32 len; -}; - -struct kvm_sev_send_start { - __u32 policy; - __u64 pdh_cert_uaddr; - __u32 pdh_cert_len; - __u64 plat_certs_uaddr; - __u32 plat_certs_len; - __u64 amd_certs_uaddr; - __u32 amd_certs_len; - __u64 session_uaddr; - __u32 session_len; -}; - -struct kvm_sev_send_update_data { - __u64 hdr_uaddr; - __u32 hdr_len; - __u64 guest_uaddr; - __u32 guest_len; - __u64 trans_uaddr; - __u32 trans_len; -}; - -struct kvm_sev_receive_start { - __u32 handle; - __u32 policy; - __u64 pdh_uaddr; - __u32 pdh_len; - __u64 session_uaddr; - __u32 session_len; -}; - -struct kvm_sev_receive_update_data { - __u64 hdr_uaddr; - __u32 hdr_len; - __u64 guest_uaddr; - __u32 guest_len; - __u64 trans_uaddr; - __u32 trans_len; -}; - -#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) -#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) - -struct kvm_assigned_pci_dev { - __u32 assigned_dev_id; - __u32 busnr; - __u32 devfn; - __u32 flags; - __u32 segnr; - union { - __u32 reserved[11]; - }; -}; - -#define KVM_DEV_IRQ_HOST_INTX (1 << 0) -#define KVM_DEV_IRQ_HOST_MSI (1 << 1) -#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) - -#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) -#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) -#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) - -#define KVM_DEV_IRQ_HOST_MASK 0x00ff -#define KVM_DEV_IRQ_GUEST_MASK 0xff00 - -struct kvm_assigned_irq { - __u32 assigned_dev_id; - __u32 host_irq; /* ignored (legacy field) */ - __u32 guest_irq; - __u32 flags; - union { - __u32 reserved[12]; - }; -}; - -struct kvm_assigned_msix_nr { - __u32 assigned_dev_id; - __u16 entry_nr; - __u16 padding; -}; - -#define KVM_MAX_MSIX_PER_DEV 256 -struct kvm_assigned_msix_entry { - __u32 assigned_dev_id; - __u32 gsi; - __u16 entry; /* The index of entry in the MSI-X table */ - __u16 padding[3]; -}; - -#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) -#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) - -/* Available with KVM_CAP_ARM_USER_IRQ */ - -/* Bits for run->s.regs.device_irq_level */ -#define KVM_ARM_DEV_EL1_VTIMER (1 << 0) -#define KVM_ARM_DEV_EL1_PTIMER (1 << 1) -#define KVM_ARM_DEV_PMU (1 << 2) - -struct kvm_hyperv_eventfd { - __u32 conn_id; - __s32 fd; - __u32 flags; - __u32 padding[3]; -}; - -#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff -#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) - #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) #define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) @@ -2180,33 +1528,6 @@ struct kvm_stats_desc { /* Available with KVM_CAP_S390_ZPCI_OP */ #define KVM_S390_ZPCI_OP _IOW(KVMIO, 0xd1, struct kvm_s390_zpci_op) -struct kvm_s390_zpci_op { - /* in */ - __u32 fh; /* target device */ - __u8 op; /* operation to perform */ - __u8 pad[3]; - union { - /* for KVM_S390_ZPCIOP_REG_AEN */ - struct { - __u64 ibv; /* Guest addr of interrupt bit vector */ - __u64 sb; /* Guest addr of summary bit */ - __u32 flags; - __u32 noi; /* Number of interrupts */ - __u8 isc; /* Guest interrupt subclass */ - __u8 sbo; /* Offset of guest summary bit vector */ - __u16 pad; - } reg_aen; - __u64 reserved[8]; - } u; -}; - -/* types for kvm_s390_zpci_op->op */ -#define KVM_S390_ZPCIOP_REG_AEN 0 -#define KVM_S390_ZPCIOP_DEREG_AEN 1 - -/* flags for kvm_s390_zpci_op->u.reg_aen.flags */ -#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) - /* Available with KVM_CAP_MEMORY_ATTRIBUTES */ #define KVM_SET_MEMORY_ATTRIBUTES _IOW(KVMIO, 0xd2, struct kvm_memory_attributes) -- GitLab From b7ce17f257da17a4163da82e0fb7726c2de85da7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:15 -0700 Subject: [PATCH 783/989] tools/include: Sync uapi/sound/asound.h with the kernel sources To pick up the changes from: 85df6b5a6658 ("ALSA: pcm: clarify and fix default msbits value for all formats") This should be used to beautify sound syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/sound/asound.h include/uapi/sound/asound.h Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: linux-sound@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-5-namhyung@kernel.org --- tools/include/uapi/sound/asound.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index d5b9cfbd9ceac..628d46a0da92e 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -142,7 +142,7 @@ struct snd_hwdep_dsp_image { * * *****************************************************************************/ -#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 16) +#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 17) typedef unsigned long snd_pcm_uframes_t; typedef signed long snd_pcm_sframes_t; @@ -416,7 +416,7 @@ struct snd_pcm_hw_params { unsigned int rmask; /* W: requested masks */ unsigned int cmask; /* R: changed masks */ unsigned int info; /* R: Info flags for returned setup */ - unsigned int msbits; /* R: used most significant bits */ + unsigned int msbits; /* R: used most significant bits (in sample bit-width) */ unsigned int rate_num; /* R: rate numerator */ unsigned int rate_den; /* R: rate denominator */ snd_pcm_uframes_t fifo_size; /* R: chip FIFO size in frames */ -- GitLab From 58e1b92df491c35abad7ddd2e393b89244e16bd5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:16 -0700 Subject: [PATCH 784/989] tools/include: Sync x86 CPU feature headers with the kernel sources To pick up the changes from: 598c2fafc06f ("perf/x86/amd/lbr: Use freeze based on availability") 7f274e609f3d ("x86/cpufeatures: Add new word for scattered features") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/disabled-features.h arch/x86/include/asm/disabled-features.h diff -u tools/arch/x86/include/asm/required-features.h arch/x86/include/asm/required-features.h diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-6-namhyung@kernel.org --- tools/arch/x86/include/asm/cpufeatures.h | 17 ++++++++++++----- tools/arch/x86/include/asm/disabled-features.h | 11 +++++++++-- tools/arch/x86/include/asm/required-features.h | 3 ++- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 25160d26764b5..a38f8f9ba6572 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 21 /* N 32-bit words worth of info */ +#define NCAPINTS 22 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* @@ -81,10 +81,8 @@ #define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ #define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ - -/* CPU types for specific tunings: */ #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */ +#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* "" CPU based on Zen5 microarchitecture */ #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ @@ -97,7 +95,7 @@ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ #define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ -/* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */ +#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ @@ -461,6 +459,14 @@ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */ +/* + * Extended auxiliary flags: Linux defined - for features scattered in various + * CPUID levels like 0x80000022, etc. + * + * Reuse free bits when adding new feature flags! + */ +#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ + /* * BUG word(s) */ @@ -508,4 +514,5 @@ /* BUG word 2 */ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ +#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 1f23960d2b06e..c492bdc97b059 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -123,6 +123,12 @@ # define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) #endif +#ifdef CONFIG_KVM_AMD_SEV +#define DISABLE_SEV_SNP 0 +#else +#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -147,8 +153,9 @@ DISABLE_ENQCMD) #define DISABLED_MASK17 0 #define DISABLED_MASK18 (DISABLE_IBT) -#define DISABLED_MASK19 0 +#define DISABLED_MASK19 (DISABLE_SEV_SNP) #define DISABLED_MASK20 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define DISABLED_MASK21 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h index 7ba1726b71c7b..e9187ddd3d1fd 100644 --- a/tools/arch/x86/include/asm/required-features.h +++ b/tools/arch/x86/include/asm/required-features.h @@ -99,6 +99,7 @@ #define REQUIRED_MASK18 0 #define REQUIRED_MASK19 0 #define REQUIRED_MASK20 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define REQUIRED_MASK21 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ -- GitLab From 978f2a60dd5ca6c25dfd5e24e7191b16af0ec429 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:17 -0700 Subject: [PATCH 785/989] tools/include: Sync x86 asm/irq_vectors.h with the kernel sources To pick up the changes from: 0cbca1bf44a0 ("x86: irq: unconditionally define KVM interrupt vectors") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/irq_vectors.h arch/x86/include/asm/irq_vectors.h Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-7-namhyung@kernel.org --- tools/arch/x86/include/asm/irq_vectors.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/arch/x86/include/asm/irq_vectors.h b/tools/arch/x86/include/asm/irq_vectors.h index 3f73ac3ed3a07..d18bfb238f660 100644 --- a/tools/arch/x86/include/asm/irq_vectors.h +++ b/tools/arch/x86/include/asm/irq_vectors.h @@ -84,11 +84,9 @@ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 /* Vector for KVM to deliver posted interrupt IPI */ -#if IS_ENABLED(CONFIG_KVM) #define POSTED_INTR_VECTOR 0xf2 #define POSTED_INTR_WAKEUP_VECTOR 0xf1 #define POSTED_INTR_NESTED_VECTOR 0xf0 -#endif #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef -- GitLab From c781a72f9ddd09baddde9df1f59955c0d6ea4944 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:18 -0700 Subject: [PATCH 786/989] tools/include: Sync x86 asm/msr-index.h with the kernel sources To pick up the changes from: 8076fcde016c ("x86/rfds: Mitigate Register File Data Sampling (RFDS)") d7b69b590bc9 ("x86/sev: Dump SEV_STATUS") cd6df3f378f6 ("x86/cpu: Add MSR numbers for FRED configuration") 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-8-namhyung@kernel.org --- tools/arch/x86/include/asm/msr-index.h | 74 ++++++++++++++++++-------- 1 file changed, 51 insertions(+), 23 deletions(-) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 1f9dc9bd13eb7..05956bd8bacf5 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -176,6 +176,14 @@ * CPU is not vulnerable to Gather * Data Sampling (GDS). */ +#define ARCH_CAP_RFDS_NO BIT(27) /* + * Not susceptible to Register + * File Data Sampling. + */ +#define ARCH_CAP_RFDS_CLEAR BIT(28) /* + * VERW clears CPU Register + * File. + */ #define ARCH_CAP_XAPIC_DISABLE BIT(21) /* * IA32_XAPIC_DISABLE_STATUS MSR @@ -605,34 +613,47 @@ #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 -#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 -#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) +#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) - -/* SNP feature bits enabled by the hypervisor */ -#define MSR_AMD64_SNP_VTOM BIT_ULL(3) -#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(4) -#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(5) -#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(6) -#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(7) -#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(8) -#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(9) -#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(10) -#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(11) -#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(12) -#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(14) -#define MSR_AMD64_SNP_VMSA_REG_PROTECTION BIT_ULL(16) -#define MSR_AMD64_SNP_SMT_PROTECTION BIT_ULL(17) - -/* SNP feature bits reserved for future use. */ -#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) -#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) -#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, 18) +#define MSR_AMD64_SNP_VTOM_BIT 3 +#define MSR_AMD64_SNP_VTOM BIT_ULL(MSR_AMD64_SNP_VTOM_BIT) +#define MSR_AMD64_SNP_REFLECT_VC_BIT 4 +#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(MSR_AMD64_SNP_REFLECT_VC_BIT) +#define MSR_AMD64_SNP_RESTRICTED_INJ_BIT 5 +#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(MSR_AMD64_SNP_RESTRICTED_INJ_BIT) +#define MSR_AMD64_SNP_ALT_INJ_BIT 6 +#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(MSR_AMD64_SNP_ALT_INJ_BIT) +#define MSR_AMD64_SNP_DEBUG_SWAP_BIT 7 +#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(MSR_AMD64_SNP_DEBUG_SWAP_BIT) +#define MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT 8 +#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT) +#define MSR_AMD64_SNP_BTB_ISOLATION_BIT 9 +#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(MSR_AMD64_SNP_BTB_ISOLATION_BIT) +#define MSR_AMD64_SNP_VMPL_SSS_BIT 10 +#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(MSR_AMD64_SNP_VMPL_SSS_BIT) +#define MSR_AMD64_SNP_SECURE_TSC_BIT 11 +#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(MSR_AMD64_SNP_SECURE_TSC_BIT) +#define MSR_AMD64_SNP_VMGEXIT_PARAM_BIT 12 +#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(MSR_AMD64_SNP_VMGEXIT_PARAM_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) +#define MSR_AMD64_SNP_IBS_VIRT_BIT 14 +#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(MSR_AMD64_SNP_IBS_VIRT_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) +#define MSR_AMD64_SNP_VMSA_REG_PROT_BIT 16 +#define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) +#define MSR_AMD64_SNP_SMT_PROT_BIT 17 +#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) +#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f +#define MSR_AMD64_RMP_BASE 0xc0010132 +#define MSR_AMD64_RMP_END 0xc0010133 + /* AMD Collaborative Processor Performance Control MSRs */ #define MSR_AMD_CPPC_CAP1 0xc00102b0 #define MSR_AMD_CPPC_ENABLE 0xc00102b1 @@ -719,8 +740,15 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_AMD64_SYSCFG 0xc0010010 -#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 #define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG_SNP_EN_BIT 24 +#define MSR_AMD64_SYSCFG_SNP_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_EN_BIT) +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT 25 +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT) +#define MSR_AMD64_SYSCFG_MFDM_BIT 19 +#define MSR_AMD64_SYSCFG_MFDM BIT_ULL(MSR_AMD64_SYSCFG_MFDM_BIT) + #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 -- GitLab From 99e4e1174acd7f5a942d37e1ac6c115f870d5975 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:19 -0700 Subject: [PATCH 787/989] tools/include: Sync asm-generic/bitops/fls.h with the kernel sources To pick up the changes from: cb4ede926134 ("riscv: Avoid code duplication with generic bitops implementation") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/asm-generic/bitops/__fls.h include/asm-generic/bitops/__fls.h diff -u tools/include/asm-generic/bitops/fls.h include/asm-generic/bitops/fls.h Cc: Arnd Bergmann Cc: Geert Uytterhoeven Cc: Palmer Dabbelt Cc: linux-arch@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-9-namhyung@kernel.org --- tools/include/asm-generic/bitops/__fls.h | 8 ++++++-- tools/include/asm-generic/bitops/fls.h | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/include/asm-generic/bitops/__fls.h b/tools/include/asm-generic/bitops/__fls.h index 03f721a8a2b19..54ccccf96e21e 100644 --- a/tools/include/asm-generic/bitops/__fls.h +++ b/tools/include/asm-generic/bitops/__fls.h @@ -5,12 +5,12 @@ #include /** - * __fls - find last (most-significant) set bit in a long word + * generic___fls - find last (most-significant) set bit in a long word * @word: the word to search * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __fls(unsigned long word) +static __always_inline unsigned long generic___fls(unsigned long word) { int num = BITS_PER_LONG - 1; @@ -41,4 +41,8 @@ static __always_inline unsigned long __fls(unsigned long word) return num; } +#ifndef __HAVE_ARCH___FLS +#define __fls(word) generic___fls(word) +#endif + #endif /* _ASM_GENERIC_BITOPS___FLS_H_ */ diff --git a/tools/include/asm-generic/bitops/fls.h b/tools/include/asm-generic/bitops/fls.h index b168bb10e1be1..26f3ce1dd6e44 100644 --- a/tools/include/asm-generic/bitops/fls.h +++ b/tools/include/asm-generic/bitops/fls.h @@ -3,14 +3,14 @@ #define _ASM_GENERIC_BITOPS_FLS_H_ /** - * fls - find last (most-significant) bit set + * generic_fls - find last (most-significant) bit set * @x: the word to search * * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(unsigned int x) +static __always_inline int generic_fls(unsigned int x) { int r = 32; @@ -39,4 +39,8 @@ static __always_inline int fls(unsigned int x) return r; } +#ifndef __HAVE_ARCH_FLS +#define fls(x) generic_fls(x) +#endif + #endif /* _ASM_GENERIC_BITOPS_FLS_H_ */ -- GitLab From 1cebd7f74976455ccd89c1dfbcf00bca52d0a512 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:20 -0700 Subject: [PATCH 788/989] tools/include: Sync arm64 asm/cputype.h with the kernel sources To pick up the changes from: fb091ff39479 ("arm64: Subscribe Microsoft Azure Cobalt 100 to ARM Neoverse N2 errata") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/arm64/include/asm/cputype.h arch/arm64/include/asm/cputype.h Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-10-namhyung@kernel.org --- tools/arch/arm64/include/asm/cputype.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 7c7493cb571f9..52f076afeb960 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -61,6 +61,7 @@ #define ARM_CPU_IMP_HISI 0x48 #define ARM_CPU_IMP_APPLE 0x61 #define ARM_CPU_IMP_AMPERE 0xC0 +#define ARM_CPU_IMP_MICROSOFT 0x6D #define ARM_CPU_PART_AEM_V8 0xD0F #define ARM_CPU_PART_FOUNDATION 0xD00 @@ -135,6 +136,8 @@ #define AMPERE_CPU_PART_AMPERE1 0xAC3 +#define MICROSOFT_CPU_PART_AZURE_COBALT_100 0xD49 /* Based on r0p0 of ARM Neoverse N2 */ + #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) @@ -193,6 +196,7 @@ #define MIDR_APPLE_M2_BLIZZARD_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_MAX) #define MIDR_APPLE_M2_AVALANCHE_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_MAX) #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) +#define MIDR_MICROSOFT_AZURE_COBALT_100 MIDR_CPU_MODEL(ARM_CPU_IMP_MICROSOFT, MICROSOFT_CPU_PART_AZURE_COBALT_100) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ #define MIDR_FUJITSU_ERRATUM_010001 MIDR_FUJITSU_A64FX -- GitLab From 447112d7edd77fa468938377418434233bcb3709 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Mar 2024 17:13:42 -0800 Subject: [PATCH 789/989] KVM: VMX: Snapshot LBR capabilities during module initialization Snapshot VMX's LBR capabilities once during module initialization instead of calling into perf every time a vCPU reconfigures its vPMU. This will allow massaging the LBR capabilities, e.g. if the CPU doesn't support callstacks, without having to remember to update multiple locations. Opportunistically tag vmx_get_perf_capabilities() with __init, as it's only called from vmx_set_cpu_caps(). Reviewed-by: Mingwei Zhang Link: https://lore.kernel.org/r/20240307011344.835640-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/pmu_intel.c | 2 +- arch/x86/kvm/vmx/vmx.c | 9 +++++---- arch/x86/kvm/vmx/vmx.h | 2 ++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 12ade343a17ed..be40474de6e4d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -535,7 +535,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) perf_capabilities = vcpu_get_perf_capabilities(vcpu); if (cpuid_model_is_consistent(vcpu) && (perf_capabilities & PMU_CAP_LBR_FMT)) - x86_perf_get_lbr(&lbr_desc->records); + memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); else lbr_desc->records.nr = 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e3315bda62372..aca41d3419f90 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -218,6 +218,8 @@ module_param(ple_window_max, uint, 0444); int __read_mostly pt_mode = PT_MODE_SYSTEM; module_param(pt_mode, int, S_IRUGO); +struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; + static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); @@ -7862,10 +7864,9 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_update_exception_bitmap(vcpu); } -static u64 vmx_get_perf_capabilities(void) +static __init u64 vmx_get_perf_capabilities(void) { u64 perf_cap = PMU_CAP_FW_WRITES; - struct x86_pmu_lbr lbr; u64 host_perf_cap = 0; if (!enable_pmu) @@ -7875,8 +7876,8 @@ static u64 vmx_get_perf_capabilities(void) rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { - x86_perf_get_lbr(&lbr); - if (lbr.nr) + x86_perf_get_lbr(&vmx_lbr_caps); + if (vmx_lbr_caps.nr) perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 1742c88ba9c80..90f9e44346464 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -110,6 +110,8 @@ struct lbr_desc { bool msr_passthrough; }; +extern struct x86_pmu_lbr vmx_lbr_caps; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. -- GitLab From 0d0b60865071115f6da76090f0dbc1f0e8b9c647 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Mar 2024 17:13:43 -0800 Subject: [PATCH 790/989] perf/x86/intel: Expose existence of callback support to KVM Add a "has_callstack" field to the x86_pmu_lbr structure used to pass information to KVM, and set it accordingly in x86_perf_get_lbr(). KVM will use has_callstack to avoid trying to create perf LBR events with PERF_SAMPLE_BRANCH_CALL_STACK on CPUs that don't support callstacks. Reviewed-by: Mingwei Zhang Link: https://lore.kernel.org/r/20240307011344.835640-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/events/intel/lbr.c | 1 + arch/x86/include/asm/perf_event.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 78cd5084104e9..4367aa77cb8d9 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1693,6 +1693,7 @@ void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) lbr->from = x86_pmu.lbr_from; lbr->to = x86_pmu.lbr_to; lbr->info = x86_pmu.lbr_info; + lbr->has_callstack = x86_pmu_has_lbr_callstack(); } EXPORT_SYMBOL_GPL(x86_perf_get_lbr); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 3736b8a46c04d..7f1e17250546b 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -555,6 +555,7 @@ struct x86_pmu_lbr { unsigned int from; unsigned int to; unsigned int info; + bool has_callstack; }; extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); -- GitLab From bb9dc859086df369f1fd34578dd5ca82d6321d21 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Mar 2024 17:13:44 -0800 Subject: [PATCH 791/989] KVM: VMX: Disable LBR virtualization if the CPU doesn't support LBR callstacks Disable LBR virtualization if the CPU doesn't support callstacks, which were introduced in HSW (see commit e9d7f7cd97c4 ("perf/x86/intel: Add basic Haswell LBR call stack support"), as KVM unconditionally configures the perf LBR event with PERF_SAMPLE_BRANCH_CALL_STACK, i.e. LBR virtualization always fails on pre-HSW CPUs. Simply disable LBR support on such CPUs, as it has never worked, i.e. there is no risk of breaking an existing setup, and figuring out a way to performantly context switch LBRs on old CPUs is not worth the effort. Fixes: be635e34c284 ("KVM: vmx/pmu: Expose LBR_FMT in the MSR_IA32_PERF_CAPABILITIES") Cc: Mingwei Zhang Cc: Jim Mattson Tested-by: Mingwei Zhang Link: https://lore.kernel.org/r/20240307011344.835640-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aca41d3419f90..22411f4aff530 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7877,7 +7877,15 @@ static __init u64 vmx_get_perf_capabilities(void) if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { x86_perf_get_lbr(&vmx_lbr_caps); - if (vmx_lbr_caps.nr) + + /* + * KVM requires LBR callstack support, as the overhead due to + * context switching LBRs without said support is too high. + * See intel_pmu_create_guest_lbr_event() for more info. + */ + if (!vmx_lbr_caps.has_callstack) + memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); + else if (vmx_lbr_caps.nr) perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; } -- GitLab From 1bc26cb9090246190e8c540f5aa201cea2f895a1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 8 Apr 2024 16:11:15 -0700 Subject: [PATCH 792/989] KVM: x86/mmu: Precisely invalidate MMU root_role during CPUID update Set kvm_mmu_page_role.invalid to mark the various MMU root_roles invalid during CPUID update in order to force a refresh, instead of zeroing out the entire role. This fixes a bug where kvm_mmu_free_roots() incorrectly thinks a root is indirect, i.e. not a TDP MMU, due to "direct" being zeroed, which in turn causes KVM to take mmu_lock for write instead of read. Note, paving over the entire role was largely unintentional, commit 7a458f0e1ba1 ("KVM: x86/mmu: remove extended bits from mmu_role, rename field") simply missed that "invalid" could be set. Fixes: 576a15de8d29 ("KVM: x86/mmu: Free TDP MMU roots while holding mmy_lock for read") Reported-by: syzbot+dc308fcfcd53f987de73@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/0000000000009b38080614c49bdb@google.com Cc: Phi Nguyen Link: https://lore.kernel.org/r/20240408231115.1387279-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 18a404b5923f1..59c051845992b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5576,9 +5576,9 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) * that problem is swept under the rug; KVM's CPUID API is horrific and * it's all but impossible to solve it without introducing a new API. */ - vcpu->arch.root_mmu.root_role.word = 0; - vcpu->arch.guest_mmu.root_role.word = 0; - vcpu->arch.nested_mmu.root_role.word = 0; + vcpu->arch.root_mmu.root_role.invalid = 1; + vcpu->arch.guest_mmu.root_role.invalid = 1; + vcpu->arch.nested_mmu.root_role.invalid = 1; vcpu->arch.root_mmu.cpu_role.ext.valid = 0; vcpu->arch.guest_mmu.cpu_role.ext.valid = 0; vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; -- GitLab From 2673dfb591a359c75080dd5af3da484b89320d22 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 15 Mar 2024 16:05:38 -0700 Subject: [PATCH 793/989] KVM: x86/mmu: Write-protect L2 SPTEs in TDP MMU when clearing dirty status Check kvm_mmu_page_ad_need_write_protect() when deciding whether to write-protect or clear D-bits on TDP MMU SPTEs, so that the TDP MMU accounts for any role-specific reasons for disabling D-bit dirty logging. Specifically, TDP MMU SPTEs must be write-protected when the TDP MMU is being used to run an L2 (i.e. L1 has disabled EPT) and PML is enabled. KVM always disables PML when running L2, even when L1 and L2 GPAs are in the some domain, so failing to write-protect TDP MMU SPTEs will cause writes made by L2 to not be reflected in the dirty log. Reported-by: syzbot+900d58a45dcaab9e4821@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=900d58a45dcaab9e4821 Fixes: 5982a5392663 ("KVM: x86/mmu: Use kvm_ad_enabled() to determine if TDP MMU SPTEs need wrprot") Cc: stable@vger.kernel.org Cc: Vipin Sharma Cc: Sean Christopherson Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240315230541.1635322-2-dmatlack@google.com [sean: massage shortlog and changelog, tweak ternary op formatting] Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d078157e62aa4..cf7d607a01252 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1548,6 +1548,16 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, } } +static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) +{ + /* + * All TDP MMU shadow pages share the same role as their root, aside + * from level, so it is valid to key off any shadow page to determine if + * write protection is needed for an entire tree. + */ + return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled(); +} + /* * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. @@ -1558,7 +1568,8 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { - u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; + const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK : + shadow_dirty_mask; struct tdp_iter iter; bool spte_set = false; @@ -1573,7 +1584,7 @@ retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - KVM_MMU_WARN_ON(kvm_ad_enabled() && + KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && spte_ad_need_write_protect(iter.old_spte)); if (!(iter.old_spte & dbit)) @@ -1620,8 +1631,8 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { - u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK : + shadow_dirty_mask; struct tdp_iter iter; lockdep_assert_held_write(&kvm->mmu_lock); @@ -1633,7 +1644,7 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, if (!mask) break; - KVM_MMU_WARN_ON(kvm_ad_enabled() && + KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && spte_ad_need_write_protect(iter.old_spte)); if (iter.level > PG_LEVEL_4K || -- GitLab From feac19aa4c26dc028d358bf23fa8f228ab46699e Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 15 Mar 2024 16:05:39 -0700 Subject: [PATCH 794/989] KVM: x86/mmu: Remove function comments above clear_dirty_{gfn_range,pt_masked}() Drop the comments above clear_dirty_gfn_range() and clear_dirty_pt_masked(), since each is word-for-word identical to the comment above their parent function. Leave the comment on the parent functions since they are APIs called by the KVM/x86 MMU. No functional change intended. Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240315230541.1635322-3-dmatlack@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index cf7d607a01252..23759b2bca5e6 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1558,13 +1558,6 @@ static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled(); } -/* - * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If - * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. - * If AD bits are not enabled, this will require clearing the writable bit on - * each SPTE. Returns true if an SPTE has been changed and the TLBs need to - * be flushed. - */ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { @@ -1621,13 +1614,6 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, return spte_set; } -/* - * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is - * set in mask, starting at gfn. The given memslot is expected to contain all - * the GFNs represented by set bits in the mask. If AD bits are enabled, - * clearing the dirty status will involve clearing the dirty bit on each SPTE - * or, if AD bits are not enabled, clearing the writable bit on each SPTE. - */ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { -- GitLab From b1a8d2b02b69c7d7685f2e19f32034065310dbae Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 15 Mar 2024 16:05:40 -0700 Subject: [PATCH 795/989] KVM: x86/mmu: Fix and clarify comments about clearing D-bit vs. write-protecting Drop the "If AD bits are enabled/disabled" verbiage from the comments above kvm_tdp_mmu_clear_dirty_{slot,pt_masked}() since TDP MMU SPTEs may need to be write-protected even when A/D bits are enabled. i.e. These comments aren't technically correct. No functional change intended. Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240315230541.1635322-4-dmatlack@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 23759b2bca5e6..04c1f0957fea8 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1594,11 +1594,9 @@ retry: } /* - * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If - * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. - * If AD bits are not enabled, this will require clearing the writable bit on - * each SPTE. Returns true if an SPTE has been changed and the TLBs need to - * be flushed. + * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the + * memslot. Returns true if an SPTE has been changed and the TLBs need to be + * flushed. */ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot) @@ -1656,11 +1654,9 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is - * set in mask, starting at gfn. The given memslot is expected to contain all - * the GFNs represented by set bits in the mask. If AD bits are enabled, - * clearing the dirty status will involve clearing the dirty bit on each SPTE - * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for + * which a bit is set in mask, starting at gfn. The given memslot is expected to + * contain all the GFNs represented by set bits in the mask. */ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, -- GitLab From 40e0ee6338f0c042c0dabe1f17eb76eac37b5425 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 15 Mar 2024 16:05:41 -0700 Subject: [PATCH 796/989] KVM: selftests: Add coverage of EPT-disabled to vmx_dirty_log_test Extend vmx_dirty_log_test to include accesses made by L2 when EPT is disabled. This commit adds explicit coverage of a bug caught by syzkaller, where the TDP MMU would clear D-bits instead of write-protecting SPTEs being used to map an L2, which only happens when L1 does not enable EPT, causing writes made by L2 to not be reflected in the dirty log when PML is enabled: $ ./vmx_dirty_log_test Nested EPT: disabled ==== Test Assertion Failure ==== x86_64/vmx_dirty_log_test.c:151: test_bit(0, bmap) pid=72052 tid=72052 errno=4 - Interrupted system call (stack trace empty) Page 0 incorrectly reported clean Opportunistically replace the volatile casts with {READ,WRITE}_ONCE(). Link: https://lore.kernel.org/kvm/000000000000c6526f06137f18cc@google.com/ Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240315230541.1635322-5-dmatlack@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/vmx_dirty_log_test.c | 60 ++++++++++++++----- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index 7f6f5f23fb9b6..977948fd52e6b 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -28,16 +28,16 @@ #define NESTED_TEST_MEM1 0xc0001000 #define NESTED_TEST_MEM2 0xc0002000 -static void l2_guest_code(void) +static void l2_guest_code(u64 *a, u64 *b) { - *(volatile uint64_t *)NESTED_TEST_MEM1; - *(volatile uint64_t *)NESTED_TEST_MEM1 = 1; + READ_ONCE(*a); + WRITE_ONCE(*a, 1); GUEST_SYNC(true); GUEST_SYNC(false); - *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + WRITE_ONCE(*b, 1); GUEST_SYNC(true); - *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + WRITE_ONCE(*b, 1); GUEST_SYNC(true); GUEST_SYNC(false); @@ -45,17 +45,33 @@ static void l2_guest_code(void) vmcall(); } +static void l2_guest_code_ept_enabled(void) +{ + l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2); +} + +static void l2_guest_code_ept_disabled(void) +{ + /* Access the same L1 GPAs as l2_guest_code_ept_enabled() */ + l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM); +} + void l1_guest_code(struct vmx_pages *vmx) { #define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + void *l2_rip; GUEST_ASSERT(vmx->vmcs_gpa); GUEST_ASSERT(prepare_for_vmx_operation(vmx)); GUEST_ASSERT(load_vmcs(vmx)); - prepare_vmcs(vmx, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + if (vmx->eptp_gpa) + l2_rip = l2_guest_code_ept_enabled; + else + l2_rip = l2_guest_code_ept_disabled; + + prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); GUEST_SYNC(false); GUEST_ASSERT(!vmlaunch()); @@ -64,7 +80,7 @@ void l1_guest_code(struct vmx_pages *vmx) GUEST_DONE(); } -int main(int argc, char *argv[]) +static void test_vmx_dirty_log(bool enable_ept) { vm_vaddr_t vmx_pages_gva = 0; struct vmx_pages *vmx; @@ -76,8 +92,7 @@ int main(int argc, char *argv[]) struct ucall uc; bool done = false; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - TEST_REQUIRE(kvm_cpu_has_ept()); + pr_info("Nested EPT: %s\n", enable_ept ? "enabled" : "disabled"); /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); @@ -103,11 +118,16 @@ int main(int argc, char *argv[]) * * Note that prepare_eptp should be called only L1's GPA map is done, * meaning after the last call to virt_map. + * + * When EPT is disabled, the L2 guest code will still access the same L1 + * GPAs as the EPT enabled case. */ - prepare_eptp(vmx, vm, 0); - nested_map_memslot(vmx, vm, 0); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); + if (enable_ept) { + prepare_eptp(vmx, vm, 0); + nested_map_memslot(vmx, vm, 0); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); + } bmap = bitmap_zalloc(TEST_MEM_PAGES); host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); @@ -148,3 +168,15 @@ int main(int argc, char *argv[]) } } } + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + test_vmx_dirty_log(/*enable_ept=*/false); + + if (kvm_cpu_has_ept()) + test_vmx_dirty_log(/*enable_ept=*/true); + + return 0; +} -- GitLab From eefb85b3f0310c2f4149c50cb9b13094ed1dde25 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 4 Mar 2024 16:37:42 -0800 Subject: [PATCH 797/989] KVM: Drop unused @may_block param from gfn_to_pfn_cache_invalidate_start() Remove gfn_to_pfn_cache_invalidate_start()'s unused @may_block parameter, which was leftover from KVM's abandoned (for now) attempt to support guest usage of gfn_to_pfn caches. Fixes: a4bff3df5147 ("KVM: pfncache: remove KVM_GUEST_USES_PFN usage") Reported-by: Like Xu Cc: Paul Durrant Cc: David Woodhouse Reviewed-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20240305003742.245767-1-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 3 +-- virt/kvm/kvm_mm.h | 6 ++---- virt/kvm/pfncache.c | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fb49c2a602002..ff0a20565f908 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -832,8 +832,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, * mn_active_invalidate_count (see above) instead of * mmu_invalidate_in_progress. */ - gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end, - hva_range.may_block); + gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end); /* * If one or more memslots were found and thus zapped, notify arch code diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index ecefc7ec51af8..715f19669d01f 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -26,13 +26,11 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, #ifdef CONFIG_HAVE_KVM_PFNCACHE void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, - unsigned long end, - bool may_block); + unsigned long end); #else static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, - unsigned long end, - bool may_block) + unsigned long end) { } #endif /* HAVE_KVM_PFNCACHE */ diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index f618719644e04..e3453e869e92c 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -23,7 +23,7 @@ * MMU notifier 'invalidate_range_start' hook. */ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, - unsigned long end, bool may_block) + unsigned long end) { struct gfn_to_pfn_cache *gpc; -- GitLab From 17f8dc2db52185460f212052f3a692c1fdc167ba Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 9 Apr 2024 08:56:03 +0800 Subject: [PATCH 798/989] ceph: switch to use cap_delay_lock for the unlink delay list The same list item will be used in both cap_delay_list and cap_unlink_delay_list, so it's buggy to use two different locks to protect them. Cc: stable@vger.kernel.org Fixes: dbc347ef7f0c ("ceph: add ceph_cap_unlink_work to fire check_caps() immediately") Link: https://lists.ceph.io/hyperkitty/list/ceph-users@ceph.io/thread/AODC76VXRAMXKLFDCTK4TKFDDPWUSCN5 Reported-by: Marc Ruhmann Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Tested-by: Marc Ruhmann Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 4 ++-- fs/ceph/mds_client.c | 9 ++++----- fs/ceph/mds_client.h | 3 +-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 55051ad09c191..c4941ba245ac3 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4783,13 +4783,13 @@ int ceph_drop_caps_for_unlink(struct inode *inode) doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); - spin_lock(&mdsc->cap_unlink_delay_lock); + spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) list_del_init(&ci->i_cap_delay_list); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_unlink_delay_list); - spin_unlock(&mdsc->cap_unlink_delay_lock); + spin_unlock(&mdsc->cap_delay_lock); /* * Fire the work immediately, because the MDS maybe diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3ab9c268a8bb3..360b686c3c67c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2504,7 +2504,7 @@ static void ceph_cap_unlink_work(struct work_struct *work) struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "begin\n"); - spin_lock(&mdsc->cap_unlink_delay_lock); + spin_lock(&mdsc->cap_delay_lock); while (!list_empty(&mdsc->cap_unlink_delay_list)) { struct ceph_inode_info *ci; struct inode *inode; @@ -2516,15 +2516,15 @@ static void ceph_cap_unlink_work(struct work_struct *work) inode = igrab(&ci->netfs.inode); if (inode) { - spin_unlock(&mdsc->cap_unlink_delay_lock); + spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "on %p %llx.%llx\n", inode, ceph_vinop(inode)); ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); - spin_lock(&mdsc->cap_unlink_delay_lock); + spin_lock(&mdsc->cap_delay_lock); } } - spin_unlock(&mdsc->cap_unlink_delay_lock); + spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "done\n"); } @@ -5404,7 +5404,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_LIST_HEAD(&mdsc->cap_wait_list); spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); - spin_lock_init(&mdsc->cap_unlink_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 03f8ff00874f7..b88e804152241 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -461,9 +461,8 @@ struct ceph_mds_client { struct delayed_work delayed_work; /* delayed work */ unsigned long last_renew_caps; /* last time we renewed our caps */ struct list_head cap_delay_list; /* caps with delayed release */ - spinlock_t cap_delay_lock; /* protects cap_delay_list */ struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */ - spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */ + spinlock_t cap_delay_lock; /* protects cap_delay_list and cap_unlink_delay_list */ struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; -- GitLab From d3e0469306793972fd2b1ea016fa7ab0658c9849 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 9 Apr 2024 07:01:57 -0400 Subject: [PATCH 799/989] MAINTAINERS: remove myself as a Reviewer for Ceph It has been a couple of years since I stepped down as CephFS maintainer. I'm not involved in any meaningful way with the project these days, so while I'm happy to help review the occasional patch, I don't need to be cc'ed on all of them. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index aea47e04c3a52..c45e2c3f6ff95 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4869,7 +4869,6 @@ F: drivers/power/supply/cw2015_battery.c CEPH COMMON CODE (LIBCEPH) M: Ilya Dryomov M: Xiubo Li -R: Jeff Layton L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ @@ -4881,7 +4880,6 @@ F: net/ceph/ CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH) M: Xiubo Li M: Ilya Dryomov -R: Jeff Layton L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ -- GitLab From 28e0947651ce6a2200b9a7eceb93282e97d7e51a Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 6 Apr 2024 23:16:08 -0500 Subject: [PATCH 800/989] smb3: fix Open files on server counter going negative We were decrementing the count of open files on server twice for the case where we were closing cached directories. Fixes: 8e843bf38f7b ("cifs: return a single-use cfid if we did not get a lease") Cc: stable@vger.kernel.org Acked-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/cached_dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 13a9d7acf8f8e..0ff2491c311d8 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -433,8 +433,8 @@ smb2_close_cached_fid(struct kref *ref) if (cfid->is_open) { rc = SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, cfid->fid.volatile_fid); - if (rc != -EBUSY && rc != -EAGAIN) - atomic_dec(&cfid->tcon->num_remote_opens); + if (rc) /* should we retry on -EBUSY or -EAGAIN? */ + cifs_dbg(VFS, "close cached dir rc %d\n", rc); } free_cached_dir(cfid); -- GitLab From c6ff459037b2e35450af2351037eac4c8aca1d6b Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 9 Apr 2024 11:28:59 -0300 Subject: [PATCH 801/989] smb: client: instantiate when creating SFU files In cifs_sfu_make_node(), on success, instantiate rather than leave it with dentry unhashed negative to support callers that expect mknod(2) to always instantiate. This fixes the following test case: mount.cifs //srv/share /mnt -o ...,sfu mkfifo /mnt/fifo ./xfstests/ltp/growfiles -b -W test -e 1 -u -i 0 -L 30 /mnt/fifo ... BUG: unable to handle page fault for address: 000000034cec4e58 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 1 PREEMPT SMP PTI CPU: 0 PID: 138098 Comm: growfiles Kdump: loaded Not tainted 5.14.0-436.3987_1240945149.el9.x86_64 #1 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:_raw_callee_save__kvm_vcpu_is_preempted+0x0/0x20 Code: e8 15 d9 61 00 e9 63 ff ff ff 41 bd ea ff ff ff e9 58 ff ff ff e8 d0 71 c0 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 <48> 8b 04 fd 60 2b c1 99 80 b8 90 50 03 00 00 0f 95 c0 c3 cc cc cc RSP: 0018:ffffb6a143cf7cf8 EFLAGS: 00010206 RAX: ffff8a9bc30fb038 RBX: ffff8a9bc666a200 RCX: ffff8a9cc0260000 RDX: 00000000736f622e RSI: ffff8a9bc30fb038 RDI: 000000007665645f RBP: ffffb6a143cf7d70 R08: 0000000000001000 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000000 R12: ffff8a9bc666a200 R13: 0000559a302a12b0 R14: 0000000000001000 R15: 0000000000000000 FS: 00007fbed1dbb740(0000) GS:ffff8a9cf0000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000034cec4e58 CR3: 0000000128ec6006 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? show_trace_log_lvl+0x1c4/0x2df ? show_trace_log_lvl+0x1c4/0x2df ? __mutex_lock.constprop.0+0x5f7/0x6a0 ? __die_body.cold+0x8/0xd ? page_fault_oops+0x134/0x170 ? exc_page_fault+0x62/0x150 ? asm_exc_page_fault+0x22/0x30 ? _pfx_raw_callee_save__kvm_vcpu_is_preempted+0x10/0x10 __mutex_lock.constprop.0+0x5f7/0x6a0 ? __mod_memcg_lruvec_state+0x84/0xd0 pipe_write+0x47/0x650 ? do_anonymous_page+0x258/0x410 ? inode_security+0x22/0x60 ? selinux_file_permission+0x108/0x150 vfs_write+0x2cb/0x410 ksys_write+0x5f/0xe0 do_syscall_64+0x5c/0xf0 ? syscall_exit_to_user_mode+0x22/0x40 ? do_syscall_64+0x6b/0xf0 ? sched_clock_cpu+0x9/0xc0 ? exc_page_fault+0x62/0x150 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Cc: stable@vger.kernel.org Fixes: 72bc63f5e23a ("smb3: fix creating FIFOs when mounting with "sfu" mount option") Suggested-by: Al Viro Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 94 ++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index b156eefa75d7c..78c94d0350fe9 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4964,68 +4964,84 @@ static int smb2_next_header(struct TCP_Server_Info *server, char *buf, return 0; } -int cifs_sfu_make_node(unsigned int xid, struct inode *inode, - struct dentry *dentry, struct cifs_tcon *tcon, - const char *full_path, umode_t mode, dev_t dev) +static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) { - struct cifs_open_info_data buf = {}; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; struct cifs_io_parms io_parms = {}; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_fid fid; unsigned int bytes_written; - struct win_dev *pdev; + struct win_dev pdev = {}; struct kvec iov[2]; __u32 oplock = server->oplocks ? REQ_OPLOCK : 0; int rc; - if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode)) + switch (mode & S_IFMT) { + case S_IFCHR: + strscpy(pdev.type, "IntxCHR"); + pdev.major = cpu_to_le64(MAJOR(dev)); + pdev.minor = cpu_to_le64(MINOR(dev)); + break; + case S_IFBLK: + strscpy(pdev.type, "IntxBLK"); + pdev.major = cpu_to_le64(MAJOR(dev)); + pdev.minor = cpu_to_le64(MINOR(dev)); + break; + case S_IFIFO: + strscpy(pdev.type, "LnxFIFO"); + break; + default: return -EPERM; + } - oparms = (struct cifs_open_parms) { - .tcon = tcon, - .cifs_sb = cifs_sb, - .desired_access = GENERIC_WRITE, - .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | - CREATE_OPTION_SPECIAL), - .disposition = FILE_CREATE, - .path = full_path, - .fid = &fid, - }; + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, GENERIC_WRITE, + FILE_CREATE, CREATE_NOT_DIR | + CREATE_OPTION_SPECIAL, ACL_NO_MODE); + oparms.fid = &fid; - rc = server->ops->open(xid, &oparms, &oplock, &buf); + rc = server->ops->open(xid, &oparms, &oplock, NULL); if (rc) return rc; - /* - * BB Do not bother to decode buf since no local inode yet to put - * timestamps in, but we can reuse it safely. - */ - pdev = (struct win_dev *)&buf.fi; io_parms.pid = current->tgid; io_parms.tcon = tcon; - io_parms.length = sizeof(*pdev); - iov[1].iov_base = pdev; - iov[1].iov_len = sizeof(*pdev); - if (S_ISCHR(mode)) { - memcpy(pdev->type, "IntxCHR", 8); - pdev->major = cpu_to_le64(MAJOR(dev)); - pdev->minor = cpu_to_le64(MINOR(dev)); - } else if (S_ISBLK(mode)) { - memcpy(pdev->type, "IntxBLK", 8); - pdev->major = cpu_to_le64(MAJOR(dev)); - pdev->minor = cpu_to_le64(MINOR(dev)); - } else if (S_ISFIFO(mode)) { - memcpy(pdev->type, "LnxFIFO", 8); - } + io_parms.length = sizeof(pdev); + iov[1].iov_base = &pdev; + iov[1].iov_len = sizeof(pdev); rc = server->ops->sync_write(xid, &fid, &io_parms, &bytes_written, iov, 1); server->ops->close(xid, tcon, &fid); - d_drop(dentry); - /* FIXME: add code here to set EAs */ - cifs_free_open_info(&buf); + return rc; +} + +int cifs_sfu_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) +{ + struct inode *new = NULL; + int rc; + + rc = __cifs_sfu_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); + if (rc) + return rc; + + if (tcon->posix_extensions) { + rc = smb311_posix_get_inode_info(&new, full_path, NULL, + inode->i_sb, xid); + } else if (tcon->unix_ext) { + rc = cifs_get_inode_info_unix(&new, full_path, + inode->i_sb, xid); + } else { + rc = cifs_get_inode_info(&new, full_path, NULL, + inode->i_sb, xid, NULL); + } + if (!rc) + d_instantiate(dentry, new); return rc; } -- GitLab From 35f834265e0dc78b003aa0d1af65cafb89666b76 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 4 Apr 2024 18:06:56 -0500 Subject: [PATCH 802/989] smb3: fix broken reconnect when password changing on the server by allowing password rotation There are various use cases that are becoming more common in which password changes are scheduled on a server(s) periodically but the clients connected to this server need to stay connected (even in the face of brief network reconnects) due to mounts which can not be easily unmounted and mounted at will, and servers that do password rotation do not always have the ability to tell the clients exactly when to the new password will be effective, so add support for an alt password ("password2=") on mount (and also remount) so that we can anticipate the upcoming change to the server without risking breaking existing mounts. An alternative would have been to use the kernel keyring for this but the processes doing the reconnect do not have access to the keyring but do have access to the ses structure. Reviewed-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 1 + fs/smb/client/connect.c | 8 ++++++++ fs/smb/client/fs_context.c | 21 +++++++++++++++++++++ fs/smb/client/fs_context.h | 2 ++ fs/smb/client/misc.c | 1 + fs/smb/client/smb2pdu.c | 11 +++++++++++ 6 files changed, 44 insertions(+) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index f6a302205f89c..d6669ce4ae87f 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1077,6 +1077,7 @@ struct cifs_ses { and after mount option parsing we fill it */ char *domainName; char *password; + char *password2; /* When key rotation used, new password may be set before it expires */ char workstation_name[CIFS_MAX_WORKSTATION_LEN]; struct session_key auth_key; struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 85679ae106fd5..4e35970681bf0 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2183,6 +2183,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) } ++delim; + /* BB consider adding support for password2 (Key Rotation) for multiuser in future */ ctx->password = kstrndup(delim, len, GFP_KERNEL); if (!ctx->password) { cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n", @@ -2206,6 +2207,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) kfree(ctx->username); ctx->username = NULL; kfree_sensitive(ctx->password); + /* no need to free ctx->password2 since not allocated in this path */ ctx->password = NULL; goto out_key_put; } @@ -2317,6 +2319,12 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) if (!ses->password) goto get_ses_fail; } + /* ctx->password freed at unmount */ + if (ctx->password2) { + ses->password2 = kstrdup(ctx->password2, GFP_KERNEL); + if (!ses->password2) + goto get_ses_fail; + } if (ctx->domainname) { ses->domainName = kstrdup(ctx->domainname, GFP_KERNEL); if (!ses->domainName) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index b7bfe705b2c49..6c727d8c31e87 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -162,6 +162,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_string("username", Opt_user), fsparam_string("pass", Opt_pass), fsparam_string("password", Opt_pass), + fsparam_string("password2", Opt_pass2), fsparam_string("ip", Opt_ip), fsparam_string("addr", Opt_ip), fsparam_string("domain", Opt_domain), @@ -345,6 +346,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx new_ctx->nodename = NULL; new_ctx->username = NULL; new_ctx->password = NULL; + new_ctx->password2 = NULL; new_ctx->server_hostname = NULL; new_ctx->domainname = NULL; new_ctx->UNC = NULL; @@ -357,6 +359,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx DUP_CTX_STR(prepath); DUP_CTX_STR(username); DUP_CTX_STR(password); + DUP_CTX_STR(password2); DUP_CTX_STR(server_hostname); DUP_CTX_STR(UNC); DUP_CTX_STR(source); @@ -905,6 +908,8 @@ static int smb3_reconfigure(struct fs_context *fc) else { kfree_sensitive(ses->password); ses->password = kstrdup(ctx->password, GFP_KERNEL); + kfree_sensitive(ses->password2); + ses->password2 = kstrdup(ctx->password2, GFP_KERNEL); } STEAL_STRING(cifs_sb, ctx, domainname); STEAL_STRING(cifs_sb, ctx, nodename); @@ -1305,6 +1310,18 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; } break; + case Opt_pass2: + kfree_sensitive(ctx->password2); + ctx->password2 = NULL; + if (strlen(param->string) == 0) + break; + + ctx->password2 = kstrdup(param->string, GFP_KERNEL); + if (ctx->password2 == NULL) { + cifs_errorf(fc, "OOM when copying password2 string\n"); + goto cifs_parse_mount_err; + } + break; case Opt_ip: if (strlen(param->string) == 0) { ctx->got_ip = false; @@ -1608,6 +1625,8 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_parse_mount_err: kfree_sensitive(ctx->password); ctx->password = NULL; + kfree_sensitive(ctx->password2); + ctx->password2 = NULL; return -EINVAL; } @@ -1713,6 +1732,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx) ctx->username = NULL; kfree_sensitive(ctx->password); ctx->password = NULL; + kfree_sensitive(ctx->password2); + ctx->password2 = NULL; kfree(ctx->server_hostname); ctx->server_hostname = NULL; kfree(ctx->UNC); diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 8a35645e0b65b..a947bddeba273 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -145,6 +145,7 @@ enum cifs_param { Opt_source, Opt_user, Opt_pass, + Opt_pass2, Opt_ip, Opt_domain, Opt_srcaddr, @@ -177,6 +178,7 @@ struct smb3_fs_context { char *username; char *password; + char *password2; char *domainname; char *source; char *server_hostname; diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 33ac4f8f5050c..7d15a1969b818 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -98,6 +98,7 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree(buf_to_free->serverDomain); kfree(buf_to_free->serverNOS); kfree_sensitive(buf_to_free->password); + kfree_sensitive(buf_to_free->password2); kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); kfree_sensitive(buf_to_free->auth_key.response); diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index c0c4933af5fc3..86c647a947ccd 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -367,6 +367,17 @@ again: } rc = cifs_setup_session(0, ses, server, nls_codepage); + if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) { + /* + * Try alternate password for next reconnect (key rotation + * could be enabled on the server e.g.) if an alternate + * password is available and the current password is expired, + * but do not swap on non pwd related errors like host down + */ + if (ses->password2) + swap(ses->password2, ses->password); + } + if ((rc == -EACCES) && !tcon->retry) { mutex_unlock(&ses->session_mutex); rc = -EHOSTDOWN; -- GitLab From a8fa658eebe8b17fc852482da52f8841be8931d6 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 22 Mar 2024 14:26:04 +0800 Subject: [PATCH 803/989] eventfs: Fix kernel-doc comments to functions This commit fix kernel-doc style comments with complete parameter descriptions for the lookup_file(),lookup_dir_entry() and lookup_file_dentry(). Link: https://lore.kernel.org/linux-trace-kernel/20240322062604.28862-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index dc067eeb63874..894c6ca1e5002 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -336,6 +336,7 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode, /** * lookup_file - look up a file in the tracefs filesystem + * @parent_ei: Pointer to the eventfs_inode that represents parent of the file * @dentry: the dentry to look up * @mode: the permission that the file should have. * @attr: saved attributes changed by user @@ -389,6 +390,7 @@ static struct dentry *lookup_file(struct eventfs_inode *parent_ei, /** * lookup_dir_entry - look up a dir in the tracefs filesystem * @dentry: the directory to look up + * @pei: Pointer to the parent eventfs_inode if available * @ei: the eventfs_inode that represents the directory to create * * This function will look up a dentry for a directory represented by @@ -478,16 +480,20 @@ void eventfs_d_release(struct dentry *dentry) /** * lookup_file_dentry - create a dentry for a file of an eventfs_inode + * @dentry: The parent dentry under which the new file's dentry will be created * @ei: the eventfs_inode that the file will be created under * @idx: the index into the entry_attrs[] of the @ei - * @parent: The parent dentry of the created file. - * @name: The name of the file to create * @mode: The mode of the file. * @data: The data to use to set the inode of the file with on open() * @fops: The fops of the file to be created. * - * Create a dentry for a file of an eventfs_inode @ei and place it into the - * address located at @e_dentry. + * This function creates a dentry for a file associated with an + * eventfs_inode @ei. It uses the entry attributes specified by @idx, + * if available. The file will have the specified @mode and its inode will be + * set up with @data upon open. The file operations will be set to @fops. + * + * Return: Returns a pointer to the newly created file's dentry or an error + * pointer. */ static struct dentry * lookup_file_dentry(struct dentry *dentry, -- GitLab From d96c36004e31e2baaf8ea1b449b7d0b2c2bfb41a Mon Sep 17 00:00:00 2001 From: Prasad Pandit Date: Fri, 22 Mar 2024 17:48:01 +0530 Subject: [PATCH 804/989] tracing: Fix FTRACE_RECORD_RECURSION_SIZE Kconfig entry Fix FTRACE_RECORD_RECURSION_SIZE entry, replace tab with a space character. It helps Kconfig parsers to read file without error. Link: https://lore.kernel.org/linux-trace-kernel/20240322121801.1803948-1-ppandit@redhat.com Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: 773c16705058 ("ftrace: Add recording of functions that caused recursion") Signed-off-by: Prasad Pandit Reviewed-by: Randy Dunlap Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 61c541c36596d..47345bf1d4a9f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -965,7 +965,7 @@ config FTRACE_RECORD_RECURSION config FTRACE_RECORD_RECURSION_SIZE int "Max number of recursed functions to record" - default 128 + default 128 depends on FTRACE_RECORD_RECURSION help This defines the limit of number of functions that can be -- GitLab From 5281ec83454d70d98b71f1836fb16512566c01cd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2024 10:06:24 +0200 Subject: [PATCH 805/989] tracing: hide unused ftrace_event_id_fops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_PERF_EVENTS, a 'make W=1' build produces a warning about the unused ftrace_event_id_fops variable: kernel/trace/trace_events.c:2155:37: error: 'ftrace_event_id_fops' defined but not used [-Werror=unused-const-variable=] 2155 | static const struct file_operations ftrace_event_id_fops = { Hide this in the same #ifdef as the reference to it. Link: https://lore.kernel.org/linux-trace-kernel/20240403080702.3509288-7-arnd@kernel.org Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Mathieu Desnoyers Cc: Zheng Yejian Cc: Kees Cook Cc: Ajay Kaher Cc: Jinjie Ruan Cc: Clément Léger Cc: Dan Carpenter Cc: "Tzvetomir Stoyanov (VMware)" Fixes: 620a30e97feb ("tracing: Don't pass file_operations array to event_create_dir()") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7c364b87352ee..52f75c36bbca4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1670,6 +1670,7 @@ static int trace_format_open(struct inode *inode, struct file *file) return 0; } +#ifdef CONFIG_PERF_EVENTS static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -1684,6 +1685,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } +#endif static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, @@ -2152,10 +2154,12 @@ static const struct file_operations ftrace_event_format_fops = { .release = seq_release, }; +#ifdef CONFIG_PERF_EVENTS static const struct file_operations ftrace_event_id_fops = { .read = event_id_read, .llseek = default_llseek, }; +#endif static const struct file_operations ftrace_event_filter_fops = { .open = tracing_open_file_tr, -- GitLab From ffe3986fece696cf65e0ef99e74c75f848be8e30 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 9 Apr 2024 15:13:09 -0400 Subject: [PATCH 806/989] ring-buffer: Only update pages_touched when a new page is touched The "buffer_percent" logic that is used by the ring buffer splice code to only wake up the tasks when there's no data after the buffer is filled to the percentage of the "buffer_percent" file is dependent on three variables that determine the amount of data that is in the ring buffer: 1) pages_read - incremented whenever a new sub-buffer is consumed 2) pages_lost - incremented every time a writer overwrites a sub-buffer 3) pages_touched - incremented when a write goes to a new sub-buffer The percentage is the calculation of: (pages_touched - (pages_lost + pages_read)) / nr_pages Basically, the amount of data is the total number of sub-bufs that have been touched, minus the number of sub-bufs lost and sub-bufs consumed. This is divided by the total count to give the buffer percentage. When the percentage is greater than the value in the "buffer_percent" file, it wakes up splice readers waiting for that amount. It was observed that over time, the amount read from the splice was constantly decreasing the longer the trace was running. That is, if one asked for 60%, it would read over 60% when it first starts tracing, but then it would be woken up at under 60% and would slowly decrease the amount of data read after being woken up, where the amount becomes much less than the buffer percent. This was due to an accounting of the pages_touched incrementation. This value is incremented whenever a writer transfers to a new sub-buffer. But the place where it was incremented was incorrect. If a writer overflowed the current sub-buffer it would go to the next one. If it gets preempted by an interrupt at that time, and the interrupt performs a trace, it too will end up going to the next sub-buffer. But only one should increment the counter. Unfortunately, that was not the case. Change the cmpxchg() that does the real switch of the tail-page into a try_cmpxchg(), and on success, perform the increment of pages_touched. This will only increment the counter once for when the writer moves to a new sub-buffer, and not when there's a race and is incremented for when a writer and its preempting writer both move to the same new sub-buffer. Link: https://lore.kernel.org/linux-trace-kernel/20240409151309.0d0e5056@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Fixes: 2c2b0a78b3739 ("ring-buffer: Add percentage of ring buffer full to wake up reader") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 25476ead681b8..6511dc3a00da8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1393,7 +1393,6 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); - local_inc(&cpu_buffer->pages_touched); /* * Just make sure we have seen our old_write and synchronize * with any interrupts that come in. @@ -1430,8 +1429,9 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, */ local_set(&next_page->page->commit, 0); - /* Again, either we update tail_page or an interrupt does */ - (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page); + /* Either we update tail_page or an interrupt does */ + if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page)) + local_inc(&cpu_buffer->pages_touched); } } -- GitLab From 824f06ff81464f823cd6259ff2ec8fbeceb2afa5 Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Thu, 11 Apr 2024 23:36:33 +0000 Subject: [PATCH 807/989] fs/9p: Revert "fs/9p: fix dups even in uncached mode" This reverts commit be57855f505003c5cafff40338d5d0f23b00ba4d. It caused a regression involving duplicate inode numbers in some tester trees. The bad behavior seems to be dependent on inode reuse policy in underlying file system, so it did not trigger in my test setup. Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_super.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 4236058c7bbd1..eeac06c2a84cf 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -244,6 +244,21 @@ done: return res; } +static int v9fs_drop_inode(struct inode *inode) +{ + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) + return generic_drop_inode(inode); + /* + * in case of non cached mode always drop the + * inode because we want the inode attribute + * to always match that on the server. + */ + return 1; +} + static int v9fs_write_inode(struct inode *inode, struct writeback_control *wbc) { @@ -278,6 +293,7 @@ static const struct super_operations v9fs_super_ops_dotl = { .alloc_inode = v9fs_alloc_inode, .free_inode = v9fs_free_inode, .statfs = v9fs_statfs, + .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, .umount_begin = v9fs_umount_begin, -- GitLab From 7fd524b9bd1be210fe79035800f4bd78a41b349f Mon Sep 17 00:00:00 2001 From: Joakim Sindholt Date: Mon, 18 Mar 2024 12:22:32 +0100 Subject: [PATCH 808/989] fs/9p: drop inodes immediately on non-.L too Signed-off-by: Joakim Sindholt Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index eeac06c2a84cf..55e67e36ae682 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -283,6 +283,7 @@ static const struct super_operations v9fs_super_ops = { .alloc_inode = v9fs_alloc_inode, .free_inode = v9fs_free_inode, .statfs = simple_statfs, + .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, .umount_begin = v9fs_umount_begin, -- GitLab From 3b0daecfeac0103aba8b293df07a0cbaf8b43f29 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 12 Apr 2024 06:11:25 +1000 Subject: [PATCH 809/989] amdkfd: use calloc instead of kzalloc to avoid integer overflow This uses calloc instead of doing the multiplication which might overflow. Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f9631f4b1a02c..55aa74cbc5325 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -779,8 +779,8 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp, * nodes, but not more than args->num_of_nodes as that is * the amount of memory allocated by user */ - pa = kzalloc((sizeof(struct kfd_process_device_apertures) * - args->num_of_nodes), GFP_KERNEL); + pa = kcalloc(args->num_of_nodes, sizeof(struct kfd_process_device_apertures), + GFP_KERNEL); if (!pa) return -ENOMEM; -- GitLab From 2b3e79fea66e166622a454715ce981432ac8c6e3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 01:01:11 -0400 Subject: [PATCH 810/989] bcachefs: Don't use bch2_btree_node_lock_write_nofail() in btree split path It turns out - btree splits happen with the rest of the transaction still locked, to avoid unnecessary restarts, which means using nofail doesn't work here - we can deadlock. Fortunately, we now have the ability to return errors here. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 41 ++++++++++++++++++----------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index c4a5e83a56a43..29a5bc7d87898 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1280,23 +1280,29 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) bch2_recalc_btree_reserve(c); } -static void bch2_btree_set_root(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b) +static int bch2_btree_set_root(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + bool nofail) { struct bch_fs *c = as->c; - struct btree *old; trace_and_count(c, btree_node_set_root, trans, b); - old = btree_node_root(c, b); + struct btree *old = btree_node_root(c, b); /* * Ensure no one is using the old root while we switch to the * new root: */ - bch2_btree_node_lock_write_nofail(trans, path, &old->c); + if (nofail) { + bch2_btree_node_lock_write_nofail(trans, path, &old->c); + } else { + int ret = bch2_btree_node_lock_write(trans, path, &old->c); + if (ret) + return ret; + } bch2_btree_set_root_inmem(c, b); @@ -1310,6 +1316,7 @@ static void bch2_btree_set_root(struct btree_update *as, * depend on the new root would have to update the new root. */ bch2_btree_node_unlock_write(trans, path, old); + return 0; } /* Interior node updates: */ @@ -1652,15 +1659,16 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (parent) { /* Split a non root node */ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); - if (ret) - goto err; } else if (n3) { - bch2_btree_set_root(as, trans, trans->paths + path, n3); + ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false); } else { /* Root filled up but didn't need to be split */ - bch2_btree_set_root(as, trans, trans->paths + path, n1); + ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false); } + if (ret) + goto err; + if (n3) { bch2_btree_update_get_open_buckets(as, n3); bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); @@ -1863,7 +1871,9 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans * bch2_keylist_add(&as->parent_keys, &b->key); btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); - bch2_btree_set_root(as, trans, path, n); + int ret = bch2_btree_set_root(as, trans, path, n, true); + BUG_ON(ret); + bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); bch2_trans_node_add(trans, path, n); @@ -2106,12 +2116,13 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); - if (ret) - goto err; } else { - bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n); + ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false); } + if (ret) + goto err; + bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); -- GitLab From 210cfef579260ed6c3b700e7baeae51a5e183f43 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Thu, 4 Apr 2024 17:02:09 -0500 Subject: [PATCH 811/989] selftests/powerpc/papr-vpd: Fix missing variable initialization The "close handle without consuming VPD" testcase has inconsistent results because it fails to initialize the location code object it passes to ioctl() to create a VPD handle. Initialize the location code to the empty string as intended. Signed-off-by: Nathan Lynch Fixes: 9118c5d32bdd ("powerpc/selftests: Add test for papr-vpd") Reported-by: Geetika Moolchandani Signed-off-by: Michael Ellerman Link: https://msgid.link/20240404-papr-vpd-test-uninit-lc-v2-1-37bff46c65a5@linux.ibm.com --- tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c index 505294da1b9fb..d6f99eb9be659 100644 --- a/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c +++ b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c @@ -154,7 +154,7 @@ static int dev_papr_vpd_null_handle(void) static int papr_vpd_close_handle_without_reading(void) { const int devfd = open(DEVPATH, O_RDONLY); - struct papr_location_code lc; + struct papr_location_code lc = { .str = "", }; int fd; SKIP_IF_MSG(devfd < 0 && errno == ENOENT, -- GitLab From 84b1cec4fac5889b6d138d4b5ff6f2d5ef126c5e Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 4 Apr 2024 10:27:17 +0000 Subject: [PATCH 812/989] iommu/amd: Fix possible irq lock inversion dependency issue LOCKDEP detector reported below warning: ---------------------------------------- [ 23.796949] ======================================================== [ 23.796950] WARNING: possible irq lock inversion dependency detected [ 23.796952] 6.8.0fix+ #811 Not tainted [ 23.796954] -------------------------------------------------------- [ 23.796954] kworker/0:1/8 just changed the state of lock: [ 23.796956] ff365325e084a9b8 (&domain->lock){..-.}-{3:3}, at: amd_iommu_flush_iotlb_all+0x1f/0x50 [ 23.796969] but this lock took another, SOFTIRQ-unsafe lock in the past: [ 23.796970] (pd_bitmap_lock){+.+.}-{3:3} [ 23.796972] and interrupts could create inverse lock ordering between them. [ 23.796973] other info that might help us debug this: [ 23.796974] Chain exists of: &domain->lock --> &dev_data->lock --> pd_bitmap_lock [ 23.796980] Possible interrupt unsafe locking scenario: [ 23.796981] CPU0 CPU1 [ 23.796982] ---- ---- [ 23.796983] lock(pd_bitmap_lock); [ 23.796985] local_irq_disable(); [ 23.796985] lock(&domain->lock); [ 23.796988] lock(&dev_data->lock); [ 23.796990] [ 23.796991] lock(&domain->lock); Fix this issue by disabling interrupt when acquiring pd_bitmap_lock. Note that this is temporary fix. We have a plan to replace custom bitmap allocator with IDA allocator. Fixes: 87a6f1f22c97 ("iommu/amd: Introduce per-device domain ID to fix potential TLB aliasing issue") Reviewed-by: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20240404102717.6705-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d35c1b8c8e65c..e692217fcb280 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1692,26 +1692,29 @@ int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, static u16 domain_id_alloc(void) { + unsigned long flags; int id; - spin_lock(&pd_bitmap_lock); + spin_lock_irqsave(&pd_bitmap_lock, flags); id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); BUG_ON(id == 0); if (id > 0 && id < MAX_DOMAIN_ID) __set_bit(id, amd_iommu_pd_alloc_bitmap); else id = 0; - spin_unlock(&pd_bitmap_lock); + spin_unlock_irqrestore(&pd_bitmap_lock, flags); return id; } static void domain_id_free(int id) { - spin_lock(&pd_bitmap_lock); + unsigned long flags; + + spin_lock_irqsave(&pd_bitmap_lock, flags); if (id > 0 && id < MAX_DOMAIN_ID) __clear_bit(id, amd_iommu_pd_alloc_bitmap); - spin_unlock(&pd_bitmap_lock); + spin_unlock_irqrestore(&pd_bitmap_lock, flags); } static void free_gcr3_tbl_level1(u64 *tbl) -- GitLab From b650b38b006053ef96e90b8e3f7e6edc7845cd57 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 10 Apr 2024 08:57:02 +0000 Subject: [PATCH 813/989] iommu/amd: Do not enable SNP when V2 page table is enabled DTE[Mode]=0 is not supported when SNP is enabled in the host. That means to support SNP, IOMMU must be configured with V1 page table (See IOMMU spec [1] for the details). If user passes kernel command line to configure IOMMU domains with v2 page table (amd_iommu=pgtbl_v2) then disable SNP as the user asked by not forcing the page table to v1. [1] https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/specifications/48882_IOMMU.pdf Cc: Ashish Kalra Cc: Michael Roth Cc: Tom Lendacky Signed-off-by: Vasant Hegde Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240410085702.31869-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 33228c1c8980f..a714eb08c08b5 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3232,28 +3232,29 @@ static void iommu_snp_enable(void) return; /* * The SNP support requires that IOMMU must be enabled, and is - * not configured in the passthrough mode. + * configured with V1 page table (DTE[Mode] = 0 is not supported). */ if (no_iommu || iommu_default_passthrough()) { pr_err("SNP: IOMMU disabled or configured in passthrough mode, SNP cannot be supported.\n"); - cc_platform_clear(CC_ATTR_HOST_SEV_SNP); - return; + goto disable_snp; + } + + if (amd_iommu_pgtable != AMD_IOMMU_V1) { + pr_warn("SNP: IOMMU is configured with V2 page table mode, SNP cannot be supported.\n"); + goto disable_snp; } amd_iommu_snp_en = check_feature(FEATURE_SNP); if (!amd_iommu_snp_en) { pr_err("SNP: IOMMU SNP feature not enabled, SNP cannot be supported.\n"); - cc_platform_clear(CC_ATTR_HOST_SEV_SNP); - return; + goto disable_snp; } pr_info("IOMMU SNP support enabled.\n"); + return; - /* Enforce IOMMU v1 pagetable when SNP is enabled. */ - if (amd_iommu_pgtable != AMD_IOMMU_V1) { - pr_warn("Forcing use of AMD IOMMU v1 page table due to SNP.\n"); - amd_iommu_pgtable = AMD_IOMMU_V1; - } +disable_snp: + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); #endif } -- GitLab From 7537e31df80cb58c27f3b6fef702534ea87a5957 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 10 Apr 2024 18:41:09 +0200 Subject: [PATCH 814/989] iommu: mtk: fix module autoloading Add MODULE_DEVICE_TABLE(), so modules could be properly autoloaded based on the alias from of_device_id table. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240410164109.233308-1-krzk@kernel.org Signed-off-by: Joerg Roedel --- drivers/iommu/mtk_iommu.c | 1 + drivers/iommu/mtk_iommu_v1.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index b8c47f18bc261..6a2707fe7a78c 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -1790,6 +1790,7 @@ static const struct of_device_id mtk_iommu_of_ids[] = { { .compatible = "mediatek,mt8365-m4u", .data = &mt8365_data}, {} }; +MODULE_DEVICE_TABLE(of, mtk_iommu_of_ids); static struct platform_driver mtk_iommu_driver = { .probe = mtk_iommu_probe, diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index a9fa2a54dc9b3..d6e4002200bd3 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -600,6 +600,7 @@ static const struct of_device_id mtk_iommu_v1_of_ids[] = { { .compatible = "mediatek,mt2701-m4u", }, {} }; +MODULE_DEVICE_TABLE(of, mtk_iommu_v1_of_ids); static const struct component_master_ops mtk_iommu_v1_com_ops = { .bind = mtk_iommu_v1_bind, -- GitLab From 36d4fe147c870f6d3f6602befd7ef44393a1c87a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:50 -0700 Subject: [PATCH 815/989] x86/bugs: Remove CONFIG_BHI_MITIGATION_AUTO and spectre_bhi=auto Unlike most other mitigations' "auto" options, spectre_bhi=auto only mitigates newer systems, which is confusing and not particularly useful. Remove it. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Nikolay Borisov Cc: Sean Christopherson Cc: Linus Torvalds Link: https://lore.kernel.org/r/412e9dc87971b622bbbaf64740ebc1f140bff343.1712813475.git.jpoimboe@kernel.org --- Documentation/admin-guide/hw-vuln/spectre.rst | 4 ---- Documentation/admin-guide/kernel-parameters.txt | 3 --- arch/x86/Kconfig | 5 ----- arch/x86/kernel/cpu/bugs.c | 10 +--------- 4 files changed, 1 insertion(+), 21 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 5a39acf824835..25a04cda4c2c0 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -669,10 +669,6 @@ kernel command line. needed. off Disable the mitigation. - auto - Enable the HW mitigation if needed, but - *don't* enable the SW mitigation except for KVM. - The system may be vulnerable. For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a3874cc97892d..902ecd92a29fb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6072,9 +6072,6 @@ on - (default) Enable the HW or SW mitigation as needed. off - Disable the mitigation. - auto - Enable the HW mitigation if needed, but - *don't* enable the SW mitigation except - for KVM. The system may be vulnerable. spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 10a6251f58f3e..b63b6767a63dc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2651,11 +2651,6 @@ config SPECTRE_BHI_OFF bool "off" help Equivalent to setting spectre_bhi=off command line parameter. -config SPECTRE_BHI_AUTO - bool "auto" - depends on BROKEN - help - Equivalent to setting spectre_bhi=auto command line parameter. endchoice diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9eeb60f5fbb35..6af4780a18ed3 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1625,13 +1625,10 @@ static bool __init spec_ctrl_bhi_dis(void) enum bhi_mitigations { BHI_MITIGATION_OFF, BHI_MITIGATION_ON, - BHI_MITIGATION_AUTO, }; static enum bhi_mitigations bhi_mitigation __ro_after_init = - IS_ENABLED(CONFIG_SPECTRE_BHI_ON) ? BHI_MITIGATION_ON : - IS_ENABLED(CONFIG_SPECTRE_BHI_OFF) ? BHI_MITIGATION_OFF : - BHI_MITIGATION_AUTO; + IS_ENABLED(CONFIG_SPECTRE_BHI_ON) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; static int __init spectre_bhi_parse_cmdline(char *str) { @@ -1642,8 +1639,6 @@ static int __init spectre_bhi_parse_cmdline(char *str) bhi_mitigation = BHI_MITIGATION_OFF; else if (!strcmp(str, "on")) bhi_mitigation = BHI_MITIGATION_ON; - else if (!strcmp(str, "auto")) - bhi_mitigation = BHI_MITIGATION_AUTO; else pr_err("Ignoring unknown spectre_bhi option (%s)", str); @@ -1673,9 +1668,6 @@ static void __init bhi_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n"); - if (bhi_mitigation == BHI_MITIGATION_AUTO) - return; - /* Mitigate syscalls when the mitigation is forced =on */ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n"); -- GitLab From 4f511739c54b549061993b53fc0380f48dfca23b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:51 -0700 Subject: [PATCH 816/989] x86/bugs: Replace CONFIG_SPECTRE_BHI_{ON,OFF} with CONFIG_MITIGATION_SPECTRE_BHI For consistency with the other CONFIG_MITIGATION_* options, replace the CONFIG_SPECTRE_BHI_{ON,OFF} options with a single CONFIG_MITIGATION_SPECTRE_BHI option. [ mingo: Fix ] Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Sean Christopherson Cc: Linus Torvalds Cc: Nikolay Borisov Link: https://lore.kernel.org/r/3833812ea63e7fdbe36bf8b932e63f70d18e2a2a.1712813475.git.jpoimboe@kernel.org --- arch/x86/Kconfig | 17 +++-------------- arch/x86/kernel/cpu/bugs.c | 2 +- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b63b6767a63dc..4474bf32d0a49 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2633,27 +2633,16 @@ config MITIGATION_RFDS stored in floating point, vector and integer registers. See also -choice - prompt "Clear branch history" +config MITIGATION_SPECTRE_BHI + bool "Mitigate Spectre-BHB (Branch History Injection)" depends on CPU_SUP_INTEL - default SPECTRE_BHI_ON + default y help Enable BHI mitigations. BHI attacks are a form of Spectre V2 attacks where the branch history buffer is poisoned to speculatively steer indirect branches. See -config SPECTRE_BHI_ON - bool "on" - help - Equivalent to setting spectre_bhi=on command line parameter. -config SPECTRE_BHI_OFF - bool "off" - help - Equivalent to setting spectre_bhi=off command line parameter. - -endchoice - endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 6af4780a18ed3..ca295b0c1eeee 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1628,7 +1628,7 @@ enum bhi_mitigations { }; static enum bhi_mitigations bhi_mitigation __ro_after_init = - IS_ENABLED(CONFIG_SPECTRE_BHI_ON) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; static int __init spectre_bhi_parse_cmdline(char *str) { -- GitLab From 1b3108f6898ef2e03973d65255182792e94e2240 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 Apr 2024 21:45:27 +0200 Subject: [PATCH 817/989] x86/cpu/amd: Make the CPUID 0x80000008 parser correct CPUID 0x80000008 ECX.cpu_nthreads describes the number of threads in the package. The parser uses this value to initialize the SMT domain level. That's wrong because cpu_nthreads does not describe the number of threads per physical core. So this needs to set the CORE domain level and let the later parsers set the SMT shift if available. Preset the SMT domain level with the assumption of one thread per core, which is correct ifrt here are no other CPUID leafs to parse, and propagate cpu_nthreads and the core level APIC bitwidth into the CORE domain. Fixes: f7fb3b2dd92c ("x86/cpu: Provide an AMD/HYGON specific topology parser") Reported-by: "kernelci.org bot" Reported-by: Laura Nao Signed-off-by: Thomas Gleixner Tested-by: Laura Nao Link: https://lore.kernel.org/r/20240410194311.535206450@linutronix.de --- arch/x86/kernel/cpu/topology_amd.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 1a8b3ad493afe..79a85a4814ca1 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -29,11 +29,21 @@ static bool parse_8000_0008(struct topo_scan *tscan) if (!sft) sft = get_count_order(ecx.cpu_nthreads + 1); - topology_set_dom(tscan, TOPO_SMT_DOMAIN, sft, ecx.cpu_nthreads + 1); + /* + * cpu_nthreads describes the number of threads in the package + * sft is the number of APIC ID bits per package + * + * As the number of actual threads per core is not described in + * this leaf, just set the CORE domain shift and let the later + * parsers set SMT shift. Assume one thread per core by default + * which is correct if there are no other CPUID leafs to parse. + */ + topology_update_dom(tscan, TOPO_SMT_DOMAIN, 0, 1); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, sft, ecx.cpu_nthreads + 1); return true; } -static void store_node(struct topo_scan *tscan, unsigned int nr_nodes, u16 node_id) +static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) { /* * Starting with Fam 17h the DIE domain could probably be used to @@ -73,12 +83,14 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) tscan->c->topo.initial_apicid = leaf.ext_apic_id; /* - * If leaf 0xb is available, then SMT shift is set already. If not - * take it from ecx.threads_per_core and use topo_update_dom() - - * topology_set_dom() would propagate and overwrite the already - * propagated CORE level. + * If leaf 0xb is available, then the domain shifts are set + * already and nothing to do here. */ if (!has_0xb) { + /* + * Leaf 0x80000008 set the CORE domain shift already. + * Update the SMT domain, but do not propagate it. + */ unsigned int nthreads = leaf.core_nthreads + 1; topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads); -- GitLab From c064b536a8f9ab7c8e204da8f5a22f7420d0b56c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 Apr 2024 21:45:28 +0200 Subject: [PATCH 818/989] x86/cpu/amd: Make the NODEID_MSR union actually work A system with NODEID_MSR was reported to crash during early boot without any output. The reason is that the union which is used for accessing the bitfields in the MSR is written wrongly and the resulting executable code accesses the wrong part of the MSR data. As a consequence a later division by that value results in 0 and that result is used for another division as divisor, which obviously does not work well. The magic world of C, unions and bitfields: union { u64 bita : 3, bitb : 3; u64 all; } x; x.all = foo(); a = x.bita; b = x.bitb; results in the effective executable code of: a = b = x.bita; because bita and bitb are treated as union members and therefore both end up at bit offset 0. Wrapping the bitfield into an anonymous struct: union { struct { u64 bita : 3, bitb : 3; }; u64 all; } x; works like expected. Rework the NODEID_MSR union in exactly that way to cure the problem. Fixes: f7fb3b2dd92c ("x86/cpu: Provide an AMD/HYGON specific topology parser") Reported-by: "kernelci.org bot" Reported-by: Laura Nao Signed-off-by: Thomas Gleixner Tested-by: Laura Nao Link: https://lore.kernel.org/r/20240410194311.596282919@linutronix.de Closes: https://lore.kernel.org/all/20240322175210.124416-1-laura.nao@collabora.com/ --- arch/x86/kernel/cpu/topology_amd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 79a85a4814ca1..7f999aef19659 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -121,13 +121,13 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) static bool parse_fam10h_node_id(struct topo_scan *tscan) { - struct { - union { + union { + struct { u64 node_id : 3, nodes_per_pkg : 3, unused : 58; - u64 msr; }; + u64 msr; } nid; if (!boot_cpu_has(X86_FEATURE_NODEID_MSR)) -- GitLab From 7211274fe0ee352332255e41ab5e628b86e83994 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Apr 2024 18:55:38 +0200 Subject: [PATCH 819/989] x86/cpu/amd: Move TOPOEXT enablement into the topology parser The topology rework missed that early_init_amd() tries to re-enable the Topology Extensions when the BIOS disabled them. The new parser is invoked before early_init_amd() so the re-enable attempt happens too late. Move it into the AMD specific topology parser code where it belongs. Fixes: f7fb3b2dd92c ("x86/cpu: Provide an AMD/HYGON specific topology parser") Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/878r1j260l.ffs@tglx --- arch/x86/kernel/cpu/amd.c | 15 --------------- arch/x86/kernel/cpu/topology_amd.c | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9bf17c9c29dad..cb9eece55904d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -535,7 +535,6 @@ clear_sev: static void early_init_amd(struct cpuinfo_x86 *c) { - u64 value; u32 dummy; if (c->x86 >= 0xf) @@ -603,20 +602,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_detect_mem_encrypt(c); - /* Re-enable TopologyExtensions if switched off by BIOS */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x6f) && - !cpu_has(c, X86_FEATURE_TOPOEXT)) { - - if (msr_set_bit(0xc0011005, 54) > 0) { - rdmsrl(0xc0011005, value); - if (value & BIT_64(54)) { - set_cpu_cap(c, X86_FEATURE_TOPOEXT); - pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); - } - } - } - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 7f999aef19659..a7aa6eff4ae5b 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -147,6 +147,26 @@ static void legacy_set_llc(struct topo_scan *tscan) tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN]; } +static void topoext_fixup(struct topo_scan *tscan) +{ + struct cpuinfo_x86 *c = tscan->c; + u64 msrval; + + /* Try to re-enable TopologyExtensions if switched off by BIOS */ + if (cpu_has(c, X86_FEATURE_TOPOEXT) || c->x86_vendor != X86_VENDOR_AMD || + c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) + return; + + if (msr_set_bit(0xc0011005, 54) <= 0) + return; + + rdmsrl(0xc0011005, msrval); + if (msrval & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } +} + static void parse_topology_amd(struct topo_scan *tscan) { bool has_0xb = false; @@ -176,6 +196,7 @@ static void parse_topology_amd(struct topo_scan *tscan) void cpu_parse_topology_amd(struct topo_scan *tscan) { tscan->amd_nodes_per_pkg = 1; + topoext_fixup(tscan); parse_topology_amd(tscan); if (tscan->amd_nodes_per_pkg > 1) -- GitLab From 5b3625a4f6422e8982f90f0c11b5546149c962b8 Mon Sep 17 00:00:00 2001 From: Xuchun Shang Date: Thu, 11 Apr 2024 11:07:42 +0800 Subject: [PATCH 820/989] iommu/vt-d: Fix wrong use of pasid config The commit "iommu/vt-d: Add IOMMU perfmon support" introduce IOMMU PMU feature, but use the wrong config when set pasid filter. Fixes: 7232ab8b89e9 ("iommu/vt-d: Add IOMMU perfmon support") Signed-off-by: Xuchun Shang Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20240401060753.3321318-1-xuchun.shang@linux.alibaba.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/perfmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c index cf43e798eca49..44083d01852db 100644 --- a/drivers/iommu/intel/perfmon.c +++ b/drivers/iommu/intel/perfmon.c @@ -438,7 +438,7 @@ static int iommu_pmu_assign_event(struct iommu_pmu *iommu_pmu, iommu_pmu_set_filter(domain, event->attr.config1, IOMMU_PMU_FILTER_DOMAIN, idx, event->attr.config1); - iommu_pmu_set_filter(pasid, event->attr.config1, + iommu_pmu_set_filter(pasid, event->attr.config2, IOMMU_PMU_FILTER_PASID, idx, event->attr.config1); iommu_pmu_set_filter(ats, event->attr.config2, -- GitLab From a34f3e20ddff02c4f12df2c0635367394e64c63d Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 11 Apr 2024 11:07:43 +0800 Subject: [PATCH 821/989] iommu/vt-d: Allocate local memory for page request queue The page request queue is per IOMMU, its allocation should be made NUMA-aware for performance reasons. Fixes: a222a7f0bb6c ("iommu/vt-d: Implement page request handling") Signed-off-by: Jacob Pan Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240403214007.985600-1-jacob.jun.pan@linux.intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index c1bed89b10261..ee3b469e2da15 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -66,7 +66,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) struct page *pages; int irq, ret; - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); + pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); if (!pages) { pr_warn("IOMMU: %s: Failed to allocate page request queue\n", iommu->name); -- GitLab From 89436f4f54125b1297aec1f466efd8acb4ec613d Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 11 Apr 2024 11:07:44 +0800 Subject: [PATCH 822/989] iommu/vt-d: Fix WARN_ON in iommu probe path Commit 1a75cc710b95 ("iommu/vt-d: Use rbtree to track iommu probed devices") adds all devices probed by the iommu driver in a rbtree indexed by the source ID of each device. It assumes that each device has a unique source ID. This assumption is incorrect and the VT-d spec doesn't state this requirement either. The reason for using a rbtree to track devices is to look up the device with PCI bus and devfunc in the paths of handling ATS invalidation time out error and the PRI I/O page faults. Both are PCI ATS feature related. Only track the devices that have PCI ATS capabilities in the rbtree to avoid unnecessary WARN_ON in the iommu probe path. Otherwise, on some platforms below kernel splat will be displayed and the iommu probe results in failure. WARNING: CPU: 3 PID: 166 at drivers/iommu/intel/iommu.c:158 intel_iommu_probe_device+0x319/0xd90 Call Trace: ? __warn+0x7e/0x180 ? intel_iommu_probe_device+0x319/0xd90 ? report_bug+0x1f8/0x200 ? handle_bug+0x3c/0x70 ? exc_invalid_op+0x18/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? intel_iommu_probe_device+0x319/0xd90 ? debug_mutex_init+0x37/0x50 __iommu_probe_device+0xf2/0x4f0 iommu_probe_device+0x22/0x70 iommu_bus_notifier+0x1e/0x40 notifier_call_chain+0x46/0x150 blocking_notifier_call_chain+0x42/0x60 bus_notify+0x2f/0x50 device_add+0x5ed/0x7e0 platform_device_add+0xf5/0x240 mfd_add_devices+0x3f9/0x500 ? preempt_count_add+0x4c/0xa0 ? up_write+0xa2/0x1b0 ? __debugfs_create_file+0xe3/0x150 intel_lpss_probe+0x49f/0x5b0 ? pci_conf1_write+0xa3/0xf0 intel_lpss_pci_probe+0xcf/0x110 [intel_lpss_pci] pci_device_probe+0x95/0x120 really_probe+0xd9/0x370 ? __pfx___driver_attach+0x10/0x10 __driver_probe_device+0x73/0x150 driver_probe_device+0x19/0xa0 __driver_attach+0xb6/0x180 ? __pfx___driver_attach+0x10/0x10 bus_for_each_dev+0x77/0xd0 bus_add_driver+0x114/0x210 driver_register+0x5b/0x110 ? __pfx_intel_lpss_pci_driver_init+0x10/0x10 [intel_lpss_pci] do_one_initcall+0x57/0x2b0 ? kmalloc_trace+0x21e/0x280 ? do_init_module+0x1e/0x210 do_init_module+0x5f/0x210 load_module+0x1d37/0x1fc0 ? init_module_from_file+0x86/0xd0 init_module_from_file+0x86/0xd0 idempotent_init_module+0x17c/0x230 __x64_sys_finit_module+0x56/0xb0 do_syscall_64+0x6e/0x140 entry_SYSCALL_64_after_hwframe+0x71/0x79 Fixes: 1a75cc710b95 ("iommu/vt-d: Use rbtree to track iommu probed devices") Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10689 Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20240407011429.136282-1-baolu.lu@linux.intel.com Reviewed-by: Kevin Tian Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 50eb9aed47cc5..a7ecd90303dc4 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4299,9 +4299,11 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) } dev_iommu_priv_set(dev, info); - ret = device_rbtree_insert(iommu, info); - if (ret) - goto free; + if (pdev && pci_ats_supported(pdev)) { + ret = device_rbtree_insert(iommu, info); + if (ret) + goto free; + } if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { ret = intel_pasid_alloc_table(dev); @@ -4336,7 +4338,8 @@ static void intel_iommu_release_device(struct device *dev) struct intel_iommu *iommu = info->iommu; mutex_lock(&iommu->iopf_lock); - device_rbtree_remove(info); + if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) + device_rbtree_remove(info); mutex_unlock(&iommu->iopf_lock); if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && -- GitLab From b8246a2ad80a810cafbeddb30525278f9d64bca3 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 10 Apr 2024 10:16:43 +0000 Subject: [PATCH 823/989] iommu/amd: Change log message severity Use consistent log severity (pr_warn) to log all messages in SNP enable path. Suggested-by: Tom Lendacky Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20240410101643.32309-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index a714eb08c08b5..ac6754a85f350 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3235,7 +3235,7 @@ static void iommu_snp_enable(void) * configured with V1 page table (DTE[Mode] = 0 is not supported). */ if (no_iommu || iommu_default_passthrough()) { - pr_err("SNP: IOMMU disabled or configured in passthrough mode, SNP cannot be supported.\n"); + pr_warn("SNP: IOMMU disabled or configured in passthrough mode, SNP cannot be supported.\n"); goto disable_snp; } @@ -3246,7 +3246,7 @@ static void iommu_snp_enable(void) amd_iommu_snp_en = check_feature(FEATURE_SNP); if (!amd_iommu_snp_en) { - pr_err("SNP: IOMMU SNP feature not enabled, SNP cannot be supported.\n"); + pr_warn("SNP: IOMMU SNP feature not enabled, SNP cannot be supported.\n"); goto disable_snp; } -- GitLab From e4a6bceac98eba3c00e874892736b34ea5fdaca3 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Apr 2024 16:26:28 -0700 Subject: [PATCH 824/989] selftests: timers: Fix posix_timers ksft_print_msg() warning After commit 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") the following warning occurs when building with an older gcc: posix_timers.c:250:2: warning: format not a string literal and no format arguments [-Wformat-security] 250 | ksft_print_msg(errmsg); | ^~~~~~~~~~~~~~ Fix this up by changing it to ksft_print_msg("%s", errmsg) Fixes: 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Acked-by: Justin Stitt Acked-by: Shuah Khan Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240410232637.4135564-1-jstultz@google.com --- tools/testing/selftests/timers/posix_timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index d86a0e00711e4..348f47176e0a3 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -247,7 +247,7 @@ static int check_timer_distribution(void) ksft_test_result_skip("check signal distribution (old kernel)\n"); return 0; err: - ksft_print_msg(errmsg); + ksft_print_msg("%s", errmsg); return -1; } -- GitLab From f7d5bcd35d427daac7e206b1073ca14f5db85c27 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 11 Apr 2024 11:45:40 -0700 Subject: [PATCH 825/989] selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn After commit 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()"), clang warns: tools/testing/selftests/timers/../kselftest.h:398:6: warning: variable 'major' is used uninitialized whenever '||' condition is true [-Wsometimes-uninitialized] 398 | if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2) | ^~~~~~~~~~~~ tools/testing/selftests/timers/../kselftest.h:401:9: note: uninitialized use occurs here 401 | return major > min_major || (major == min_major && minor >= min_minor); | ^~~~~ tools/testing/selftests/timers/../kselftest.h:398:6: note: remove the '||' if its condition is always false 398 | if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2) | ^~~~~~~~~~~~~~~ tools/testing/selftests/timers/../kselftest.h:395:20: note: initialize the variable 'major' to silence this warning 395 | unsigned int major, minor; | ^ | = 0 This is a false positive because if uname() fails, ksft_exit_fail_msg() will be called, which unconditionally calls exit(), a noreturn function. However, clang does not know that ksft_exit_fail_msg() will call exit() at the point in the pipeline that the warning is emitted because inlining has not occurred, so it assumes control flow will resume normally after ksft_exit_fail_msg() is called. Make it clear to clang that all of the functions that call exit() unconditionally in kselftest.h are noreturn transitively by marking them explicitly with '__attribute__((__noreturn__))', which clears up the warning above and any future warnings that may appear for the same reason. Fixes: 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") Reported-by: John Stultz Signed-off-by: Nathan Chancellor Signed-off-by: Thomas Gleixner Acked-by: Shuah Khan Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240411-mark-kselftest-exit-funcs-noreturn-v1-1-b027c948f586@kernel.org Closes: https://lore.kernel.org/all/20240410232637.4135564-2-jstultz@google.com/ --- tools/testing/selftests/kselftest.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 973b18e156b28..0591974b57e06 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -80,6 +80,9 @@ #define KSFT_XPASS 3 #define KSFT_SKIP 4 +#ifndef __noreturn +#define __noreturn __attribute__((__noreturn__)) +#endif #define __printf(a, b) __attribute__((format(printf, a, b))) /* counters */ @@ -300,13 +303,13 @@ void ksft_test_result_code(int exit_code, const char *test_name, va_end(args); } -static inline int ksft_exit_pass(void) +static inline __noreturn int ksft_exit_pass(void) { ksft_print_cnts(); exit(KSFT_PASS); } -static inline int ksft_exit_fail(void) +static inline __noreturn int ksft_exit_fail(void) { ksft_print_cnts(); exit(KSFT_FAIL); @@ -333,7 +336,7 @@ static inline int ksft_exit_fail(void) ksft_cnt.ksft_xfail + \ ksft_cnt.ksft_xskip) -static inline __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) +static inline __noreturn __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) { int saved_errno = errno; va_list args; @@ -348,19 +351,19 @@ static inline __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) exit(KSFT_FAIL); } -static inline int ksft_exit_xfail(void) +static inline __noreturn int ksft_exit_xfail(void) { ksft_print_cnts(); exit(KSFT_XFAIL); } -static inline int ksft_exit_xpass(void) +static inline __noreturn int ksft_exit_xpass(void) { ksft_print_cnts(); exit(KSFT_XPASS); } -static inline __printf(1, 2) int ksft_exit_skip(const char *msg, ...) +static inline __noreturn __printf(1, 2) int ksft_exit_skip(const char *msg, ...) { int saved_errno = errno; va_list args; -- GitLab From ed366de8ec89d4f960d66c85fc37d9de22f7bf6d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Apr 2024 16:26:30 -0700 Subject: [PATCH 826/989] selftests: timers: Fix abs() warning in posix_timers test Building with clang results in the following warning: posix_timers.c:69:6: warning: absolute value function 'abs' given an argument of type 'long long' but has parameter of type 'int' which may cause truncation of value [-Wabsolute-value] if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { ^ So switch to using llabs() instead. Fixes: 0bc4b0cf1570 ("selftests: add basic posix timers selftests") Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240410232637.4135564-3-jstultz@google.com --- tools/testing/selftests/timers/posix_timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 348f47176e0a3..c001dd79179d5 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -66,7 +66,7 @@ static int check_diff(struct timeval start, struct timeval end) diff = end.tv_usec - start.tv_usec; diff += (end.tv_sec - start.tv_sec) * USECS_PER_SEC; - if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { + if (llabs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { printf("Diff too high: %lld..", diff); return -1; } -- GitLab From 607638faf2ff1cede37458111496e7cc6c977f6f Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Wed, 10 Apr 2024 11:46:18 +0200 Subject: [PATCH 827/989] s390/qdio: handle deferred cc1 A deferred condition code 1 response indicates that I/O was not started and should be retried. The current QDIO implementation handles a cc1 response as I/O error, resulting in a failed QDIO setup. This can happen for example when a path verification request arrives at the same time as QDIO setup I/O is started. Fix this by retrying the QDIO setup I/O when a cc1 response is received. Note that since commit 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") commit 5ef1dc40ffa6 ("s390/cio: fix invalid -EBUSY on ccw_device_start") deferred cc1 responses are much more likely to occur. See the commit message of the latter for more background information. Fixes: 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") Reviewed-by: Alexandra Winter Signed-off-by: Peter Oberparleiter Signed-off-by: Alexander Gordeev --- drivers/s390/cio/qdio_main.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 3d9f0834c78bf..a1cb39f4b7a27 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -722,8 +722,8 @@ static void qdio_handle_activate_check(struct qdio_irq *irq_ptr, lgr_info_log(); } -static void qdio_establish_handle_irq(struct qdio_irq *irq_ptr, int cstat, - int dstat) +static int qdio_establish_handle_irq(struct qdio_irq *irq_ptr, int cstat, + int dstat, int dcc) { DBF_DEV_EVENT(DBF_INFO, irq_ptr, "qest irq"); @@ -731,15 +731,18 @@ static void qdio_establish_handle_irq(struct qdio_irq *irq_ptr, int cstat, goto error; if (dstat & ~(DEV_STAT_DEV_END | DEV_STAT_CHN_END)) goto error; + if (dcc == 1) + return -EAGAIN; if (!(dstat & DEV_STAT_DEV_END)) goto error; qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ESTABLISHED); - return; + return 0; error: DBF_ERROR("%4x EQ:error", irq_ptr->schid.sch_no); DBF_ERROR("ds: %2x cs:%2x", dstat, cstat); qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR); + return -EIO; } /* qdio interrupt handler */ @@ -748,7 +751,7 @@ void qdio_int_handler(struct ccw_device *cdev, unsigned long intparm, { struct qdio_irq *irq_ptr = cdev->private->qdio_data; struct subchannel_id schid; - int cstat, dstat; + int cstat, dstat, rc, dcc; if (!intparm || !irq_ptr) { ccw_device_get_schid(cdev, &schid); @@ -768,10 +771,12 @@ void qdio_int_handler(struct ccw_device *cdev, unsigned long intparm, qdio_irq_check_sense(irq_ptr, irb); cstat = irb->scsw.cmd.cstat; dstat = irb->scsw.cmd.dstat; + dcc = scsw_cmd_is_valid_cc(&irb->scsw) ? irb->scsw.cmd.cc : 0; + rc = 0; switch (irq_ptr->state) { case QDIO_IRQ_STATE_INACTIVE: - qdio_establish_handle_irq(irq_ptr, cstat, dstat); + rc = qdio_establish_handle_irq(irq_ptr, cstat, dstat, dcc); break; case QDIO_IRQ_STATE_CLEANUP: qdio_set_state(irq_ptr, QDIO_IRQ_STATE_INACTIVE); @@ -785,12 +790,25 @@ void qdio_int_handler(struct ccw_device *cdev, unsigned long intparm, if (cstat || dstat) qdio_handle_activate_check(irq_ptr, intparm, cstat, dstat); + else if (dcc == 1) + rc = -EAGAIN; break; case QDIO_IRQ_STATE_STOPPED: break; default: WARN_ON_ONCE(1); } + + if (rc == -EAGAIN) { + DBF_DEV_EVENT(DBF_INFO, irq_ptr, "qint retry"); + rc = ccw_device_start(cdev, irq_ptr->ccw, intparm, 0, 0); + if (!rc) + return; + DBF_ERROR("%4x RETRY ERR", irq_ptr->schid.sch_no); + DBF_ERROR("rc:%4x", rc); + qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR); + } + wake_up(&cdev->private->wait_q); } -- GitLab From 2d8527f2f911fab84aec04df4788c0c23af3df48 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Wed, 10 Apr 2024 11:46:19 +0200 Subject: [PATCH 828/989] s390/cio: fix race condition during online processing A race condition exists in ccw_device_set_online() that can cause the online process to fail, leaving the affected device in an inconsistent state. As a result, subsequent attempts to set that device online fail with return code ENODEV. The problem occurs when a path verification request arrives after a wait for final device state completed, but before the result state is evaluated. Fix this by ensuring that the CCW-device lock is held between determining final state and checking result state. Note that since: commit 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") path verification requests are much more likely to occur during boot, resulting in an increased chance of this race condition occurring. Fixes: 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") Reviewed-by: Alexandra Winter Reviewed-by: Vineeth Vijayan Signed-off-by: Peter Oberparleiter Signed-off-by: Alexander Gordeev --- drivers/s390/cio/device.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index f95d12345d98a..920f550bc313b 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -363,10 +363,8 @@ int ccw_device_set_online(struct ccw_device *cdev) spin_lock_irq(cdev->ccwlock); ret = ccw_device_online(cdev); - spin_unlock_irq(cdev->ccwlock); - if (ret == 0) - wait_event(cdev->private->wait_q, dev_fsm_final_state(cdev)); - else { + if (ret) { + spin_unlock_irq(cdev->ccwlock); CIO_MSG_EVENT(0, "ccw_device_online returned %d, " "device 0.%x.%04x\n", ret, cdev->private->dev_id.ssid, @@ -375,7 +373,12 @@ int ccw_device_set_online(struct ccw_device *cdev) put_device(&cdev->dev); return ret; } - spin_lock_irq(cdev->ccwlock); + /* Wait until a final state is reached */ + while (!dev_fsm_final_state(cdev)) { + spin_unlock_irq(cdev->ccwlock); + wait_event(cdev->private->wait_q, dev_fsm_final_state(cdev)); + spin_lock_irq(cdev->ccwlock); + } /* Check if online processing was successful */ if ((cdev->private->state != DEV_STATE_ONLINE) && (cdev->private->state != DEV_STATE_W4SENSE)) { -- GitLab From 6f76592ef63a1ffd8949f0828d24da7913ddb6d8 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Wed, 10 Apr 2024 11:46:20 +0200 Subject: [PATCH 829/989] s390/cio: log fake IRB events Add traces when queueing and delivering fake IRBs. These are significant events that might have an impact on device driver processing and are therefore relevant for problem analysis. Reviewed-by: Vineeth Vijayan Signed-off-by: Peter Oberparleiter Signed-off-by: Alexander Gordeev --- drivers/s390/cio/device_fsm.c | 5 +++++ drivers/s390/cio/device_ops.c | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index 65d8b2cfd6262..42791fa0b80e2 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -504,6 +504,11 @@ callback: ccw_device_done(cdev, DEV_STATE_ONLINE); /* Deliver fake irb to device driver, if needed. */ if (cdev->private->flags.fake_irb) { + CIO_MSG_EVENT(2, "fakeirb: deliver device 0.%x.%04x intparm %lx type=%d\n", + cdev->private->dev_id.ssid, + cdev->private->dev_id.devno, + cdev->private->intparm, + cdev->private->flags.fake_irb); create_fake_irb(&cdev->private->dma_area->irb, cdev->private->flags.fake_irb); cdev->private->flags.fake_irb = 0; diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c index 40c97f8730751..acd6790dba4dd 100644 --- a/drivers/s390/cio/device_ops.c +++ b/drivers/s390/cio/device_ops.c @@ -208,6 +208,10 @@ int ccw_device_start_timeout_key(struct ccw_device *cdev, struct ccw1 *cpa, if (!cdev->private->flags.fake_irb) { cdev->private->flags.fake_irb = FAKE_CMD_IRB; cdev->private->intparm = intparm; + CIO_MSG_EVENT(2, "fakeirb: queue device 0.%x.%04x intparm %lx type=%d\n", + cdev->private->dev_id.ssid, + cdev->private->dev_id.devno, intparm, + cdev->private->flags.fake_irb); return 0; } else /* There's already a fake I/O around. */ @@ -551,6 +555,10 @@ int ccw_device_tm_start_timeout_key(struct ccw_device *cdev, struct tcw *tcw, if (!cdev->private->flags.fake_irb) { cdev->private->flags.fake_irb = FAKE_TM_IRB; cdev->private->intparm = intparm; + CIO_MSG_EVENT(2, "fakeirb: queue device 0.%x.%04x intparm %lx type=%d\n", + cdev->private->dev_id.ssid, + cdev->private->dev_id.devno, intparm, + cdev->private->flags.fake_irb); return 0; } else /* There's already a fake I/O around. */ -- GitLab From 3ec4848913d695245716ea45ca4872d9dff097a5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 11 Apr 2024 11:23:48 +0800 Subject: [PATCH 830/989] block: fix that blk_time_get_ns() doesn't update time after schedule While monitoring the throttle time of IO from iocost, it's found that such time is always zero after the io_schedule() from ioc_rqos_throttle, for example, with the following debug patch: + printk("%s-%d: %s enter %llu\n", current->comm, current->pid, __func__, blk_time_get_ns()); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); if (wait.committed) break; io_schedule(); } + printk("%s-%d: %s exit %llu\n", current->comm, current->pid, __func__, blk_time_get_ns()); It can be observerd that blk_time_get_ns() always return the same time: [ 1068.096579] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.272587] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.274389] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.472690] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.474485] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.672656] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.674451] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.872655] fio-1268: ioc_rqos_throttle exit 1067901962288 And I think the root cause is that 'PF_BLOCK_TS' is always cleared by blk_flush_plug() before scheduel(), hence blk_plug_invalidate_ts() will never be called: blk_time_get_ns plug->cur_ktime = ktime_get_ns(); current->flags |= PF_BLOCK_TS; io_schedule: io_schedule_prepare blk_flush_plug __blk_flush_plug /* the flag is cleared, while time is not */ current->flags &= ~PF_BLOCK_TS; schedule sched_update_worker /* the flag is not set, hence plug->cur_ktime is not cleared */ if (tsk->flags & PF_BLOCK_TS) blk_plug_invalidate_ts() blk_time_get_ns /* got the time stashed before schedule */ return plug->cur_ktime; Fix the problem by clearing cached time in __blk_flush_plug(). Fixes: 06b23f92af87 ("block: update cached timestamp post schedule/preemption") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240411032349.3051233-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-core.c b/block/blk-core.c index 3a6f5603fb44b..b795ac177281a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1197,6 +1197,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug); + plug->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; } -- GitLab From 16767502aa990cca2cb7d1372b31d328c4c85b40 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 12 Apr 2024 14:35:36 +0200 Subject: [PATCH 831/989] selftests: kselftest: Fix build failure with NOLIBC As Mark explains ksft_min_kernel_version() can't be compiled with nolibc, it doesn't implement uname(). Fixes: 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") Reported-by: Mark Brown Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240412123536.GA32444@redhat.com Closes: https://lore.kernel.org/all/f0523b3a-ea08-4615-b0fb-5b504a2d39df@sirena.org.uk/ --- tools/testing/selftests/kselftest.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 0591974b57e06..7b89362da4bfd 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -395,6 +395,10 @@ static inline __noreturn __printf(1, 2) int ksft_exit_skip(const char *msg, ...) static inline int ksft_min_kernel_version(unsigned int min_major, unsigned int min_minor) { +#ifdef NOLIBC + ksft_print_msg("NOLIBC: Can't check kernel version: Function not implemented\n"); + return 0; +#else unsigned int major, minor; struct utsname info; @@ -402,6 +406,7 @@ static inline int ksft_min_kernel_version(unsigned int min_major, ksft_exit_fail_msg("Can't parse kernel version\n"); return major > min_major || (major == min_major && minor >= min_minor); +#endif } #endif /* __KSELFTEST_H */ -- GitLab From 46dad3c1e57897ab9228332f03e1c14798d2d3b9 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Fri, 12 Apr 2024 16:17:32 +0800 Subject: [PATCH 832/989] init/main.c: Fix potential static_command_line memory overflow We allocate memory of size 'xlen + strlen(boot_command_line) + 1' for static_command_line, but the strings copied into static_command_line are extra_command_line and command_line, rather than extra_command_line and boot_command_line. When strlen(command_line) > strlen(boot_command_line), static_command_line will overflow. This patch just recovers strlen(command_line) which was miss-consolidated with strlen(boot_command_line) in the commit f5c7310ac73e ("init/main: add checks for the return value of memblock_alloc*()") Link: https://lore.kernel.org/all/20240412081733.35925-2-ytcoode@gmail.com/ Fixes: f5c7310ac73e ("init/main: add checks for the return value of memblock_alloc*()") Cc: stable@vger.kernel.org Signed-off-by: Yuntao Wang Signed-off-by: Masami Hiramatsu (Google) --- init/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/init/main.c b/init/main.c index 881f6230ee59e..5dcf5274c09c7 100644 --- a/init/main.c +++ b/init/main.c @@ -636,6 +636,8 @@ static void __init setup_command_line(char *command_line) if (!saved_command_line) panic("%s: Failed to allocate %zu bytes\n", __func__, len + ilen); + len = xlen + strlen(command_line) + 1; + static_command_line = memblock_alloc(len, SMP_CACHE_BYTES); if (!static_command_line) panic("%s: Failed to allocate %zu bytes\n", __func__, len); -- GitLab From d5cf50dafc9dd5faa1e61e7021e3496ddf7fd61e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 12 Apr 2024 10:05:10 -0700 Subject: [PATCH 833/989] Kconfig: add some hidden tabs on purpose Commit d96c36004e31 ("tracing: Fix FTRACE_RECORD_RECURSION_SIZE Kconfig entry") removed a hidden tab because it apparently showed breakage in some third-party kernel config parsing tool. It wasn't clear what tool it was, but let's make sure it gets fixed. Because if you can't parse tabs as whitespace, you should not be parsing the kernel Kconfig files. In fact, let's make such breakage more obvious than some esoteric ftrace record size option. If you can't parse tabs, you can't have page sizes. Yes, tab-vs-space confusion is sadly a traditional Unix thing, and 'make' is famous for being broken in this regard. But no, that does not mean that it's ok. I'd add more random tabs to our Kconfig files, but I don't want to make things uglier than necessary. But it *might* bbe necessary if it turns out we see more of this kind of silly tooling. Fixes: d96c36004e31 ("tracing: Fix FTRACE_RECORD_RECURSION_SIZE Kconfig entry") Link: https://lore.kernel.org/lkml/CAHk-=wj-hLLN_t_m5OL4dXLaxvXKy_axuoJYXif7iczbfgAevQ@mail.gmail.com/ Signed-off-by: Linus Torvalds --- arch/Kconfig | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 9f066785bb71d..65afb1de48b36 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1172,12 +1172,12 @@ config PAGE_SIZE_LESS_THAN_256KB config PAGE_SHIFT int - default 12 if PAGE_SIZE_4KB - default 13 if PAGE_SIZE_8KB - default 14 if PAGE_SIZE_16KB - default 15 if PAGE_SIZE_32KB - default 16 if PAGE_SIZE_64KB - default 18 if PAGE_SIZE_256KB + default 12 if PAGE_SIZE_4KB + default 13 if PAGE_SIZE_8KB + default 14 if PAGE_SIZE_16KB + default 15 if PAGE_SIZE_32KB + default 16 if PAGE_SIZE_64KB + default 18 if PAGE_SIZE_256KB # This allows to use a set of generic functions to determine mmap base # address by giving priority to top-down scheme only if the process -- GitLab From 11baa36d317321f5d54059f07d243c5a1dbbfbb2 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 10 Apr 2024 19:03:05 +0200 Subject: [PATCH 834/989] gpio: lpc32xx: fix module autoloading Add MODULE_DEVICE_TABLE(), so the module could be properly autoloaded based on the alias from of_device_id table. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-lpc32xx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-lpc32xx.c b/drivers/gpio/gpio-lpc32xx.c index 5ef8af8249806..c097e310c9e84 100644 --- a/drivers/gpio/gpio-lpc32xx.c +++ b/drivers/gpio/gpio-lpc32xx.c @@ -529,6 +529,7 @@ static const struct of_device_id lpc32xx_gpio_of_match[] = { { .compatible = "nxp,lpc3220-gpio", }, { }, }; +MODULE_DEVICE_TABLE(of, lpc32xx_gpio_of_match); static struct platform_driver lpc32xx_gpio_driver = { .driver = { -- GitLab From 79336504781e7fee5ddaf046dcc186c8dfdf60b1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 12 Apr 2024 08:41:15 +0900 Subject: [PATCH 835/989] ata: libata-scsi: Fix ata_scsi_dev_rescan() error path Commit 0c76106cb975 ("scsi: sd: Fix TCG OPAL unlock on system resume") incorrectly handles failures of scsi_resume_device() in ata_scsi_dev_rescan(), leading to a double call to spin_unlock_irqrestore() to unlock a device port. Fix this by redefining the goto labels used in case of errors and only unlock the port scsi_scan_mutex when scsi_resume_device() fails. Bug found with the Smatch static checker warning: drivers/ata/libata-scsi.c:4774 ata_scsi_dev_rescan() error: double unlocked 'ap->lock' (orig line 4757) Reported-by: Dan Carpenter Fixes: 0c76106cb975 ("scsi: sd: Fix TCG OPAL unlock on system resume") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel --- drivers/ata/libata-scsi.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 2f4c588376410..e954976891a9f 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -4745,7 +4745,7 @@ void ata_scsi_dev_rescan(struct work_struct *work) * bail out. */ if (ap->pflags & ATA_PFLAG_SUSPENDED) - goto unlock; + goto unlock_ap; if (!sdev) continue; @@ -4758,7 +4758,7 @@ void ata_scsi_dev_rescan(struct work_struct *work) if (do_resume) { ret = scsi_resume_device(sdev); if (ret == -EWOULDBLOCK) - goto unlock; + goto unlock_scan; dev->flags &= ~ATA_DFLAG_RESUMING; } ret = scsi_rescan_device(sdev); @@ -4766,12 +4766,13 @@ void ata_scsi_dev_rescan(struct work_struct *work) spin_lock_irqsave(ap->lock, flags); if (ret) - goto unlock; + goto unlock_ap; } } -unlock: +unlock_ap: spin_unlock_irqrestore(ap->lock, flags); +unlock_scan: mutex_unlock(&ap->scsi_scan_mutex); /* Reschedule with a delay if scsi_rescan_device() returned an error */ -- GitLab From c0297e7dd50795d559f3534887a6de1756b35d0f Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Thu, 11 Apr 2024 20:12:24 +0000 Subject: [PATCH 836/989] ata: libata-core: Allow command duration limits detection for ACS-4 drives Even though the command duration limits (CDL) feature was first added in ACS-5 (major version 12), there are some ACS-4 (major version 11) drives that implement CDL as well. IDENTIFY_DEVICE, SUPPORTED_CAPABILITIES, and CURRENT_SETTINGS log pages are mandatory in the ACS-4 standard so it should be safe to read these log pages on older drives implementing the ACS-4 standard. Fixes: 62e4a60e0cdb ("scsi: ata: libata: Detect support for command duration limits") Cc: stable@vger.kernel.org Signed-off-by: Igor Pylypiv Signed-off-by: Damien Le Moal --- drivers/ata/libata-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index be3412cdb22e7..c449d60d9bb96 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2539,7 +2539,7 @@ static void ata_dev_config_cdl(struct ata_device *dev) bool cdl_enabled; u64 val; - if (ata_id_major_version(dev->id) < 12) + if (ata_id_major_version(dev->id) < 11) goto not_supported; if (!ata_log_supported(dev, ATA_LOG_IDENTIFY_DEVICE) || -- GitLab From 283454c8a123072e5c386a5a2b5fc576aa455b6f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 10 Apr 2024 10:10:15 -0700 Subject: [PATCH 837/989] af_unix: Call manage_oob() for every skb in unix_stream_read_generic(). When we call recv() for AF_UNIX socket, we first peek one skb and calls manage_oob() to check if the skb is sent with MSG_OOB. However, when we fetch the next (and the following) skb, manage_oob() is not called now, leading a wrong behaviour. Let's say a socket send()s "hello" with MSG_OOB and the peer tries to recv() 5 bytes with MSG_PEEK. Here, we should get only "hell" without 'o', but actually not: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'hello', MSG_OOB) 5 >>> c2.recv(5, MSG_PEEK) b'hello' The first skb fills 4 bytes, and the next skb is peeked but not properly checked by manage_oob(). Let's move up the again label to call manage_oob() for evry skb. With this patch: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'hello', MSG_OOB) 5 >>> c2.recv(5, MSG_PEEK) b'hell' Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240410171016.7621-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d032eb5fa6df1..f297320438bfd 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2741,6 +2741,7 @@ redo: last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0; +again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); @@ -2752,7 +2753,6 @@ redo: } } #endif -again: if (skb == NULL) { if (copied >= target) goto unlock; -- GitLab From 22dd70eb2c3d754862964377a75abafd3167346b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 10 Apr 2024 10:10:16 -0700 Subject: [PATCH 838/989] af_unix: Don't peek OOB data without MSG_OOB. Currently, we can read OOB data without MSG_OOB by using MSG_PEEK when OOB data is sitting on the front row, which is apparently wrong. >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'a', MSG_OOB) 1 >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) b'a' If manage_oob() is called when no data has been copied, we only check if the socket enables SO_OOBINLINE or MSG_PEEK is not used. Otherwise, the skb is returned as is. However, here we should return NULL if MSG_PEEK is set and no data has been copied. Also, in such a case, we should not jump to the redo label because we will be caught in the loop and hog the CPU until normal data comes in. Then, we need to handle skb == NULL case with the if-clause below the manage_oob() block. With this patch: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'a', MSG_OOB) 1 >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) Traceback (most recent call last): File "", line 1, in BlockingIOError: [Errno 11] Resource temporarily unavailable Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240410171016.7621-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f297320438bfd..9a6ad5974dff5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2663,7 +2663,9 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, WRITE_ONCE(u->oob_skb, NULL); consume_skb(skb); } - } else if (!(flags & MSG_PEEK)) { + } else if (flags & MSG_PEEK) { + skb = NULL; + } else { skb_unlink(skb, &sk->sk_receive_queue); WRITE_ONCE(u->oob_skb, NULL); if (!WARN_ON_ONCE(skb_unref(skb))) @@ -2745,11 +2747,9 @@ again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); - if (!skb) { + if (!skb && copied) { unix_state_unlock(sk); - if (copied) - break; - goto redo; + break; } } #endif -- GitLab From 68aba00483c7c4102429bcdfdece7289a8ab5c8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Thu, 11 Apr 2024 11:13:18 +0000 Subject: [PATCH 839/989] net: sparx5: flower: fix fragment flags handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I noticed that only 3 out of the 4 input bits were used, mt.key->flags & FLOW_DIS_IS_FRAGMENT was never checked. In order to avoid a complicated maze, I converted it to use a 16 byte mapping table. As shown in the table below the old heuristics doesn't always do the right thing, ie. when FLOW_DIS_IS_FRAGMENT=1/1 then it used to only match follow-up fragment packets. Here are all the combinations, and their resulting new/old VCAP key/mask filter: /- FLOW_DIS_IS_FRAGMENT (key/mask) | /- FLOW_DIS_FIRST_FRAG (key/mask) | | /-- new VCAP fragment (key/mask) v v v v- old VCAP fragment (key/mask) 0/0 0/0 -/- -/- impossible (due to entry cond. on mask) 0/0 0/1 -/- 0/3 !! invalid (can't match non-fragment + follow-up frag) 0/0 1/0 -/- -/- impossible (key > mask) 0/0 1/1 1/3 1/3 first fragment 0/1 0/0 0/3 3/3 !! not fragmented 0/1 0/1 0/3 3/3 !! not fragmented (+ not first fragment) 0/1 1/0 -/- -/- impossible (key > mask) 0/1 1/1 -/- 1/3 !! invalid (non-fragment and first frag) 1/0 0/0 -/- -/- impossible (key > mask) 1/0 0/1 -/- -/- impossible (key > mask) 1/0 1/0 -/- -/- impossible (key > mask) 1/0 1/1 -/- -/- impossible (key > mask) 1/1 0/0 1/1 3/3 !! some fragment 1/1 0/1 3/3 3/3 follow-up fragment 1/1 1/0 -/- -/- impossible (key > mask) 1/1 1/1 1/3 1/3 first fragment In the datasheet the VCAP fragment values are documented as: 0 = no fragment 1 = initial fragment 2 = suspicious fragment 3 = valid follow-up fragment Result: 3 combinations match the old behavior, 3 combinations have been corrected, 2 combinations are now invalid, and fail, 8 combinations are impossible. It should now be aligned with how FLOW_DIS_IS_FRAGMENT and FLOW_DIS_FIRST_FRAG is set in __skb_flow_dissect() in net/core/flow_dissector.c Since the VCAP fragment values are not a bitfield, we have to ignore the suspicious fragment value, eg. when matching on any kind of fragment with FLOW_DIS_IS_FRAGMENT=1/1. Only compile tested, and logic tested in userspace, as I unfortunately don't have access to this switch chip (yet). Fixes: d6c2964db3fe ("net: microchip: sparx5: Adding more tc flower keys for the IS2 VCAP") Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Steen Hegelund Tested-by: Daniel Machon Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20240411111321.114095-1-ast@fiberby.net Signed-off-by: Jakub Kicinski --- .../microchip/sparx5/sparx5_tc_flower.c | 61 ++++++++++++------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c b/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c index 523e0c470894f..55f255a3c9db6 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c @@ -36,6 +36,27 @@ struct sparx5_tc_flower_template { u16 l3_proto; /* protocol specified in the template */ }; +/* SparX-5 VCAP fragment types: + * 0 = no fragment, 1 = initial fragment, + * 2 = suspicious fragment, 3 = valid follow-up fragment + */ +enum { /* key / mask */ + FRAG_NOT = 0x03, /* 0 / 3 */ + FRAG_SOME = 0x11, /* 1 / 1 */ + FRAG_FIRST = 0x13, /* 1 / 3 */ + FRAG_LATER = 0x33, /* 3 / 3 */ + FRAG_INVAL = 0xff, /* invalid */ +}; + +/* Flower fragment flag to VCAP fragment type mapping */ +static const u8 sparx5_vcap_frag_map[4][4] = { /* is_frag */ + { FRAG_INVAL, FRAG_INVAL, FRAG_INVAL, FRAG_FIRST }, /* 0/0 */ + { FRAG_NOT, FRAG_NOT, FRAG_INVAL, FRAG_INVAL }, /* 0/1 */ + { FRAG_INVAL, FRAG_INVAL, FRAG_INVAL, FRAG_INVAL }, /* 1/0 */ + { FRAG_SOME, FRAG_LATER, FRAG_INVAL, FRAG_FIRST } /* 1/1 */ + /* 0/0 0/1 1/0 1/1 <-- first_frag */ +}; + static int sparx5_tc_flower_es0_tpid(struct vcap_tc_flower_parse_usage *st) { @@ -145,29 +166,27 @@ sparx5_tc_flower_handler_control_usage(struct vcap_tc_flower_parse_usage *st) flow_rule_match_control(st->frule, &mt); if (mt.mask->flags) { - if (mt.mask->flags & FLOW_DIS_FIRST_FRAG) { - if (mt.key->flags & FLOW_DIS_FIRST_FRAG) { - value = 1; /* initial fragment */ - mask = 0x3; - } else { - if (mt.mask->flags & FLOW_DIS_IS_FRAGMENT) { - value = 3; /* follow up fragment */ - mask = 0x3; - } else { - value = 0; /* no fragment */ - mask = 0x3; - } - } - } else { - if (mt.mask->flags & FLOW_DIS_IS_FRAGMENT) { - value = 3; /* follow up fragment */ - mask = 0x3; - } else { - value = 0; /* no fragment */ - mask = 0x3; - } + u8 is_frag_key = !!(mt.key->flags & FLOW_DIS_IS_FRAGMENT); + u8 is_frag_mask = !!(mt.mask->flags & FLOW_DIS_IS_FRAGMENT); + u8 is_frag_idx = (is_frag_key << 1) | is_frag_mask; + + u8 first_frag_key = !!(mt.key->flags & FLOW_DIS_FIRST_FRAG); + u8 first_frag_mask = !!(mt.mask->flags & FLOW_DIS_FIRST_FRAG); + u8 first_frag_idx = (first_frag_key << 1) | first_frag_mask; + + /* Lookup verdict based on the 2 + 2 input bits */ + u8 vdt = sparx5_vcap_frag_map[is_frag_idx][first_frag_idx]; + + if (vdt == FRAG_INVAL) { + NL_SET_ERR_MSG_MOD(st->fco->common.extack, + "Match on invalid fragment flag combination"); + return -EINVAL; } + /* Extract VCAP fragment key and mask from verdict */ + value = (vdt >> 4) & 0x3; + mask = vdt & 0x3; + err = vcap_rule_add_key_u32(st->vrule, VCAP_KF_L3_FRAGMENT_TYPE, value, mask); -- GitLab From 37cc10da3a50e6d0cb9808a90b7da9b4868794dd Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 11 Apr 2024 14:54:39 +0300 Subject: [PATCH 840/989] net/mlx5: Lag, restore buckets number to default after hash LAG deactivation The cited patch introduces the concept of buckets in LAG in hash mode. However, the patch doesn't clear the number of buckets in the LAG deactivation. This results in using the wrong number of buckets in case user create a hash mode LAG and afterwards create a non-hash mode LAG. Hence, restore buckets number to default after hash mode LAG deactivation. Fixes: 352899f384d4 ("net/mlx5: Lag, use buckets in hash mode") Signed-off-by: Shay Drory Reviewed-by: Maor Gottlieb Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index d14459e5c04fc..69d482f7c5a29 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -703,8 +703,10 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev) return err; } - if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) + if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) { mlx5_lag_port_sel_destroy(ldev); + ldev->buckets = 1; + } if (mlx5_lag_has_drop_rule(ldev)) mlx5_lag_drop_rule_cleanup(ldev); -- GitLab From aa4ac90d04f4371466000825adb44935ecb5c974 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 11 Apr 2024 14:54:40 +0300 Subject: [PATCH 841/989] net/mlx5: SD, Handle possible devcom ERR_PTR Check if devcom holds an error pointer and return immediately. This fixes Smatch static checker warning: drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c:221 sd_register() error: 'devcom' dereferencing possible ERR_PTR() Enhance mlx5_devcom_register_component() so it stops returning NULL, making it easier for its callers. Fixes: d3d057666090 ("net/mlx5: SD, Implement devcom communication and primary election") Reported-by: Dan Carpenter Link: https://lore.kernel.org/all/f09666c8-e604-41f6-958b-4cc55c73faf9@gmail.com/T/ Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Link: https://lore.kernel.org/r/20240411115444.374475-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b375ef268671a..319930c04093b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -209,8 +209,8 @@ static int mlx5e_devcom_init_mpv(struct mlx5e_priv *priv, u64 *data) *data, mlx5e_devcom_event_mpv, priv); - if (IS_ERR_OR_NULL(priv->devcom)) - return -EOPNOTSUPP; + if (IS_ERR(priv->devcom)) + return PTR_ERR(priv->devcom); if (mlx5_core_is_mp_master(priv->mdev)) { mlx5_devcom_send_event(priv->devcom, MPV_DEVCOM_MASTER_UP, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 1f60954c12f72..844d3e3a65ddf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3060,7 +3060,7 @@ void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw, u64 key) key, mlx5_esw_offloads_devcom_event, esw); - if (IS_ERR_OR_NULL(esw->devcom)) + if (IS_ERR(esw->devcom)) return; mlx5_devcom_send_event(esw->devcom, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c index e7d59cfa8708e..7b0766c89f4cf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c @@ -220,7 +220,7 @@ mlx5_devcom_register_component(struct mlx5_devcom_dev *devc, struct mlx5_devcom_comp *comp; if (IS_ERR_OR_NULL(devc)) - return NULL; + return ERR_PTR(-EINVAL); mutex_lock(&comp_list_lock); comp = devcom_component_get(devc, id, key, handler); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index 5b28084e8a03c..dd5d186dc6148 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -213,8 +213,8 @@ static int sd_register(struct mlx5_core_dev *dev) sd = mlx5_get_sd(dev); devcom = mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_SD_GROUP, sd->group_id, NULL, dev); - if (!devcom) - return -ENOMEM; + if (IS_ERR(devcom)) + return PTR_ERR(devcom); sd->devcom = devcom; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 59806553889e9..a39c4b25ba280 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -956,7 +956,7 @@ static void mlx5_register_hca_devcom_comp(struct mlx5_core_dev *dev) mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_HCA_PORTS, mlx5_query_nic_system_image_guid(dev), NULL, dev); - if (IS_ERR_OR_NULL(dev->priv.hca_devcom_comp)) + if (IS_ERR(dev->priv.hca_devcom_comp)) mlx5_core_err(dev, "Failed to register devcom HCA component\n"); } -- GitLab From bf729988303a27833a86acb561f42b9a3cc12728 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 11 Apr 2024 14:54:41 +0300 Subject: [PATCH 842/989] net/mlx5: Restore mistakenly dropped parts in register devlink flow Code parts from cited commit were mistakenly dropped while rebasing before submission. Add them here. Fixes: c6e77aa9dd82 ("net/mlx5: Register devlink first under devlink lock") Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 5 ++++- drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index a39c4b25ba280..331ce47f51a17 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1699,12 +1699,15 @@ int mlx5_init_one_light(struct mlx5_core_dev *dev) err = mlx5_devlink_params_register(priv_to_devlink(dev)); if (err) { mlx5_core_warn(dev, "mlx5_devlink_param_reg err = %d\n", err); - goto query_hca_caps_err; + goto params_reg_err; } devl_unlock(devlink); return 0; +params_reg_err: + devl_unregister(devlink); + devl_unlock(devlink); query_hca_caps_err: devl_unregister(devlink); devl_unlock(devlink); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c index e3bf8c7e4baa6..7ebe712808275 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -75,7 +75,6 @@ static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxilia goto peer_devlink_set_err; } - devlink_register(devlink); return 0; peer_devlink_set_err: -- GitLab From 6c685bdb9e1af966ec0278dbd4068ec39ae88c2d Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Thu, 11 Apr 2024 14:54:42 +0300 Subject: [PATCH 843/989] net/mlx5e: Use channel mdev reference instead of global mdev instance for coalescing Channels can potentially have independent mdev instances. Do not refer to the global mdev instance in the mlx5e_priv instance for channel FW operations related to coalescing. CQ numbers that would be valid on the channel's mdev instance may not be correctly referenced if using the mlx5e_priv instance. Fixes: 67936e138586 ("net/mlx5e: Let channels be SD-aware") Signed-off-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 8f101181648c6..67a29826bb570 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -589,12 +589,12 @@ static int mlx5e_get_coalesce(struct net_device *netdev, static void mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) { - struct mlx5_core_dev *mdev = priv->mdev; int tc; int i; for (i = 0; i < priv->channels.num; ++i) { struct mlx5e_channel *c = priv->channels.c[i]; + struct mlx5_core_dev *mdev = c->mdev; for (tc = 0; tc < c->num_tc; tc++) { mlx5_core_modify_cq_moderation(mdev, @@ -608,11 +608,11 @@ mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coal static void mlx5e_set_priv_channels_rx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) { - struct mlx5_core_dev *mdev = priv->mdev; int i; for (i = 0; i < priv->channels.num; ++i) { struct mlx5e_channel *c = priv->channels.c[i]; + struct mlx5_core_dev *mdev = c->mdev; mlx5_core_modify_cq_moderation(mdev, &c->rq.cq.mcq, coal->rx_coalesce_usecs, -- GitLab From fdce06bda7e56b2a34c53ea58bad7af2fae96da1 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Apr 2024 14:54:43 +0300 Subject: [PATCH 844/989] net/mlx5e: Acquire RTNL lock before RQs/SQs activation/deactivation netif_queue_set_napi asserts whether RTNL lock is held if the netdev is initialized. Acquire the RTNL lock before activating or deactivating RQs/SQs if the lock has not been held before in the flow. Fixes: f25e7b82635f ("net/mlx5e: link NAPI instances to queues and IRQs") Cc: Joe Damato Signed-off-by: Carolina Jubran Reviewed-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 0ab9db3195302..22918b2ef7f12 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -108,7 +108,10 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) mlx5e_reset_txqsq_cc_pc(sq); sq->stats->recover++; clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); + rtnl_lock(); mlx5e_activate_txqsq(sq); + rtnl_unlock(); + if (sq->channel) mlx5e_trigger_napi_icosq(sq->channel); else @@ -179,12 +182,16 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx) carrier_ok = netif_carrier_ok(netdev); netif_carrier_off(netdev); + rtnl_lock(); mlx5e_deactivate_priv_channels(priv); + rtnl_unlock(); mlx5e_ptp_close(chs->ptp); err = mlx5e_ptp_open(priv, &chs->params, chs->c[0]->lag_port, &chs->ptp); + rtnl_lock(); mlx5e_activate_priv_channels(priv); + rtnl_unlock(); /* return carrier back if needed */ if (carrier_ok) -- GitLab From fef965764cf562f28afb997b626fc7c3cec99693 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Apr 2024 14:54:44 +0300 Subject: [PATCH 845/989] net/mlx5e: Prevent deadlock while disabling aRFS When disabling aRFS under the `priv->state_lock`, any scheduled aRFS works are canceled using the `cancel_work_sync` function, which waits for the work to end if it has already started. However, while waiting for the work handler, the handler will try to acquire the `state_lock` which is already acquired. The worker acquires the lock to delete the rules if the state is down, which is not the worker's responsibility since disabling aRFS deletes the rules. Add an aRFS state variable, which indicates whether the aRFS is enabled and prevent adding rules when the aRFS is disabled. Kernel log: ====================================================== WARNING: possible circular locking dependency detected 6.7.0-rc4_net_next_mlx5_5483eb2 #1 Tainted: G I ------------------------------------------------------ ethtool/386089 is trying to acquire lock: ffff88810f21ce68 ((work_completion)(&rule->arfs_work)){+.+.}-{0:0}, at: __flush_work+0x74/0x4e0 but task is already holding lock: ffff8884a1808cc0 (&priv->state_lock){+.+.}-{3:3}, at: mlx5e_ethtool_set_channels+0x53/0x200 [mlx5_core] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&priv->state_lock){+.+.}-{3:3}: __mutex_lock+0x80/0xc90 arfs_handle_work+0x4b/0x3b0 [mlx5_core] process_one_work+0x1dc/0x4a0 worker_thread+0x1bf/0x3c0 kthread+0xd7/0x100 ret_from_fork+0x2d/0x50 ret_from_fork_asm+0x11/0x20 -> #0 ((work_completion)(&rule->arfs_work)){+.+.}-{0:0}: __lock_acquire+0x17b4/0x2c80 lock_acquire+0xd0/0x2b0 __flush_work+0x7a/0x4e0 __cancel_work_timer+0x131/0x1c0 arfs_del_rules+0x143/0x1e0 [mlx5_core] mlx5e_arfs_disable+0x1b/0x30 [mlx5_core] mlx5e_ethtool_set_channels+0xcb/0x200 [mlx5_core] ethnl_set_channels+0x28f/0x3b0 ethnl_default_set_doit+0xec/0x240 genl_family_rcv_msg_doit+0xd0/0x120 genl_rcv_msg+0x188/0x2c0 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1a1/0x270 netlink_sendmsg+0x214/0x460 __sock_sendmsg+0x38/0x60 __sys_sendto+0x113/0x170 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&priv->state_lock); lock((work_completion)(&rule->arfs_work)); lock(&priv->state_lock); lock((work_completion)(&rule->arfs_work)); *** DEADLOCK *** 3 locks held by ethtool/386089: #0: ffffffff82ea7210 (cb_lock){++++}-{3:3}, at: genl_rcv+0x15/0x40 #1: ffffffff82e94c88 (rtnl_mutex){+.+.}-{3:3}, at: ethnl_default_set_doit+0xd3/0x240 #2: ffff8884a1808cc0 (&priv->state_lock){+.+.}-{3:3}, at: mlx5e_ethtool_set_channels+0x53/0x200 [mlx5_core] stack backtrace: CPU: 15 PID: 386089 Comm: ethtool Tainted: G I 6.7.0-rc4_net_next_mlx5_5483eb2 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x60/0xa0 check_noncircular+0x144/0x160 __lock_acquire+0x17b4/0x2c80 lock_acquire+0xd0/0x2b0 ? __flush_work+0x74/0x4e0 ? save_trace+0x3e/0x360 ? __flush_work+0x74/0x4e0 __flush_work+0x7a/0x4e0 ? __flush_work+0x74/0x4e0 ? __lock_acquire+0xa78/0x2c80 ? lock_acquire+0xd0/0x2b0 ? mark_held_locks+0x49/0x70 __cancel_work_timer+0x131/0x1c0 ? mark_held_locks+0x49/0x70 arfs_del_rules+0x143/0x1e0 [mlx5_core] mlx5e_arfs_disable+0x1b/0x30 [mlx5_core] mlx5e_ethtool_set_channels+0xcb/0x200 [mlx5_core] ethnl_set_channels+0x28f/0x3b0 ethnl_default_set_doit+0xec/0x240 genl_family_rcv_msg_doit+0xd0/0x120 genl_rcv_msg+0x188/0x2c0 ? ethnl_ops_begin+0xb0/0xb0 ? genl_family_rcv_msg_dumpit+0xf0/0xf0 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1a1/0x270 netlink_sendmsg+0x214/0x460 __sock_sendmsg+0x38/0x60 __sys_sendto+0x113/0x170 ? do_user_addr_fault+0x53f/0x8f0 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e Fixes: 45bf454ae884 ("net/mlx5e: Enabling aRFS mechanism") Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_arfs.c | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c index c7f542d0b8f08..93cf23278d93c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c @@ -46,6 +46,10 @@ struct arfs_table { struct hlist_head rules_hash[ARFS_HASH_SIZE]; }; +enum { + MLX5E_ARFS_STATE_ENABLED, +}; + enum arfs_type { ARFS_IPV4_TCP, ARFS_IPV6_TCP, @@ -60,6 +64,7 @@ struct mlx5e_arfs_tables { spinlock_t arfs_lock; int last_filter_id; struct workqueue_struct *wq; + unsigned long state; }; struct arfs_tuple { @@ -170,6 +175,8 @@ int mlx5e_arfs_enable(struct mlx5e_flow_steering *fs) return err; } } + set_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state); + return 0; } @@ -455,6 +462,8 @@ static void arfs_del_rules(struct mlx5e_flow_steering *fs) int i; int j; + clear_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state); + spin_lock_bh(&arfs->arfs_lock); mlx5e_for_each_arfs_rule(rule, htmp, arfs->arfs_tables, i, j) { hlist_del_init(&rule->hlist); @@ -627,17 +636,8 @@ static void arfs_handle_work(struct work_struct *work) struct mlx5_flow_handle *rule; arfs = mlx5e_fs_get_arfs(priv->fs); - mutex_lock(&priv->state_lock); - if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { - spin_lock_bh(&arfs->arfs_lock); - hlist_del(&arfs_rule->hlist); - spin_unlock_bh(&arfs->arfs_lock); - - mutex_unlock(&priv->state_lock); - kfree(arfs_rule); - goto out; - } - mutex_unlock(&priv->state_lock); + if (!test_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state)) + return; if (!arfs_rule->rule) { rule = arfs_add_rule(priv, arfs_rule); @@ -753,6 +753,11 @@ int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, return -EPROTONOSUPPORT; spin_lock_bh(&arfs->arfs_lock); + if (!test_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state)) { + spin_unlock_bh(&arfs->arfs_lock); + return -EPERM; + } + arfs_rule = arfs_find_rule(arfs_t, &fk); if (arfs_rule) { if (arfs_rule->rxq == rxq_index || work_busy(&arfs_rule->arfs_work)) { -- GitLab From 58caa786f1c02fd84919fb6db9eaecb22e8f7983 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 17:47:42 -0400 Subject: [PATCH 846/989] bcachefs: Fix UAFs of btree_insert_entry array The btree paths array is now dynamically resizable - and as well the btree_insert_entries array, as it needs to be the same size. The merge path (and interior update path) allocates new btree paths, thus can trigger a resize; thus we need to not retain direct pointers after invoking merge; similarly when running btree node triggers. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index aa9da49707404..9609ad370d71c 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -499,9 +499,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ } static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - struct btree_insert_entry *btree_id_start) + unsigned btree_id_start) { - struct btree_insert_entry *i; bool trans_trigger_run; int ret, overwrite; @@ -514,13 +513,13 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, do { trans_trigger_run = false; - for (i = btree_id_start; - i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + for (unsigned i = btree_id_start; + i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; i++) { - if (i->btree_id != btree_id) + if (trans->updates[i].btree_id != btree_id) continue; - ret = run_one_trans_trigger(trans, i, overwrite); + ret = run_one_trans_trigger(trans, trans->updates + i, overwrite); if (ret < 0) return ret; if (ret) @@ -534,8 +533,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - struct btree_insert_entry *btree_id_start = trans->updates; - unsigned btree_id = 0; + unsigned btree_id = 0, btree_id_start = 0; int ret = 0; /* @@ -549,8 +547,8 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) if (btree_id == BTREE_ID_alloc) continue; - while (btree_id_start < trans->updates + trans->nr_updates && - btree_id_start->btree_id < btree_id) + while (btree_id_start < trans->nr_updates && + trans->updates[btree_id_start].btree_id < btree_id) btree_id_start++; ret = run_btree_triggers(trans, btree_id, btree_id_start); @@ -558,11 +556,13 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) return ret; } - trans_for_each_update(trans, i) { + for (unsigned idx = 0; idx < trans->nr_updates; idx++) { + struct btree_insert_entry *i = trans->updates + idx; + if (i->btree_id > BTREE_ID_alloc) break; if (i->btree_id == BTREE_ID_alloc) { - ret = run_btree_triggers(trans, BTREE_ID_alloc, i); + ret = run_btree_triggers(trans, BTREE_ID_alloc, idx); if (ret) return ret; break; @@ -826,7 +826,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags struct bch_fs *c = trans->c; int ret = 0, u64s_delta = 0; - trans_for_each_update(trans, i) { + for (unsigned idx = 0; idx < trans->nr_updates; idx++) { + struct btree_insert_entry *i = trans->updates + idx; if (i->cached) continue; -- GitLab From 031ad9e7dbd18c63e671fc4d98be3082189b8a63 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 21:30:43 -0400 Subject: [PATCH 847/989] bcachefs: Check for packed bkeys that are too big add missing validation; fixes assertion pop in bkey unpack Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey.h | 6 ++++++ fs/bcachefs/btree_io.c | 15 ++++++++------- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index cf23ff47bed8b..3a45d128f608d 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -314,6 +314,12 @@ static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, return bkey_packed(k) ? format->key_u64s : BKEY_U64s; } +static inline bool bkeyp_u64s_valid(const struct bkey_format *f, + const struct bkey_packed *k) +{ + return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s); +} + static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, const struct bkey_packed *k) { diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index d7de82ac38935..b1aaf0550644a 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -831,7 +831,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b, (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); } -static bool __bkey_valid(struct bch_fs *c, struct btree *b, +static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, struct bset *i, struct bkey_packed *k) { if (bkey_p_next(k) > vstruct_last(i)) @@ -840,7 +840,7 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b, if (k->format > KEY_FORMAT_CURRENT) return false; - if (k->u64s < bkeyp_key_u64s(&b->format, k)) + if (!bkeyp_u64s_valid(&b->format, k)) return false; struct printbuf buf = PRINTBUF; @@ -884,11 +884,13 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, "invalid bkey format %u", k->format)) goto drop_this_key; - if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k), + if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, btree_node_bkey_bad_u64s, - "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k))) + "bad k->u64s %u (min %u max %lu)", k->u64s, + bkeyp_key_u64s(&b->format, k), + U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k))) goto drop_this_key; if (!write) @@ -947,13 +949,12 @@ drop_this_key: * do */ - if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { + if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { for (next_good_key = 1; next_good_key < (u64 *) vstruct_last(i) - (u64 *) k; next_good_key++) - if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) + if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) goto got_good_key; - } /* -- GitLab From 87cb0239c87f608fd48fb50f9f53f129dcfd73f4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 23:38:07 -0400 Subject: [PATCH 848/989] bcachefs: btree node scan: handle encrypted nodes Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_node_scan.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 556f76f5c84e1..20f2b37c4474c 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -133,9 +133,19 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, if (le64_to_cpu(bn->magic) != bset_magic(c)) return; + if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { + struct nonce nonce = btree_nonce(&bn->keys, 0); + unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; + + bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); + } + if (btree_id_is_alloc(BTREE_NODE_ID(bn))) return; + if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) + return; + rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), -- GitLab From dc32c118ec6b1032693c489a0aa9e011f0acdb1a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 21:20:27 -0400 Subject: [PATCH 849/989] bcachefs: fix unsafety in bch2_extent_ptr_to_text() Need to check if we have a valid bucket before checking if ptr is stale Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 0e3ca99fbd2de..36d12d2adb81e 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -998,7 +998,9 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc prt_str(out, " cached"); if (ptr->unwritten) prt_str(out, " unwritten"); - if (ca && ptr_stale(ca, ptr)) + if (b >= ca->mi.first_bucket && + b < ca->mi.nbuckets && + ptr_stale(ca, ptr)) prt_printf(out, " stale"); } } -- GitLab From 2aeed876d7c2c2fb4c6b92f59bdf7e73cfe5e098 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 23:37:24 -0400 Subject: [PATCH 850/989] bcachefs: fix unsafety in bch2_stripe_to_text() .to_text() functions need to work on key values that didn't pass .valid Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 46 +++++++++++++++++++++++++--------------------- fs/bcachefs/ec.h | 2 ++ 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 082075244e16a..c0b2b2f180e01 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -131,29 +131,33 @@ fsck_err: void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned i, nr_data = s->nr_blocks - s->nr_redundant; + const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v; + struct bch_stripe s = {}; + + memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k))); + + unsigned nr_data = s.nr_blocks - s.nr_redundant; prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", - s->algorithm, - le16_to_cpu(s->sectors), - nr_data, - s->nr_redundant, - s->csum_type, - 1U << s->csum_granularity_bits); - - for (i = 0; i < s->nr_blocks; i++) { - const struct bch_extent_ptr *ptr = s->ptrs + i; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - u32 offset; - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); - - prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); - if (i < nr_data) - prt_printf(out, "#%u", stripe_blockcount_get(s, i)); - prt_printf(out, " gen %u", ptr->gen); - if (ptr_stale(ca, ptr)) - prt_printf(out, " stale"); + s.algorithm, + le16_to_cpu(s.sectors), + nr_data, + s.nr_redundant, + s.csum_type, + 1U << s.csum_granularity_bits); + + for (unsigned i = 0; i < s.nr_blocks; i++) { + const struct bch_extent_ptr *ptr = sp->ptrs + i; + + if ((void *) ptr >= bkey_val_end(k)) + break; + + bch2_extent_ptr_to_text(out, c, ptr); + + if (s.csum_type < BCH_CSUM_NR && + i < nr_data && + stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k)) + prt_printf(out, "#%u", stripe_blockcount_get(sp, i)); } } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index f4369b02e805f..f042616888b0a 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -32,6 +32,8 @@ static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) static inline unsigned stripe_csum_offset(const struct bch_stripe *s, unsigned dev, unsigned csum_idx) { + EBUG_ON(s->csum_type >= BCH_CSUM_NR); + unsigned csum_bytes = bch_crc_bytes[s->csum_type]; return sizeof(struct bch_stripe) + -- GitLab From 7b4c4ccf848b359762c1525bab5a12f008f14a65 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 23:58:36 -0400 Subject: [PATCH 851/989] bcachefs: fix race in bch2_btree_node_evict() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 84474324dba9b..c7f156320a35d 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1148,6 +1148,8 @@ wait_on_io: btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + if (unlikely(b->hash_val != btree_ptr_hash_val(k))) + goto out; if (btree_node_dirty(b)) { __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); @@ -1162,7 +1164,7 @@ wait_on_io: btree_node_data_free(c, b); bch2_btree_node_hash_remove(bc, b); mutex_unlock(&bc->lock); - +out: six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } -- GitLab From ba8ed36e72033dd6b89d44145f323e6c4508bfc9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 00:09:08 -0400 Subject: [PATCH 852/989] bcachefs: don't queue btree nodes for rewrites during scan many nodes found during scan will be old nodes, overwritten by newer nodes Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index b1aaf0550644a..9678b2375bedd 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1340,7 +1340,9 @@ start: rb->start_time); bio_put(&rb->bio); - if (saw_error && !btree_node_read_error(b)) { + if (saw_error && + !btree_node_read_error(b) && + c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->key.k.p); bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", -- GitLab From 9abb6dd7ce5a261f7aebf6f396b50a63db71133f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 15:17:00 -0400 Subject: [PATCH 853/989] bcachefs: Standardize helpers for printing enum strs with bounds checks Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 4 ++-- fs/bcachefs/buckets.h | 8 -------- fs/bcachefs/checksum.c | 23 ++++++++++++++--------- fs/bcachefs/checksum.h | 5 +++-- fs/bcachefs/compress.h | 8 -------- fs/bcachefs/ec.c | 14 ++++++-------- fs/bcachefs/extents.c | 7 ++++--- fs/bcachefs/journal_io.c | 17 +++++++++-------- fs/bcachefs/opts.c | 29 +++++++++++++++++++++++++---- fs/bcachefs/opts.h | 10 ++++++---- 10 files changed, 69 insertions(+), 56 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 364ae42022af1..9480f1b44f071 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1314,7 +1314,7 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(write_buffer_keys, 11) \ x(datetime, 12) -enum { +enum bch_jset_entry_type { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, BCH_JSET_ENTRY_TYPES() #undef x @@ -1360,7 +1360,7 @@ struct jset_entry_blacklist_v2 { x(inodes, 1) \ x(key_version, 2) -enum { +enum bch_fs_usage_type { #define x(f, nr) BCH_FS_USAGE_##f = nr, BCH_FS_USAGE_TYPES() #undef x diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 00aaf4bb51397..f9af5adabe836 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -395,14 +395,6 @@ static inline const char *bch2_data_type_str(enum bch_data_type type) : "(invalid data type)"; } -static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type) -{ - if (type < BCH_DATA_NR) - prt_str(out, __bch2_data_types[type]); - else - prt_printf(out, "(invalid data type %u)", type); -} - /* disk reservations: */ static inline void bch2_disk_reservation_put(struct bch_fs *c, diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 4701457f6381c..7ed779b411f61 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -429,15 +429,20 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, extent_nonce(version, crc_old), bio); if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { - bch_err(c, "checksum error in %s() (memory corruption or bug?)\n" - "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", - __func__, - crc_old.csum.hi, - crc_old.csum.lo, - merged.hi, - merged.lo, - bch2_csum_types[crc_old.csum_type], - bch2_csum_types[new_csum_type]); + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n" + "expected %0llx:%0llx got %0llx:%0llx (old type ", + __func__, + crc_old.csum.hi, + crc_old.csum.lo, + merged.hi, + merged.lo); + bch2_prt_csum_type(&buf, crc_old.csum_type); + prt_str(&buf, " new type "); + bch2_prt_csum_type(&buf, new_csum_type); + prt_str(&buf, ")"); + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); return -EIO; } diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 1b8c2c1016dc6..e40499fde9a40 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -61,11 +61,12 @@ static inline void bch2_csum_err_msg(struct printbuf *out, struct bch_csum expected, struct bch_csum got) { - prt_printf(out, "checksum error: got "); + prt_str(out, "checksum error, type "); + bch2_prt_csum_type(out, type); + prt_str(out, ": got "); bch2_csum_to_text(out, type, got); prt_str(out, " should be "); bch2_csum_to_text(out, type, expected); - prt_printf(out, " type %s", bch2_csum_types[type]); } int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 58c2eb45570ff..607fd5e232c90 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -47,14 +47,6 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; } -static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type) -{ - if (type < BCH_COMPRESSION_TYPE_NR) - prt_str(out, __bch2_compression_types[type]); - else - prt_printf(out, "(invalid compression type %u)", type); -} - int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, struct bch_extent_crc_unpacked *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index c0b2b2f180e01..556a217108d32 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -138,13 +138,13 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, unsigned nr_data = s.nr_blocks - s.nr_redundant; - prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", + prt_printf(out, "algo %u sectors %u blocks %u:%u csum ", s.algorithm, le16_to_cpu(s.sectors), nr_data, - s.nr_redundant, - s.csum_type, - 1U << s.csum_granularity_bits); + s.nr_redundant); + bch2_prt_csum_type(out, s.csum_type); + prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); for (unsigned i = 0; i < s.nr_blocks; i++) { const struct bch_extent_ptr *ptr = sp->ptrs + i; @@ -611,10 +611,8 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct printbuf err = PRINTBUF; struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); - prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n", - want.hi, want.lo, - got.hi, got.lo, - bch2_csum_types[v->csum_type]); + prt_str(&err, "stripe "); + bch2_csum_err_msg(&err, v->csum_type, want, got); prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); bch_err_ratelimited(ca, "%s", err.buf); diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 36d12d2adb81e..1a331e5392048 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1030,11 +1030,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ", + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ", crc.compressed_size, crc.uncompressed_size, - crc.offset, crc.nonce, - bch2_csum_types[crc.csum_type]); + crc.offset, crc.nonce); + bch2_prt_csum_type(out, crc.csum_type); + prt_str(out, " compress "); bch2_prt_compression_type(out, crc.compression_type); break; } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 725fcf46f6312..9aa28b52ab926 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -247,7 +247,7 @@ static void journal_entry_err_msg(struct printbuf *out, if (entry) { prt_str(out, " type="); - prt_str(out, bch2_jset_entry_types[entry->type]); + bch2_prt_jset_entry_type(out, entry->type); } if (!jset) { @@ -403,7 +403,8 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs jset_entry_for_each_key(entry, k) { if (!first) { prt_newline(out); - prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + bch2_prt_jset_entry_type(out, entry->type); + prt_str(out, ": "); } prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); @@ -563,9 +564,9 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); - prt_printf(out, "type=%s v=%llu", - bch2_fs_usage_types[u->entry.btree_id], - le64_to_cpu(u->v)); + prt_str(out, "type="); + bch2_prt_fs_usage_type(out, u->entry.btree_id); + prt_printf(out, " v=%llu", le64_to_cpu(u->v)); } static int journal_entry_data_usage_validate(struct bch_fs *c, @@ -827,11 +828,11 @@ int bch2_journal_entry_validate(struct bch_fs *c, void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry *entry) { + bch2_prt_jset_entry_type(out, entry->type); + if (entry->type < BCH_JSET_ENTRY_NR) { - prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + prt_str(out, ": "); bch2_jset_entry_ops[entry->type].to_text(out, c, entry); - } else { - prt_printf(out, "(unknown type %u)", entry->type); } } diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index e1800c4119b5f..bb068fd724656 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -43,7 +43,7 @@ const char * const __bch2_btree_ids[] = { NULL }; -const char * const bch2_csum_types[] = { +static const char * const __bch2_csum_types[] = { BCH_CSUM_TYPES() NULL }; @@ -53,7 +53,7 @@ const char * const bch2_csum_opts[] = { NULL }; -const char * const __bch2_compression_types[] = { +static const char * const __bch2_compression_types[] = { BCH_COMPRESSION_TYPES() NULL }; @@ -83,18 +83,39 @@ const char * const bch2_member_states[] = { NULL }; -const char * const bch2_jset_entry_types[] = { +static const char * const __bch2_jset_entry_types[] = { BCH_JSET_ENTRY_TYPES() NULL }; -const char * const bch2_fs_usage_types[] = { +static const char * const __bch2_fs_usage_types[] = { BCH_FS_USAGE_TYPES() NULL }; #undef x +static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], + unsigned nr, const char *type, unsigned idx) +{ + if (idx < nr) + prt_str(out, opts[idx]); + else + prt_printf(out, "(unknown %s %u)", type, idx); +} + +#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \ +void bch2_prt_##name(struct printbuf *out, type t) \ +{ \ + prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\ +} + +PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type); +PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); +PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); +PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); +PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); + static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, struct printbuf *err) { diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 1ac4135cca1c3..84e452835a17d 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -16,18 +16,20 @@ extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; -extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; -extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; -extern const char * const bch2_jset_entry_types[]; -extern const char * const bch2_fs_usage_types[]; extern const char * const bch2_d_types[]; +void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); +void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); +void bch2_prt_data_type(struct printbuf *, enum bch_data_type); +void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); +void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); + static inline const char *bch2_d_type_str(unsigned d_type) { return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; -- GitLab From 4518e80adfdbfdec1cc79c98bc73677ff44d96bd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 14:05:36 -0400 Subject: [PATCH 854/989] bcachefs: Go rw if running any explicit recovery passes This fixes a bug where we fail to start when upgrading/downgrading because we forgot we needed to go rw. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index cb501460d6152..0cec0f7d97035 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -44,7 +44,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || c->opts.fsck || !c->sb.clean) + if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit) return bch2_fs_read_write_early(c); return 0; } -- GitLab From 82cf18f23e1ae17053983e325e550194f947e1f1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 21:07:05 -0400 Subject: [PATCH 855/989] bcachefs: Fix deadlock in journal replay btree_key_can_insert_cached() should be checking the watermark - BCH_TRANS_COMMIT_journal_replay really means nonblocking mode when watermark < reclaim, it was being used incorrectly. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 9609ad370d71c..bbec91e8e6506 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -397,12 +397,13 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags struct bkey_cached *ck = (void *) path->l[0].b; unsigned new_u64s; struct bkey_i *new_k; + unsigned watermark = flags & BCH_WATERMARK_MASK; EBUG_ON(path->level); - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bch2_btree_key_cache_must_wait(c) && - !(flags & BCH_TRANS_COMMIT_journal_reclaim)) + if (watermark < BCH_WATERMARK_reclaim && + !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(c)) return -BCH_ERR_btree_insert_need_journal_reclaim; /* -- GitLab From 9e203c43dc1cbaefb3888ee0ba885b2d20d47526 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 00:26:01 -0400 Subject: [PATCH 856/989] bcachefs: Fix missing write refs in fs fio paths bch2_journal_flush_seq requires us to have a write ref Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 ++ fs/bcachefs/fs-io-direct.c | 19 +++++++++++++------ fs/bcachefs/fs-io.c | 16 ++++++++-------- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index a31a5f706929e..91c3c1fef233d 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -709,6 +709,8 @@ struct btree_trans_buf { x(stripe_delete) \ x(reflink) \ x(fallocate) \ + x(fsync) \ + x(dio_write) \ x(discard) \ x(discard_fast) \ x(invalidate) \ diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index f49e6c0f0f683..b889370a50881 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -387,6 +387,8 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); + bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write); + /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); @@ -590,22 +592,25 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) prefetch(&inode->ei_inode); prefetch((void *) &inode->ei_inode + 64); + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write)) + return -EROFS; + inode_lock(&inode->v); ret = generic_write_checks(req, iter); if (unlikely(ret <= 0)) - goto err; + goto err_put_write_ref; ret = file_remove_privs(file); if (unlikely(ret)) - goto err; + goto err_put_write_ref; ret = file_update_time(file); if (unlikely(ret)) - goto err; + goto err_put_write_ref; if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) - goto err; + goto err_put_write_ref; inode_dio_begin(&inode->v); bch2_pagecache_block_get(inode); @@ -645,7 +650,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) } ret = bch2_dio_write_loop(dio); -err: +out: if (locked) inode_unlock(&inode->v); return ret; @@ -653,7 +658,9 @@ err_put_bio: bch2_pagecache_block_put(inode); bio_put(bio); inode_dio_end(&inode->v); - goto err; +err_put_write_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + goto out; } void bch2_fs_fs_io_direct_exit(struct bch_fs *c) diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 8c70123b6a0c8..20b40477425f4 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -174,18 +174,18 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, static int bch2_flush_inode(struct bch_fs *c, struct bch_inode_info *inode) { - struct bch_inode_unpacked u; - int ret; - if (c->opts.journal_flush_disabled) return 0; - ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u); - if (ret) - return ret; + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) + return -EROFS; - return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: - bch2_inode_flush_nocow_writes(c, inode); + struct bch_inode_unpacked u; + int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: + bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: + bch2_inode_flush_nocow_writes(c, inode); + bch2_write_ref_put(c, BCH_WRITE_REF_fsync); + return ret; } int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) -- GitLab From 9054ef2ea944528b7935d0ce3f540d4dc0bc37ba Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 16:13:13 -0400 Subject: [PATCH 857/989] bcachefs: Run merges at BCH_WATERMARK_btree This fixes a deadlock where the interior update path during journal replay ends up doing a ton of merges on the backpointers btree, and deadlocking. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 29a5bc7d87898..1a7537ee43297 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1926,6 +1926,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); + flags &= ~BCH_WATERMARK_MASK; + b = trans->paths[path].l[level].b; if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || @@ -2071,6 +2073,10 @@ err: bch2_path_put(trans, new_path, true); bch2_path_put(trans, sib_path, true); bch2_trans_verify_locks(trans); + if (ret == -BCH_ERR_journal_reclaim_would_deadlock) + ret = 0; + if (!ret) + ret = bch2_trans_relock(trans); return ret; err_free_update: bch2_btree_node_free_never_used(as, trans, n); -- GitLab From 3f10048973c8366e27147d72bf3fd8d0e1d929f2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 18:39:03 -0400 Subject: [PATCH 858/989] bcachefs: Disable merges from interior update path There's been a bug in the btree write buffer where it wasn't triggering btree node merges - and leaving behind a bunch of nearly empty btree nodes. Then during journal replay, when updates to the backpointers btree aren't using the btree write buffer (because we require synchronization with journal replay), we end up doing those merges all at once. Then if it's the interior update path running them, we deadlock because those run with the highest watermark. There's no real need for the interior update path to be doing btree node merges; other code paths can handle that at lower watermarks. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 1a7537ee43297..82ac00aa0f7b0 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1926,6 +1926,16 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); + /* + * Work around a deadlock caused by the btree write buffer not doing + * merges and leaving tons of merges for us to do - we really don't need + * to be doing merges at all from the interior update path, and if the + * interior update path is generating too many new interior updates we + * deadlock: + */ + if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates) + return 0; + flags &= ~BCH_WATERMARK_MASK; b = trans->paths[path].l[level].b; -- GitLab From 86dbf8c566417afd62087ecd3bedcf84f91ee4e4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Dec 2023 22:42:34 -0500 Subject: [PATCH 859/989] bcachefs: Fix btree node merging on write buffer btrees The btree write buffer flush fastpath that avoids the main transaction commit path had the unfortunate side effect of not doing btree node merging. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_write_buffer.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index baf63e2fddb64..36a6f42aba5e6 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -316,6 +316,16 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { bch2_btree_node_unlock_write(trans, path, path->l[0].b); write_locked = false; + + ret = lockrestart_do(trans, + bch2_btree_iter_traverse(&iter) ?: + bch2_foreground_maybe_merge(trans, iter.path, 0, + BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc)); + if (ret) + goto err; } } @@ -382,10 +392,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) ret = commit_do(trans, NULL, NULL, BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_journal_res| - BCH_TRANS_COMMIT_journal_reclaim, + BCH_TRANS_COMMIT_no_journal_res , btree_write_buffered_insert(trans, i)); if (ret) goto err; -- GitLab From 69129794d94c544810e68b2b4eaa7e44063f9bf2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 12 Apr 2024 11:10:33 -0700 Subject: [PATCH 860/989] x86/bugs: Fix BHI retpoline check Confusingly, X86_FEATURE_RETPOLINE doesn't mean retpolines are enabled, as it also includes the original "AMD retpoline" which isn't a retpoline at all. Also replace cpu_feature_enabled() with boot_cpu_has() because this is before alternatives are patched and cpu_feature_enabled()'s fallback path is slower than plain old boot_cpu_has(). Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Pawan Gupta Cc: Borislav Petkov Cc: Linus Torvalds Link: https://lore.kernel.org/r/ad3807424a3953f0323c011a643405619f2a4927.1712944776.git.jpoimboe@kernel.org --- arch/x86/kernel/cpu/bugs.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ca295b0c1eeee..ab18185894dfd 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1652,7 +1652,8 @@ static void __init bhi_select_mitigation(void) return; /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */ - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { + if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) { spec_ctrl_disable_kernel_rrsba(); if (rrsba_disabled) return; @@ -2804,11 +2805,13 @@ static const char *spectre_bhi_state(void) { if (!boot_cpu_has_bug(X86_BUG_BHI)) return "; BHI: Not affected"; - else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW)) + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW)) return "; BHI: BHI_DIS_S"; - else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) return "; BHI: SW loop, KVM: SW loop"; - else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && rrsba_disabled) + else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) && + rrsba_disabled) return "; BHI: Retpoline"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) return "; BHI: Vulnerable, KVM: SW loop"; -- GitLab From 16b52bbee4823b01ab7fe3919373c981a38f3797 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 5 Apr 2024 17:56:35 +0300 Subject: [PATCH 861/989] kernfs: annotate different lockdep class for of->mutex of writable files The writable file /sys/power/resume may call vfs lookup helpers for arbitrary paths and readonly files can be read by overlayfs from vfs helpers when sysfs is a lower layer of overalyfs. To avoid a lockdep warning of circular dependency between overlayfs inode lock and kernfs of->mutex, use a different lockdep class for writable and readonly kernfs files. Reported-by: syzbot+9a5b0ced8b1bfb238b56@syzkaller.appspotmail.com Fixes: 0fedefd4c4e3 ("kernfs: sysfs: support custom llseek method for sysfs entries") Suggested-by: Al Viro Signed-off-by: Amir Goldstein Signed-off-by: Al Viro --- fs/kernfs/file.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e9df2f87072c6..8502ef68459b9 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -636,11 +636,18 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) * each file a separate locking class. Let's differentiate on * whether the file has mmap or not for now. * - * Both paths of the branch look the same. They're supposed to + * For similar reasons, writable and readonly files are given different + * lockdep key, because the writable file /sys/power/resume may call vfs + * lookup helpers for arbitrary paths and readonly files can be read by + * overlayfs from vfs helpers when sysfs is a lower layer of overalyfs. + * + * All three cases look the same. They're supposed to * look that way and give @of->mutex different static lockdep keys. */ if (has_mmap) mutex_init(&of->mutex); + else if (file->f_mode & FMODE_WRITE) + mutex_init(&of->mutex); else mutex_init(&of->mutex); -- GitLab From 89f9a1e876b5a7ad884918c03a46831af202c8a0 Mon Sep 17 00:00:00 2001 From: Qiang Zhang Date: Sun, 14 Apr 2024 19:49:45 +0800 Subject: [PATCH 862/989] bootconfig: use memblock_free_late to free xbc memory to buddy On the time to free xbc memory in xbc_exit(), memblock may has handed over memory to buddy allocator. So it doesn't make sense to free memory back to memblock. memblock_free() called by xbc_exit() even causes UAF bugs on architectures with CONFIG_ARCH_KEEP_MEMBLOCK disabled like x86. Following KASAN logs shows this case. This patch fixes the xbc memory free problem by calling memblock_free() in early xbc init error rewind path and calling memblock_free_late() in xbc exit path to free memory to buddy allocator. [ 9.410890] ================================================================== [ 9.418962] BUG: KASAN: use-after-free in memblock_isolate_range+0x12d/0x260 [ 9.426850] Read of size 8 at addr ffff88845dd30000 by task swapper/0/1 [ 9.435901] CPU: 9 PID: 1 Comm: swapper/0 Tainted: G U 6.9.0-rc3-00208-g586b5dfb51b9 #5 [ 9.446403] Hardware name: Intel Corporation RPLP LP5 (CPU:RaptorLake)/RPLP LP5 (ID:13), BIOS IRPPN02.01.01.00.00.19.015.D-00000000 Dec 28 2023 [ 9.460789] Call Trace: [ 9.463518] [ 9.465859] dump_stack_lvl+0x53/0x70 [ 9.469949] print_report+0xce/0x610 [ 9.473944] ? __virt_addr_valid+0xf5/0x1b0 [ 9.478619] ? memblock_isolate_range+0x12d/0x260 [ 9.483877] kasan_report+0xc6/0x100 [ 9.487870] ? memblock_isolate_range+0x12d/0x260 [ 9.493125] memblock_isolate_range+0x12d/0x260 [ 9.498187] memblock_phys_free+0xb4/0x160 [ 9.502762] ? __pfx_memblock_phys_free+0x10/0x10 [ 9.508021] ? mutex_unlock+0x7e/0xd0 [ 9.512111] ? __pfx_mutex_unlock+0x10/0x10 [ 9.516786] ? kernel_init_freeable+0x2d4/0x430 [ 9.521850] ? __pfx_kernel_init+0x10/0x10 [ 9.526426] xbc_exit+0x17/0x70 [ 9.529935] kernel_init+0x38/0x1e0 [ 9.533829] ? _raw_spin_unlock_irq+0xd/0x30 [ 9.538601] ret_from_fork+0x2c/0x50 [ 9.542596] ? __pfx_kernel_init+0x10/0x10 [ 9.547170] ret_from_fork_asm+0x1a/0x30 [ 9.551552] [ 9.555649] The buggy address belongs to the physical page: [ 9.561875] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x45dd30 [ 9.570821] flags: 0x200000000000000(node=0|zone=2) [ 9.576271] page_type: 0xffffffff() [ 9.580167] raw: 0200000000000000 ffffea0011774c48 ffffea0012ba1848 0000000000000000 [ 9.588823] raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 [ 9.597476] page dumped because: kasan: bad access detected [ 9.605362] Memory state around the buggy address: [ 9.610714] ffff88845dd2ff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 9.618786] ffff88845dd2ff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 9.626857] >ffff88845dd30000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 9.634930] ^ [ 9.638534] ffff88845dd30080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 9.646605] ffff88845dd30100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 9.654675] ================================================================== Link: https://lore.kernel.org/all/20240414114944.1012359-1-qiang4.zhang@linux.intel.com/ Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed") Cc: Stable@vger.kernel.org Signed-off-by: Qiang Zhang Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- include/linux/bootconfig.h | 7 ++++++- lib/bootconfig.c | 19 +++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index e5ee2c694401e..3f4b4ac527ca2 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -288,7 +288,12 @@ int __init xbc_init(const char *buf, size_t size, const char **emsg, int *epos); int __init xbc_get_info(int *node_size, size_t *data_size); /* XBC cleanup data structures */ -void __init xbc_exit(void); +void __init _xbc_exit(bool early); + +static inline void xbc_exit(void) +{ + _xbc_exit(false); +} /* XBC embedded bootconfig data in kernel */ #ifdef CONFIG_BOOT_CONFIG_EMBED diff --git a/lib/bootconfig.c b/lib/bootconfig.c index c59d26068a640..8841554432d5b 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -61,9 +61,12 @@ static inline void * __init xbc_alloc_mem(size_t size) return memblock_alloc(size, SMP_CACHE_BYTES); } -static inline void __init xbc_free_mem(void *addr, size_t size) +static inline void __init xbc_free_mem(void *addr, size_t size, bool early) { - memblock_free(addr, size); + if (early) + memblock_free(addr, size); + else if (addr) + memblock_free_late(__pa(addr), size); } #else /* !__KERNEL__ */ @@ -73,7 +76,7 @@ static inline void *xbc_alloc_mem(size_t size) return malloc(size); } -static inline void xbc_free_mem(void *addr, size_t size) +static inline void xbc_free_mem(void *addr, size_t size, bool early) { free(addr); } @@ -904,13 +907,13 @@ static int __init xbc_parse_tree(void) * If you need to reuse xbc_init() with new boot config, you can * use this. */ -void __init xbc_exit(void) +void __init _xbc_exit(bool early) { - xbc_free_mem(xbc_data, xbc_data_size); + xbc_free_mem(xbc_data, xbc_data_size, early); xbc_data = NULL; xbc_data_size = 0; xbc_node_num = 0; - xbc_free_mem(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX); + xbc_free_mem(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX, early); xbc_nodes = NULL; brace_index = 0; } @@ -963,7 +966,7 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) if (!xbc_nodes) { if (emsg) *emsg = "Failed to allocate bootconfig nodes"; - xbc_exit(); + _xbc_exit(true); return -ENOMEM; } memset(xbc_nodes, 0, sizeof(struct xbc_node) * XBC_NODE_MAX); @@ -977,7 +980,7 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) *epos = xbc_err_pos; if (emsg) *emsg = xbc_err_msg; - xbc_exit(); + _xbc_exit(true); } else ret = xbc_node_num; -- GitLab From 1382e3b6a3500c245e5278c66d210c02926f804f Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Thu, 11 Apr 2024 08:11:24 +0300 Subject: [PATCH 863/989] net: change maximum number of UDP segments to 128 The commit fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") adds check of potential number of UDP segments vs UDP_MAX_SEGMENTS in linux/virtio_net.h. After this change certification test of USO guest-to-guest transmit on Windows driver for virtio-net device fails, for example with packet size of ~64K and mss of 536 bytes. In general the USO should not be more restrictive than TSO. Indeed, in case of unreasonably small mss a lot of segments can cause queue overflow and packet loss on the destination. Limit of 128 segments is good for any practical purpose, with minimal meaningful mss of 536 the maximal UDP packet will be divided to ~120 segments. The number of segments for UDP packets is validated vs UDP_MAX_SEGMENTS also in udp.c (v4,v6), this does not affect quest-to-guest path but does affect packets sent to host, for example. It is important to mention that UDP_MAX_SEGMENTS is kernel-only define and not available to user mode socket applications. In order to request MSS smaller than MTU the applications just uses setsockopt with SOL_UDP and UDP_SEGMENT and there is no limitations on socket API level. Fixes: fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") Signed-off-by: Yuri Benditovich Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 2 +- tools/testing/selftests/net/udpgso.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 17539d0896661..e398e1dbd2d36 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,7 +108,7 @@ struct udp_sock { #define udp_assign_bit(nr, sk, val) \ assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val) -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk) diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index 1d975bf52af33..85b3baa3f7f34 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -34,7 +34,7 @@ #endif #ifndef UDP_MAX_SEGMENTS -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #endif #define CONST_MTU_TEST 1500 -- GitLab From 8541323285994528ad5be2c1bdc759e6c83b936e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 4 Apr 2024 21:05:14 -0300 Subject: [PATCH 864/989] iommufd: Add missing IOMMUFD_DRIVER kconfig for the selftest Some kconfigs don't automatically include this symbol which results in sub functions for some of the dirty tracking related things that are non-functional. Thus the test suite will fail. select IOMMUFD_DRIVER in the IOMMUFD_TEST kconfig to fix it. Fixes: a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP") Link: https://lore.kernel.org/r/20240327182050.GA1363414@ziepe.ca Tested-by: Muhammad Usama Anjum Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 99d4b075df49e..76656fe0470d7 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -37,6 +37,7 @@ config IOMMUFD_TEST depends on DEBUG_KERNEL depends on FAULT_INJECTION depends on RUNTIME_TESTING_MENU + select IOMMUFD_DRIVER default n help This is dangerous, do not enable unless running -- GitLab From 2760c51b8040d7cffedc337939e7475a17cc4b19 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 25 Mar 2024 14:00:48 +0500 Subject: [PATCH 865/989] iommufd: Add config needed for iommufd_fail_nth Add FAULT_INJECTION_DEBUG_FS and FAILSLAB configurations to the kconfig fragment for the iommfd selftests. These kconfigs are needed by the iommufd_fail_nth test. Fixes: a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP") Link: https://lore.kernel.org/r/20240325090048.1423908-1-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/iommu/config b/tools/testing/selftests/iommu/config index 110d73917615d..02a2a1b267c1e 100644 --- a/tools/testing/selftests/iommu/config +++ b/tools/testing/selftests/iommu/config @@ -1,3 +1,5 @@ CONFIG_IOMMUFD=y +CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FAULT_INJECTION=y CONFIG_IOMMUFD_TEST=y +CONFIG_FAILSLAB=y -- GitLab From 0bbac3facb5d6cc0171c45c9873a2dc96bea9680 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 14 Apr 2024 13:38:39 -0700 Subject: [PATCH 866/989] Linux 6.9-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e1bf12891cb0e..59d8a7f95d0a8 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 9 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- GitLab From bceb86be9e970bc6d77bfd1f8e5272d9a2007c16 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 23:59:06 -0400 Subject: [PATCH 867/989] bcachefs: add missing bounds check in __bch2_bkey_val_invalid() Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey_methods.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 5e52684764eb1..e4481e4d5f38f 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -175,7 +175,10 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", - bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]); + bch2_btree_node_type_str(type), + k.k->type < KEY_TYPE_MAX + ? bch2_bkey_types[k.k->type] + : "(unknown)"); if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { bkey_fsck_err_on(k.k->size == 0, c, err, -- GitLab From d789e9a7d5e2799f4d5425b0b620210d2fcad529 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 23:59:28 -0400 Subject: [PATCH 868/989] bcachefs: Interior known are required to have known key types For forwards compatibilyt, we allow bkeys of unknown type in leaf nodes; we can simply ignore metadata we don't understand. Pointers to btree nodes must always be of known types, howwever. Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey_methods.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index e4481e4d5f38f..db336a43fc083 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -171,7 +171,8 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, if (type >= BKEY_TYPE_NR) return 0; - bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && + bkey_fsck_err_on((type == BKEY_TYPE_btree || + (flags & BKEY_INVALID_COMMIT)) && !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", -- GitLab From 8cf2036e7b557282667a437d409e6307d55366ab Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 15:34:14 -0400 Subject: [PATCH 869/989] bcachefs: add safety checks in bch2_btree_node_fill() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index c7f156320a35d..c347d08d80bcf 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -711,7 +711,30 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, struct btree *b; u32 seq; - BUG_ON(level + 1 >= BTREE_MAX_DEPTH); + if (unlikely(level >= BTREE_MAX_DEPTH)) { + int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", + level, BTREE_MAX_DEPTH); + return ERR_PTR(ret); + } + + if (unlikely(!bkey_is_btree_ptr(&k->k))) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); + printbuf_exit(&buf); + return ERR_PTR(ret); + } + + if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); + printbuf_exit(&buf); + return ERR_PTR(ret); + } + /* * Parent node must be locked, else we could read in a btree node that's * been freed: -- GitLab From e879389f5777de5d94f27f8d88d7e92341afa3ef Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 15:54:33 -0400 Subject: [PATCH 870/989] bcachefs: Fix bch2_btree_node_fill() for !path We shouldn't be doing the unlock/relock dance when we're not using a path - this fixes an assertion pop when called from btree node scan. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 44 ++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index c347d08d80bcf..02c70e813face 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -709,7 +709,6 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - u32 seq; if (unlikely(level >= BTREE_MAX_DEPTH)) { int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", @@ -775,34 +774,26 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, } set_btree_node_read_in_flight(b); - six_unlock_write(&b->c.lock); - seq = six_lock_seq(&b->c.lock); - six_unlock_intent(&b->c.lock); - /* Unlock before doing IO: */ - if (path && sync) - bch2_trans_unlock_noassert(trans); - - bch2_btree_node_read(trans, b, sync); + if (path) { + u32 seq = six_lock_seq(&b->c.lock); - if (!sync) - return NULL; + /* Unlock before doing IO: */ + six_unlock_intent(&b->c.lock); + bch2_trans_unlock_noassert(trans); - if (path) { - int ret = bch2_trans_relock(trans) ?: - bch2_btree_path_relock_intent(trans, path); - if (ret) { - BUG_ON(!trans->restarted); - return ERR_PTR(ret); - } - } + bch2_btree_node_read(trans, b, sync); - if (!six_relock_type(&b->c.lock, lock_type, seq)) { - BUG_ON(!path); + if (!sync) + return NULL; - trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); + if (!six_relock_type(&b->c.lock, lock_type, seq)) + b = NULL; + } else { + bch2_btree_node_read(trans, b, sync); + if (lock_type == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); } return b; @@ -1135,18 +1126,19 @@ int bch2_btree_node_prefetch(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; - struct btree *b; BUG_ON(path && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); - b = btree_cache_find(bc, k); + struct btree *b = btree_cache_find(bc, k); if (b) return 0; b = bch2_btree_node_fill(trans, path, k, btree_id, level, SIX_LOCK_read, false); - return PTR_ERR_OR_ZERO(b); + if (!IS_ERR_OR_NULL(b)) + six_unlock_read(&b->c.lock); + return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b); } void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) -- GitLab From bdae2a7e6020e1cf864a30dab2af323575569dc7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 22:43:11 -0400 Subject: [PATCH 871/989] bcachefs: sysfs internal/trigger_journal_flush Add a sysfs knob for immediately flushing the entire journal. Signed-off-by: Kent Overstreet --- fs/bcachefs/sysfs.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index b18b0cc81b594..5be92fe3f4ea4 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -25,6 +25,7 @@ #include "ec.h" #include "inode.h" #include "journal.h" +#include "journal_reclaim.h" #include "keylist.h" #include "move.h" #include "movinggc.h" @@ -138,6 +139,7 @@ do { \ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); +write_attribute(trigger_journal_flush); write_attribute(prune_cache); write_attribute(btree_wakeup); rw_attribute(btree_gc_periodic); @@ -500,7 +502,7 @@ STORE(bch2_fs) /* Debugging: */ - if (!test_bit(BCH_FS_rw, &c->flags)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) return -EROFS; if (attr == &sysfs_prune_cache) { @@ -533,6 +535,11 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_invalidates) bch2_do_invalidates(c); + if (attr == &sysfs_trigger_journal_flush) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_meta(&c->journal); + } + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -553,6 +560,7 @@ STORE(bch2_fs) size = ret; } #endif + bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); return size; } SYSFS_OPS(bch2_fs); @@ -651,6 +659,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_gc, &sysfs_trigger_discards, &sysfs_trigger_invalidates, + &sysfs_trigger_journal_flush, &sysfs_prune_cache, &sysfs_btree_wakeup, -- GitLab From 27c15ed297cb71c2e7a839439b5a097081a32605 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 18:45:47 -0400 Subject: [PATCH 872/989] bcachefs: bch_member.btree_allocated_bitmap This adds a small (64 bit) per-device bitmap that tracks ranges that have btree nodes, for accelerating btree node scan if it is ever needed. - New helpers, bch2_dev_btree_bitmap_marked() and bch2_dev_bitmap_mark(), for checking and updating the bitmap - Interior btree update path updates the bitmaps when required - The check_allocations pass has a new fsck_err check, btree_bitmap_not_marked - New on disk format version, mi_btree_mitmap, which indicates the new bitmap is present - Upgrade table lists the required recovery pass and expected fsck error - Btree node scan uses the bitmap to skip ranges if we're on the new version Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 7 ++-- fs/bcachefs/btree_gc.c | 13 +++++++ fs/bcachefs/btree_node_scan.c | 9 +++-- fs/bcachefs/btree_update_interior.c | 24 +++++++++++++ fs/bcachefs/sb-downgrade.c | 5 ++- fs/bcachefs/sb-errors_types.h | 3 +- fs/bcachefs/sb-members.c | 53 +++++++++++++++++++++++++++++ fs/bcachefs/sb-members.h | 21 ++++++++++++ fs/bcachefs/super_types.h | 2 ++ 9 files changed, 131 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 9480f1b44f071..085987435a5ea 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -578,7 +578,8 @@ struct bch_member { __le64 nbuckets; /* device size */ __le16 first_bucket; /* index of first bucket used */ __le16 bucket_size; /* sectors */ - __le32 pad; + __u8 btree_bitmap_shift; + __u8 pad[3]; __le64 last_mount; /* time_t */ __le64 flags; @@ -587,6 +588,7 @@ struct bch_member { __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; __le64 errors_reset_time; __le64 seq; + __le64 btree_allocated_bitmap; }; #define BCH_MEMBER_V1_BYTES 56 @@ -876,7 +878,8 @@ struct bch_sb_field_downgrade { x(rebalance_work, BCH_VERSION(1, 3)) \ x(member_seq, BCH_VERSION(1, 4)) \ x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ - x(btree_subvolume_children, BCH_VERSION(1, 6)) + x(btree_subvolume_children, BCH_VERSION(1, 6)) \ + x(mi_btree_bitmap, BCH_VERSION(1, 7)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index d2555da55c6da..ecbd9598f69fd 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -828,6 +828,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, struct bch_fs *c = trans->c; struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + struct printbuf buf = PRINTBUF; int ret = 0; deleted.p = k->k->p; @@ -848,11 +849,23 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (ret) goto err; + if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, *k), + c, btree_bitmap_not_marked, + "btree ptr not marked in member info btree allocated bitmap\n %s", + (bch2_bkey_val_to_text(&buf, c, *k), + buf.buf))) { + mutex_lock(&c->sb_lock); + bch2_dev_btree_bitmap_mark(c, *k); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + ret = commit_do(trans, NULL, NULL, 0, bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); fsck_err: err: + printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 20f2b37c4474c..866bd278439f8 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -205,8 +205,13 @@ static int read_btree_nodes_worker(void *p) last_print = jiffies; } - try_read_btree_node(w->f, ca, bio, buf, - bucket * ca->mi.bucket_size + bucket_offset); + u64 sector = bucket * ca->mi.bucket_size + bucket_offset; + + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && + !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) + continue; + + try_read_btree_node(w->f, ca, bio, buf, sector); } err: bio_put(bio); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 82ac00aa0f7b0..6030c396754f6 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -21,6 +21,7 @@ #include "keylist.h" #include "recovery_passes.h" #include "replicas.h" +#include "sb-members.h" #include "super-io.h" #include "trace.h" @@ -605,6 +606,26 @@ static void btree_update_add_key(struct btree_update *as, bch2_keylist_push(keys); } +static bool btree_update_new_nodes_marked_sb(struct btree_update *as) +{ + for_each_keylist_key(&as->new_keys, k) + if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k))) + return false; + return true; +} + +static void btree_update_new_nodes_mark_sb(struct btree_update *as) +{ + struct bch_fs *c = as->c; + + mutex_lock(&c->sb_lock); + for_each_keylist_key(&as->new_keys, k) + bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + /* * The transactional part of an interior btree node update, where we journal the * update we did to the interior node and update alloc info: @@ -662,6 +683,9 @@ static void btree_update_nodes_written(struct btree_update *as) if (ret) goto err; + if (!btree_update_new_nodes_marked_sb(as)) + btree_update_new_nodes_mark_sb(as); + /* * Wait for any in flight writes to finish before we free the old nodes * on disk: diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index d6f81179c3a29..a98ef940b7a32 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -51,7 +51,10 @@ BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \ x(btree_subvolume_children, \ BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \ - BCH_FSCK_ERR_subvol_children_not_set) + BCH_FSCK_ERR_subvol_children_not_set) \ + x(mi_btree_bitmap, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_btree_bitmap_not_marked) #define DOWNGRADE_TABLE() diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index d7d609131030a..5b600c6d7aca2 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -270,7 +270,8 @@ x(btree_ptr_v2_min_key_bad, 262) \ x(btree_root_unreadable_and_scan_found_nothing, 263) \ x(snapshot_node_missing, 264) \ - x(dup_backpointer_to_bad_csum_extent, 265) + x(dup_backpointer_to_bad_csum_extent, 265) \ + x(btree_bitmap_not_marked, 266) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index eff5ce18c69c0..522a969345e52 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" #include "disk_groups.h" #include "opts.h" #include "replicas.h" @@ -426,3 +427,55 @@ void bch2_dev_errors_reset(struct bch_dev *ca) bch2_write_super(c); mutex_unlock(&c->sb_lock); } + +/* + * Per member "range has btree nodes" bitmap: + * + * This is so that if we ever have to run the btree node scan to repair we don't + * have to scan full devices: + */ + +bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) +{ + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) + if (!bch2_dev_btree_bitmap_marked_sectors(bch_dev_bkey_exists(c, ptr->dev), + ptr->offset, btree_sectors(c))) + return false; + return true; +} + +static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, + u64 start, unsigned sectors) +{ + struct bch_member *m = __bch2_members_v2_get_mut(mi, dev); + u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap); + + u64 end = start + sectors; + + int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6); + if (resize > 0) { + u64 new_bitmap = 0; + + for (unsigned i = 0; i < 64; i++) + if (bitmap & BIT_ULL(i)) + new_bitmap |= BIT_ULL(i >> resize); + bitmap = new_bitmap; + m->btree_bitmap_shift += resize; + } + + for (unsigned bit = sectors >> m->btree_bitmap_shift; + bit << m->btree_bitmap_shift < end; + bit++) + bitmap |= BIT_ULL(bit); + + m->btree_allocated_bitmap = cpu_to_le64(bitmap); +} + +void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) +{ + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) + __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); +} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index be0a941832715..b27c3e4467cf2 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -3,6 +3,7 @@ #define _BCACHEFS_SB_MEMBERS_H #include "darray.h" +#include "bkey_types.h" extern char * const bch2_member_error_strs[]; @@ -220,6 +221,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), .valid = bch2_member_exists(mi), + .btree_bitmap_shift = mi->btree_bitmap_shift, + .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), }; } @@ -228,4 +231,22 @@ void bch2_sb_members_from_cpu(struct bch_fs *); void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); void bch2_dev_errors_reset(struct bch_dev *); +static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors) +{ + u64 end = start + sectors; + + if (end > 64 << ca->mi.btree_bitmap_shift) + return false; + + for (unsigned bit = sectors >> ca->mi.btree_bitmap_shift; + bit << ca->mi.btree_bitmap_shift < end; + bit++) + if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit))) + return false; + return true; +} + +bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); +void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); + #endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index ec784d975f665..11bcef170c2c2 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -37,6 +37,8 @@ struct bch_member_cpu { u8 durability; u8 freespace_initialized; u8 valid; + u8 btree_bitmap_shift; + u64 btree_allocated_bitmap; }; #endif /* _BCACHEFS_SUPER_TYPES_H */ -- GitLab From f0a73d4fde5b285d94a702026216d9fd1fd2733d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 14 Apr 2024 00:51:48 -0400 Subject: [PATCH 873/989] bcachefs: Check for backpointer bucket_offset >= bucket size Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 8 +++++--- fs/bcachefs/backpointers.h | 9 +++------ fs/bcachefs/sb-errors_types.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 114328acde720..fadb1078903d2 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -49,13 +49,15 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, if (!bch2_dev_exists2(c, bp.k->p.inode)) return 0; + struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode); struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); int ret = 0; - bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), + bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || + !bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), c, err, - backpointer_pos_wrong, - "backpointer at wrong pos"); + backpointer_bucket_offset_wrong, + "backpointer bucket_offset wrong"); fsck_err: return ret; } diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index da012ca7daee5..85949b9fd880c 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -53,14 +53,11 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, u64 bucket_offset) { struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); - struct bpos ret; - - ret = POS(bucket.inode, - (bucket_to_sector(ca, bucket.offset) << - MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); + struct bpos ret = POS(bucket.inode, + (bucket_to_sector(ca, bucket.offset) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret))); - return ret; } diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 5b600c6d7aca2..4ca6e7b0d8aae 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -130,7 +130,7 @@ x(bucket_gens_nonzero_for_invalid_buckets, 122) \ x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ x(need_discard_freespace_key_bad, 124) \ - x(backpointer_pos_wrong, 125) \ + x(backpointer_bucket_offset_wrong, 125) \ x(backpointer_to_missing_device, 126) \ x(backpointer_to_missing_alloc, 127) \ x(backpointer_to_missing_ptr, 128) \ -- GitLab From 356952b13af5b2c338df1e06889fd1b5e12cbbf4 Mon Sep 17 00:00:00 2001 From: bolan wang Date: Wed, 6 Mar 2024 19:03:39 +0800 Subject: [PATCH 874/989] USB: serial: option: add Fibocom FM135-GL variants Update the USB serial option driver support for the Fibocom FM135-GL LTE modules. - VID:PID 2cb7:0115, FM135-GL for laptop debug M.2 cards(with MBIM interface for /Linux/Chrome OS) 0x0115: mbim, diag, at, pipe Here are the outputs of usb-devices: T: Bus=01 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 16 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2cb7 ProdID=0115 Rev=05.15 S: Manufacturer=Fibocom Wireless Inc. S: Product=Fibocom Module S: SerialNumber=12345678 C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: bolan wang Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 55a65d941ccbf..cedf93a3eaea5 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2272,6 +2272,8 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x010b, 0xff, 0xff, 0x30) }, /* Fibocom FG150 Diag */ { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x010b, 0xff, 0, 0) }, /* Fibocom FG150 AT */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0111, 0xff) }, /* Fibocom FM160 (MBIM mode) */ + { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0115, 0xff), /* Fibocom FM135 (laptop MBIM) */ + .driver_info = RSVD(5) }, { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a0, 0xff) }, /* Fibocom NL668-AM/NL652-EU (laptop MBIM) */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a2, 0xff) }, /* Fibocom FM101-GL (laptop MBIM) */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a3, 0xff) }, /* Fibocom FM101-GL (laptop MBIM) */ -- GitLab From c840244aba7ad2b83ed904378b36bd6aef25511c Mon Sep 17 00:00:00 2001 From: Jerry Meng Date: Mon, 15 Apr 2024 15:04:29 +0800 Subject: [PATCH 875/989] USB: serial: option: support Quectel EM060K sub-models EM060K_129, EM060K_12a, EM060K_12b and EM0060K_12c are EM060K's sub-models, having the same name "Quectel EM060K-GL" and the same interface layout. MBIM + GNSS + DIAG + NMEA + AT + QDSS + DPL T: Bus=03 Lev=01 Prnt=01 Port=01 Cnt=02 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2c7c ProdID=0129 Rev= 5.04 S: Manufacturer=Quectel S: Product=Quectel EM060K-GL S: SerialNumber=f6fa08b6 C:* #Ifs= 8 Cfg#= 1 Atr=a0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=0e Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8f(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Jerry Meng Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index cedf93a3eaea5..16cf0ddaae9b7 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -255,6 +255,10 @@ static void option_instat_callback(struct urb *urb); #define QUECTEL_PRODUCT_EM061K_LMS 0x0124 #define QUECTEL_PRODUCT_EC25 0x0125 #define QUECTEL_PRODUCT_EM060K_128 0x0128 +#define QUECTEL_PRODUCT_EM060K_129 0x0129 +#define QUECTEL_PRODUCT_EM060K_12a 0x012a +#define QUECTEL_PRODUCT_EM060K_12b 0x012b +#define QUECTEL_PRODUCT_EM060K_12c 0x012c #define QUECTEL_PRODUCT_EG91 0x0191 #define QUECTEL_PRODUCT_EG95 0x0195 #define QUECTEL_PRODUCT_BG96 0x0296 @@ -1218,6 +1222,18 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_128, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_128, 0xff, 0x00, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_128, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_129, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_129, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_129, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12a, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12a, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12a, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12b, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12b, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12b, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12c, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12c, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12c, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LCN, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LCN, 0xff, 0x00, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LCN, 0xff, 0xff, 0x40) }, -- GitLab From 4864a6dd8320ad856698f93009c89f66ccb1653f Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 7 Apr 2024 18:57:56 +0300 Subject: [PATCH 876/989] fuse: fix wrong ff->iomode state changes from parallel dio write There is a confusion with fuse_file_uncached_io_{start,end} interface. These helpers do two things when called from passthrough open()/release(): 1. Take/drop negative refcount of fi->iocachectr (inode uncached io mode) 2. State change ff->iomode IOM_NONE <-> IOM_UNCACHED (file uncached open) The calls from parallel dio write path need to take a reference on fi->iocachectr, but they should not be changing ff->iomode state, because in this case, the fi->iocachectr reference does not stick around until file release(). Factor out helpers fuse_inode_uncached_io_{start,end}, to be used from parallel dio write path and rename fuse_file_*cached_io_{start,end} helpers to fuse_file_*cached_io_{open,release} to clarify the difference. Fixes: 205c1d802683 ("fuse: allow parallel dio writes with FUSE_DIRECT_IO_ALLOW_MMAP") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 12 ++++++----- fs/fuse/fuse_i.h | 7 +++--- fs/fuse/inode.c | 1 + fs/fuse/iomode.c | 56 +++++++++++++++++++++++++++++++++--------------- 4 files changed, 51 insertions(+), 25 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a56e7bffd0004..b57ce41576407 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1362,7 +1362,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, bool *exclusive) { struct inode *inode = file_inode(iocb->ki_filp); - struct fuse_file *ff = iocb->ki_filp->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); if (*exclusive) { @@ -1377,7 +1377,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, * have raced, so check it again. */ if (fuse_io_past_eof(iocb, from) || - fuse_file_uncached_io_start(inode, ff, NULL) != 0) { + fuse_inode_uncached_io_start(fi, NULL) != 0) { inode_unlock_shared(inode); inode_lock(inode); *exclusive = true; @@ -1388,13 +1388,13 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) { struct inode *inode = file_inode(iocb->ki_filp); - struct fuse_file *ff = iocb->ki_filp->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); if (exclusive) { inode_unlock(inode); } else { /* Allow opens in caching mode after last parallel dio end */ - fuse_file_uncached_io_end(inode, ff); + fuse_inode_uncached_io_end(fi); inode_unlock_shared(inode); } } @@ -2574,8 +2574,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) * First mmap of direct_io file enters caching inode io mode. * Also waits for parallel dio writers to go into serial mode * (exclusive instead of shared lock). + * After first mmap, the inode stays in caching io mode until + * the direct_io file release. */ - rc = fuse_file_cached_io_start(inode, ff); + rc = fuse_file_cached_io_open(inode, ff); if (rc) return rc; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b24084b60864e..f239196103137 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1394,9 +1394,10 @@ int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); /* iomode.c */ -int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff); -int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb); -void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff); +int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff); +int fuse_inode_uncached_io_start(struct fuse_inode *fi, + struct fuse_backing *fb); +void fuse_inode_uncached_io_end(struct fuse_inode *fi); int fuse_file_io_open(struct file *file, struct inode *inode); void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3a5d888783353..99e44ea7d8756 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -175,6 +175,7 @@ static void fuse_evict_inode(struct inode *inode) } } if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { + WARN_ON(fi->iocachectr != 0); WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); } diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index c653ddcf05787..98f1fd523dae6 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -21,12 +21,13 @@ static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) } /* - * Start cached io mode. + * Called on cached file open() and on first mmap() of direct_io file. + * Takes cached_io inode mode reference to be dropped on file release. * * Blocks new parallel dio writes and waits for the in-progress parallel dio * writes to complete. */ -int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) +int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -67,10 +68,9 @@ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) return 0; } -static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) +static void fuse_file_cached_io_release(struct fuse_file *ff, + struct fuse_inode *fi) { - struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fi->lock); WARN_ON(fi->iocachectr <= 0); WARN_ON(ff->iomode != IOM_CACHED); @@ -82,9 +82,8 @@ static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) } /* Start strictly uncached io mode where cache access is not allowed */ -int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb) +int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) { - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_backing *oldfb; int err = 0; @@ -99,9 +98,7 @@ int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struc err = -ETXTBSY; goto unlock; } - WARN_ON(ff->iomode != IOM_NONE); fi->iocachectr--; - ff->iomode = IOM_UNCACHED; /* fuse inode holds a single refcount of backing file */ if (!oldfb) { @@ -115,15 +112,29 @@ unlock: return err; } -void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) +/* Takes uncached_io inode mode reference to be dropped on file release */ +static int fuse_file_uncached_io_open(struct inode *inode, + struct fuse_file *ff, + struct fuse_backing *fb) { struct fuse_inode *fi = get_fuse_inode(inode); + int err; + + err = fuse_inode_uncached_io_start(fi, fb); + if (err) + return err; + + WARN_ON(ff->iomode != IOM_NONE); + ff->iomode = IOM_UNCACHED; + return 0; +} + +void fuse_inode_uncached_io_end(struct fuse_inode *fi) +{ struct fuse_backing *oldfb = NULL; spin_lock(&fi->lock); WARN_ON(fi->iocachectr >= 0); - WARN_ON(ff->iomode != IOM_UNCACHED); - ff->iomode = IOM_NONE; fi->iocachectr++; if (!fi->iocachectr) { wake_up(&fi->direct_io_waitq); @@ -134,6 +145,15 @@ void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) fuse_backing_put(oldfb); } +/* Drop uncached_io reference from passthrough open */ +static void fuse_file_uncached_io_release(struct fuse_file *ff, + struct fuse_inode *fi) +{ + WARN_ON(ff->iomode != IOM_UNCACHED); + ff->iomode = IOM_NONE; + fuse_inode_uncached_io_end(fi); +} + /* * Open flags that are allowed in combination with FOPEN_PASSTHROUGH. * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write @@ -163,7 +183,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file) return PTR_ERR(fb); /* First passthrough file open denies caching inode io mode */ - err = fuse_file_uncached_io_start(inode, ff, fb); + err = fuse_file_uncached_io_open(inode, ff, fb); if (!err) return 0; @@ -216,7 +236,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode) if (ff->open_flags & FOPEN_PASSTHROUGH) err = fuse_file_passthrough_open(inode, file); else - err = fuse_file_cached_io_start(inode, ff); + err = fuse_file_cached_io_open(inode, ff); if (err) goto fail; @@ -236,8 +256,10 @@ fail: /* No more pending io and no new io possible to inode via open/mmapped file */ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + /* - * Last parallel dio close allows caching inode io mode. + * Last passthrough file close allows caching inode io mode. * Last caching file close exits caching inode io mode. */ switch (ff->iomode) { @@ -245,10 +267,10 @@ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) /* Nothing to do */ break; case IOM_UNCACHED: - fuse_file_uncached_io_end(inode, ff); + fuse_file_uncached_io_release(ff, fi); break; case IOM_CACHED: - fuse_file_cached_io_end(inode, ff); + fuse_file_cached_io_release(ff, fi); break; } } -- GitLab From 7cc911262835419fe469ebfae89891c0e97c62ef Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 7 Apr 2024 18:57:57 +0300 Subject: [PATCH 877/989] fuse: fix parallel dio write on file open in passthrough mode Parallel dio write takes a negative refcount of fi->iocachectr and so does open of file in passthrough mode. The refcount of passthrough mode is associated with attach/detach of a fuse_backing object to fuse inode. For parallel dio write, the backing file is irrelevant, so the call to fuse_inode_uncached_io_start() passes a NULL fuse_backing object. Passing a NULL fuse_backing will result in false -EBUSY error if the file is already open in passthrough mode. Allow taking negative fi->iocachectr refcount with NULL fuse_backing, because it does not conflict with an already attached fuse_backing object. Fixes: 4a90451bbc7f ("fuse: implement open in passthrough mode") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/iomode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index 98f1fd523dae6..c99e285f3183e 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -90,7 +90,7 @@ int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) spin_lock(&fi->lock); /* deny conflicting backing files on same fuse inode */ oldfb = fuse_inode_backing(fi); - if (oldfb && oldfb != fb) { + if (fb && oldfb && oldfb != fb) { err = -EBUSY; goto unlock; } @@ -101,7 +101,7 @@ int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) fi->iocachectr--; /* fuse inode holds a single refcount of backing file */ - if (!oldfb) { + if (fb && !oldfb) { oldfb = fuse_inode_backing_set(fi, fb); WARN_ON_ONCE(oldfb != NULL); } else { -- GitLab From eb4b691b9115fae4c844f5941418335575cf667f Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 13 Apr 2024 17:34:31 -0700 Subject: [PATCH 878/989] fuse: fix leaked ENOSYS error on first statx call FUSE attempts to detect server support for statx by trying it once and setting no_statx=1 if it fails with ENOSYS, but consider the following scenario: - Userspace (e.g. sh) calls stat() on a file * succeeds - Userspace (e.g. lsd) calls statx(BTIME) on the same file - request_mask = STATX_BASIC_STATS | STATX_BTIME - first pass: sync=true due to differing cache_mask - statx fails and returns ENOSYS - set no_statx and retry - retry sets mask = STATX_BASIC_STATS - now mask == cache_mask; sync=false (time_before: still valid) - so we take the "else if (stat)" path - "err" is still ENOSYS from the failed statx call Fix this by zeroing "err" before retrying the failed call. Fixes: d3045530bdd2 ("fuse: implement statx") Cc: stable@vger.kernel.org # v6.6 Signed-off-by: Danny Lin Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 4a6df591add61..2b0d4781f3948 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1321,6 +1321,7 @@ retry: err = fuse_do_statx(inode, file, stat); if (err == -ENOSYS) { fc->no_statx = 1; + err = 0; goto retry; } } else { -- GitLab From fff1386cc889d8fb4089d285f883f8cba62d82ce Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 11 Apr 2024 11:15:09 +1000 Subject: [PATCH 879/989] nouveau: fix instmem race condition around ptr stores Running a lot of VK CTS in parallel against nouveau, once every few hours you might see something like this crash. BUG: kernel NULL pointer dereference, address: 0000000000000008 PGD 8000000114e6e067 P4D 8000000114e6e067 PUD 109046067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 7 PID: 53891 Comm: deqp-vk Not tainted 6.8.0-rc6+ #27 Hardware name: Gigabyte Technology Co., Ltd. Z390 I AORUS PRO WIFI/Z390 I AORUS PRO WIFI-CF, BIOS F8 11/05/2021 RIP: 0010:gp100_vmm_pgt_mem+0xe3/0x180 [nouveau] Code: c7 48 01 c8 49 89 45 58 85 d2 0f 84 95 00 00 00 41 0f b7 46 12 49 8b 7e 08 89 da 42 8d 2c f8 48 8b 47 08 41 83 c7 01 48 89 ee <48> 8b 40 08 ff d0 0f 1f 00 49 8b 7e 08 48 89 d9 48 8d 75 04 48 c1 RSP: 0000:ffffac20c5857838 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 00000000004d8001 RCX: 0000000000000001 RDX: 00000000004d8001 RSI: 00000000000006d8 RDI: ffffa07afe332180 RBP: 00000000000006d8 R08: ffffac20c5857ad0 R09: 0000000000ffff10 R10: 0000000000000001 R11: ffffa07af27e2de0 R12: 000000000000001c R13: ffffac20c5857ad0 R14: ffffa07a96fe9040 R15: 000000000000001c FS: 00007fe395eed7c0(0000) GS:ffffa07e2c980000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 000000011febe001 CR4: 00000000003706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ... ? gp100_vmm_pgt_mem+0xe3/0x180 [nouveau] ? gp100_vmm_pgt_mem+0x37/0x180 [nouveau] nvkm_vmm_iter+0x351/0xa20 [nouveau] ? __pfx_nvkm_vmm_ref_ptes+0x10/0x10 [nouveau] ? __pfx_gp100_vmm_pgt_mem+0x10/0x10 [nouveau] ? __pfx_gp100_vmm_pgt_mem+0x10/0x10 [nouveau] ? __lock_acquire+0x3ed/0x2170 ? __pfx_gp100_vmm_pgt_mem+0x10/0x10 [nouveau] nvkm_vmm_ptes_get_map+0xc2/0x100 [nouveau] ? __pfx_nvkm_vmm_ref_ptes+0x10/0x10 [nouveau] ? __pfx_gp100_vmm_pgt_mem+0x10/0x10 [nouveau] nvkm_vmm_map_locked+0x224/0x3a0 [nouveau] Adding any sort of useful debug usually makes it go away, so I hand wrote the function in a line, and debugged the asm. Every so often pt->memory->ptrs is NULL. This ptrs ptr is set in the nv50_instobj_acquire called from nvkm_kmap. If Thread A and Thread B both get to nv50_instobj_acquire around the same time, and Thread A hits the refcount_set line, and in lockstep thread B succeeds at refcount_inc_not_zero, there is a chance the ptrs value won't have been stored since refcount_set is unordered. Force a memory barrier here, I picked smp_mb, since we want it on all CPUs and it's write followed by a read. v2: use paired smp_rmb/smp_wmb. Cc: Fixes: be55287aa5ba ("drm/nouveau/imem/nv50: embed nvkm_instobj directly into nv04_instobj") Signed-off-by: Dave Airlie Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240411011510.2546857-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c index a7f3fc342d87e..dd5b5a17ece0b 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c @@ -222,8 +222,11 @@ nv50_instobj_acquire(struct nvkm_memory *memory) void __iomem *map = NULL; /* Already mapped? */ - if (refcount_inc_not_zero(&iobj->maps)) + if (refcount_inc_not_zero(&iobj->maps)) { + /* read barrier match the wmb on refcount set */ + smp_rmb(); return iobj->map; + } /* Take the lock, and re-check that another thread hasn't * already mapped the object in the meantime. @@ -250,6 +253,8 @@ nv50_instobj_acquire(struct nvkm_memory *memory) iobj->base.memory.ptrs = &nv50_instobj_fast; else iobj->base.memory.ptrs = &nv50_instobj_slow; + /* barrier to ensure the ptrs are written before refcount is set */ + smp_wmb(); refcount_set(&iobj->maps, 1); } -- GitLab From cf92bb778eda7830e79452c6917efa8474a30c1e Mon Sep 17 00:00:00 2001 From: Mikhail Kobuk Date: Thu, 11 Apr 2024 14:08:52 +0300 Subject: [PATCH 880/989] drm: nv04: Fix out of bounds access When Output Resource (dcb->or) value is assigned in fabricate_dcb_output(), there may be out of bounds access to dac_users array in case dcb->or is zero because ffs(dcb->or) is used as index there. The 'or' argument of fabricate_dcb_output() must be interpreted as a number of bit to set, not value. Utilize macros from 'enum nouveau_or' in calls instead of hardcoding. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 2e5702aff395 ("drm/nouveau: fabricate DCB encoder table for iMac G4") Fixes: 670820c0e6a9 ("drm/nouveau: Workaround incorrect DCB entry on a GeForce3 Ti 200.") Signed-off-by: Mikhail Kobuk Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240411110854.16701-1-m.kobuk@ispras.ru --- drivers/gpu/drm/nouveau/nouveau_bios.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_bios.c b/drivers/gpu/drm/nouveau/nouveau_bios.c index 479effcf607e2..79cfab53f80e2 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bios.c +++ b/drivers/gpu/drm/nouveau/nouveau_bios.c @@ -23,6 +23,7 @@ */ #include "nouveau_drv.h" +#include "nouveau_bios.h" #include "nouveau_reg.h" #include "dispnv04/hw.h" #include "nouveau_encoder.h" @@ -1677,7 +1678,7 @@ apply_dcb_encoder_quirks(struct drm_device *dev, int idx, u32 *conn, u32 *conf) */ if (nv_match_device(dev, 0x0201, 0x1462, 0x8851)) { if (*conn == 0xf2005014 && *conf == 0xffffffff) { - fabricate_dcb_output(dcb, DCB_OUTPUT_TMDS, 1, 1, 1); + fabricate_dcb_output(dcb, DCB_OUTPUT_TMDS, 1, 1, DCB_OUTPUT_B); return false; } } @@ -1763,26 +1764,26 @@ fabricate_dcb_encoder_table(struct drm_device *dev, struct nvbios *bios) #ifdef __powerpc__ /* Apple iMac G4 NV17 */ if (of_machine_is_compatible("PowerMac4,5")) { - fabricate_dcb_output(dcb, DCB_OUTPUT_TMDS, 0, all_heads, 1); - fabricate_dcb_output(dcb, DCB_OUTPUT_ANALOG, 1, all_heads, 2); + fabricate_dcb_output(dcb, DCB_OUTPUT_TMDS, 0, all_heads, DCB_OUTPUT_B); + fabricate_dcb_output(dcb, DCB_OUTPUT_ANALOG, 1, all_heads, DCB_OUTPUT_C); return; } #endif /* Make up some sane defaults */ fabricate_dcb_output(dcb, DCB_OUTPUT_ANALOG, - bios->legacy.i2c_indices.crt, 1, 1); + bios->legacy.i2c_indices.crt, 1, DCB_OUTPUT_B); if (nv04_tv_identify(dev, bios->legacy.i2c_indices.tv) >= 0) fabricate_dcb_output(dcb, DCB_OUTPUT_TV, bios->legacy.i2c_indices.tv, - all_heads, 0); + all_heads, DCB_OUTPUT_A); else if (bios->tmds.output0_script_ptr || bios->tmds.output1_script_ptr) fabricate_dcb_output(dcb, DCB_OUTPUT_TMDS, bios->legacy.i2c_indices.panel, - all_heads, 1); + all_heads, DCB_OUTPUT_B); } static int -- GitLab From 09492cb45100cab909cabe164deb7cdc14e38634 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 15 Mar 2024 16:02:53 +0800 Subject: [PATCH 881/989] cuse: add kernel-doc comments to cuse_process_init_reply() This commit adds kernel-doc style comments with complete parameter descriptions for the function cuse_process_init_reply. Signed-off-by: Yang Li Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b6cad106c37e4..0b2da7b7e2ad0 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -310,6 +310,10 @@ struct cuse_init_args { /** * cuse_process_init_reply - finish initializing CUSE channel * + * @fm: The fuse mount information containing the CUSE connection. + * @args: The arguments passed to the init reply. + * @error: The error code signifying if any error occurred during the process. + * * This function creates the character device and sets up all the * required data structures for it. Please read the comment at the * top of this file for high level overview. -- GitLab From 460b0d33cf10eee33de651381d3170ef13241650 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Apr 2024 11:02:02 -0700 Subject: [PATCH 882/989] inet: bring NLM_DONE out to a separate recv() again Commit under Fixes optimized the number of recv() calls needed during RTM_GETROUTE dumps, but we got multiple reports of applications hanging on recv() calls. Applications expect that a route dump will be terminated with a recv() reading an individual NLM_DONE message. Coalescing NLM_DONE is perfectly legal in netlink, but even tho reporters fixed the code in respective projects, chances are it will take time for those applications to get updated. So revert to old behavior (for now)? Old kernel (5.19): $ ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \ --dump getroute --json '{"rtm-family": 2}' Recv: read 692 bytes, 11 messages nl_len = 68 (52) nl_flags = 0x22 nl_type = 24 ... nl_len = 60 (44) nl_flags = 0x22 nl_type = 24 Recv: read 20 bytes, 1 messages nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 Before (6.9-rc2): $ ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \ --dump getroute --json '{"rtm-family": 2}' Recv: read 712 bytes, 12 messages nl_len = 68 (52) nl_flags = 0x22 nl_type = 24 ... nl_len = 60 (44) nl_flags = 0x22 nl_type = 24 nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 After: $ ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \ --dump getroute --json '{"rtm-family": 2}' Recv: read 692 bytes, 11 messages nl_len = 68 (52) nl_flags = 0x22 nl_type = 24 ... nl_len = 60 (44) nl_flags = 0x22 nl_type = 24 Recv: read 20 bytes, 1 messages nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 Reported-by: Stefano Brivio Link: https://lore.kernel.org/all/20240315124808.033ff58d@elisabeth Reported-by: Ilya Maximets Link: https://lore.kernel.org/all/02b50aae-f0e9-47a4-8365-a977a85975d3@ovn.org Fixes: 4ce5dc9316de ("inet: switch inet_dump_fib() to RCU protection") Signed-off-by: Jakub Kicinski Tested-by: Ilya Maximets Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 48741352a88a7..c484b1c0fc00a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1050,6 +1050,11 @@ next: e++; } } + + /* Don't let NLM_DONE coalesce into a message, even if it could. + * Some user space expects NLM_DONE in a separate recv(). + */ + err = skb->len; out: cb->args[1] = e; -- GitLab From 75ce9506ee3dc66648a7d74ab3b0acfa364d6d43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Fri, 12 Apr 2024 12:02:56 +0000 Subject: [PATCH 883/989] octeontx2-pf: fix FLOW_DIS_IS_FRAGMENT implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upon reviewing the flower control flags handling in this driver, I notice that the key wasn't being used, only the mask. Ie. `tc flower ... ip_flags nofrag` was hardware offloaded as `... ip_flags frag`. Only compile tested, no access to HW. Fixes: c672e3727989 ("octeontx2-pf: Add support to filter packet based on IP fragment") Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c index 87bdb93cb066e..f4655a8c0705d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c @@ -689,6 +689,7 @@ static int otx2_tc_prepare_flow(struct otx2_nic *nic, struct otx2_tc_flow *node, if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) { struct flow_match_control match; + u32 val; flow_rule_match_control(rule, &match); if (match.mask->flags & FLOW_DIS_FIRST_FRAG) { @@ -697,12 +698,14 @@ static int otx2_tc_prepare_flow(struct otx2_nic *nic, struct otx2_tc_flow *node, } if (match.mask->flags & FLOW_DIS_IS_FRAGMENT) { + val = match.key->flags & FLOW_DIS_IS_FRAGMENT; if (ntohs(flow_spec->etype) == ETH_P_IP) { - flow_spec->ip_flag = IPV4_FLAG_MORE; + flow_spec->ip_flag = val ? IPV4_FLAG_MORE : 0; flow_mask->ip_flag = IPV4_FLAG_MORE; req->features |= BIT_ULL(NPC_IPFRAG_IPV4); } else if (ntohs(flow_spec->etype) == ETH_P_IPV6) { - flow_spec->next_header = IPPROTO_FRAGMENT; + flow_spec->next_header = val ? + IPPROTO_FRAGMENT : 0; flow_mask->next_header = 0xff; req->features |= BIT_ULL(NPC_IPFRAG_IPV6); } else { -- GitLab From b6976f323a8687cc0d55bc92c2086fd934324ed5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 15 Apr 2024 15:48:21 +0200 Subject: [PATCH 884/989] drm/ttm: stop pooling cached NUMA pages v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We only pool write combined and uncached allocations because they require extra overhead on allocation and release. If we also pool cached NUMA it not only means some extra unnecessary overhead, but also that under memory pressure it can happen that pages from the wrong NUMA node enters the pool and are re-used over and over again. This can lead to performance reduction after running into memory pressure. v2: restructure and cleanup the code a bit from the internal hack to test this. Signed-off-by: Christian König Fixes: 4482d3c94d7f ("drm/ttm: add NUMA node id to the pool") CC: stable@vger.kernel.org Reviewed-by: Felix Kuehling Link: https://patchwork.freedesktop.org/patch/msgid/20240415134821.1919-1-christian.koenig@amd.com --- drivers/gpu/drm/ttm/ttm_pool.c | 38 +++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index 112438d965ffb..6e1fd6985ffcb 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -288,17 +288,23 @@ static struct ttm_pool_type *ttm_pool_select_type(struct ttm_pool *pool, enum ttm_caching caching, unsigned int order) { - if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) + if (pool->use_dma_alloc) return &pool->caching[caching].orders[order]; #ifdef CONFIG_X86 switch (caching) { case ttm_write_combined: + if (pool->nid != NUMA_NO_NODE) + return &pool->caching[caching].orders[order]; + if (pool->use_dma32) return &global_dma32_write_combined[order]; return &global_write_combined[order]; case ttm_uncached: + if (pool->nid != NUMA_NO_NODE) + return &pool->caching[caching].orders[order]; + if (pool->use_dma32) return &global_dma32_uncached[order]; @@ -566,11 +572,17 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev, pool->use_dma_alloc = use_dma_alloc; pool->use_dma32 = use_dma32; - if (use_dma_alloc || nid != NUMA_NO_NODE) { - for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j < NR_PAGE_ORDERS; ++j) - ttm_pool_type_init(&pool->caching[i].orders[j], - pool, i, j); + for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) { + for (j = 0; j < NR_PAGE_ORDERS; ++j) { + struct ttm_pool_type *pt; + + /* Initialize only pool types which are actually used */ + pt = ttm_pool_select_type(pool, i, j); + if (pt != &pool->caching[i].orders[j]) + continue; + + ttm_pool_type_init(pt, pool, i, j); + } } } EXPORT_SYMBOL(ttm_pool_init); @@ -599,10 +611,16 @@ void ttm_pool_fini(struct ttm_pool *pool) { unsigned int i, j; - if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) { - for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j < NR_PAGE_ORDERS; ++j) - ttm_pool_type_fini(&pool->caching[i].orders[j]); + for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) { + for (j = 0; j < NR_PAGE_ORDERS; ++j) { + struct ttm_pool_type *pt; + + pt = ttm_pool_select_type(pool, i, j); + if (pt != &pool->caching[i].orders[j]) + continue; + + ttm_pool_type_fini(pt); + } } /* We removed the pool types from the LRU, but we need to also make sure -- GitLab From a2ac1cbc5397eb4e400efa66c3337886d9a63026 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Mon, 15 Apr 2024 13:10:51 +0530 Subject: [PATCH 885/989] pwm: dwc: allow suspend/resume for 16 channels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With 16 channel pwm support, we're registering two instances of pwm_chip with 8 channels each. We need to update PM functions to use both instances of pwm_chip during power state transitions. Introduce struct dwc_pwm_drvdata and use it as driver_data, which will maintain both instances of pwm_chip along with dwc_pwm_info and allow us to use them inside suspend/resume handles. Fixes: ebf2c89eb95e ("pwm: dwc: Add 16 channel support for Intel Elkhart Lake") Signed-off-by: Raag Jadav Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240415074051.14681-1-raag.jadav@intel.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-dwc-core.c | 1 - drivers/pwm/pwm-dwc.c | 88 ++++++++++++++++++++++++-------------- drivers/pwm/pwm-dwc.h | 6 +++ 3 files changed, 63 insertions(+), 32 deletions(-) diff --git a/drivers/pwm/pwm-dwc-core.c b/drivers/pwm/pwm-dwc-core.c index 043736972cb92..c8425493b95d8 100644 --- a/drivers/pwm/pwm-dwc-core.c +++ b/drivers/pwm/pwm-dwc-core.c @@ -172,7 +172,6 @@ struct pwm_chip *dwc_pwm_alloc(struct device *dev) dwc->clk_ns = 10; chip->ops = &dwc_pwm_ops; - dev_set_drvdata(dev, chip); return chip; } EXPORT_SYMBOL_GPL(dwc_pwm_alloc); diff --git a/drivers/pwm/pwm-dwc.c b/drivers/pwm/pwm-dwc.c index 676eaf8d7a53f..fb3eadf6fbc46 100644 --- a/drivers/pwm/pwm-dwc.c +++ b/drivers/pwm/pwm-dwc.c @@ -31,26 +31,34 @@ static const struct dwc_pwm_info ehl_pwm_info = { .size = 0x1000, }; -static int dwc_pwm_init_one(struct device *dev, void __iomem *base, unsigned int offset) +static int dwc_pwm_init_one(struct device *dev, struct dwc_pwm_drvdata *ddata, unsigned int idx) { struct pwm_chip *chip; struct dwc_pwm *dwc; + int ret; chip = dwc_pwm_alloc(dev); if (IS_ERR(chip)) return PTR_ERR(chip); dwc = to_dwc_pwm(chip); - dwc->base = base + offset; + dwc->base = ddata->io_base + (ddata->info->size * idx); - return devm_pwmchip_add(dev, chip); + ret = devm_pwmchip_add(dev, chip); + if (ret) + return ret; + + ddata->chips[idx] = chip; + return 0; } static int dwc_pwm_probe(struct pci_dev *pci, const struct pci_device_id *id) { const struct dwc_pwm_info *info; struct device *dev = &pci->dev; - int i, ret; + struct dwc_pwm_drvdata *ddata; + unsigned int idx; + int ret; ret = pcim_enable_device(pci); if (ret) @@ -63,17 +71,25 @@ static int dwc_pwm_probe(struct pci_dev *pci, const struct pci_device_id *id) return dev_err_probe(dev, ret, "Failed to iomap PCI BAR\n"); info = (const struct dwc_pwm_info *)id->driver_data; - - for (i = 0; i < info->nr; i++) { - /* - * No need to check for pcim_iomap_table() failure, - * pcim_iomap_regions() already does it for us. - */ - ret = dwc_pwm_init_one(dev, pcim_iomap_table(pci)[0], i * info->size); + ddata = devm_kzalloc(dev, struct_size(ddata, chips, info->nr), GFP_KERNEL); + if (!ddata) + return -ENOMEM; + + /* + * No need to check for pcim_iomap_table() failure, + * pcim_iomap_regions() already does it for us. + */ + ddata->io_base = pcim_iomap_table(pci)[0]; + ddata->info = info; + + for (idx = 0; idx < ddata->info->nr; idx++) { + ret = dwc_pwm_init_one(dev, ddata, idx); if (ret) return ret; } + dev_set_drvdata(dev, ddata); + pm_runtime_put(dev); pm_runtime_allow(dev); @@ -88,19 +104,24 @@ static void dwc_pwm_remove(struct pci_dev *pci) static int dwc_pwm_suspend(struct device *dev) { - struct pwm_chip *chip = dev_get_drvdata(dev); - struct dwc_pwm *dwc = to_dwc_pwm(chip); - int i; - - for (i = 0; i < DWC_TIMERS_TOTAL; i++) { - if (chip->pwms[i].state.enabled) { - dev_err(dev, "PWM %u in use by consumer (%s)\n", - i, chip->pwms[i].label); - return -EBUSY; + struct dwc_pwm_drvdata *ddata = dev_get_drvdata(dev); + unsigned int idx; + + for (idx = 0; idx < ddata->info->nr; idx++) { + struct pwm_chip *chip = ddata->chips[idx]; + struct dwc_pwm *dwc = to_dwc_pwm(chip); + unsigned int i; + + for (i = 0; i < DWC_TIMERS_TOTAL; i++) { + if (chip->pwms[i].state.enabled) { + dev_err(dev, "PWM %u in use by consumer (%s)\n", + i, chip->pwms[i].label); + return -EBUSY; + } + dwc->ctx[i].cnt = dwc_pwm_readl(dwc, DWC_TIM_LD_CNT(i)); + dwc->ctx[i].cnt2 = dwc_pwm_readl(dwc, DWC_TIM_LD_CNT2(i)); + dwc->ctx[i].ctrl = dwc_pwm_readl(dwc, DWC_TIM_CTRL(i)); } - dwc->ctx[i].cnt = dwc_pwm_readl(dwc, DWC_TIM_LD_CNT(i)); - dwc->ctx[i].cnt2 = dwc_pwm_readl(dwc, DWC_TIM_LD_CNT2(i)); - dwc->ctx[i].ctrl = dwc_pwm_readl(dwc, DWC_TIM_CTRL(i)); } return 0; @@ -108,14 +129,19 @@ static int dwc_pwm_suspend(struct device *dev) static int dwc_pwm_resume(struct device *dev) { - struct pwm_chip *chip = dev_get_drvdata(dev); - struct dwc_pwm *dwc = to_dwc_pwm(chip); - int i; - - for (i = 0; i < DWC_TIMERS_TOTAL; i++) { - dwc_pwm_writel(dwc, dwc->ctx[i].cnt, DWC_TIM_LD_CNT(i)); - dwc_pwm_writel(dwc, dwc->ctx[i].cnt2, DWC_TIM_LD_CNT2(i)); - dwc_pwm_writel(dwc, dwc->ctx[i].ctrl, DWC_TIM_CTRL(i)); + struct dwc_pwm_drvdata *ddata = dev_get_drvdata(dev); + unsigned int idx; + + for (idx = 0; idx < ddata->info->nr; idx++) { + struct pwm_chip *chip = ddata->chips[idx]; + struct dwc_pwm *dwc = to_dwc_pwm(chip); + unsigned int i; + + for (i = 0; i < DWC_TIMERS_TOTAL; i++) { + dwc_pwm_writel(dwc, dwc->ctx[i].cnt, DWC_TIM_LD_CNT(i)); + dwc_pwm_writel(dwc, dwc->ctx[i].cnt2, DWC_TIM_LD_CNT2(i)); + dwc_pwm_writel(dwc, dwc->ctx[i].ctrl, DWC_TIM_CTRL(i)); + } } return 0; diff --git a/drivers/pwm/pwm-dwc.h b/drivers/pwm/pwm-dwc.h index a8b074841ae80..c6e2df5a61227 100644 --- a/drivers/pwm/pwm-dwc.h +++ b/drivers/pwm/pwm-dwc.h @@ -38,6 +38,12 @@ struct dwc_pwm_info { unsigned int size; }; +struct dwc_pwm_drvdata { + const struct dwc_pwm_info *info; + void __iomem *io_base; + struct pwm_chip *chips[]; +}; + struct dwc_pwm_ctx { u32 cnt; u32 cnt2; -- GitLab From fb7c3d8ba039df877886fd457538d8b24ca9c84b Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Thu, 4 Apr 2024 10:18:08 +0200 Subject: [PATCH 886/989] dt-bindings: pwm: mediatek,pwm-disp: Document power-domains property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow the power-domains property to the PWM_DISP block as on some SoCs this does need at most one power domain. Fixes: b09b179bac0a ("dt-bindings: pwm: Convert pwm-mtk-disp.txt to mediatek,pwm-disp.yaml format") Signed-off-by: AngeloGioacchino Del Regno Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240404081808.92199-1-angelogioacchino.delregno@collabora.com Signed-off-by: Uwe Kleine-König --- Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml index afcdeed4e88af..bc813fe74faba 100644 --- a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml +++ b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml @@ -52,6 +52,9 @@ properties: - const: main - const: mm + power-domains: + maxItems: 1 + required: - compatible - reg -- GitLab From b32233accefff1338806f064fb9b62cf5bc0609f Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Thu, 11 Apr 2024 22:55:09 -0400 Subject: [PATCH 887/989] drm/vmwgfx: Fix prime import/export vmwgfx never supported prime import of external buffers. Furthermore the driver exposes two different objects to userspace: vmw_surface's and gem buffers but prime import/export only worked with vmw_surfaces. Because gem buffers are used through the dumb_buffer interface this meant that the driver created buffers couldn't have been prime exported or imported. Fix prime import/export. Makes IGT's kms_prime pass. Signed-off-by: Zack Rusin Fixes: 8afa13a0583f ("drm/vmwgfx: Implement DRIVER_GEM") Cc: # v6.6+ Reviewed-by: Martin Krastev Link: https://patchwork.freedesktop.org/patch/msgid/20240412025511.78553-4-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_blit.c | 35 +++++++++++++++-- drivers/gpu/drm/vmwgfx/vmwgfx_bo.c | 7 ++-- drivers/gpu/drm/vmwgfx/vmwgfx_bo.h | 2 + drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 1 + drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 3 ++ drivers/gpu/drm/vmwgfx/vmwgfx_gem.c | 32 ++++++++++++++++ drivers/gpu/drm/vmwgfx/vmwgfx_prime.c | 15 +++++++- drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 44 +++++++++++++++------- 8 files changed, 117 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c index c52c7bf1485b1..717d624e9a052 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c @@ -456,8 +456,10 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, .no_wait_gpu = false }; u32 j, initial_line = dst_offset / dst_stride; - struct vmw_bo_blit_line_data d; + struct vmw_bo_blit_line_data d = {0}; int ret = 0; + struct page **dst_pages = NULL; + struct page **src_pages = NULL; /* Buffer objects need to be either pinned or reserved: */ if (!(dst->pin_count)) @@ -477,12 +479,35 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, return ret; } + if (!src->ttm->pages && src->ttm->sg) { + src_pages = kvmalloc_array(src->ttm->num_pages, + sizeof(struct page *), GFP_KERNEL); + if (!src_pages) + return -ENOMEM; + ret = drm_prime_sg_to_page_array(src->ttm->sg, src_pages, + src->ttm->num_pages); + if (ret) + goto out; + } + if (!dst->ttm->pages && dst->ttm->sg) { + dst_pages = kvmalloc_array(dst->ttm->num_pages, + sizeof(struct page *), GFP_KERNEL); + if (!dst_pages) { + ret = -ENOMEM; + goto out; + } + ret = drm_prime_sg_to_page_array(dst->ttm->sg, dst_pages, + dst->ttm->num_pages); + if (ret) + goto out; + } + d.mapped_dst = 0; d.mapped_src = 0; d.dst_addr = NULL; d.src_addr = NULL; - d.dst_pages = dst->ttm->pages; - d.src_pages = src->ttm->pages; + d.dst_pages = dst->ttm->pages ? dst->ttm->pages : dst_pages; + d.src_pages = src->ttm->pages ? src->ttm->pages : src_pages; d.dst_num_pages = PFN_UP(dst->resource->size); d.src_num_pages = PFN_UP(src->resource->size); d.dst_prot = ttm_io_prot(dst, dst->resource, PAGE_KERNEL); @@ -504,6 +529,10 @@ out: kunmap_atomic(d.src_addr); if (d.dst_addr) kunmap_atomic(d.dst_addr); + if (src_pages) + kvfree(src_pages); + if (dst_pages) + kvfree(dst_pages); return ret; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c index bfd41ce3c8f4f..e5eb21a471a60 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c @@ -377,7 +377,8 @@ static int vmw_bo_init(struct vmw_private *dev_priv, { struct ttm_operation_ctx ctx = { .interruptible = params->bo_type != ttm_bo_type_kernel, - .no_wait_gpu = false + .no_wait_gpu = false, + .resv = params->resv, }; struct ttm_device *bdev = &dev_priv->bdev; struct drm_device *vdev = &dev_priv->drm; @@ -394,8 +395,8 @@ static int vmw_bo_init(struct vmw_private *dev_priv, vmw_bo_placement_set(vmw_bo, params->domain, params->busy_domain); ret = ttm_bo_init_reserved(bdev, &vmw_bo->tbo, params->bo_type, - &vmw_bo->placement, 0, &ctx, NULL, - NULL, destroy); + &vmw_bo->placement, 0, &ctx, + params->sg, params->resv, destroy); if (unlikely(ret)) return ret; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h index 0d496dc9c6af7..f349642e6190d 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h @@ -55,6 +55,8 @@ struct vmw_bo_params { enum ttm_bo_type bo_type; size_t size; bool pin; + struct dma_resv *resv; + struct sg_table *sg; }; /** diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index 0a304706e0132..58fb40c93100a 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -1628,6 +1628,7 @@ static const struct drm_driver driver = { .prime_fd_to_handle = vmw_prime_fd_to_handle, .prime_handle_to_fd = vmw_prime_handle_to_fd, + .gem_prime_import_sg_table = vmw_prime_import_sg_table, .fops = &vmwgfx_driver_fops, .name = VMWGFX_DRIVER_NAME, diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h index 12efecc17df66..b019a1a1787af 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h @@ -1130,6 +1130,9 @@ extern int vmw_prime_handle_to_fd(struct drm_device *dev, struct drm_file *file_priv, uint32_t handle, uint32_t flags, int *prime_fd); +struct drm_gem_object *vmw_prime_import_sg_table(struct drm_device *dev, + struct dma_buf_attachment *attach, + struct sg_table *table); /* * MemoryOBject management - vmwgfx_mob.c diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c index 12787bb9c111d..d6bcaf078b1f4 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c @@ -149,6 +149,38 @@ out_no_bo: return ret; } +struct drm_gem_object *vmw_prime_import_sg_table(struct drm_device *dev, + struct dma_buf_attachment *attach, + struct sg_table *table) +{ + int ret; + struct vmw_private *dev_priv = vmw_priv(dev); + struct drm_gem_object *gem = NULL; + struct vmw_bo *vbo; + struct vmw_bo_params params = { + .domain = (dev_priv->has_mob) ? VMW_BO_DOMAIN_SYS : VMW_BO_DOMAIN_VRAM, + .busy_domain = VMW_BO_DOMAIN_SYS, + .bo_type = ttm_bo_type_sg, + .size = attach->dmabuf->size, + .pin = false, + .resv = attach->dmabuf->resv, + .sg = table, + + }; + + dma_resv_lock(params.resv, NULL); + + ret = vmw_bo_create(dev_priv, ¶ms, &vbo); + if (ret != 0) + goto out_no_bo; + + vbo->tbo.base.funcs = &vmw_gem_object_funcs; + + gem = &vbo->tbo.base; +out_no_bo: + dma_resv_unlock(params.resv); + return gem; +} int vmw_gem_object_create_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c b/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c index 2d72a5ee7c0c7..c99cad4449915 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c @@ -75,8 +75,12 @@ int vmw_prime_fd_to_handle(struct drm_device *dev, int fd, u32 *handle) { struct ttm_object_file *tfile = vmw_fpriv(file_priv)->tfile; + int ret = ttm_prime_fd_to_handle(tfile, fd, handle); - return ttm_prime_fd_to_handle(tfile, fd, handle); + if (ret) + ret = drm_gem_prime_fd_to_handle(dev, file_priv, fd, handle); + + return ret; } int vmw_prime_handle_to_fd(struct drm_device *dev, @@ -85,5 +89,12 @@ int vmw_prime_handle_to_fd(struct drm_device *dev, int *prime_fd) { struct ttm_object_file *tfile = vmw_fpriv(file_priv)->tfile; - return ttm_prime_handle_to_fd(tfile, handle, flags, prime_fd); + int ret; + + if (handle > VMWGFX_NUM_MOB) + ret = ttm_prime_handle_to_fd(tfile, handle, flags, prime_fd); + else + ret = drm_gem_prime_handle_to_fd(dev, file_priv, handle, flags, prime_fd); + + return ret; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c index 4d23d0a70bcb7..621d98b376bbb 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c @@ -188,13 +188,18 @@ static int vmw_ttm_map_dma(struct vmw_ttm_tt *vmw_tt) switch (dev_priv->map_mode) { case vmw_dma_map_bind: case vmw_dma_map_populate: - vsgt->sgt = &vmw_tt->sgt; - ret = sg_alloc_table_from_pages_segment( - &vmw_tt->sgt, vsgt->pages, vsgt->num_pages, 0, - (unsigned long)vsgt->num_pages << PAGE_SHIFT, - dma_get_max_seg_size(dev_priv->drm.dev), GFP_KERNEL); - if (ret) - goto out_sg_alloc_fail; + if (vmw_tt->dma_ttm.page_flags & TTM_TT_FLAG_EXTERNAL) { + vsgt->sgt = vmw_tt->dma_ttm.sg; + } else { + vsgt->sgt = &vmw_tt->sgt; + ret = sg_alloc_table_from_pages_segment(&vmw_tt->sgt, + vsgt->pages, vsgt->num_pages, 0, + (unsigned long)vsgt->num_pages << PAGE_SHIFT, + dma_get_max_seg_size(dev_priv->drm.dev), + GFP_KERNEL); + if (ret) + goto out_sg_alloc_fail; + } ret = vmw_ttm_map_for_dma(vmw_tt); if (unlikely(ret != 0)) @@ -209,8 +214,9 @@ static int vmw_ttm_map_dma(struct vmw_ttm_tt *vmw_tt) return 0; out_map_fail: - sg_free_table(vmw_tt->vsgt.sgt); - vmw_tt->vsgt.sgt = NULL; + drm_warn(&dev_priv->drm, "VSG table map failed!"); + sg_free_table(vsgt->sgt); + vsgt->sgt = NULL; out_sg_alloc_fail: return ret; } @@ -356,15 +362,17 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) static int vmw_ttm_populate(struct ttm_device *bdev, struct ttm_tt *ttm, struct ttm_operation_ctx *ctx) { - int ret; + bool external = (ttm->page_flags & TTM_TT_FLAG_EXTERNAL) != 0; - /* TODO: maybe completely drop this ? */ if (ttm_tt_is_populated(ttm)) return 0; - ret = ttm_pool_alloc(&bdev->pool, ttm, ctx); + if (external && ttm->sg) + return drm_prime_sg_to_dma_addr_array(ttm->sg, + ttm->dma_address, + ttm->num_pages); - return ret; + return ttm_pool_alloc(&bdev->pool, ttm, ctx); } static void vmw_ttm_unpopulate(struct ttm_device *bdev, @@ -372,6 +380,10 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev, { struct vmw_ttm_tt *vmw_tt = container_of(ttm, struct vmw_ttm_tt, dma_ttm); + bool external = (ttm->page_flags & TTM_TT_FLAG_EXTERNAL) != 0; + + if (external) + return; vmw_ttm_unbind(bdev, ttm); @@ -390,6 +402,7 @@ static struct ttm_tt *vmw_ttm_tt_create(struct ttm_buffer_object *bo, { struct vmw_ttm_tt *vmw_be; int ret; + bool external = bo->type == ttm_bo_type_sg; vmw_be = kzalloc(sizeof(*vmw_be), GFP_KERNEL); if (!vmw_be) @@ -398,7 +411,10 @@ static struct ttm_tt *vmw_ttm_tt_create(struct ttm_buffer_object *bo, vmw_be->dev_priv = vmw_priv_from_ttm(bo->bdev); vmw_be->mob = NULL; - if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent) + if (external) + page_flags |= TTM_TT_FLAG_EXTERNAL | TTM_TT_FLAG_EXTERNAL_MAPPABLE; + + if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent || external) ret = ttm_sg_tt_init(&vmw_be->dma_ttm, bo, page_flags, ttm_cached); else -- GitLab From a60ccade88f926e871a57176e86a34bbf0db0098 Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Thu, 11 Apr 2024 22:55:10 -0400 Subject: [PATCH 888/989] drm/vmwgfx: Fix crtc's atomic check conditional The conditional was supposed to prevent enabling of a crtc state without a set primary plane. Accidently it also prevented disabling crtc state with a set primary plane. Neither is correct. Fix the conditional and just driver-warn when a crtc state has been enabled without a primary plane which will help debug broken userspace. Fixes IGT's kms_atomic_interruptible and kms_atomic_transition tests. Signed-off-by: Zack Rusin Fixes: 06ec41909e31 ("drm/vmwgfx: Add and connect CRTC helper functions") Cc: Broadcom internal kernel review list Cc: dri-devel@lists.freedesktop.org Cc: # v4.12+ Reviewed-by: Ian Forbes Reviewed-by: Martin Krastev Link: https://patchwork.freedesktop.org/patch/msgid/20240412025511.78553-5-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index cd4925346ed45..84ae4e10a2ebe 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -933,6 +933,7 @@ int vmw_du_cursor_plane_atomic_check(struct drm_plane *plane, int vmw_du_crtc_atomic_check(struct drm_crtc *crtc, struct drm_atomic_state *state) { + struct vmw_private *vmw = vmw_priv(crtc->dev); struct drm_crtc_state *new_state = drm_atomic_get_new_crtc_state(state, crtc); struct vmw_display_unit *du = vmw_crtc_to_du(new_state->crtc); @@ -940,9 +941,13 @@ int vmw_du_crtc_atomic_check(struct drm_crtc *crtc, bool has_primary = new_state->plane_mask & drm_plane_mask(crtc->primary); - /* We always want to have an active plane with an active CRTC */ - if (has_primary != new_state->enable) - return -EINVAL; + /* + * This is fine in general, but broken userspace might expect + * some actual rendering so give a clue as why it's blank. + */ + if (new_state->enable && !has_primary) + drm_dbg_driver(&vmw->drm, + "CRTC without a primary plane will be blank.\n"); if (new_state->connector_mask != connector_mask && -- GitLab From d4c972bff3129a9dd4c22a3999fd8eba1a81531a Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Thu, 11 Apr 2024 22:55:11 -0400 Subject: [PATCH 889/989] drm/vmwgfx: Sort primary plane formats by order of preference The table of primary plane formats wasn't sorted at all, leading to applications picking our least desirable formats by defaults. Sort the primary plane formats according to our order of preference. Nice side-effect of this change is that it makes IGT's kms_atomic plane-invalid-params pass because the test picks the first format which for vmwgfx was DRM_FORMAT_XRGB1555 and uses fb's with odd sizes which make Pixman, which IGT depends on assert due to the fact that our 16bpp formats aren't 32 bit aligned like Pixman requires all formats to be. Signed-off-by: Zack Rusin Fixes: 36cc79bc9077 ("drm/vmwgfx: Add universal plane support") Cc: Broadcom internal kernel review list Cc: dri-devel@lists.freedesktop.org Cc: # v4.12+ Acked-by: Pekka Paalanen Link: https://patchwork.freedesktop.org/patch/msgid/20240412025511.78553-6-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_kms.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h index a94947b588e85..19a843da87b78 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h @@ -243,10 +243,10 @@ struct vmw_framebuffer_bo { static const uint32_t __maybe_unused vmw_primary_plane_formats[] = { - DRM_FORMAT_XRGB1555, - DRM_FORMAT_RGB565, DRM_FORMAT_XRGB8888, DRM_FORMAT_ARGB8888, + DRM_FORMAT_RGB565, + DRM_FORMAT_XRGB1555, }; static const uint32_t __maybe_unused vmw_cursor_plane_formats[] = { -- GitLab From 015a12a4a6708d9cadcaea8334e270747923394c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 15 Apr 2024 15:10:03 +0530 Subject: [PATCH 890/989] arm64/hugetlb: Fix page table walk in huge_pte_alloc() Currently normal HugeTLB fault ends up crashing the kernel, as p4dp derived from p4d_offset() is an invalid address when PGTABLE_LEVEL = 5. A p4d level entry needs to be allocated when not available while walking the page table during HugeTLB faults. Let's call p4d_alloc() to allocate such entries when required instead of current p4d_offset(). Unable to handle kernel paging request at virtual address ffffffff80000000 Mem abort info: ESR = 0x0000000096000005 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x05: level 1 translation fault Data abort info: ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 swapper pgtable: 4k pages, 52-bit VAs, pgdp=0000000081da9000 [ffffffff80000000] pgd=1000000082cec003, p4d=0000000082c32003, pud=0000000000000000 Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 108 Comm: high_addr_hugep Not tainted 6.9.0-rc4 #48 Hardware name: Foundation-v8A (DT) pstate: 01402005 (nzcv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : huge_pte_alloc+0xd4/0x334 lr : hugetlb_fault+0x1b8/0xc68 sp : ffff8000833bbc20 x29: ffff8000833bbc20 x28: fff000080080cb58 x27: ffff800082a7cc58 x26: 0000000000000000 x25: fff0000800378e40 x24: fff00008008d6c60 x23: 00000000de9dbf07 x22: fff0000800378e40 x21: 0004000000000000 x20: 0004000000000000 x19: ffffffff80000000 x18: 1ffe00010011d7a1 x17: 0000000000000001 x16: ffffffffffffffff x15: 0000000000000001 x14: 0000000000000000 x13: ffff8000816120d0 x12: ffffffffffffffff x11: 0000000000000000 x10: fff00008008ebd0c x9 : 0004000000000000 x8 : 0000000000001255 x7 : fff00008003e2000 x6 : 00000000061d54b0 x5 : 0000000000001000 x4 : ffffffff80000000 x3 : 0000000000200000 x2 : 0000000000000004 x1 : 0000000080000000 x0 : 0000000000000000 Call trace: huge_pte_alloc+0xd4/0x334 hugetlb_fault+0x1b8/0xc68 handle_mm_fault+0x260/0x29c do_page_fault+0xfc/0x47c do_translation_fault+0x68/0x74 do_mem_abort+0x44/0x94 el0_da+0x2c/0x9c el0t_64_sync_handler+0x70/0xc4 el0t_64_sync+0x190/0x194 Code: aa000084 cb010084 b24c2c84 8b130c93 (f9400260) ---[ end trace 0000000000000000 ]--- Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Fixes: a6bbf5d4d9d1 ("arm64: mm: Add definitions to support 5 levels of paging") Reported-by: Dev Jain Acked-by: Ard Biesheuvel Acked-by: Mark Rutland Signed-off-by: Anshuman Khandual Reviewed-by: Ryan Roberts Link: https://lore.kernel.org/r/20240415094003.1812018-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/hugetlbpage.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 0f0e10bb0a954..b872b003a55f3 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -276,7 +276,10 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep = NULL; pgdp = pgd_offset(mm, addr); - p4dp = p4d_offset(pgdp, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + return NULL; + pudp = pud_alloc(mm, p4dp, addr); if (!pudp) return NULL; -- GitLab From 3078e059a5e984663c5c2b04485375c84c2700f9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 12 Apr 2024 14:36:38 +0800 Subject: [PATCH 891/989] bcachefs: fix error path of __bch2_read_super() In __bch2_read_super(), if kstrdup() fails, it needs to release memory in sb->holder, fix to call bch2_free_super() in the error path. Signed-off-by: Chao Yu Reviewed-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index e0aa3655b63b4..648986a7dbdee 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -700,8 +700,11 @@ retry: return -ENOMEM; sb->sb_name = kstrdup(path, GFP_KERNEL); - if (!sb->sb_name) - return -ENOMEM; + if (!sb->sb_name) { + ret = -ENOMEM; + prt_printf(&err, "error allocating memory for sb_name"); + goto err; + } #ifndef __KERNEL__ if (opt_get(*opts, direct_io) == false) -- GitLab From ad29cf999a9161e7849aa229d2028854f90728c2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 18:02:15 -0400 Subject: [PATCH 892/989] bcachefs: set_btree_iter_dontneed also clears should_be_locked This is part of a larger series cleaning up the semantics of should_be_locked and adding assertions around it; if we don't need an iterator/path anymore, it clearly doesn't need to be locked. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 1d58d447b386c..1c70836dd7cce 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -498,8 +498,13 @@ static inline void set_btree_iter_dontneed(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; - if (!trans->restarted) - btree_iter_path(trans, iter)->preserve = false; + if (!iter->path || trans->restarted) + return; + + struct btree_path *path = btree_iter_path(trans, iter); + path->preserve = false; + if (path->ref == 1) + path->should_be_locked = false; } void *__bch2_trans_kmalloc(struct btree_trans *, size_t); -- GitLab From 35f4f8c9fc972248055096d63b782060e473311b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Wed, 3 Apr 2024 17:24:50 -0300 Subject: [PATCH 893/989] drm/v3d: Don't increment `enabled_ns` twice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 509433d8146c ("drm/v3d: Expose the total GPU usage stats on sysfs") introduced the calculation of global GPU stats. For the regards, it used the already existing infrastructure provided by commit 09a93cc4f7d1 ("drm/v3d: Implement show_fdinfo() callback for GPU usage stats"). While adding global GPU stats calculation ability, the author forgot to delete the existing one. Currently, the value of `enabled_ns` is incremented twice by the end of the job, when it should be added just once. Therefore, delete the leftovers from commit 509433d8146c ("drm/v3d: Expose the total GPU usage stats on sysfs"). Fixes: 509433d8146c ("drm/v3d: Expose the total GPU usage stats on sysfs") Reported-by: Tvrtko Ursulin Signed-off-by: Maíra Canal Reviewed-by: Tvrtko Ursulin Reviewed-by: Jose Maria Casanova Crespo Link: https://patchwork.freedesktop.org/patch/msgid/20240403203517.731876-2-mcanal@igalia.com --- drivers/gpu/drm/v3d/v3d_irq.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_irq.c b/drivers/gpu/drm/v3d/v3d_irq.c index 2e04f6cb661e4..ce6b2fb341d1f 100644 --- a/drivers/gpu/drm/v3d/v3d_irq.c +++ b/drivers/gpu/drm/v3d/v3d_irq.c @@ -105,7 +105,6 @@ v3d_irq(int irq, void *arg) struct v3d_file_priv *file = v3d->bin_job->base.file->driver_priv; u64 runtime = local_clock() - file->start_ns[V3D_BIN]; - file->enabled_ns[V3D_BIN] += local_clock() - file->start_ns[V3D_BIN]; file->jobs_sent[V3D_BIN]++; v3d->queue[V3D_BIN].jobs_sent++; @@ -126,7 +125,6 @@ v3d_irq(int irq, void *arg) struct v3d_file_priv *file = v3d->render_job->base.file->driver_priv; u64 runtime = local_clock() - file->start_ns[V3D_RENDER]; - file->enabled_ns[V3D_RENDER] += local_clock() - file->start_ns[V3D_RENDER]; file->jobs_sent[V3D_RENDER]++; v3d->queue[V3D_RENDER].jobs_sent++; @@ -147,7 +145,6 @@ v3d_irq(int irq, void *arg) struct v3d_file_priv *file = v3d->csd_job->base.file->driver_priv; u64 runtime = local_clock() - file->start_ns[V3D_CSD]; - file->enabled_ns[V3D_CSD] += local_clock() - file->start_ns[V3D_CSD]; file->jobs_sent[V3D_CSD]++; v3d->queue[V3D_CSD].jobs_sent++; @@ -195,7 +192,6 @@ v3d_hub_irq(int irq, void *arg) struct v3d_file_priv *file = v3d->tfu_job->base.file->driver_priv; u64 runtime = local_clock() - file->start_ns[V3D_TFU]; - file->enabled_ns[V3D_TFU] += local_clock() - file->start_ns[V3D_TFU]; file->jobs_sent[V3D_TFU]++; v3d->queue[V3D_TFU].jobs_sent++; -- GitLab From e048d668f2969cf2b76e0fa21882a1b3bb323eca Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 11 Apr 2024 11:11:06 -0700 Subject: [PATCH 894/989] configs/hardening: Fix disabling UBSAN configurations The initial change that added kernel/configs/hardening.config attempted to disable all UBSAN sanitizers except for the array bounds one while turning on UBSAN_TRAP. Unfortunately, it only got the syntax for CONFIG_UBSAN_SHIFT correct, so configurations that are on by default with CONFIG_UBSAN=y such as CONFIG_UBSAN_{BOOL,ENUM} do not get disabled properly. CONFIG_ARCH_HAS_UBSAN=y CONFIG_UBSAN=y CONFIG_UBSAN_TRAP=y CONFIG_CC_HAS_UBSAN_BOUNDS_STRICT=y CONFIG_UBSAN_BOUNDS=y CONFIG_UBSAN_BOUNDS_STRICT=y # CONFIG_UBSAN_SHIFT is not set # CONFIG_UBSAN_DIV_ZERO is not set # CONFIG_UBSAN_UNREACHABLE is not set CONFIG_UBSAN_SIGNED_WRAP=y CONFIG_UBSAN_BOOL=y CONFIG_UBSAN_ENUM=y # CONFIG_TEST_UBSAN is not set Add the missing 'is not set' to each configuration that needs it so that they get disabled as intended. CONFIG_ARCH_HAS_UBSAN=y CONFIG_UBSAN=y CONFIG_UBSAN_TRAP=y CONFIG_CC_HAS_UBSAN_BOUNDS_STRICT=y CONFIG_UBSAN_BOUNDS=y CONFIG_UBSAN_BOUNDS_STRICT=y # CONFIG_UBSAN_SHIFT is not set # CONFIG_UBSAN_DIV_ZERO is not set # CONFIG_UBSAN_UNREACHABLE is not set CONFIG_UBSAN_SIGNED_WRAP=y # CONFIG_UBSAN_BOOL is not set # CONFIG_UBSAN_ENUM is not set # CONFIG_TEST_UBSAN is not set Fixes: 215199e3d9f3 ("hardening: Provide Kconfig fragments for basic options") Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240411-fix-ubsan-in-hardening-config-v1-1-e0177c80ffaa@kernel.org Signed-off-by: Kees Cook --- kernel/configs/hardening.config | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 7a5bbfc024b7d..d6f6dc45628aa 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -39,11 +39,11 @@ CONFIG_UBSAN=y CONFIG_UBSAN_TRAP=y CONFIG_UBSAN_BOUNDS=y # CONFIG_UBSAN_SHIFT is not set -# CONFIG_UBSAN_DIV_ZERO -# CONFIG_UBSAN_UNREACHABLE -# CONFIG_UBSAN_BOOL -# CONFIG_UBSAN_ENUM -# CONFIG_UBSAN_ALIGNMENT +# CONFIG_UBSAN_DIV_ZERO is not set +# CONFIG_UBSAN_UNREACHABLE is not set +# CONFIG_UBSAN_BOOL is not set +# CONFIG_UBSAN_ENUM is not set +# CONFIG_UBSAN_ALIGNMENT is not set # Sampling-based heap out-of-bounds and use-after-free detection. CONFIG_KFENCE=y -- GitLab From 7fcb91d94e897413c0345bb32ea11293f33efbb1 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 11 Apr 2024 11:11:07 -0700 Subject: [PATCH 895/989] configs/hardening: Disable CONFIG_UBSAN_SIGNED_WRAP kernel/configs/hardening.config turns on UBSAN for the bounds sanitizer, as that in combination with trapping can stop the exploitation of buffer overflows within the kernel. At the same time, hardening.config turns off every other UBSAN sanitizer because trapping means all UBSAN reports will be fatal and the problems brought up by other sanitizers generally do not have security implications. The signed integer overflow sanitizer was recently added back to the kernel and it is default on with just CONFIG_UBSAN=y, meaning that it gets enabled when merging hardening.config into another configuration. While this sanitizer does have security implications like the array bounds sanitizer, work to clean up enough instances to allow this to run in production environments is still ramping up, which means regular users and testers may be broken by these instances with CONFIG_UBSAN_TRAP=y. Disable CONFIG_UBSAN_SIGNED_WRAP in hardening.config to avoid this situation. Fixes: 557f8c582a9b ("ubsan: Reintroduce signed overflow sanitizer") Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240411-fix-ubsan-in-hardening-config-v1-2-e0177c80ffaa@kernel.org Signed-off-by: Kees Cook --- kernel/configs/hardening.config | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index d6f6dc45628aa..4b4cfcba31901 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -41,6 +41,7 @@ CONFIG_UBSAN_BOUNDS=y # CONFIG_UBSAN_SHIFT is not set # CONFIG_UBSAN_DIV_ZERO is not set # CONFIG_UBSAN_UNREACHABLE is not set +# CONFIG_UBSAN_SIGNED_WRAP is not set # CONFIG_UBSAN_BOOL is not set # CONFIG_UBSAN_ENUM is not set # CONFIG_UBSAN_ALIGNMENT is not set -- GitLab From ee7e980dc7c9f22c142807c5f582a6524138f57a Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 4 Apr 2024 19:35:53 -0400 Subject: [PATCH 896/989] drm/nouveau/kms/nv50-: Disable AUX bus for disconnected DP ports GSP has its own state for keeping track of whether or not a given display connector is plugged in or not, and enforces this state on the driver. In particular, AUX transactions on a DisplayPort connector which GSP says is disconnected can never succeed - and can in some cases even cause unexpected timeouts, which can trickle up to cause other problems. A good example of this is runtime power management: where we can actually get stuck trying to resume the GPU if a userspace application like fwupd tries accessing a drm_aux_dev for a disconnected port. This was an issue I hit a few times with my Slimbook Executive 16 - where trying to offload something to the discrete GPU would wake it up, and then potentially cause it to timeout as fwupd tried to immediately access the dp_aux_dev nodes for nouveau. Likewise: we don't really have any cases I know of where we'd want to ignore this state and try an aux transaction anyway - and failing pointless aux transactions immediately can even speed things up. So - let's start enabling/disabling the aux bus in nouveau_dp_detect() to fix this. We enable the aux bus during connector probing, and leave it enabled if we discover something is actually on the connector. Otherwise, we just shut it off. This should fix some people's runtime PM issues (like myself), and also get rid of quite of a lot of GSP error spam in dmesg. Signed-off-by: Lyude Paul Reviewed-by: Dave Airlie Link: https://patchwork.freedesktop.org/patch/msgid/20240404233736.7946-2-lyude@redhat.com (cherry picked from commit 9c8a10bf1f3467b2c16f6848249bdc7692ace825) --- drivers/gpu/drm/nouveau/nouveau_dp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_dp.c b/drivers/gpu/drm/nouveau/nouveau_dp.c index 7de7707ec6a89..3f6ccce20ea02 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dp.c +++ b/drivers/gpu/drm/nouveau/nouveau_dp.c @@ -232,6 +232,9 @@ nouveau_dp_detect(struct nouveau_connector *nv_connector, dpcd[DP_DPCD_REV] != 0) return NOUVEAU_DP_SST; + // Ensure that the aux bus is enabled for probing + drm_dp_dpcd_set_powered(&nv_connector->aux, true); + mutex_lock(&nv_encoder->dp.hpd_irq_lock); if (mstm) { /* If we're not ready to handle MST state changes yet, just @@ -293,6 +296,13 @@ out: if (mstm && !mstm->suspended && ret != NOUVEAU_DP_MST) nv50_mstm_remove(mstm); + /* GSP doesn't like when we try to do aux transactions on a port it considers disconnected, + * and since we don't really have a usecase for that anyway - just disable the aux bus here + * if we've decided the connector is disconnected + */ + if (ret == NOUVEAU_DP_NONE) + drm_dp_dpcd_set_powered(&nv_connector->aux, false); + mutex_unlock(&nv_encoder->dp.hpd_irq_lock); return ret; } -- GitLab From bf52d7f9b2067f02efe7e32697479097aba4a055 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 4 Apr 2024 19:35:54 -0400 Subject: [PATCH 897/989] drm/nouveau/dp: Don't probe eDP ports twice harder I didn't pay close enough attention the last time I tried to fix this problem - while we currently do correctly take care to make sure we don't probe a connected eDP port more then once, we don't do the same thing for eDP ports we found to be disconnected. So, fix this and make sure we only ever probe eDP ports once and then leave them at that connector state forever (since without HPD, it's not going to change on its own anyway). This should get rid of the last few GSP errors getting spit out during runtime suspend and resume on some machines, as we tried to reprobe eDP ports in response to ACPI hotplug probe events. Signed-off-by: Lyude Paul Reviewed-by: Dave Airlie Link: https://patchwork.freedesktop.org/patch/msgid/20240404233736.7946-3-lyude@redhat.com (cherry picked from commit fe6660b661c3397af0867d5d098f5b26581f1290) --- drivers/gpu/drm/nouveau/nouveau_dp.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dp.c b/drivers/gpu/drm/nouveau/nouveau_dp.c index 3f6ccce20ea02..a72c45809484a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dp.c +++ b/drivers/gpu/drm/nouveau/nouveau_dp.c @@ -225,12 +225,15 @@ nouveau_dp_detect(struct nouveau_connector *nv_connector, u8 *dpcd = nv_encoder->dp.dpcd; int ret = NOUVEAU_DP_NONE, hpd; - /* If we've already read the DPCD on an eDP device, we don't need to - * reread it as it won't change + /* eDP ports don't support hotplugging - so there's no point in probing eDP ports unless we + * haven't probed them once before. */ - if (connector->connector_type == DRM_MODE_CONNECTOR_eDP && - dpcd[DP_DPCD_REV] != 0) - return NOUVEAU_DP_SST; + if (connector->connector_type == DRM_MODE_CONNECTOR_eDP) { + if (connector->status == connector_status_connected) + return NOUVEAU_DP_SST; + else if (connector->status == connector_status_disconnected) + return NOUVEAU_DP_NONE; + } // Ensure that the aux bus is enabled for probing drm_dp_dpcd_set_powered(&nv_connector->aux, true); -- GitLab From f4626c12e4b538f757a73d08f4d86d564175b4f7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 15 Apr 2024 11:28:35 -0700 Subject: [PATCH 898/989] ubsan: Add awareness of signed integer overflow traps On arm64, UBSAN traps can be decoded from the trap instruction. Add the add, sub, and mul overflow trap codes now that CONFIG_UBSAN_SIGNED_WRAP exists. Seen under clang 19: Internal error: UBSAN: unrecognized failure code: 00000000f2005515 [#1] PREEMPT SMP Reported-by: Nathan Chancellor Closes: https://lore.kernel.org/lkml/20240411-fix-ubsan-in-hardening-config-v1-0-e0177c80ffaa@kernel.org Fixes: 557f8c582a9b ("ubsan: Reintroduce signed overflow sanitizer") Tested-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240415182832.work.932-kees@kernel.org Signed-off-by: Kees Cook --- lib/ubsan.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/lib/ubsan.c b/lib/ubsan.c index 5fc107f61934c..a1c983d148f16 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -44,9 +44,10 @@ const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type) case ubsan_shift_out_of_bounds: return "UBSAN: shift out of bounds"; #endif -#ifdef CONFIG_UBSAN_DIV_ZERO +#if defined(CONFIG_UBSAN_DIV_ZERO) || defined(CONFIG_UBSAN_SIGNED_WRAP) /* - * SanitizerKind::IntegerDivideByZero emits + * SanitizerKind::IntegerDivideByZero and + * SanitizerKind::SignedIntegerOverflow emit * SanitizerHandler::DivremOverflow. */ case ubsan_divrem_overflow: @@ -77,6 +78,19 @@ const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type) return "UBSAN: alignment assumption"; case ubsan_type_mismatch: return "UBSAN: type mismatch"; +#endif +#ifdef CONFIG_UBSAN_SIGNED_WRAP + /* + * SanitizerKind::SignedIntegerOverflow emits + * SanitizerHandler::AddOverflow, SanitizerHandler::SubOverflow, + * or SanitizerHandler::MulOverflow. + */ + case ubsan_add_overflow: + return "UBSAN: integer addition overflow"; + case ubsan_sub_overflow: + return "UBSAN: integer subtraction overflow"; + case ubsan_mul_overflow: + return "UBSAN: integer multiplication overflow"; #endif default: return "UBSAN: unrecognized failure code"; -- GitLab From 4225dfa4535f219b03ae14147d9c6e7e82ec8df4 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:52 +0100 Subject: [PATCH 899/989] selftests/tcp_ao: Make RST tests less flaky Currently, "active reset" cases are flaky, because select() is called for 3 sockets, while only 2 are expected to receive RST. The idea of the third socket was to get into request_sock_queue, but the test mistakenly attempted to connect() after the listener socket was shut down. Repair this test, it's important to check the different kernel code-paths for signing RST TCP-AO segments. Fixes: c6df7b2361d7 ("selftests/net: Add TCP-AO RST test") Reported-by: Jakub Kicinski Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/rst.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/tcp_ao/rst.c b/tools/testing/selftests/net/tcp_ao/rst.c index 7df8b8700e39e..a2fe88d35ac06 100644 --- a/tools/testing/selftests/net/tcp_ao/rst.c +++ b/tools/testing/selftests/net/tcp_ao/rst.c @@ -256,8 +256,6 @@ static int test_wait_fds(int sk[], size_t nr, bool is_writable[], static void test_client_active_rst(unsigned int port) { - /* one in queue, another accept()ed */ - unsigned int wait_for = backlog + 2; int i, sk[3], err; bool is_writable[ARRAY_SIZE(sk)] = {false}; unsigned int last = ARRAY_SIZE(sk) - 1; @@ -275,16 +273,20 @@ static void test_client_active_rst(unsigned int port) for (i = 0; i < last; i++) { err = _test_connect_socket(sk[i], this_ip_dest, port, (i == 0) ? TEST_TIMEOUT_SEC : -1); - if (err < 0) test_error("failed to connect()"); } - synchronize_threads(); /* 2: connection accept()ed, another queued */ - err = test_wait_fds(sk, last, is_writable, wait_for, TEST_TIMEOUT_SEC); + synchronize_threads(); /* 2: two connections: one accept()ed, another queued */ + err = test_wait_fds(sk, last, is_writable, last, TEST_TIMEOUT_SEC); if (err < 0) test_error("test_wait_fds(): %d", err); + /* async connect() with third sk to get into request_sock_queue */ + err = _test_connect_socket(sk[last], this_ip_dest, port, -1); + if (err < 0) + test_error("failed to connect()"); + synchronize_threads(); /* 3: close listen socket */ if (test_client_verify(sk[0], packet_sz, quota / packet_sz, TEST_TIMEOUT_SEC)) test_fail("Failed to send data on connected socket"); @@ -292,13 +294,14 @@ static void test_client_active_rst(unsigned int port) test_ok("Verified established tcp connection"); synchronize_threads(); /* 4: finishing up */ - err = _test_connect_socket(sk[last], this_ip_dest, port, -1); - if (err < 0) - test_error("failed to connect()"); synchronize_threads(); /* 5: closed active sk */ - err = test_wait_fds(sk, ARRAY_SIZE(sk), NULL, - wait_for, TEST_TIMEOUT_SEC); + /* + * Wait for 2 connections: one accepted, another in the accept queue, + * the one in request_sock_queue won't get fully established, so + * doesn't receive an active RST, see inet_csk_listen_stop(). + */ + err = test_wait_fds(sk, last, NULL, last, TEST_TIMEOUT_SEC); if (err < 0) test_error("select(): %d", err); -- GitLab From b089b3bead532419cdcbd8e4e0a3e23c49d11573 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:53 +0100 Subject: [PATCH 900/989] selftests/tcp_ao: Zero-init tcp_ao_info_opt The structure is on the stack and has to be zero-initialized as the kernel checks for: > if (in.reserved != 0 || in.reserved2 != 0) > return -EINVAL; Fixes: b26660531cf6 ("selftests/net: Add test for TCP-AO add setsockopt() command") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/setsockopt-closed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c index 452de131fa3a9..517930f9721bd 100644 --- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c +++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c @@ -21,7 +21,7 @@ static void make_listen(int sk) static void test_vefify_ao_info(int sk, struct tcp_ao_info_opt *info, const char *tst) { - struct tcp_ao_info_opt tmp; + struct tcp_ao_info_opt tmp = {}; socklen_t len = sizeof(tmp); if (getsockopt(sk, IPPROTO_TCP, TCP_AO_INFO, &tmp, &len)) -- GitLab From beb78cd1329d039d73487ca05633d1b92e1ab2ea Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:54 +0100 Subject: [PATCH 901/989] selftests/tcp_ao: Fix fscanf() call for format-security MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On my new laptop with packages from nixos-unstable, gcc 12.3.0 produces: > lib/proc.c: In function ‘netstat_read_type’: > lib/proc.c:89:9: error: format not a string literal and no format arguments [-Werror=format-security] > 89 | if (fscanf(fnetstat, type->header_name) == EOF) > | ^~ > cc1: some warnings being treated as errors Here the selftests lib parses header name, while expectes non-space word ending with a column. Fixes: cfbab37b3da0 ("selftests/net: Add TCP-AO library") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Reported-by: Muhammad Usama Anjum Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/lib/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/tcp_ao/lib/proc.c b/tools/testing/selftests/net/tcp_ao/lib/proc.c index 2fb6dd8adba69..8b984fa042869 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/proc.c +++ b/tools/testing/selftests/net/tcp_ao/lib/proc.c @@ -86,7 +86,7 @@ static void netstat_read_type(FILE *fnetstat, struct netstat **dest, char *line) pos = strchr(line, ' ') + 1; - if (fscanf(fnetstat, type->header_name) == EOF) + if (fscanf(fnetstat, "%[^ :]", type->header_name) == EOF) test_error("fscanf(%s)", type->header_name); if (fread(&tmp, 1, 1, fnetstat) != 1 || tmp != ':') test_error("Unexpected netstat format (%c)", tmp); -- GitLab From b476c93654d748c13624f7c7d0ba191c56a8092e Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:55 +0100 Subject: [PATCH 902/989] selftests/tcp_ao: Printing fixes to confirm with format-security MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On my new laptop with packages from nixos-unstable, gcc 12.3.0 produces > lib/setup.c: In function ‘__test_msg’: > lib/setup.c:20:9: error: format not a string literal and no format arguments [-Werror=format-security] > 20 | ksft_print_msg(buf); > | ^~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_ok’: > lib/setup.c:26:9: error: format not a string literal and no format arguments [-Werror=format-security] > 26 | ksft_test_result_pass(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_fail’: > lib/setup.c:32:9: error: format not a string literal and no format arguments [-Werror=format-security] > 32 | ksft_test_result_fail(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_xfail’: > lib/setup.c:38:9: error: format not a string literal and no format arguments [-Werror=format-security] > 38 | ksft_test_result_xfail(buf); > | ^~~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_error’: > lib/setup.c:44:9: error: format not a string literal and no format arguments [-Werror=format-security] > 44 | ksft_test_result_error(buf); > | ^~~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_skip’: > lib/setup.c:50:9: error: format not a string literal and no format arguments [-Werror=format-security] > 50 | ksft_test_result_skip(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > cc1: some warnings being treated as errors As the buffer was already pre-printed into, print it as a string rather than a format-string. Fixes: cfbab37b3da0 ("selftests/net: Add TCP-AO library") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Reported-by: Muhammad Usama Anjum Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/lib/setup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c index 92276f916f2f3..e408b9243b2c5 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/setup.c +++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c @@ -17,37 +17,37 @@ static pthread_mutex_t ksft_print_lock = PTHREAD_MUTEX_INITIALIZER; void __test_msg(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_print_msg(buf); + ksft_print_msg("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_ok(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_pass(buf); + ksft_test_result_pass("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_fail(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_fail(buf); + ksft_test_result_fail("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_xfail(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_xfail(buf); + ksft_test_result_xfail("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_error(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_error(buf); + ksft_test_result_error("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_skip(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_skip(buf); + ksft_test_result_skip("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } -- GitLab From fe90f3967bdb3e13f133e5f44025e15f943a99c5 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 15 Apr 2024 11:21:13 -0400 Subject: [PATCH 903/989] sched: Add missing memory barrier in switch_mm_cid Many architectures' switch_mm() (e.g. arm64) do not have an smp_mb() which the core scheduler code has depended upon since commit: commit 223baf9d17f25 ("sched: Fix performance regression introduced by mm_cid") If switch_mm() doesn't call smp_mb(), sched_mm_cid_remote_clear() can unset the actively used cid when it fails to observe active task after it sets lazy_put. There *is* a memory barrier between storing to rq->curr and _return to userspace_ (as required by membarrier), but the rseq mm_cid has stricter requirements: the barrier needs to be issued between store to rq->curr and switch_mm_cid(), which happens earlier than: - spin_unlock(), - switch_to(). So it's fine when the architecture switch_mm() happens to have that barrier already, but less so when the architecture only provides the full barrier in switch_to() or spin_unlock(). It is a bug in the rseq switch_mm_cid() implementation. All architectures that don't have memory barriers in switch_mm(), but rather have the full barrier either in finish_lock_switch() or switch_to() have them too late for the needs of switch_mm_cid(). Introduce a new smp_mb__after_switch_mm(), defined as smp_mb() in the generic barrier.h header, and use it in switch_mm_cid() for scheduler transitions where switch_mm() is expected to provide a memory barrier. Architectures can override smp_mb__after_switch_mm() if their switch_mm() implementation provides an implicit memory barrier. Override it with a no-op on x86 which implicitly provide this memory barrier by writing to CR3. Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid") Reported-by: levi.yun Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar Reviewed-by: Catalin Marinas # for arm64 Acked-by: Dave Hansen # for x86 Cc: # 6.4.x Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240415152114.59122-2-mathieu.desnoyers@efficios.com --- arch/x86/include/asm/barrier.h | 3 +++ include/asm-generic/barrier.h | 8 ++++++++ kernel/sched/sched.h | 20 ++++++++++++++------ 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index fe1e7e3cc844a..63bdc6b852197 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -79,6 +79,9 @@ do { \ #define __smp_mb__before_atomic() do { } while (0) #define __smp_mb__after_atomic() do { } while (0) +/* Writing to CR3 provides a full memory barrier in switch_mm(). */ +#define smp_mb__after_switch_mm() do { } while (0) + #include #endif /* _ASM_X86_BARRIER_H */ diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 0c0695763bea3..d4f581c1e21da 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -294,5 +294,13 @@ do { \ #define io_stop_wc() do { } while (0) #endif +/* + * Architectures that guarantee an implicit smp_mb() in switch_mm() + * can override smp_mb__after_switch_mm. + */ +#ifndef smp_mb__after_switch_mm +# define smp_mb__after_switch_mm() smp_mb() +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_GENERIC_BARRIER_H */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d2242679239ec..ae50f212775e5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -79,6 +79,8 @@ # include #endif +#include + #include "cpupri.h" #include "cpudeadline.h" @@ -3445,13 +3447,19 @@ static inline void switch_mm_cid(struct rq *rq, * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. * Provide it here. */ - if (!prev->mm) // from kernel + if (!prev->mm) { // from kernel smp_mb(); - /* - * user -> user transition guarantees a memory barrier through - * switch_mm() when current->mm changes. If current->mm is - * unchanged, no barrier is needed. - */ + } else { // from user + /* + * user->user transition relies on an implicit + * memory barrier in switch_mm() when + * current->mm changes. If the architecture + * switch_mm() does not have an implicit memory + * barrier, it is emitted here. If current->mm + * is unchanged, no barrier is needed. + */ + smp_mb__after_switch_mm(); + } } if (prev->mm_cid_active) { mm_cid_snapshot_time(rq, prev->mm); -- GitLab From 03cea821b82cbf0ee4a285f9dca6cc1d660dbef3 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 10 Apr 2024 09:09:54 -0500 Subject: [PATCH 904/989] platform/x86/amd: pmf: Decrease error message to debug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASUS ROG Zephyrus G14 doesn't have _CRS in AMDI0102 device and so there are no resources to walk. This is expected behavior because it doesn't support Smart PC. Decrease error message to debug. Link: https://bugzilla.kernel.org/show_bug.cgi?id=218685 Signed-off-by: Mario Limonciello Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240410140956.385-1-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd/pmf/acpi.c b/drivers/platform/x86/amd/pmf/acpi.c index d0cf46e2fc8e8..60fc71c9fb0fa 100644 --- a/drivers/platform/x86/amd/pmf/acpi.c +++ b/drivers/platform/x86/amd/pmf/acpi.c @@ -437,7 +437,7 @@ int apmf_check_smart_pc(struct amd_pmf_dev *pmf_dev) status = acpi_walk_resources(ahandle, METHOD_NAME__CRS, apmf_walk_resources, pmf_dev); if (ACPI_FAILURE(status)) { - dev_err(pmf_dev->dev, "acpi_walk_resources failed :%d\n", status); + dev_dbg(pmf_dev->dev, "acpi_walk_resources failed :%d\n", status); return -EINVAL; } -- GitLab From ed13f622bcd594d6cefd6239b1722ed8b84ba98f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 10 Apr 2024 09:09:55 -0500 Subject: [PATCH 905/989] platform/x86/amd: pmf: Add infrastructure for quirking supported funcs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the event of a BIOS bug add infrastructure that will be utilized to override the return value for supported_funcs to avoid enabling features. Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20240410140956.385-2-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/Makefile | 2 +- drivers/platform/x86/amd/pmf/acpi.c | 5 ++- drivers/platform/x86/amd/pmf/core.c | 1 + drivers/platform/x86/amd/pmf/pmf-quirks.c | 43 +++++++++++++++++++++++ drivers/platform/x86/amd/pmf/pmf.h | 3 ++ 5 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 drivers/platform/x86/amd/pmf/pmf-quirks.c diff --git a/drivers/platform/x86/amd/pmf/Makefile b/drivers/platform/x86/amd/pmf/Makefile index 6b26e48ce8ad2..7d6079b02589c 100644 --- a/drivers/platform/x86/amd/pmf/Makefile +++ b/drivers/platform/x86/amd/pmf/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_AMD_PMF) += amd-pmf.o amd-pmf-objs := core.o acpi.o sps.o \ auto-mode.o cnqf.o \ - tee-if.o spc.o + tee-if.o spc.o pmf-quirks.o diff --git a/drivers/platform/x86/amd/pmf/acpi.c b/drivers/platform/x86/amd/pmf/acpi.c index 60fc71c9fb0fa..1157ec148880b 100644 --- a/drivers/platform/x86/amd/pmf/acpi.c +++ b/drivers/platform/x86/amd/pmf/acpi.c @@ -343,7 +343,10 @@ static int apmf_if_verify_interface(struct amd_pmf_dev *pdev) if (err) return err; - pdev->supported_func = output.supported_functions; + /* only set if not already set by a quirk */ + if (!pdev->supported_func) + pdev->supported_func = output.supported_functions; + dev_dbg(pdev->dev, "supported functions:0x%x notifications:0x%x version:%u\n", output.supported_functions, output.notification_mask, output.version); diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index 5d4f80698a8b8..64e6e34a2a9ac 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -445,6 +445,7 @@ static int amd_pmf_probe(struct platform_device *pdev) mutex_init(&dev->lock); mutex_init(&dev->update_mutex); + amd_pmf_quirks_init(dev); apmf_acpi_init(dev); platform_set_drvdata(pdev, dev); amd_pmf_dbgfs_register(dev); diff --git a/drivers/platform/x86/amd/pmf/pmf-quirks.c b/drivers/platform/x86/amd/pmf/pmf-quirks.c new file mode 100644 index 0000000000000..9f3790eaaa300 --- /dev/null +++ b/drivers/platform/x86/amd/pmf/pmf-quirks.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Platform Management Framework Driver Quirks + * + * Copyright (c) 2024, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Mario Limonciello + */ + +#include + +#include "pmf.h" + +struct quirk_entry { + u32 supported_func; +}; + +static struct quirk_entry quirk_no_sps_bug = { + .supported_func = 0x4003, +}; + +static const struct dmi_system_id fwbug_list[] = { + {} +}; + +void amd_pmf_quirks_init(struct amd_pmf_dev *dev) +{ + const struct dmi_system_id *dmi_id; + struct quirk_entry *quirks; + + dmi_id = dmi_first_match(fwbug_list); + if (!dmi_id) + return; + + quirks = dmi_id->driver_data; + if (quirks->supported_func) { + dev->supported_func = quirks->supported_func; + pr_info("Using supported funcs quirk to avoid %s platform firmware bug\n", + dmi_id->ident); + } +} + diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 8c4df5753f40d..eeedd0c0395a8 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -720,4 +720,7 @@ int apmf_check_smart_pc(struct amd_pmf_dev *pmf_dev); void amd_pmf_populate_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in); void amd_pmf_dump_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in); +/* Quirk infrastructure */ +void amd_pmf_quirks_init(struct amd_pmf_dev *dev); + #endif /* PMF_H */ -- GitLab From 9d893061ed68820de24b572d1e193b5e4737f2e0 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 10 Apr 2024 09:09:56 -0500 Subject: [PATCH 906/989] platform/x86/amd: pmf: Add quirk for ROG Zephyrus G14 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ROG Zephyrus G14 advertises support for SPS notifications to the BIOS but doesn't actually use them. Instead the asus-nb-wmi driver utilizes such events. Add a quirk to prevent the system from registering for ACPI platform profile when this system is found to avoid conflicts. Reported-by: al0uette@outlook.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218685 Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20240410140956.385-3-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/pmf-quirks.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/platform/x86/amd/pmf/pmf-quirks.c b/drivers/platform/x86/amd/pmf/pmf-quirks.c index 9f3790eaaa300..0b2eb0ae85feb 100644 --- a/drivers/platform/x86/amd/pmf/pmf-quirks.c +++ b/drivers/platform/x86/amd/pmf/pmf-quirks.c @@ -21,6 +21,14 @@ static struct quirk_entry quirk_no_sps_bug = { }; static const struct dmi_system_id fwbug_list[] = { + { + .ident = "ROG Zephyrus G14", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "GA403UV"), + }, + .driver_data = &quirk_no_sps_bug, + }, {} }; -- GitLab From d8c2d38c4d1dee8fe8e015b9ebf65bdd8e4da99b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 15 Apr 2024 14:28:53 -0700 Subject: [PATCH 907/989] platform/x86: ISST: Add Granite Rapids-D to HPM CPU list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Granite Rapids-D to hpm_cpu_ids, so that MSR 0x54 can be used. Signed-off-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20240415212853.2820470-1-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/speed_select_if/isst_if_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c index 08df9494603c5..30951f7131cd9 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c @@ -719,6 +719,7 @@ static struct miscdevice isst_if_char_driver = { }; static const struct x86_cpu_id hpm_cpu_ids[] = { + X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, NULL), X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, NULL), X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, NULL), {} -- GitLab From bc774d46b41482534c7ba92f6342ca0a355c13af Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 15 Apr 2024 15:06:25 -0700 Subject: [PATCH 908/989] platform/x86/intel-uncore-freq: Increase minor number support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No new changes will be added for minor version 2. Change the minor version number to 2 and stop displaying log message for unsupported minor version 2. Signed-off-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20240415220625.2828339-1-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../x86/intel/uncore-frequency/uncore-frequency-tpmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c index bd75d61ff8a66..ef730200a04bd 100644 --- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c +++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c @@ -29,7 +29,7 @@ #include "uncore-frequency-common.h" #define UNCORE_MAJOR_VERSION 0 -#define UNCORE_MINOR_VERSION 1 +#define UNCORE_MINOR_VERSION 2 #define UNCORE_HEADER_INDEX 0 #define UNCORE_FABRIC_CLUSTER_OFFSET 8 @@ -329,7 +329,7 @@ static int uncore_probe(struct auxiliary_device *auxdev, const struct auxiliary_ goto remove_clusters; } - if (TPMI_MINOR_VERSION(pd_info->ufs_header_ver) != UNCORE_MINOR_VERSION) + if (TPMI_MINOR_VERSION(pd_info->ufs_header_ver) > UNCORE_MINOR_VERSION) dev_info(&auxdev->dev, "Uncore: Ignore: Unsupported minor version:%lx\n", TPMI_MINOR_VERSION(pd_info->ufs_header_ver)); -- GitLab From 0ebd96f5da4410c0cb8fc75e44f1009530b2f90b Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 12 Apr 2024 21:03:14 +0300 Subject: [PATCH 909/989] net: stmmac: Apply half-duplex-less constraint for DW QoS Eth only There are three DW MAC IP-cores which can have the multiple Tx/Rx queues enabled: DW GMAC v3.7+ with AV feature, DW QoS Eth v4.x/v5.x, DW XGMAC/XLGMAC Based on the respective HW databooks, only the DW QoS Eth IP-core doesn't support the half-duplex link mode in case if more than one queues enabled: "In multiple queue/channel configurations, for half-duplex operation, enable only the Q0/CH0 on Tx and Rx. For single queue/channel in full-duplex operation, any queue/channel can be enabled." The rest of the IP-cores don't have such constraint. Thus in order to have the constraint applied for the DW QoS Eth MACs only, let's move the it' implementation to the respective MAC-capabilities getter and make sure the getter is called in the queues re-init procedure. Fixes: b6cfffa7ad92 ("stmmac: fix DMA channel hang in half-duplex mode") Signed-off-by: Serge Semin Reviewed-by: Romain Gantois Signed-off-by: Paolo Abeni --- .../net/ethernet/stmicro/stmmac/dwmac4_core.c | 7 +++++++ .../net/ethernet/stmicro/stmmac/stmmac_main.c | 19 +++---------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index cef25efbdff99..ec6a13e644b36 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -71,6 +71,13 @@ static void dwmac4_core_init(struct mac_device_info *hw, static void dwmac4_phylink_get_caps(struct stmmac_priv *priv) { priv->phylink_config.mac_capabilities |= MAC_2500FD; + + if (priv->plat->tx_queues_to_use > 1) + priv->phylink_config.mac_capabilities &= + ~(MAC_10HD | MAC_100HD | MAC_1000HD); + else + priv->phylink_config.mac_capabilities |= + (MAC_10HD | MAC_100HD | MAC_1000HD); } static void dwmac4_rx_queue_enable(struct mac_device_info *hw, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 24cd80490d19c..dd58c21b53eec 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1198,17 +1198,6 @@ static int stmmac_init_phy(struct net_device *dev) return ret; } -static void stmmac_set_half_duplex(struct stmmac_priv *priv) -{ - /* Half-Duplex can only work with single tx queue */ - if (priv->plat->tx_queues_to_use > 1) - priv->phylink_config.mac_capabilities &= - ~(MAC_10HD | MAC_100HD | MAC_1000HD); - else - priv->phylink_config.mac_capabilities |= - (MAC_10HD | MAC_100HD | MAC_1000HD); -} - static int stmmac_phy_setup(struct stmmac_priv *priv) { struct stmmac_mdio_bus_data *mdio_bus_data; @@ -1237,10 +1226,7 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) priv->phylink_config.supported_interfaces); priv->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | - MAC_10FD | MAC_100FD | - MAC_1000FD; - - stmmac_set_half_duplex(priv); + MAC_10 | MAC_100 | MAC_1000; /* Get the MAC specific capabilities */ stmmac_mac_phylink_get_caps(priv); @@ -7355,7 +7341,8 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) priv->rss.table[i] = ethtool_rxfh_indir_default(i, rx_cnt); - stmmac_set_half_duplex(priv); + stmmac_mac_phylink_get_caps(priv); + stmmac_napi_add(dev); if (netif_running(dev)) -- GitLab From 59c3d6ca6cbded6c6599e975b42a9d6a27fcbaf2 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 12 Apr 2024 21:03:15 +0300 Subject: [PATCH 910/989] net: stmmac: Fix max-speed being ignored on queue re-init It's possible to have the maximum link speed being artificially limited on the platform-specific basis. It's done either by setting up the plat_stmmacenet_data::max_speed field or by specifying the "max-speed" DT-property. In such cases it's required that any specific MAC-capabilities re-initializations would take the limit into account. In particular the link speed capabilities may change during the number of active Tx/Rx queues re-initialization. But the currently implemented procedure doesn't take the speed limit into account. Fix that by calling phylink_limit_mac_speed() in the stmmac_reinit_queues() method if the speed limitation was required in the same way as it's done in the stmmac_phy_setup() function. Fixes: 95201f36f395 ("net: stmmac: update MAC capabilities when tx queues are updated") Signed-off-by: Serge Semin Reviewed-by: Romain Gantois Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index dd58c21b53eec..b8a1f02398ee9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7328,6 +7328,7 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) { struct stmmac_priv *priv = netdev_priv(dev); int ret = 0, i; + int max_speed; if (netif_running(dev)) stmmac_release(dev); @@ -7343,6 +7344,10 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) stmmac_mac_phylink_get_caps(priv); + max_speed = priv->plat->max_speed; + if (max_speed) + phylink_limit_mac_speed(&priv->phylink_config, max_speed); + stmmac_napi_add(dev); if (netif_running(dev)) -- GitLab From 9cb54af214a7cdc91577ec083e5569f2ce2c86d8 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 12 Apr 2024 21:03:16 +0300 Subject: [PATCH 911/989] net: stmmac: Fix IP-cores specific MAC capabilities Here is the list of the MAC capabilities specific to the particular DW MAC IP-cores currently supported by the driver: DW MAC100: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 DW GMAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000 Allwinner sun8i MAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000 DW QoS Eth: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000 | MAC_2500FD if there is more than 1 active Tx/Rx queues: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10FD | MAC_100FD | MAC_1000FD | MAC_2500FD DW XGMAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_1000FD | MAC_2500FD | MAC_5000FD | MAC_10000FD DW XLGMAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_1000FD | MAC_2500FD | MAC_5000FD | MAC_10000FD | MAC_25000FD | MAC_40000FD | MAC_50000FD | MAC_100000FD As you can see there are only two common capabilities: MAC_ASYM_PAUSE | MAC_SYM_PAUSE. Meanwhile what is currently implemented defines 10/100/1000 link speeds for all IP-cores, which is definitely incorrect for DW MAC100, DW XGMAC and DW XLGMAC devices. Seeing the flow-control is implemented as a callback for each MAC IP-core (see dwmac100_flow_ctrl(), dwmac1000_flow_ctrl(), sun8i_dwmac_flow_ctrl(), etc) and since the MAC-specific setup() method is supposed to be called for each available DW MAC-based device, the capabilities initialization can be freely moved to these setup() functions, thus correctly setting up the MAC-capabilities for each IP-core (including the Allwinner Sun8i). A new stmmac_link::caps field was specifically introduced for that so to have all link-specific info preserved in a single structure. Note the suggested change fixes three earlier commits at a time. The commit 5b0d7d7da64b ("net: stmmac: Add the missing speeds that XGMAC supports") permitted the 10-100 link speeds and 1G half-duplex mode for DW XGMAC IP-core even though it doesn't support them. The commit df7699c70c1b ("net: stmmac: Do not cut down 1G modes") incorrectly added the MAC1000 capability to the DW MAC100 IP-core. Similarly to the DW XGMAC the commit 8a880936e902 ("net: stmmac: Add XLGMII support") incorrectly permitted the 10-100 link speeds and 1G half-duplex mode for DW XLGMAC IP-core. Fixes: 5b0d7d7da64b ("net: stmmac: Add the missing speeds that XGMAC supports") Fixes: df7699c70c1b ("net: stmmac: Do not cut down 1G modes") Fixes: 8a880936e902 ("net: stmmac: Add XLGMII support") Suggested-by: Russell King (Oracle) Signed-off-by: Serge Semin Reviewed-by: Romain Gantois Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/common.h | 1 + .../net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 2 ++ .../ethernet/stmicro/stmmac/dwmac1000_core.c | 2 ++ .../ethernet/stmicro/stmmac/dwmac100_core.c | 2 ++ .../net/ethernet/stmicro/stmmac/dwmac4_core.c | 10 ++++------ .../ethernet/stmicro/stmmac/dwxgmac2_core.c | 18 ++++++++---------- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 7 ++++--- 7 files changed, 23 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index a6fefe675ef15..3b7d4ac1e7be0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -553,6 +553,7 @@ extern const struct stmmac_hwtimestamp stmmac_ptp; extern const struct stmmac_mode_ops dwmac4_ring_mode_ops; struct mac_link { + u32 caps; u32 speed_mask; u32 speed10; u32 speed100; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index b21d99faa2d04..e1b761dcfa1dd 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1096,6 +1096,8 @@ static struct mac_device_info *sun8i_dwmac_setup(void *ppriv) priv->dev->priv_flags |= IFF_UNICAST_FLT; + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100 | MAC_1000; /* The loopback bit seems to be re-set when link change * Simply mask it each time * Speed 10/100/1000 are set in BIT(2)/BIT(3) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index 3927609abc441..8555299443f4e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -539,6 +539,8 @@ int dwmac1000_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100 | MAC_1000; mac->link.duplex = GMAC_CONTROL_DM; mac->link.speed10 = GMAC_CONTROL_PS; mac->link.speed100 = GMAC_CONTROL_PS | GMAC_CONTROL_FES; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c index a6e8d7bd95886..7667d103cd0eb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c @@ -175,6 +175,8 @@ int dwmac100_setup(struct stmmac_priv *priv) dev_info(priv->device, "\tDWMAC100\n"); mac->pcsr = priv->ioaddr; + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100; mac->link.duplex = MAC_CONTROL_F; mac->link.speed10 = 0; mac->link.speed100 = 0; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index ec6a13e644b36..a38226d7cc6a9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -70,14 +70,10 @@ static void dwmac4_core_init(struct mac_device_info *hw, static void dwmac4_phylink_get_caps(struct stmmac_priv *priv) { - priv->phylink_config.mac_capabilities |= MAC_2500FD; - if (priv->plat->tx_queues_to_use > 1) - priv->phylink_config.mac_capabilities &= - ~(MAC_10HD | MAC_100HD | MAC_1000HD); + priv->hw->link.caps &= ~(MAC_10HD | MAC_100HD | MAC_1000HD); else - priv->phylink_config.mac_capabilities |= - (MAC_10HD | MAC_100HD | MAC_1000HD); + priv->hw->link.caps |= (MAC_10HD | MAC_100HD | MAC_1000HD); } static void dwmac4_rx_queue_enable(struct mac_device_info *hw, @@ -1385,6 +1381,8 @@ int dwmac4_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100 | MAC_1000 | MAC_2500FD; mac->link.duplex = GMAC_CONFIG_DM; mac->link.speed10 = GMAC_CONFIG_PS; mac->link.speed100 = GMAC_CONFIG_FES | GMAC_CONFIG_PS; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index e841e312077ef..f8e7775bb6336 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -47,14 +47,6 @@ static void dwxgmac2_core_init(struct mac_device_info *hw, writel(XGMAC_INT_DEFAULT_EN, ioaddr + XGMAC_INT_EN); } -static void xgmac_phylink_get_caps(struct stmmac_priv *priv) -{ - priv->phylink_config.mac_capabilities |= MAC_2500FD | MAC_5000FD | - MAC_10000FD | MAC_25000FD | - MAC_40000FD | MAC_50000FD | - MAC_100000FD; -} - static void dwxgmac2_set_mac(void __iomem *ioaddr, bool enable) { u32 tx = readl(ioaddr + XGMAC_TX_CONFIG); @@ -1540,7 +1532,6 @@ static void dwxgmac3_fpe_configure(void __iomem *ioaddr, struct stmmac_fpe_cfg * const struct stmmac_ops dwxgmac210_ops = { .core_init = dwxgmac2_core_init, - .phylink_get_caps = xgmac_phylink_get_caps, .set_mac = dwxgmac2_set_mac, .rx_ipc = dwxgmac2_rx_ipc, .rx_queue_enable = dwxgmac2_rx_queue_enable, @@ -1601,7 +1592,6 @@ static void dwxlgmac2_rx_queue_enable(struct mac_device_info *hw, u8 mode, const struct stmmac_ops dwxlgmac2_ops = { .core_init = dwxgmac2_core_init, - .phylink_get_caps = xgmac_phylink_get_caps, .set_mac = dwxgmac2_set_mac, .rx_ipc = dwxgmac2_rx_ipc, .rx_queue_enable = dwxlgmac2_rx_queue_enable, @@ -1661,6 +1651,9 @@ int dwxgmac2_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_1000FD | MAC_2500FD | MAC_5000FD | + MAC_10000FD; mac->link.duplex = 0; mac->link.speed10 = XGMAC_CONFIG_SS_10_MII; mac->link.speed100 = XGMAC_CONFIG_SS_100_MII; @@ -1698,6 +1691,11 @@ int dwxlgmac2_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_1000FD | MAC_2500FD | MAC_5000FD | + MAC_10000FD | MAC_25000FD | + MAC_40000FD | MAC_50000FD | + MAC_100000FD; mac->link.duplex = 0; mac->link.speed1000 = XLGMAC_CONFIG_SS_1000; mac->link.speed2500 = XLGMAC_CONFIG_SS_2500; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index b8a1f02398ee9..7c6fb14b55550 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1225,12 +1225,11 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) xpcs_get_interfaces(priv->hw->xpcs, priv->phylink_config.supported_interfaces); - priv->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | - MAC_10 | MAC_100 | MAC_1000; - /* Get the MAC specific capabilities */ stmmac_mac_phylink_get_caps(priv); + priv->phylink_config.mac_capabilities = priv->hw->link.caps; + max_speed = priv->plat->max_speed; if (max_speed) phylink_limit_mac_speed(&priv->phylink_config, max_speed); @@ -7344,6 +7343,8 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) stmmac_mac_phylink_get_caps(priv); + priv->phylink_config.mac_capabilities = priv->hw->link.caps; + max_speed = priv->plat->max_speed; if (max_speed) phylink_limit_mac_speed(&priv->phylink_config, max_speed); -- GitLab From 428051600cb4e5a61d81aba3f8009b6c4f5e7582 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 15 Mar 2024 12:08:20 +0100 Subject: [PATCH 912/989] ice: tc: check src_vsi in case of traffic from VF In case of traffic going from the VF (so ingress for port representor) source VSI should be consider during packet classification. It is needed for hardware to not match packets from different ports with filters added on other port. It is only for "from VF" traffic, because other traffic direction doesn't have source VSI. Set correct ::src_vsi in rule_info to pass it to the hardware filter. For example this rule should drop only ipv4 packets from eth10, not from the others VF PRs. It is needed to check source VSI in this case. $tc filter add dev eth10 ingress protocol ip flower skip_sw action drop Fixes: 0d08a441fb1a ("ice: ndo_setup_tc implementation for PF") Reviewed-by: Jedrzej Jagielski Reviewed-by: Sridhar Samudrala Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index b890410a2bc0b..49ed5fd7db107 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -28,6 +28,8 @@ ice_tc_count_lkups(u32 flags, struct ice_tc_flower_lyr_2_4_hdrs *headers, * - ICE_TC_FLWR_FIELD_VLAN_TPID (present if specified) * - Tunnel flag (present if tunnel) */ + if (fltr->direction == ICE_ESWITCH_FLTR_EGRESS) + lkups_cnt++; if (flags & ICE_TC_FLWR_FIELD_TENANT_ID) lkups_cnt++; @@ -363,6 +365,11 @@ ice_tc_fill_rules(struct ice_hw *hw, u32 flags, /* Always add direction metadata */ ice_rule_add_direction_metadata(&list[ICE_TC_METADATA_LKUP_IDX]); + if (tc_fltr->direction == ICE_ESWITCH_FLTR_EGRESS) { + ice_rule_add_src_vsi_metadata(&list[i]); + i++; + } + rule_info->tun_type = ice_sw_type_from_tunnel(tc_fltr->tunnel_type); if (tc_fltr->tunnel_type != TNL_LAST) { i = ice_tc_fill_tunnel_outer(flags, tc_fltr, list, i); @@ -820,6 +827,7 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) /* specify the cookie as filter_rule_id */ rule_info.fltr_rule_id = fltr->cookie; + rule_info.src_vsi = vsi->idx; ret = ice_add_adv_rule(hw, list, lkups_cnt, &rule_info, &rule_added); if (ret == -EEXIST) { -- GitLab From 73278715725a8347032acf233082ca4eb31e6a56 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 15 Mar 2024 12:08:21 +0100 Subject: [PATCH 913/989] ice: tc: allow zero flags in parsing tc flower The check for flags is done to not pass empty lookups to adding switch rule functions. Since metadata is always added to lookups there is no need to check against the flag. It is also fixing the problem with such rule: $ tc filter add dev gtp_dev ingress protocol ip prio 0 flower \ enc_dst_port 2123 action drop Switch block in case of GTP can't parse the destination port, because it should always be set to GTP specific value. The same with ethertype. The result is that there is no other matching criteria than GTP tunnel. In this case flags is 0, rule can't be added only because of defensive check against flags. Fixes: 9a225f81f540 ("ice: Support GTP-U and GTP-C offload in switchdev") Reviewed-by: Wojciech Drewek Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index 49ed5fd7db107..bcbcfc67e5606 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -779,7 +779,7 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) int ret; int i; - if (!flags || (flags & ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT)) { + if (flags & ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT) { NL_SET_ERR_MSG_MOD(fltr->extack, "Unsupported encap field(s)"); return -EOPNOTSUPP; } -- GitLab From 2cca35f5dd78b9f8297c879c5db5ab137c5d86c3 Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Tue, 9 Apr 2024 17:45:44 +0200 Subject: [PATCH 914/989] ice: Fix checking for unsupported keys on non-tunnel device Add missing FLOW_DISSECTOR_KEY_ENC_* checks to TC flower filter parsing. Without these checks, it would be possible to add filters with tunnel options on non-tunnel devices. enc_* options are only valid for tunnel devices. Example: devlink dev eswitch set $PF1_PCI mode switchdev echo 1 > /sys/class/net/$PF1/device/sriov_numvfs tc qdisc add dev $VF1_PR ingress ethtool -K $PF1 hw-tc-offload on tc filter add dev $VF1_PR ingress flower enc_ttl 12 skip_sw action drop Fixes: 9e300987d4a8 ("ice: VXLAN and Geneve TC support") Reviewed-by: Michal Swiatkowski Signed-off-by: Marcin Szycik Reviewed-by: Jacob Keller Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index bcbcfc67e5606..688ccb0615ab9 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -1489,7 +1489,10 @@ ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi, (BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS))) { + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IP) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_OPTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL))) { NL_SET_ERR_MSG_MOD(fltr->extack, "Tunnel key used, but device isn't a tunnel"); return -EOPNOTSUPP; } else { -- GitLab From 9e4d3f4f34455abbaa9930bf6b7575a5cd081496 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 4 Apr 2024 13:07:59 +0300 Subject: [PATCH 915/989] drm/panel: visionox-rm69299: don't unregister DSI device The DSI device for the panel was registered by the DSI host, so it is an error to unregister it from the panel driver. Drop the call to mipi_dsi_device_unregister(). Fixes: c7f66d32dd43 ("drm/panel: add support for rm69299 visionox panel") Reviewed-by: Jessica Zhang Signed-off-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20240404-drop-panel-unregister-v1-1-9f56953c5fb9@linaro.org --- drivers/gpu/drm/panel/panel-visionox-rm69299.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-visionox-rm69299.c b/drivers/gpu/drm/panel/panel-visionox-rm69299.c index 775144695283f..b15ca56a09a74 100644 --- a/drivers/gpu/drm/panel/panel-visionox-rm69299.c +++ b/drivers/gpu/drm/panel/panel-visionox-rm69299.c @@ -253,8 +253,6 @@ static void visionox_rm69299_remove(struct mipi_dsi_device *dsi) struct visionox_rm69299 *ctx = mipi_dsi_get_drvdata(dsi); mipi_dsi_detach(ctx->dsi); - mipi_dsi_device_unregister(ctx->dsi); - drm_panel_remove(&ctx->panel); } -- GitLab From 941c0bdbc176df825adf77052263b2d63db6fef7 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 4 Apr 2024 13:08:00 +0300 Subject: [PATCH 916/989] drm/panel: novatek-nt36682e: don't unregister DSI device The DSI device for the panel was registered by the DSI host, so it is an error to unregister it from the panel driver. Drop the call to mipi_dsi_device_unregister(). Fixes: ea4f9975625a ("drm/panel: Add support for Novatek NT36672E panel driver") Reviewed-by: Jessica Zhang Signed-off-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20240404-drop-panel-unregister-v1-2-9f56953c5fb9@linaro.org --- drivers/gpu/drm/panel/panel-novatek-nt36672e.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-novatek-nt36672e.c b/drivers/gpu/drm/panel/panel-novatek-nt36672e.c index cb7406d744669..c39fe0fc5d69c 100644 --- a/drivers/gpu/drm/panel/panel-novatek-nt36672e.c +++ b/drivers/gpu/drm/panel/panel-novatek-nt36672e.c @@ -614,8 +614,6 @@ static void nt36672e_panel_remove(struct mipi_dsi_device *dsi) struct nt36672e_panel *ctx = mipi_dsi_get_drvdata(dsi); mipi_dsi_detach(ctx->dsi); - mipi_dsi_device_unregister(ctx->dsi); - drm_panel_remove(&ctx->panel); } -- GitLab From 631426ba1d45a8672b177ee85ad4cabe760dd131 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 14 Mar 2024 17:12:59 +0100 Subject: [PATCH 917/989] mm/madvise: make MADV_POPULATE_(READ|WRITE) handle VM_FAULT_RETRY properly Darrick reports that in some cases where pread() would fail with -EIO and mmap()+access would generate a SIGBUS signal, MADV_POPULATE_READ / MADV_POPULATE_WRITE will keep retrying forever and not fail with -EFAULT. While the madvise() call can be interrupted by a signal, this is not the desired behavior. MADV_POPULATE_READ / MADV_POPULATE_WRITE should behave like page faults in that case: fail and not retry forever. A reproducer can be found at [1]. The reason is that __get_user_pages(), as called by faultin_vma_page_range(), will not handle VM_FAULT_RETRY in a proper way: it will simply return 0 when VM_FAULT_RETRY happened, making madvise_populate()->faultin_vma_page_range() retry again and again, never setting FOLL_TRIED->FAULT_FLAG_TRIED for __get_user_pages(). __get_user_pages_locked() does what we want, but duplicating that logic in faultin_vma_page_range() feels wrong. So let's use __get_user_pages_locked() instead, that will detect VM_FAULT_RETRY and set FOLL_TRIED when retrying, making the fault handler return VM_FAULT_SIGBUS (VM_FAULT_ERROR) at some point, propagating -EFAULT from faultin_page() to __get_user_pages(), all the way to madvise_populate(). But, there is an issue: __get_user_pages_locked() will end up re-taking the MM lock and then __get_user_pages() will do another VMA lookup. In the meantime, the VMA layout could have changed and we'd fail with different error codes than we'd want to. As __get_user_pages() will currently do a new VMA lookup either way, let it do the VMA handling in a different way, controlled by a new FOLL_MADV_POPULATE flag, effectively moving these checks from madvise_populate() + faultin_page_range() in there. With this change, Darricks reproducer properly fails with -EFAULT, as documented for MADV_POPULATE_READ / MADV_POPULATE_WRITE. [1] https://lore.kernel.org/all/20240313171936.GN1927156@frogsfrogsfrogs/ Link: https://lkml.kernel.org/r/20240314161300.382526-1-david@redhat.com Link: https://lkml.kernel.org/r/20240314161300.382526-2-david@redhat.com Fixes: 4ca9b3859dac ("mm/madvise: introduce MADV_POPULATE_(READ|WRITE) to prefault page tables") Signed-off-by: David Hildenbrand Reported-by: Darrick J. Wong Closes: https://lore.kernel.org/all/20240311223815.GW1927156@frogsfrogsfrogs/ Cc: Darrick J. Wong Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 54 ++++++++++++++++++++++++++++++--------------------- mm/internal.h | 10 ++++++---- mm/madvise.c | 17 ++-------------- 3 files changed, 40 insertions(+), 41 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index af8edadc05d1b..1611e73b1121b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1206,6 +1206,22 @@ static long __get_user_pages(struct mm_struct *mm, /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { + /* + * MADV_POPULATE_(READ|WRITE) wants to handle VMA + * lookups+error reporting differently. + */ + if (gup_flags & FOLL_MADV_POPULATE) { + vma = vma_lookup(mm, start); + if (!vma) { + ret = -ENOMEM; + goto out; + } + if (check_vma_flags(vma, gup_flags)) { + ret = -EINVAL; + goto out; + } + goto retry; + } vma = gup_vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, @@ -1685,35 +1701,35 @@ long populate_vma_page_range(struct vm_area_struct *vma, } /* - * faultin_vma_page_range() - populate (prefault) page tables inside the - * given VMA range readable/writable + * faultin_page_range() - populate (prefault) page tables inside the + * given range readable/writable * * This takes care of mlocking the pages, too, if VM_LOCKED is set. * - * @vma: target vma + * @mm: the mm to populate page tables in * @start: start address * @end: end address * @write: whether to prefault readable or writable * @locked: whether the mmap_lock is still held * - * Returns either number of processed pages in the vma, or a negative error - * code on error (see __get_user_pages()). + * Returns either number of processed pages in the MM, or a negative error + * code on error (see __get_user_pages()). Note that this function reports + * errors related to VMAs, such as incompatible mappings, as expected by + * MADV_POPULATE_(READ|WRITE). * - * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and - * covered by the VMA. If it's released, *@locked will be set to 0. + * The range must be page-aligned. + * + * mm->mmap_lock must be held. If it's released, *@locked will be set to 0. */ -long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, bool write, int *locked) +long faultin_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool write, int *locked) { - struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* @@ -1725,19 +1741,13 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, * a poisoned page. * !FOLL_FORCE: Require proper access permissions. */ - gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE; + gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE | + FOLL_MADV_POPULATE; if (write) gup_flags |= FOLL_WRITE; - /* - * We want to report -EINVAL instead of -EFAULT for any permission - * problems or incompatible mappings. - */ - if (check_vma_flags(vma, gup_flags)) - return -EINVAL; - - ret = __get_user_pages(mm, start, nr_pages, gup_flags, - NULL, locked); + ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked, + gup_flags); lru_add_drain(); return ret; } diff --git a/mm/internal.h b/mm/internal.h index 7e486f2c502ce..07ad2675a88b4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -686,9 +686,8 @@ struct anon_vma *folio_anon_vma(struct folio *folio); void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); -extern long faultin_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - bool write, int *locked); +extern long faultin_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); @@ -1127,10 +1126,13 @@ enum { FOLL_FAST_ONLY = 1 << 20, /* allow unlocking the mmap lock */ FOLL_UNLOCKABLE = 1 << 21, + /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */ + FOLL_MADV_POPULATE = 1 << 22, }; #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ - FOLL_FAST_ONLY | FOLL_UNLOCKABLE) + FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \ + FOLL_MADV_POPULATE) /* * Indicates for which pages that are write-protected in the page table, diff --git a/mm/madvise.c b/mm/madvise.c index 44a498c94158c..1a073fcc4c0c0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -908,27 +908,14 @@ static long madvise_populate(struct vm_area_struct *vma, { const bool write = behavior == MADV_POPULATE_WRITE; struct mm_struct *mm = vma->vm_mm; - unsigned long tmp_end; int locked = 1; long pages; *prev = vma; while (start < end) { - /* - * We might have temporarily dropped the lock. For example, - * our VMA might have been split. - */ - if (!vma || start >= vma->vm_end) { - vma = vma_lookup(mm, start); - if (!vma) - return -ENOMEM; - } - - tmp_end = min_t(unsigned long, end, vma->vm_end); /* Populate (prefault) page tables readable/writable. */ - pages = faultin_vma_page_range(vma, start, tmp_end, write, - &locked); + pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; @@ -949,7 +936,7 @@ static long madvise_populate(struct vm_area_struct *vma, pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); fallthrough; - case -ENOMEM: + case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM; } } -- GitLab From c0205eaf3af9f5db14d4b5ee4abacf4a583c3c50 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 4 Apr 2024 10:17:26 -0700 Subject: [PATCH 918/989] userfaultfd: change src_folio after ensuring it's unpinned in UFFDIO_MOVE Commit d7a08838ab74 ("mm: userfaultfd: fix unexpected change to src_folio when UFFDIO_MOVE fails") moved the src_folio->{mapping, index} changing to after clearing the page-table and ensuring that it's not pinned. This avoids failure of swapout+migration and possibly memory corruption. However, the commit missed fixing it in the huge-page case. Link: https://lkml.kernel.org/r/20240404171726.2302435-1-lokeshgidra@google.com Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") Signed-off-by: Lokesh Gidra Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Kalesh Singh Cc: Lokesh Gidra Cc: Nicolas Geoffray Cc: Peter Xu Cc: Qi Zheng Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9859aa4f75538..89f58c7603b25 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2259,9 +2259,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm goto unlock_ptls; } - folio_move_anon_rmap(src_folio, dst_vma); - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); - src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); /* Folio got pinned from under us. Put it back and fail the move. */ if (folio_maybe_dma_pinned(src_folio)) { @@ -2270,6 +2267,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm goto unlock_ptls; } + folio_move_anon_rmap(src_folio, dst_vma); + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); -- GitLab From ea4b5b33bf8a4936cfb07759d926db697c43fb1e Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 4 Apr 2024 09:06:59 +0200 Subject: [PATCH 919/989] mm,page_owner: update metadata for tail pages Patch series "page_owner: Fix refcount imbalance and print fixup", v4. This series consists of a refactoring/correctness of updating the metadata of tail pages, a couple of fixups for the refcounting part and a fixup for the stack_start() function. From this series on, instead of counting the stacks, we count the outstanding nr_base_pages each stack has, which gives us a much better memory overview. The other fixup is for the migration part. A more detailed explanation can be found in the changelog of the respective patches. This patch (of 4): __set_page_owner_handle() and __reset_page_owner() update the metadata of all pages when the page is of a higher-order, but we miss to do the same when the pages are migrated. __folio_copy_owner() only updates the metadata of the head page, meaning that the information stored in the first page and the tail pages will not match. Strictly speaking that is not a big problem because 1) we do not print tail pages and 2) upon splitting all tail pages will inherit the metadata of the head page, but it is better to have all metadata in check should there be any problem, so it can ease debugging. For that purpose, a couple of helpers are created __update_page_owner_handle() which updates the metadata on allocation, and __update_page_owner_free_handle() which does the same when the page is freed. __folio_copy_owner() will make use of both as it needs to entirely replace the page_owner metadata for the new page. Link: https://lkml.kernel.org/r/20240404070702.2744-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20240404070702.2744-2-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Tested-by: Kefeng Wang Cc: Alexander Potapenko Cc: Alexandre Ghiti Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Alexandre Ghiti Signed-off-by: Andrew Morton --- mm/page_owner.c | 137 ++++++++++++++++++++++++++---------------------- 1 file changed, 74 insertions(+), 63 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index d17d1351ec84a..52d1ced0b57fb 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -228,9 +228,58 @@ static void dec_stack_record_count(depot_stack_handle_t handle) refcount_dec(&stack_record->count); } -void __reset_page_owner(struct page *page, unsigned short order) +static inline void __update_page_owner_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned short order, + gfp_t gfp_mask, + short last_migrate_reason, u64 ts_nsec, + pid_t pid, pid_t tgid, char *comm) { int i; + struct page_owner *page_owner; + + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + page_owner->handle = handle; + page_owner->order = order; + page_owner->gfp_mask = gfp_mask; + page_owner->last_migrate_reason = last_migrate_reason; + page_owner->pid = pid; + page_owner->tgid = tgid; + page_owner->ts_nsec = ts_nsec; + strscpy(page_owner->comm, comm, + sizeof(page_owner->comm)); + __set_bit(PAGE_EXT_OWNER, &page_ext->flags); + __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_ext = page_ext_next(page_ext); + } +} + +static inline void __update_page_owner_free_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned short order, + pid_t pid, pid_t tgid, + u64 free_ts_nsec) +{ + int i; + struct page_owner *page_owner; + + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + /* Only __reset_page_owner() wants to clear the bit */ + if (handle) { + __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_owner->free_handle = handle; + } + page_owner->free_ts_nsec = free_ts_nsec; + page_owner->free_pid = current->pid; + page_owner->free_tgid = current->tgid; + page_ext = page_ext_next(page_ext); + } +} + +void __reset_page_owner(struct page *page, unsigned short order) +{ struct page_ext *page_ext; depot_stack_handle_t handle; depot_stack_handle_t alloc_handle; @@ -245,16 +294,10 @@ void __reset_page_owner(struct page *page, unsigned short order) alloc_handle = page_owner->handle; handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); - for (i = 0; i < (1 << order); i++) { - __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); - page_owner->free_handle = handle; - page_owner->free_ts_nsec = free_ts_nsec; - page_owner->free_pid = current->pid; - page_owner->free_tgid = current->tgid; - page_ext = page_ext_next(page_ext); - page_owner = get_page_owner(page_ext); - } + __update_page_owner_free_handle(page_ext, handle, order, current->pid, + current->tgid, free_ts_nsec); page_ext_put(page_ext); + if (alloc_handle != early_handle) /* * early_handle is being set as a handle for all those @@ -266,36 +309,11 @@ void __reset_page_owner(struct page *page, unsigned short order) dec_stack_record_count(alloc_handle); } -static inline void __set_page_owner_handle(struct page_ext *page_ext, - depot_stack_handle_t handle, - unsigned short order, gfp_t gfp_mask) -{ - struct page_owner *page_owner; - int i; - u64 ts_nsec = local_clock(); - - for (i = 0; i < (1 << order); i++) { - page_owner = get_page_owner(page_ext); - page_owner->handle = handle; - page_owner->order = order; - page_owner->gfp_mask = gfp_mask; - page_owner->last_migrate_reason = -1; - page_owner->pid = current->pid; - page_owner->tgid = current->tgid; - page_owner->ts_nsec = ts_nsec; - strscpy(page_owner->comm, current->comm, - sizeof(page_owner->comm)); - __set_bit(PAGE_EXT_OWNER, &page_ext->flags); - __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); - - page_ext = page_ext_next(page_ext); - } -} - noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { struct page_ext *page_ext; + u64 ts_nsec = local_clock(); depot_stack_handle_t handle; handle = save_stack(gfp_mask); @@ -303,7 +321,9 @@ noinline void __set_page_owner(struct page *page, unsigned short order, page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; - __set_page_owner_handle(page_ext, handle, order, gfp_mask); + __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1, + current->pid, current->tgid, ts_nsec, + current->comm); page_ext_put(page_ext); inc_stack_record_count(handle, gfp_mask); } @@ -342,7 +362,7 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) { struct page_ext *old_ext; struct page_ext *new_ext; - struct page_owner *old_page_owner, *new_page_owner; + struct page_owner *old_page_owner; old_ext = page_ext_get(&old->page); if (unlikely(!old_ext)) @@ -355,31 +375,21 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) } old_page_owner = get_page_owner(old_ext); - new_page_owner = get_page_owner(new_ext); - new_page_owner->order = old_page_owner->order; - new_page_owner->gfp_mask = old_page_owner->gfp_mask; - new_page_owner->last_migrate_reason = - old_page_owner->last_migrate_reason; - new_page_owner->handle = old_page_owner->handle; - new_page_owner->pid = old_page_owner->pid; - new_page_owner->tgid = old_page_owner->tgid; - new_page_owner->free_pid = old_page_owner->free_pid; - new_page_owner->free_tgid = old_page_owner->free_tgid; - new_page_owner->ts_nsec = old_page_owner->ts_nsec; - new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; - strcpy(new_page_owner->comm, old_page_owner->comm); - + __update_page_owner_handle(new_ext, old_page_owner->handle, + old_page_owner->order, old_page_owner->gfp_mask, + old_page_owner->last_migrate_reason, + old_page_owner->ts_nsec, old_page_owner->pid, + old_page_owner->tgid, old_page_owner->comm); /* - * We don't clear the bit on the old folio as it's going to be freed - * after migration. Until then, the info can be useful in case of - * a bug, and the overall stats will be off a bit only temporarily. - * Also, migrate_misplaced_transhuge_page() can still fail the - * migration and then we want the old folio to retain the info. But - * in that case we also don't need to explicitly clear the info from - * the new page, which will be freed. + * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio + * will be freed after migration. Keep them until then as they may be + * useful. */ - __set_bit(PAGE_EXT_OWNER, &new_ext->flags); - __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); + __update_page_owner_free_handle(new_ext, 0, old_page_owner->order, + old_page_owner->free_pid, + old_page_owner->free_tgid, + old_page_owner->free_ts_nsec); + page_ext_put(new_ext); page_ext_put(old_ext); } @@ -787,8 +797,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) goto ext_put_continue; /* Found early allocated page */ - __set_page_owner_handle(page_ext, early_handle, - 0, 0); + __update_page_owner_handle(page_ext, early_handle, 0, 0, + -1, local_clock(), current->pid, + current->tgid, current->comm); count++; ext_put_continue: page_ext_put(page_ext); -- GitLab From f5c12105c15f0ddf0ff37646290568dd986fa2f3 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 4 Apr 2024 09:07:00 +0200 Subject: [PATCH 920/989] mm,page_owner: fix refcount imbalance Current code does not contemplate scenarios were an allocation and free operation on the same pages do not handle it in the same amount at once. To give an example, page_alloc_exact(), where we will allocate a page of enough order to stafisfy the size request, but we will free the remainings right away. In the above example, we will increment the stack_record refcount only once, but we will decrease it the same number of times as number of unused pages we have to free. This will lead to a warning because of refcount imbalance. Fix this by recording the number of base pages in the refcount field. Link: https://lkml.kernel.org/r/20240404070702.2744-3-osalvador@suse.de Reported-by: syzbot+41bbfdb8d41003d12c0f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-mm/00000000000090e8ff0613eda0e5@google.com Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count") Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Tested-by: Alexandre Ghiti Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- Documentation/mm/page_owner.rst | 73 +++++++++++++++++---------------- mm/page_owner.c | 34 ++++++++------- 2 files changed, 58 insertions(+), 49 deletions(-) diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst index 0d0334cd51798..3a45a20fc05a1 100644 --- a/Documentation/mm/page_owner.rst +++ b/Documentation/mm/page_owner.rst @@ -24,10 +24,10 @@ fragmentation statistics can be obtained through gfp flag information of each page. It is already implemented and activated if page owner is enabled. Other usages are more than welcome. -It can also be used to show all the stacks and their outstanding -allocations, which gives us a quick overview of where the memory is going -without the need to screen through all the pages and match the allocation -and free operation. +It can also be used to show all the stacks and their current number of +allocated base pages, which gives us a quick overview of where the memory +is going without the need to screen through all the pages and match the +allocation and free operation. page owner is disabled by default. So, if you'd like to use it, you need to add "page_owner=on" to your boot cmdline. If the kernel is built @@ -75,42 +75,45 @@ Usage cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt cat stacks.txt - prep_new_page+0xa9/0x120 - get_page_from_freelist+0x7e6/0x2140 - __alloc_pages+0x18a/0x370 - new_slab+0xc8/0x580 - ___slab_alloc+0x1f2/0xaf0 - __slab_alloc.isra.86+0x22/0x40 - kmem_cache_alloc+0x31b/0x350 - __khugepaged_enter+0x39/0x100 - dup_mmap+0x1c7/0x5ce - copy_process+0x1afe/0x1c90 - kernel_clone+0x9a/0x3c0 - __do_sys_clone+0x66/0x90 - do_syscall_64+0x7f/0x160 - entry_SYSCALL_64_after_hwframe+0x6c/0x74 - stack_count: 234 + post_alloc_hook+0x177/0x1a0 + get_page_from_freelist+0xd01/0xd80 + __alloc_pages+0x39e/0x7e0 + allocate_slab+0xbc/0x3f0 + ___slab_alloc+0x528/0x8a0 + kmem_cache_alloc+0x224/0x3b0 + sk_prot_alloc+0x58/0x1a0 + sk_alloc+0x32/0x4f0 + inet_create+0x427/0xb50 + __sock_create+0x2e4/0x650 + inet_ctl_sock_create+0x30/0x180 + igmp_net_init+0xc1/0x130 + ops_init+0x167/0x410 + setup_net+0x304/0xa60 + copy_net_ns+0x29b/0x4a0 + create_new_namespaces+0x4a1/0x820 + nr_base_pages: 16 ... ... echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt cat stacks_7000.txt - prep_new_page+0xa9/0x120 - get_page_from_freelist+0x7e6/0x2140 - __alloc_pages+0x18a/0x370 - alloc_pages_mpol+0xdf/0x1e0 - folio_alloc+0x14/0x50 - filemap_alloc_folio+0xb0/0x100 - page_cache_ra_unbounded+0x97/0x180 - filemap_fault+0x4b4/0x1200 - __do_fault+0x2d/0x110 - do_pte_missing+0x4b0/0xa30 - __handle_mm_fault+0x7fa/0xb70 - handle_mm_fault+0x125/0x300 - do_user_addr_fault+0x3c9/0x840 - exc_page_fault+0x68/0x150 - asm_exc_page_fault+0x22/0x30 - stack_count: 8248 + post_alloc_hook+0x177/0x1a0 + get_page_from_freelist+0xd01/0xd80 + __alloc_pages+0x39e/0x7e0 + alloc_pages_mpol+0x22e/0x490 + folio_alloc+0xd5/0x110 + filemap_alloc_folio+0x78/0x230 + page_cache_ra_order+0x287/0x6f0 + filemap_get_pages+0x517/0x1160 + filemap_read+0x304/0x9f0 + xfs_file_buffered_read+0xe6/0x1d0 [xfs] + xfs_file_read_iter+0x1f0/0x380 [xfs] + __kernel_read+0x3b9/0x730 + kernel_read_file+0x309/0x4d0 + __do_sys_finit_module+0x381/0x730 + do_syscall_64+0x8d/0x150 + entry_SYSCALL_64_after_hwframe+0x62/0x6a + nr_base_pages: 20824 ... cat /sys/kernel/debug/page_owner > page_owner_full.txt diff --git a/mm/page_owner.c b/mm/page_owner.c index 52d1ced0b57fb..5df0d6892bdce 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -196,7 +196,8 @@ static void add_stack_record_to_list(struct stack_record *stack_record, spin_unlock_irqrestore(&stack_list_lock, flags); } -static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask) +static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask, + int nr_base_pages) { struct stack_record *stack_record = __stack_depot_get_stack_record(handle); @@ -217,15 +218,20 @@ static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask) /* Add the new stack_record to our list */ add_stack_record_to_list(stack_record, gfp_mask); } - refcount_inc(&stack_record->count); + refcount_add(nr_base_pages, &stack_record->count); } -static void dec_stack_record_count(depot_stack_handle_t handle) +static void dec_stack_record_count(depot_stack_handle_t handle, + int nr_base_pages) { struct stack_record *stack_record = __stack_depot_get_stack_record(handle); - if (stack_record) - refcount_dec(&stack_record->count); + if (!stack_record) + return; + + if (refcount_sub_and_test(nr_base_pages, &stack_record->count)) + pr_warn("%s: refcount went to 0 for %u handle\n", __func__, + handle); } static inline void __update_page_owner_handle(struct page_ext *page_ext, @@ -306,7 +312,7 @@ void __reset_page_owner(struct page *page, unsigned short order) * the machinery is not ready yet, we cannot decrement * their refcount either. */ - dec_stack_record_count(alloc_handle); + dec_stack_record_count(alloc_handle, 1 << order); } noinline void __set_page_owner(struct page *page, unsigned short order, @@ -325,7 +331,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order, current->pid, current->tgid, ts_nsec, current->comm); page_ext_put(page_ext); - inc_stack_record_count(handle, gfp_mask); + inc_stack_record_count(handle, gfp_mask, 1 << order); } void __set_page_owner_migrate_reason(struct page *page, int reason) @@ -872,11 +878,11 @@ static void *stack_next(struct seq_file *m, void *v, loff_t *ppos) return stack; } -static unsigned long page_owner_stack_threshold; +static unsigned long page_owner_pages_threshold; static int stack_print(struct seq_file *m, void *v) { - int i, stack_count; + int i, nr_base_pages; struct stack *stack = v; unsigned long *entries; unsigned long nr_entries; @@ -887,14 +893,14 @@ static int stack_print(struct seq_file *m, void *v) nr_entries = stack_record->size; entries = stack_record->entries; - stack_count = refcount_read(&stack_record->count) - 1; + nr_base_pages = refcount_read(&stack_record->count) - 1; - if (stack_count < 1 || stack_count < page_owner_stack_threshold) + if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold) return 0; for (i = 0; i < nr_entries; i++) seq_printf(m, " %pS\n", (void *)entries[i]); - seq_printf(m, "stack_count: %d\n\n", stack_count); + seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages); return 0; } @@ -924,13 +930,13 @@ static const struct file_operations page_owner_stack_operations = { static int page_owner_threshold_get(void *data, u64 *val) { - *val = READ_ONCE(page_owner_stack_threshold); + *val = READ_ONCE(page_owner_pages_threshold); return 0; } static int page_owner_threshold_set(void *data, u64 val) { - WRITE_ONCE(page_owner_stack_threshold, val); + WRITE_ONCE(page_owner_pages_threshold, val); return 0; } -- GitLab From 718b1f3373a7999f77e617c17abdcb98a3c001ea Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 4 Apr 2024 09:07:01 +0200 Subject: [PATCH 921/989] mm,page_owner: fix accounting of pages when migrating Upon migration, new allocated pages are being given the handle of the old pages. This is problematic because it means that for the stack which allocated the old page, we will be substracting the old page + the new one when that page is freed, creating an accounting imbalance. There is an interest in keeping it that way, as otherwise the output will biased towards migration stacks should those operations occur often, but that is not really helpful. The link from the new page to the old stack is being performed by calling __update_page_owner_handle() in __folio_copy_owner(). The only thing that is left is to link the migrate stack to the old page, so the old page will be subtracted from the migrate stack, avoiding by doing so any possible imbalance. Link: https://lkml.kernel.org/r/20240404070702.2744-4-osalvador@suse.de Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count") Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Cc: Alexander Potapenko Cc: Alexandre Ghiti Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- mm/page_owner.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/page_owner.c b/mm/page_owner.c index 5df0d6892bdce..b4476f45b376e 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -366,9 +366,12 @@ void __split_page_owner(struct page *page, int old_order, int new_order) void __folio_copy_owner(struct folio *newfolio, struct folio *old) { + int i; struct page_ext *old_ext; struct page_ext *new_ext; struct page_owner *old_page_owner; + struct page_owner *new_page_owner; + depot_stack_handle_t migrate_handle; old_ext = page_ext_get(&old->page); if (unlikely(!old_ext)) @@ -381,6 +384,8 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) } old_page_owner = get_page_owner(old_ext); + new_page_owner = get_page_owner(new_ext); + migrate_handle = new_page_owner->handle; __update_page_owner_handle(new_ext, old_page_owner->handle, old_page_owner->order, old_page_owner->gfp_mask, old_page_owner->last_migrate_reason, @@ -395,6 +400,16 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) old_page_owner->free_pid, old_page_owner->free_tgid, old_page_owner->free_ts_nsec); + /* + * We linked the original stack to the new folio, we need to do the same + * for the new one and the old folio otherwise there will be an imbalance + * when subtracting those pages from the stack. + */ + for (i = 0; i < (1 << new_page_owner->order); i++) { + old_page_owner->handle = migrate_handle; + old_ext = page_ext_next(old_ext); + old_page_owner = get_page_owner(old_ext); + } page_ext_put(new_ext); page_ext_put(old_ext); -- GitLab From 74017458017127ca6bf14b1f9fda69e03f43389b Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 4 Apr 2024 09:07:02 +0200 Subject: [PATCH 922/989] mm,page_owner: fix printing of stack records When seq_* code sees that its buffer overflowed, it re-allocates a bigger onecand calls seq_operations->start() callback again. stack_start() naively though that if it got called again, it meant that the old record got already printed so it returned the next object, but that is not true. The consequence of that is that every time stack_stop() -> stack_start() get called because we needed a bigger buffer, stack_start() will skip entries, and those will not be printed. Fix it by not advancing to the next object in stack_start(). Link: https://lkml.kernel.org/r/20240404070702.2744-5-osalvador@suse.de Fixes: 765973a09803 ("mm,page_owner: display all stacks and their count") Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Cc: Alexander Potapenko Cc: Alexandre Ghiti Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- mm/page_owner.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index b4476f45b376e..9bef0b4428634 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -872,13 +872,11 @@ static void *stack_start(struct seq_file *m, loff_t *ppos) * value of stack_list. */ stack = smp_load_acquire(&stack_list); + m->private = stack; } else { stack = m->private; - stack = stack->next; } - m->private = stack; - return stack; } -- GitLab From c5977c95dff182d6ee06f4d6f60bcb0284912969 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Apr 2024 19:19:20 -0400 Subject: [PATCH 923/989] mm/userfaultfd: allow hugetlb change protection upon poison entry After UFFDIO_POISON, there can be two kinds of hugetlb pte markers, either the POISON one or UFFD_WP one. Allow change protection to run on a poisoned marker just like !hugetlb cases, ignoring the marker irrelevant of the permission. Here the two bits are mutual exclusive. For example, when install a poisoned entry it must not be UFFD_WP already (by checking pte_none() before such install). And it also means if UFFD_WP is set there must have no POISON bit set. It makes sense because UFFD_WP is a bit to reflect permission, and permissions do not apply if the pte is poisoned and destined to sigbus. So here we simply check uffd_wp bit set first, do nothing otherwise. Attach the Fixes to UFFDIO_POISON work, as before that it should not be possible to have poison entry for hugetlb (e.g., hugetlb doesn't do swap, so no chance of swapin errors). Link: https://lkml.kernel.org/r/20240405231920.1772199-1-peterx@redhat.com Link: https://lore.kernel.org/r/000000000000920d5e0615602dd1@google.com Fixes: fc71884a5f59 ("mm: userfaultfd: add new UFFDIO_POISON ioctl") Signed-off-by: Peter Xu Reported-by: syzbot+b07c8ac8eee3d4d8440f@syzkaller.appspotmail.com Reviewed-by: David Hildenbrand Reviewed-by: Axel Rasmussen Cc: [6.6+] Signed-off-by: Andrew Morton --- mm/hugetlb.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 23ef240ba48a6..31d00eee028f1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7044,9 +7044,13 @@ long hugetlb_change_protection(struct vm_area_struct *vma, if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); } else if (unlikely(is_pte_marker(pte))) { - /* No other markers apply for now. */ - WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); - if (uffd_wp_resolve) + /* + * Do nothing on a poison marker; page is + * corrupted, permissons do not apply. Here + * pte_marker_uffd_wp()==true implies !poison + * because they're mutual exclusive. + */ + if (pte_marker_uffd_wp(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); } else if (!huge_pte_none(pte)) { -- GitLab From 1983184c22dd84a4d95a71e5c6775c2638557dc7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 7 Apr 2024 16:54:56 +0800 Subject: [PATCH 924/989] mm/memory-failure: fix deadlock when hugetlb_optimize_vmemmap is enabled When I did hard offline test with hugetlb pages, below deadlock occurs: ====================================================== WARNING: possible circular locking dependency detected 6.8.0-11409-gf6cef5f8c37f #1 Not tainted ------------------------------------------------------ bash/46904 is trying to acquire lock: ffffffffabe68910 (cpu_hotplug_lock){++++}-{0:0}, at: static_key_slow_dec+0x16/0x60 but task is already holding lock: ffffffffabf92ea8 (pcp_batch_high_lock){+.+.}-{3:3}, at: zone_pcp_disable+0x16/0x40 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (pcp_batch_high_lock){+.+.}-{3:3}: __mutex_lock+0x6c/0x770 page_alloc_cpu_online+0x3c/0x70 cpuhp_invoke_callback+0x397/0x5f0 __cpuhp_invoke_callback_range+0x71/0xe0 _cpu_up+0xeb/0x210 cpu_up+0x91/0xe0 cpuhp_bringup_mask+0x49/0xb0 bringup_nonboot_cpus+0xb7/0xe0 smp_init+0x25/0xa0 kernel_init_freeable+0x15f/0x3e0 kernel_init+0x15/0x1b0 ret_from_fork+0x2f/0x50 ret_from_fork_asm+0x1a/0x30 -> #0 (cpu_hotplug_lock){++++}-{0:0}: __lock_acquire+0x1298/0x1cd0 lock_acquire+0xc0/0x2b0 cpus_read_lock+0x2a/0xc0 static_key_slow_dec+0x16/0x60 __hugetlb_vmemmap_restore_folio+0x1b9/0x200 dissolve_free_huge_page+0x211/0x260 __page_handle_poison+0x45/0xc0 memory_failure+0x65e/0xc70 hard_offline_page_store+0x55/0xa0 kernfs_fop_write_iter+0x12c/0x1d0 vfs_write+0x387/0x550 ksys_write+0x64/0xe0 do_syscall_64+0xca/0x1e0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(pcp_batch_high_lock); lock(cpu_hotplug_lock); lock(pcp_batch_high_lock); rlock(cpu_hotplug_lock); *** DEADLOCK *** 5 locks held by bash/46904: #0: ffff98f6c3bb23f0 (sb_writers#5){.+.+}-{0:0}, at: ksys_write+0x64/0xe0 #1: ffff98f6c328e488 (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write_iter+0xf8/0x1d0 #2: ffff98ef83b31890 (kn->active#113){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x100/0x1d0 #3: ffffffffabf9db48 (mf_mutex){+.+.}-{3:3}, at: memory_failure+0x44/0xc70 #4: ffffffffabf92ea8 (pcp_batch_high_lock){+.+.}-{3:3}, at: zone_pcp_disable+0x16/0x40 stack backtrace: CPU: 10 PID: 46904 Comm: bash Kdump: loaded Not tainted 6.8.0-11409-gf6cef5f8c37f #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x68/0xa0 check_noncircular+0x129/0x140 __lock_acquire+0x1298/0x1cd0 lock_acquire+0xc0/0x2b0 cpus_read_lock+0x2a/0xc0 static_key_slow_dec+0x16/0x60 __hugetlb_vmemmap_restore_folio+0x1b9/0x200 dissolve_free_huge_page+0x211/0x260 __page_handle_poison+0x45/0xc0 memory_failure+0x65e/0xc70 hard_offline_page_store+0x55/0xa0 kernfs_fop_write_iter+0x12c/0x1d0 vfs_write+0x387/0x550 ksys_write+0x64/0xe0 do_syscall_64+0xca/0x1e0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 RIP: 0033:0x7fc862314887 Code: 10 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 RSP: 002b:00007fff19311268 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007fc862314887 RDX: 000000000000000c RSI: 000056405645fe10 RDI: 0000000000000001 RBP: 000056405645fe10 R08: 00007fc8623d1460 R09: 000000007fffffff R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c R13: 00007fc86241b780 R14: 00007fc862417600 R15: 00007fc862416a00 In short, below scene breaks the lock dependency chain: memory_failure __page_handle_poison zone_pcp_disable -- lock(pcp_batch_high_lock) dissolve_free_huge_page __hugetlb_vmemmap_restore_folio static_key_slow_dec cpus_read_lock -- rlock(cpu_hotplug_lock) Fix this by calling drain_all_pages() instead. This issue won't occur until commit a6b40850c442 ("mm: hugetlb: replace hugetlb_free_vmemmap_enabled with a static_key"). As it introduced rlock(cpu_hotplug_lock) in dissolve_free_huge_page() code path while lock(pcp_batch_high_lock) is already in the __page_handle_poison(). [linmiaohe@huawei.com: extend comment per Oscar] [akpm@linux-foundation.org: reflow block comment] Link: https://lkml.kernel.org/r/20240407085456.2798193-1-linmiaohe@huawei.com Fixes: a6b40850c442 ("mm: hugetlb: replace hugetlb_free_vmemmap_enabled with a static_key") Signed-off-by: Miaohe Lin Acked-by: Oscar Salvador Reviewed-by: Jane Chu Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9349948f1abfd..9e62a00b46dde 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -154,11 +154,23 @@ static int __page_handle_poison(struct page *page) { int ret; - zone_pcp_disable(page_zone(page)); + /* + * zone_pcp_disable() can't be used here. It will + * hold pcp_batch_high_lock and dissolve_free_huge_page() might hold + * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap + * optimization is enabled. This will break current lock dependency + * chain and leads to deadlock. + * Disabling pcp before dissolving the page was a deterministic + * approach because we made sure that those pages cannot end up in any + * PCP list. Draining PCP lists expels those pages to the buddy system, + * but nothing guarantees that those pages do not get back to a PCP + * queue if we need to refill those. + */ ret = dissolve_free_huge_page(page); - if (!ret) + if (!ret) { + drain_all_pages(page_zone(page)); ret = take_page_off_buddy(page); - zone_pcp_enable(page_zone(page)); + } return ret; } -- GitLab From 07a57a338adb6ec9e766d6a6790f76527f45ceb5 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Sun, 7 Apr 2024 15:05:37 +0200 Subject: [PATCH 925/989] mm,swapops: update check in is_pfn_swap_entry for hwpoison entries Tony reported that the Machine check recovery was broken in v6.9-rc1, as he was hitting a VM_BUG_ON when injecting uncorrectable memory errors to DRAM. After some more digging and debugging on his side, he realized that this went back to v6.1, with the introduction of 'commit 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry")'. That commit, among other things, introduced swp_offset_pfn(), replacing hwpoison_entry_to_pfn() in its favour. The patch also introduced a VM_BUG_ON() check for is_pfn_swap_entry(), but is_pfn_swap_entry() never got updated to cover hwpoison entries, which means that we would hit the VM_BUG_ON whenever we would call swp_offset_pfn() for such entries on environments with CONFIG_DEBUG_VM set. Fix this by updating the check to cover hwpoison entries as well, and update the comment while we are it. Link: https://lkml.kernel.org/r/20240407130537.16977-1-osalvador@suse.de Fixes: 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry") Signed-off-by: Oscar Salvador Reported-by: Tony Luck Closes: https://lore.kernel.org/all/Zg8kLSl2yAlA3o5D@agluck-desk3/ Tested-by: Tony Luck Reviewed-by: Peter Xu Reviewed-by: David Hildenbrand Acked-by: Miaohe Lin Cc: [6.1.x] Signed-off-by: Andrew Morton --- include/linux/swapops.h | 65 +++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 48b700ba1d188..a5c560a2f8c25 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -390,6 +390,35 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) } #endif /* CONFIG_MIGRATION */ +#ifdef CONFIG_MEMORY_FAILURE + +/* + * Support for hardware poisoned pages + */ +static inline swp_entry_t make_hwpoison_entry(struct page *page) +{ + BUG_ON(!PageLocked(page)); + return swp_entry(SWP_HWPOISON, page_to_pfn(page)); +} + +static inline int is_hwpoison_entry(swp_entry_t entry) +{ + return swp_type(entry) == SWP_HWPOISON; +} + +#else + +static inline swp_entry_t make_hwpoison_entry(struct page *page) +{ + return swp_entry(0, 0); +} + +static inline int is_hwpoison_entry(swp_entry_t swp) +{ + return 0; +} +#endif + typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) @@ -483,8 +512,9 @@ static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) /* * A pfn swap entry is a special type of swap entry that always has a pfn stored - * in the swap offset. They are used to represent unaddressable device memory - * and to restrict access to a page undergoing migration. + * in the swap offset. They can either be used to represent unaddressable device + * memory, to restrict access to a page undergoing migration or to represent a + * pfn which has been hwpoisoned and unmapped. */ static inline bool is_pfn_swap_entry(swp_entry_t entry) { @@ -492,7 +522,7 @@ static inline bool is_pfn_swap_entry(swp_entry_t entry) BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); return is_migration_entry(entry) || is_device_private_entry(entry) || - is_device_exclusive_entry(entry); + is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); } struct page_vma_mapped_walk; @@ -561,35 +591,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd) } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -#ifdef CONFIG_MEMORY_FAILURE - -/* - * Support for hardware poisoned pages - */ -static inline swp_entry_t make_hwpoison_entry(struct page *page) -{ - BUG_ON(!PageLocked(page)); - return swp_entry(SWP_HWPOISON, page_to_pfn(page)); -} - -static inline int is_hwpoison_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_HWPOISON; -} - -#else - -static inline swp_entry_t make_hwpoison_entry(struct page *page) -{ - return swp_entry(0, 0); -} - -static inline int is_hwpoison_entry(swp_entry_t swp) -{ - return 0; -} -#endif - static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; -- GitLab From 9253c54e01b6505d348afbc02abaa4d9f8a01395 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 8 Apr 2024 23:02:06 +0100 Subject: [PATCH 926/989] Squashfs: check the inode number is not the invalid value of zero Syskiller has produced an out of bounds access in fill_meta_index(). That out of bounds access is ultimately caused because the inode has an inode number with the invalid value of zero, which was not checked. The reason this causes the out of bounds access is due to following sequence of events: 1. Fill_meta_index() is called to allocate (via empty_meta_index()) and fill a metadata index. It however suffers a data read error and aborts, invalidating the newly returned empty metadata index. It does this by setting the inode number of the index to zero, which means unused (zero is not a valid inode number). 2. When fill_meta_index() is subsequently called again on another read operation, locate_meta_index() returns the previous index because it matches the inode number of 0. Because this index has been returned it is expected to have been filled, and because it hasn't been, an out of bounds access is performed. This patch adds a sanity check which checks that the inode number is not zero when the inode is created and returns -EINVAL if it is. [phillip@squashfs.org.uk: whitespace fix] Link: https://lkml.kernel.org/r/20240409204723.446925-1-phillip@squashfs.org.uk Link: https://lkml.kernel.org/r/20240408220206.435788-1-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Reported-by: "Ubisectech Sirius" Closes: https://lore.kernel.org/lkml/87f5c007-b8a5-41ae-8b57-431e924c5915.bugreport@ubisectech.com/ Cc: Christian Brauner Cc: Signed-off-by: Andrew Morton --- fs/squashfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index aa3411354e66d..16bd693d0b3aa 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -48,6 +48,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, gid_t i_gid; int err; + inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); + if (inode->i_ino == 0) + return -EINVAL; + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid); if (err) return err; @@ -58,7 +62,6 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); - inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0); inode_set_atime(inode, inode_get_mtime_sec(inode), 0); inode_set_ctime(inode, inode_get_mtime_sec(inode), 0); -- GitLab From 0b2cf0a45e06d9538a2371f90150297a87b20eea Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 9 Apr 2024 15:17:15 +0200 Subject: [PATCH 927/989] mm,page_owner: defer enablement of static branch Kefeng Wang reported that he was seeing some memory leaks with kmemleak with page_owner enabled. The reason is that we enable the page_owner_inited static branch and then proceed with the linking of stack_list struct to dummy_stack, which means that exists a race window between these two steps where we can have pages already being allocated calling add_stack_record_to_list(), allocating objects and linking them to stack_list, but then we set stack_list pointing to dummy_stack in init_page_owner. Which means that the objects that have been allocated during that time window are unreferenced and lost. Fix this by deferring the enablement of the branch until we have properly set up the list. Link: https://lkml.kernel.org/r/20240409131715.13632-1-osalvador@suse.de Fixes: 4bedfb314bdd ("mm,page_owner: maintain own list of stack_records structs") Signed-off-by: Oscar Salvador Reported-by: Kefeng Wang Closes: https://lore.kernel.org/linux-mm/74b147b0-718d-4d50-be75-d6afc801cd24@huawei.com/ Tested-by: Kefeng Wang Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 9bef0b4428634..742f432e5bf06 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -118,7 +118,6 @@ static __init void init_page_owner(void) register_dummy_stack(); register_failure_stack(); register_early_stack(); - static_branch_enable(&page_owner_inited); init_early_allocated_pages(); /* Initialize dummy and failure stacks and link them to stack_list */ dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle); @@ -129,6 +128,7 @@ static __init void init_page_owner(void) refcount_set(&failure_stack.stack_record->count, 1); dummy_stack.next = &failure_stack; stack_list = &dummy_stack; + static_branch_enable(&page_owner_inited); } struct page_ext_operations page_owner_ops = { -- GitLab From 1f737846aa3c45f07a06fa0d018b39e1afb8084a Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Tue, 9 Apr 2024 17:54:07 +0200 Subject: [PATCH 928/989] mm/shmem: inline shmem_is_huge() for disabled transparent hugepages In order to minimize code size (CONFIG_CC_OPTIMIZE_FOR_SIZE=y), compiler might choose to make a regular function call (out-of-line) for shmem_is_huge() instead of inlining it. When transparent hugepages are disabled (CONFIG_TRANSPARENT_HUGEPAGE=n), it can cause compilation error. mm/shmem.c: In function `shmem_getattr': ./include/linux/huge_mm.h:383:27: note: in expansion of macro `BUILD_BUG' 383 | #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) | ^~~~~~~~~ mm/shmem.c:1148:33: note: in expansion of macro `HPAGE_PMD_SIZE' 1148 | stat->blksize = HPAGE_PMD_SIZE; To prevent the possible error, always inline shmem_is_huge() when transparent hugepages are disabled. Link: https://lkml.kernel.org/r/20240409155407.2322714-1-sumanthk@linux.ibm.com Signed-off-by: Sumanth Korikkar Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ilya Leoshkevich Cc: Vasily Gorbik Cc: Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 9 +++++++++ mm/shmem.c | 6 ------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a4c15db2f5e54..3fb18f7eb73ea 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -110,8 +110,17 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags); +#else +static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags) +{ + return false; +} +#endif + #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); #else diff --git a/mm/shmem.c b/mm/shmem.c index 0aad0d9a621b8..94ab99b6b574a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -748,12 +748,6 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) -{ - return false; -} - static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) { -- GitLab From 35e351780fa9d8240dd6f7e4f245f9ea37e96c19 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 10 Apr 2024 17:14:41 +0800 Subject: [PATCH 929/989] fork: defer linking file vma until vma is fully initialized Thorvald reported a WARNING [1]. And the root cause is below race: CPU 1 CPU 2 fork hugetlbfs_fallocate dup_mmap hugetlbfs_punch_hole i_mmap_lock_write(mapping); vma_interval_tree_insert_after -- Child vma is visible through i_mmap tree. i_mmap_unlock_write(mapping); hugetlb_dup_vma_private -- Clear vma_lock outside i_mmap_rwsem! i_mmap_lock_write(mapping); hugetlb_vmdelete_list vma_interval_tree_foreach hugetlb_vma_trylock_write -- Vma_lock is cleared. tmp->vm_ops->open -- Alloc new vma_lock outside i_mmap_rwsem! hugetlb_vma_unlock_write -- Vma_lock is assigned!!! i_mmap_unlock_write(mapping); hugetlb_dup_vma_private() and hugetlb_vm_op_open() are called outside i_mmap_rwsem lock while vma lock can be used in the same time. Fix this by deferring linking file vma until vma is fully initialized. Those vmas should be initialized first before they can be used. Link: https://lkml.kernel.org/r/20240410091441.3539905-1-linmiaohe@huawei.com Fixes: 8d9bfb260814 ("hugetlb: add vma based lock for pmd sharing") Signed-off-by: Miaohe Lin Reported-by: Thorvald Natvig Closes: https://lore.kernel.org/linux-mm/20240129161735.6gmjsswx62o4pbja@revolver/T/ [1] Reviewed-by: Jane Chu Cc: Christian Brauner Cc: Heiko Carstens Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Muchun Song Cc: Oleg Nesterov Cc: Peng Zhang Cc: Tycho Andersen Cc: Signed-off-by: Andrew Morton --- kernel/fork.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 39a5046c2f0bf..aebb3e6c96dc6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -714,6 +714,23 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; vm_flags_clear(tmp, VM_LOCKED_MASK); + /* + * Copy/update hugetlb private vma information. + */ + if (is_vm_hugetlb_page(tmp)) + hugetlb_dup_vma_private(tmp); + + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ + vma_iter_bulk_store(&vmi, tmp); + + mm->map_count++; + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; @@ -730,25 +747,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, i_mmap_unlock_write(mapping); } - /* - * Copy/update hugetlb private vma information. - */ - if (is_vm_hugetlb_page(tmp)) - hugetlb_dup_vma_private(tmp); - - /* - * Link the vma into the MT. After using __mt_dup(), memory - * allocation is not necessary here, so it cannot fail. - */ - vma_iter_bulk_store(&vmi, tmp); - - mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(tmp, mpnt); - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - if (retval) { mpnt = vma_next(&vmi); goto loop_out; -- GitLab From 8247bf1db92a9697d4ee26db3259d2a1959d1366 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Sat, 13 Apr 2024 03:17:20 +0900 Subject: [PATCH 930/989] MAINTAINERS: update Naoya Horiguchi's email address My old NEC address has been removed, so update MAINTAINERS and .mailmap to map it to my gmail address. Link: https://lkml.kernel.org/r/20240412181720.18452-1-nao.horiguchi@gmail.com Signed-off-by: Naoya Horiguchi Acked-by: Miaohe Lin Cc: Oscar Salvador Signed-off-by: Andrew Morton --- .mailmap | 3 ++- MAINTAINERS | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 8284692f96107..625b496bf5f45 100644 --- a/.mailmap +++ b/.mailmap @@ -446,7 +446,8 @@ Mythri P K Nadav Amit Nadav Amit Nadia Yvette Chambers William Lee Irwin III -Naoya Horiguchi +Naoya Horiguchi +Naoya Horiguchi Nathan Chancellor Neeraj Upadhyay Neil Armstrong diff --git a/MAINTAINERS b/MAINTAINERS index c23fda1aa1f09..ee9cc2b40409e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10024,7 +10024,7 @@ F: drivers/media/platform/st/sti/hva HWPOISON MEMORY FAILURE HANDLING M: Miaohe Lin -R: Naoya Horiguchi +R: Naoya Horiguchi L: linux-mm@kvack.org S: Maintained F: mm/hwpoison-inject.c -- GitLab From c4a7dc9523b59b3e73fd522c73e95e072f876b16 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Tue, 16 Apr 2024 03:20:48 +0900 Subject: [PATCH 931/989] nilfs2: fix OOB in nilfs_set_de_type The size of the nilfs_type_by_mode array in the fs/nilfs2/dir.c file is defined as "S_IFMT >> S_SHIFT", but the nilfs_set_de_type() function, which uses this array, specifies the index to read from the array in the same way as "(mode & S_IFMT) >> S_SHIFT". static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) { umode_t mode = inode->i_mode; de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; // oob } However, when the index is determined this way, an out-of-bounds (OOB) error occurs by referring to an index that is 1 larger than the array size when the condition "mode & S_IFMT == S_IFMT" is satisfied. Therefore, a patch to resize the nilfs_type_by_mode array should be applied to prevent OOB errors. Link: https://lkml.kernel.org/r/20240415182048.7144-1-konishi.ryusuke@gmail.com Reported-by: syzbot+2e22057de05b9f3b30d8@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2e22057de05b9f3b30d8 Fixes: 2ba466d74ed7 ("nilfs2: directory entry operations") Signed-off-by: Jeongjun Park Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index bc846b904b68d..aee40db7a036f 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -240,7 +240,7 @@ nilfs_filetype_table[NILFS_FT_MAX] = { #define S_SHIFT 12 static unsigned char -nilfs_type_by_mode[S_IFMT >> S_SHIFT] = { +nilfs_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR, [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV, -- GitLab From f8bbc07ac535593139c875ffa19af924b1084540 Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Sun, 14 Apr 2024 22:02:46 -0400 Subject: [PATCH 932/989] tun: limit printing rate when illegal packet received by tun dev vhost_worker will call tun call backs to receive packets. If too many illegal packets arrives, tun_do_read will keep dumping packet contents. When console is enabled, it will costs much more cpu time to dump packet and soft lockup will be detected. net_ratelimit mechanism can be used to limit the dumping rate. PID: 33036 TASK: ffff949da6f20000 CPU: 23 COMMAND: "vhost-32980" #0 [fffffe00003fce50] crash_nmi_callback at ffffffff89249253 #1 [fffffe00003fce58] nmi_handle at ffffffff89225fa3 #2 [fffffe00003fceb0] default_do_nmi at ffffffff8922642e #3 [fffffe00003fced0] do_nmi at ffffffff8922660d #4 [fffffe00003fcef0] end_repeat_nmi at ffffffff89c01663 [exception RIP: io_serial_in+20] RIP: ffffffff89792594 RSP: ffffa655314979e8 RFLAGS: 00000002 RAX: ffffffff89792500 RBX: ffffffff8af428a0 RCX: 0000000000000000 RDX: 00000000000003fd RSI: 0000000000000005 RDI: ffffffff8af428a0 RBP: 0000000000002710 R8: 0000000000000004 R9: 000000000000000f R10: 0000000000000000 R11: ffffffff8acbf64f R12: 0000000000000020 R13: ffffffff8acbf698 R14: 0000000000000058 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #5 [ffffa655314979e8] io_serial_in at ffffffff89792594 #6 [ffffa655314979e8] wait_for_xmitr at ffffffff89793470 #7 [ffffa65531497a08] serial8250_console_putchar at ffffffff897934f6 #8 [ffffa65531497a20] uart_console_write at ffffffff8978b605 #9 [ffffa65531497a48] serial8250_console_write at ffffffff89796558 #10 [ffffa65531497ac8] console_unlock at ffffffff89316124 #11 [ffffa65531497b10] vprintk_emit at ffffffff89317c07 #12 [ffffa65531497b68] printk at ffffffff89318306 #13 [ffffa65531497bc8] print_hex_dump at ffffffff89650765 #14 [ffffa65531497ca8] tun_do_read at ffffffffc0b06c27 [tun] #15 [ffffa65531497d38] tun_recvmsg at ffffffffc0b06e34 [tun] #16 [ffffa65531497d68] handle_rx at ffffffffc0c5d682 [vhost_net] #17 [ffffa65531497ed0] vhost_worker at ffffffffc0c644dc [vhost] #18 [ffffa65531497f10] kthread at ffffffff892d2e72 #19 [ffffa65531497f50] ret_from_fork at ffffffff89c0022f Fixes: ef3db4a59542 ("tun: avoid BUG, dump packet on GSO errors") Signed-off-by: Lei Chen Reviewed-by: Willem de Bruijn Acked-by: Jason Wang Reviewed-by: Eric Dumazet Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20240415020247.2207781-1-lei.chen@smartx.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 0b3f21cba552f..92da8c03d960c 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2125,14 +2125,16 @@ static ssize_t tun_put_user(struct tun_struct *tun, tun_is_little_endian(tun), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb); - pr_err("unexpected GSO type: " - "0x%x, gso_size %d, hdr_len %d\n", - sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), - tun16_to_cpu(tun, gso.hdr_len)); - print_hex_dump(KERN_ERR, "tun: ", - DUMP_PREFIX_NONE, - 16, 1, skb->head, - min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); + + if (net_ratelimit()) { + netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", + sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), + tun16_to_cpu(tun, gso.hdr_len)); + print_hex_dump(KERN_ERR, "tun: ", + DUMP_PREFIX_NONE, + 16, 1, skb->head, + min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); + } WARN_ON_ONCE(1); return -EINVAL; } -- GitLab From fb1f4584b1215e8c209f6b3a4028ed8351a0e961 Mon Sep 17 00:00:00 2001 From: Chuanhong Guo Date: Tue, 12 Mar 2024 14:29:12 +0800 Subject: [PATCH 933/989] USB: serial: option: add support for Fibocom FM650/FG650 Fibocom FM650/FG650 are 5G modems with ECM/NCM/RNDIS/MBIM modes. This patch adds support to all 4 modes. In all 4 modes, the first serial port is the AT console while the other 3 appear to be diagnostic interfaces for dumping modem logs. usb-devices output for all modes: ECM: T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 5 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=2cb7 ProdID=0a04 Rev=04.04 S: Manufacturer=Fibocom Wireless Inc. S: Product=FG650 Module S: SerialNumber=0123456789ABCDEF C: #Ifs= 5 Cfg#= 1 Atr=c0 MxPwr=504mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=06 Prot=00 Driver=cdc_ether E: Ad=82(I) Atr=03(Int.) MxPS= 16 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms NCM: T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 6 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=2cb7 ProdID=0a05 Rev=04.04 S: Manufacturer=Fibocom Wireless Inc. S: Product=FG650 Module S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=c0 MxPwr=504mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0d Prot=00 Driver=cdc_ncm E: Ad=82(I) Atr=03(Int.) MxPS= 16 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=01 Driver=cdc_ncm E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms RNDIS: T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 4 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=2cb7 ProdID=0a06 Rev=04.04 S: Manufacturer=Fibocom Wireless Inc. S: Product=FG650 Module S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=c0 MxPwr=504mA I: If#= 0 Alt= 0 #EPs= 1 Cls=e0(wlcon) Sub=01 Prot=03 Driver=rndis_host E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms MBIM: T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 7 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=2cb7 ProdID=0a07 Rev=04.04 S: Manufacturer=Fibocom Wireless Inc. S: Product=FG650 Module S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=c0 MxPwr=504mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms Signed-off-by: Chuanhong Guo Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 16cf0ddaae9b7..ad0fff1e889d0 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2295,6 +2295,10 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a3, 0xff) }, /* Fibocom FM101-GL (laptop MBIM) */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a4, 0xff), /* Fibocom FM101-GL (laptop MBIM) */ .driver_info = RSVD(4) }, + { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a04, 0xff) }, /* Fibocom FM650-CN (ECM mode) */ + { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a05, 0xff) }, /* Fibocom FM650-CN (NCM mode) */ + { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a06, 0xff) }, /* Fibocom FM650-CN (RNDIS mode) */ + { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a07, 0xff) }, /* Fibocom FM650-CN (MBIM mode) */ { USB_DEVICE_INTERFACE_CLASS(0x2df3, 0x9d03, 0xff) }, /* LongSung M5710 */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1404, 0xff) }, /* GosunCn GM500 RNDIS */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1405, 0xff) }, /* GosunCn GM500 MBIM */ -- GitLab From d59cf049c8378677053703e724808836f180888e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Sat, 13 Apr 2024 16:01:39 +0300 Subject: [PATCH 934/989] net: dsa: mt7530: fix mirroring frames received on local port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This switch intellectual property provides a bit on the ARL global control register which controls allowing mirroring frames which are received on the local port (monitor port). This bit is unset after reset. This ability must be enabled to fully support the port mirroring feature on this switch intellectual property. Therefore, this patch fixes the traffic not being reflected on a port, which would be configured like below: tc qdisc add dev swp0 clsact tc filter add dev swp0 ingress matchall skip_sw \ action mirred egress mirror dev swp0 As a side note, this configuration provides the hairpinning feature for a single port. Fixes: 37feab6076aa ("net: dsa: mt7530: add support for port mirroring") Signed-off-by: Arınç ÜNAL Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 6 ++++++ drivers/net/dsa/mt7530.h | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index c0d0bce0b5942..b84e1845fa028 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2480,6 +2480,9 @@ mt7530_setup(struct dsa_switch *ds) PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT)); } + /* Allow mirroring frames received on the local port (monitor port). */ + mt7530_set(priv, MT753X_AGC, LOCAL_EN); + /* Setup VLAN ID 0 for VLAN-unaware bridges */ ret = mt7530_setup_vlan0(priv); if (ret) @@ -2591,6 +2594,9 @@ mt7531_setup_common(struct dsa_switch *ds) PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT)); } + /* Allow mirroring frames received on the local port (monitor port). */ + mt7530_set(priv, MT753X_AGC, LOCAL_EN); + /* Flush the FDB table */ ret = mt7530_fdb_cmd(priv, MT7530_FDB_FLUSH, NULL); if (ret < 0) diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 585db03c05487..a08053390b285 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -32,6 +32,10 @@ enum mt753x_id { #define SYSC_REG_RSTCTRL 0x34 #define RESET_MCM BIT(2) +/* Register for ARL global control */ +#define MT753X_AGC 0xc +#define LOCAL_EN BIT(7) + /* Registers to mac forward control for unknown frames */ #define MT7530_MFC 0x10 #define BC_FFP(x) (((x) & 0xff) << 24) -- GitLab From 2c606d138518cc69f09c35929abc414a99e3a28f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Sat, 13 Apr 2024 16:01:40 +0300 Subject: [PATCH 935/989] net: dsa: mt7530: fix port mirroring for MT7988 SoC switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "MT7988A Wi-Fi 7 Generation Router Platform: Datasheet (Open Version) v0.1" document shows bits 16 to 18 as the MIRROR_PORT field of the CPU forward control register. Currently, the MT7530 DSA subdriver configures bits 0 to 2 of the CPU forward control register which breaks the port mirroring feature for the MT7988 SoC switch. Fix this by using the MT7531_MIRROR_PORT_GET() and MT7531_MIRROR_PORT_SET() macros which utilise the correct bits. Fixes: 110c18bfed41 ("net: dsa: mt7530: introduce driver for MT7988 built-in switch") Signed-off-by: Arınç ÜNAL Acked-by: Daniel Golle Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index b84e1845fa028..8090390edaf9d 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1883,14 +1883,16 @@ mt7530_port_vlan_del(struct dsa_switch *ds, int port, static int mt753x_mirror_port_get(unsigned int id, u32 val) { - return (id == ID_MT7531) ? MT7531_MIRROR_PORT_GET(val) : - MIRROR_PORT(val); + return (id == ID_MT7531 || id == ID_MT7988) ? + MT7531_MIRROR_PORT_GET(val) : + MIRROR_PORT(val); } static int mt753x_mirror_port_set(unsigned int id, u32 val) { - return (id == ID_MT7531) ? MT7531_MIRROR_PORT_SET(val) : - MIRROR_PORT(val); + return (id == ID_MT7531 || id == ID_MT7988) ? + MT7531_MIRROR_PORT_SET(val) : + MIRROR_PORT(val); } static int mt753x_port_mirror_add(struct dsa_switch *ds, int port, -- GitLab From 13c785323b36b845300b256d0e5963c3727667d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 17 Apr 2024 11:03:27 +0200 Subject: [PATCH 936/989] serial: stm32: Return IRQ_NONE in the ISR if no handling happend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If there is a stuck irq that the handler doesn't address, returning IRQ_HANDLED unconditionally makes it impossible for the irq core to detect the problem and disable the irq. So only return IRQ_HANDLED if an event was handled. A stuck irq is still problematic, but with this change at least it only makes the UART nonfunctional instead of occupying the (usually only) CPU by 100% and so stall the whole machine. Fixes: 48a6092fb41f ("serial: stm32-usart: Add STM32 USART Driver") Cc: stable@vger.kernel.org Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/5f92603d0dfd8a5b8014b2b10a902d91e0bb881f.1713344161.git.u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/stm32-usart.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index 58d169e5c1db0..d60cbac691944 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -861,6 +861,7 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) const struct stm32_usart_offsets *ofs = &stm32_port->info->ofs; u32 sr; unsigned int size; + irqreturn_t ret = IRQ_NONE; sr = readl_relaxed(port->membase + ofs->isr); @@ -869,11 +870,14 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) (sr & USART_SR_TC)) { stm32_usart_tc_interrupt_disable(port); stm32_usart_rs485_rts_disable(port); + ret = IRQ_HANDLED; } - if ((sr & USART_SR_RTOF) && ofs->icr != UNDEF_REG) + if ((sr & USART_SR_RTOF) && ofs->icr != UNDEF_REG) { writel_relaxed(USART_ICR_RTOCF, port->membase + ofs->icr); + ret = IRQ_HANDLED; + } if ((sr & USART_SR_WUF) && ofs->icr != UNDEF_REG) { /* Clear wake up flag and disable wake up interrupt */ @@ -882,6 +886,7 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) stm32_usart_clr_bits(port, ofs->cr3, USART_CR3_WUFIE); if (irqd_is_wakeup_set(irq_get_irq_data(port->irq))) pm_wakeup_event(tport->tty->dev, 0); + ret = IRQ_HANDLED; } /* @@ -896,6 +901,7 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) uart_unlock_and_check_sysrq(port); if (size) tty_flip_buffer_push(tport); + ret = IRQ_HANDLED; } } @@ -903,6 +909,7 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) uart_port_lock(port); stm32_usart_transmit_chars(port); uart_port_unlock(port); + ret = IRQ_HANDLED; } /* Receiver timeout irq for DMA RX */ @@ -912,9 +919,10 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) uart_unlock_and_check_sysrq(port); if (size) tty_flip_buffer_push(tport); + ret = IRQ_HANDLED; } - return IRQ_HANDLED; + return ret; } static void stm32_usart_set_mctrl(struct uart_port *port, unsigned int mctrl) -- GitLab From ea2624b5b829b8f93c0dce25721d835969b34faf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 17 Apr 2024 11:03:28 +0200 Subject: [PATCH 937/989] serial: stm32: Reset .throttled state in .startup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an UART is opened that still has .throttled set from a previous open, the RX interrupt is enabled but the irq handler doesn't consider it. This easily results in a stuck irq with the effect to occupy the CPU in a tight loop. So reset the throttle state in .startup() to ensure that RX irqs are handled. Fixes: d1ec8a2eabe9 ("serial: stm32: update throttle and unthrottle ops for dma mode") Cc: stable@vger.kernel.org Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/a784f80d3414f7db723b2ec66efc56e1ad666cbf.1713344161.git.u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/stm32-usart.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index d60cbac691944..4fa5a03ebac08 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -1092,6 +1092,7 @@ static int stm32_usart_startup(struct uart_port *port) val |= USART_CR2_SWAP; writel_relaxed(val, port->membase + ofs->cr2); } + stm32_port->throttled = false; /* RX FIFO Flush */ if (ofs->rqr != UNDEF_REG) -- GitLab From e871abcda3b67d0820b4182ebe93435624e9c6a4 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 17 Apr 2024 13:38:29 +0200 Subject: [PATCH 938/989] random: handle creditable entropy from atomic process context The entropy accounting changes a static key when the RNG has initialized, since it only ever initializes once. Static key changes, however, cannot be made from atomic context, so depending on where the last creditable entropy comes from, the static key change might need to be deferred to a worker. Previously the code used the execute_in_process_context() helper function, which accounts for whether or not the caller is in_interrupt(). However, that doesn't account for the case where the caller is actually in process context but is holding a spinlock. This turned out to be the case with input_handle_event() in drivers/input/input.c contributing entropy: [] die+0xa8/0x2fc [] bug_handler+0x44/0xec [] brk_handler+0x90/0x144 [] do_debug_exception+0xa0/0x148 [] el1_dbg+0x60/0x7c [] el1h_64_sync_handler+0x38/0x90 [] el1h_64_sync+0x64/0x6c [] __might_resched+0x1fc/0x2e8 [] __might_sleep+0x44/0x7c [] cpus_read_lock+0x1c/0xec [] static_key_enable+0x14/0x38 [] crng_set_ready+0x14/0x28 [] execute_in_process_context+0xb8/0xf8 [] _credit_init_bits+0x118/0x1dc [] add_timer_randomness+0x264/0x270 [] add_input_randomness+0x38/0x48 [] input_handle_event+0x2b8/0x490 [] input_event+0x6c/0x98 According to Guoyong, it's not really possible to refactor the various drivers to never hold a spinlock there. And in_atomic() isn't reliable. So, rather than trying to be too fancy, just punt the change in the static key to a workqueue always. There's basically no drawback of doing this, as the code already needed to account for the static key not changing immediately, and given that it's just an optimization, there's not exactly a hurry to change the static key right away, so deferal is fine. Reported-by: Guoyong Wang Cc: stable@vger.kernel.org Fixes: f5bda35fba61 ("random: use static branch for crng_ready()") Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 456be28ba67cb..2597cb43f4387 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -702,7 +702,7 @@ static void extract_entropy(void *buf, size_t len) static void __cold _credit_init_bits(size_t bits) { - static struct execute_work set_ready; + static DECLARE_WORK(set_ready, crng_set_ready); unsigned int new, orig, add; unsigned long flags; @@ -718,8 +718,8 @@ static void __cold _credit_init_bits(size_t bits) if (orig < POOL_READY_BITS && new >= POOL_READY_BITS) { crng_reseed(NULL); /* Sets crng_init to CRNG_READY under base_crng.lock. */ - if (static_key_initialized) - execute_in_process_context(crng_set_ready, &set_ready); + if (static_key_initialized && system_unbound_wq) + queue_work(system_unbound_wq, &set_ready); atomic_notifier_call_chain(&random_ready_notifier, 0, NULL); wake_up_interruptible(&crng_init_wait); kill_fasync(&fasync, SIGIO, POLL_IN); @@ -890,8 +890,8 @@ void __init random_init(void) /* * If we were initialized by the cpu or bootloader before jump labels - * are initialized, then we should enable the static branch here, where - * it's guaranteed that jump labels have been initialized. + * or workqueues are initialized, then we should enable the static + * branch here, where it's guaranteed that these have been initialized. */ if (!static_branch_likely(&crng_is_ready) && crng_init >= CRNG_READY) crng_set_ready(NULL); -- GitLab From 83781384a96b95e2b6403d3c8a002b2c89031770 Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Mon, 15 Apr 2024 15:15:07 +0200 Subject: [PATCH 939/989] s390/ism: Properly fix receive message buffer allocation Since [1], dma_alloc_coherent() does not accept requests for GFP_COMP anymore, even on archs that may be able to fulfill this. Functionality that relied on the receive buffer being a compound page broke at that point: The SMC-D protocol, that utilizes the ism device driver, passes receive buffers to the splice processor in a struct splice_pipe_desc with a single entry list of struct pages. As the buffer is no longer a compound page, the splice processor now rejects requests to handle more than a page worth of data. Replace dma_alloc_coherent() and allocate a buffer with folio_alloc and create a DMA map for it with dma_map_page(). Since only receive buffers on ISM devices use DMA, qualify the mapping as FROM_DEVICE. Since ISM devices are available on arch s390, only, and on that arch all DMA is coherent, there is no need to introduce and export some kind of dma_sync_to_cpu() method to be called by the SMC-D protocol layer. Analogously, replace dma_free_coherent by a two step dma_unmap_page, then folio_put to free the receive buffer. [1] https://lore.kernel.org/all/20221113163535.884299-1-hch@lst.de/ Fixes: c08004eede4b ("s390/ism: don't pass bogus GFP_ flags to dma_alloc_coherent") Signed-off-by: Gerd Bayer Signed-off-by: David S. Miller --- drivers/s390/net/ism_drv.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 2c8e964425dc3..43778b088ffac 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -292,13 +292,16 @@ out: static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); - dma_free_coherent(&ism->pdev->dev, dmb->dmb_len, - dmb->cpu_addr, dmb->dma_addr); + dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len, + DMA_FROM_DEVICE); + folio_put(virt_to_folio(dmb->cpu_addr)); } static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { + struct folio *folio; unsigned long bit; + int rc; if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev)) return -EINVAL; @@ -315,14 +318,30 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) test_and_set_bit(dmb->sba_idx, ism->sba_bitmap)) return -EINVAL; - dmb->cpu_addr = dma_alloc_coherent(&ism->pdev->dev, dmb->dmb_len, - &dmb->dma_addr, - GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_NORETRY); - if (!dmb->cpu_addr) - clear_bit(dmb->sba_idx, ism->sba_bitmap); + folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | + __GFP_NORETRY, get_order(dmb->dmb_len)); - return dmb->cpu_addr ? 0 : -ENOMEM; + if (!folio) { + rc = -ENOMEM; + goto out_bit; + } + + dmb->cpu_addr = folio_address(folio); + dmb->dma_addr = dma_map_page(&ism->pdev->dev, + virt_to_page(dmb->cpu_addr), 0, + dmb->dmb_len, DMA_FROM_DEVICE); + if (dma_mapping_error(&ism->pdev->dev, dmb->dma_addr)) { + rc = -ENOMEM; + goto out_free; + } + + return 0; + +out_free: + kfree(dmb->cpu_addr); +out_bit: + clear_bit(dmb->sba_idx, ism->sba_bitmap); + return rc; } int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, -- GitLab From 652ead9b746a63e4e79d7ad66d3edf0a8a5b0c2f Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 4 Apr 2024 11:03:02 +0200 Subject: [PATCH 940/989] drm/xe: Fix bo leak in intel_fb_bo_framebuffer_init Add a unreference bo in the error path, to prevent leaking a bo ref. Return 0 on success to clarify the success path. Signed-off-by: Maarten Lankhorst Fixes: 44e694958b95 ("drm/xe/display: Implement display support") Cc: # v6.8+ Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240404090302.68422-1-maarten.lankhorst@linux.intel.com (cherry picked from commit a2f3d731be3893e730417ae3190760fcaffdf549) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/display/intel_fb_bo.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/display/intel_fb_bo.c b/drivers/gpu/drm/xe/display/intel_fb_bo.c index b21da7b745a5e..a9c1f9885c6bb 100644 --- a/drivers/gpu/drm/xe/display/intel_fb_bo.c +++ b/drivers/gpu/drm/xe/display/intel_fb_bo.c @@ -31,7 +31,7 @@ int intel_fb_bo_framebuffer_init(struct intel_framebuffer *intel_fb, ret = ttm_bo_reserve(&bo->ttm, true, false, NULL); if (ret) - return ret; + goto err; if (!(bo->flags & XE_BO_SCANOUT_BIT)) { /* @@ -42,12 +42,16 @@ int intel_fb_bo_framebuffer_init(struct intel_framebuffer *intel_fb, */ if (XE_IOCTL_DBG(i915, !list_empty(&bo->ttm.base.gpuva.list))) { ttm_bo_unreserve(&bo->ttm); - return -EINVAL; + ret = -EINVAL; + goto err; } bo->flags |= XE_BO_SCANOUT_BIT; } ttm_bo_unreserve(&bo->ttm); + return 0; +err: + xe_bo_put(bo); return ret; } -- GitLab From ca7c52ac7ad384bcf299d89482c45fec7cd00da9 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 12 Apr 2024 12:31:45 +0100 Subject: [PATCH 941/989] drm/xe/vm: prevent UAF with asid based lookup The asid is only erased from the xarray when the vm refcount reaches zero, however this leads to potential UAF since the xe_vm_get() only works on a vm with refcount != 0. Since the asid is allocated in the vm create ioctl, rather erase it when closing the vm, prior to dropping the potential last ref. This should also work when user closes driver fd without explicit vm destroy. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1594 Signed-off-by: Matthew Auld Cc: Matthew Brost Cc: # v6.8+ Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240412113144.259426-4-matthew.auld@intel.com (cherry picked from commit 83967c57320d0d01ae512f10e79213f81e4bf594) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_vm.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 62d1ef8867a84..3d4c8f342e215 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -1577,6 +1577,16 @@ void xe_vm_close_and_put(struct xe_vm *vm) xe->usm.num_vm_in_fault_mode--; else if (!(vm->flags & XE_VM_FLAG_MIGRATION)) xe->usm.num_vm_in_non_fault_mode--; + + if (vm->usm.asid) { + void *lookup; + + xe_assert(xe, xe->info.has_asid); + xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION)); + + lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid); + xe_assert(xe, lookup == vm); + } mutex_unlock(&xe->usm.lock); for_each_tile(tile, xe, id) @@ -1592,24 +1602,15 @@ static void vm_destroy_work_func(struct work_struct *w) struct xe_device *xe = vm->xe; struct xe_tile *tile; u8 id; - void *lookup; /* xe_vm_close_and_put was not called? */ xe_assert(xe, !vm->size); mutex_destroy(&vm->snap_mutex); - if (!(vm->flags & XE_VM_FLAG_MIGRATION)) { + if (!(vm->flags & XE_VM_FLAG_MIGRATION)) xe_device_mem_access_put(xe); - if (xe->info.has_asid && vm->usm.asid) { - mutex_lock(&xe->usm.lock); - lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid); - xe_assert(xe, lookup == vm); - mutex_unlock(&xe->usm.lock); - } - } - for_each_tile(tile, xe, id) XE_WARN_ON(vm->pt_root[id]); -- GitLab From f609e7b1b49e4d15cf107d2069673ee63860c398 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 10 Apr 2024 09:10:46 -0500 Subject: [PATCH 942/989] platform/x86/amd/pmc: Extend Framework 13 quirk to more BIOSes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BIOS 03.05 still hasn't fixed the spurious IRQ1 issue. As it's still being worked on there is still a possibility that it won't need to apply to future BIOS releases. Add a quirk for BIOS 03.05 as well. Signed-off-by: Mario Limonciello Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240410141046.433-1-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc-quirks.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c index b456370166b6b..b4f49720c87f6 100644 --- a/drivers/platform/x86/amd/pmc/pmc-quirks.c +++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c @@ -208,6 +208,15 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_BIOS_VERSION, "03.03"), } }, + { + .ident = "Framework Laptop 13 (Phoenix)", + .driver_data = &quirk_spurious_8042, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Framework"), + DMI_MATCH(DMI_PRODUCT_NAME, "Laptop 13 (AMD Ryzen 7040Series)"), + DMI_MATCH(DMI_BIOS_VERSION, "03.05"), + } + }, {} }; -- GitLab From ca7c4507ba87e9fc22e0ecfa819c3664b3e8287b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 15 Mar 2024 13:07:53 +0100 Subject: [PATCH 943/989] drm/amdgpu: remove invalid resource->start check v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The majority of those where removed in the commit aed01a68047b ("drm/amdgpu: Remove TTM resource->start visible VRAM condition v2") But this one was missed because it's working on the resource and not the BO. Since we also no longer use a fake start address for visible BOs this will now trigger invalid mapping errors. v2: also remove the unused variable Signed-off-by: Christian König Fixes: aed01a68047b ("drm/amdgpu: Remove TTM resource->start visible VRAM condition v2") CC: stable@vger.kernel.org Acked-by: Pierre-Eric Pelloux-Prayer Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index fc418e670fdae..6417cb76ccd44 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -557,7 +557,6 @@ static int amdgpu_ttm_io_mem_reserve(struct ttm_device *bdev, struct ttm_resource *mem) { struct amdgpu_device *adev = amdgpu_ttm_adev(bdev); - size_t bus_size = (size_t)mem->size; switch (mem->mem_type) { case TTM_PL_SYSTEM: @@ -568,9 +567,6 @@ static int amdgpu_ttm_io_mem_reserve(struct ttm_device *bdev, break; case TTM_PL_VRAM: mem->bus.offset = mem->start << PAGE_SHIFT; - /* check if it's visible */ - if ((mem->bus.offset + bus_size) > adev->gmc.visible_vram_size) - return -EINVAL; if (adev->mman.aper_base_kaddr && mem->placement & TTM_PL_FLAG_CONTIGUOUS) -- GitLab From 18921b205012568b45760753ad3146ddb9e2d4e2 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 10 Apr 2024 15:52:10 -0400 Subject: [PATCH 944/989] drm/amdkfd: Fix memory leak in create_process failure Fix memory leak due to a leaked mmget reference on an error handling code path that is triggered when attempting to create KFD processes while a GPU reset is in progress. Fixes: 0ab2d7532b05 ("drm/amdkfd: prepare per-process debug enable and disable") CC: Xiaogang Chen Signed-off-by: Felix Kuehling Tested-by: Harish Kasiviswanthan Reviewed-by: Mukul Joshi Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 717a60d7a4ea9..b79986412cd83 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -819,9 +819,9 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) mutex_lock(&kfd_processes_mutex); if (kfd_is_locked()) { - mutex_unlock(&kfd_processes_mutex); pr_debug("KFD is locked! Cannot create process"); - return ERR_PTR(-EINVAL); + process = ERR_PTR(-EINVAL); + goto out; } /* A prior open of /dev/kfd could have already created the process. */ -- GitLab From 91f10a3d21f2313485178d49efef8a3ba02bd8c7 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 29 Mar 2024 18:03:03 -0400 Subject: [PATCH 945/989] Revert "drm/amd/display: fix USB-C flag update after enc10 feature init" This reverts commit b5abd7f983e14054593dc91d6df2aa5f8cc67652. This change breaks DSC on 4k monitors at 144Hz over USB-C. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3254 Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher Cc: Muhammad Ahmed Cc: Tom Chung Cc: Charlene Liu Cc: Hamza Mahfooz Cc: Harry Wentland Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c | 8 +++----- .../gpu/drm/amd/display/dc/dcn35/dcn35_dio_link_encoder.c | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c index e224a028d68ac..8a0460e863097 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c @@ -248,14 +248,12 @@ void dcn32_link_encoder_construct( enc10->base.hpd_source = init_data->hpd_source; enc10->base.connector = init_data->connector; - enc10->base.preferred_engine = ENGINE_ID_UNKNOWN; - - enc10->base.features = *enc_features; if (enc10->base.connector.id == CONNECTOR_ID_USBC) enc10->base.features.flags.bits.DP_IS_USB_C = 1; - if (enc10->base.connector.id == CONNECTOR_ID_USBC) - enc10->base.features.flags.bits.DP_IS_USB_C = 1; + enc10->base.preferred_engine = ENGINE_ID_UNKNOWN; + + enc10->base.features = *enc_features; enc10->base.transmitter = init_data->transmitter; diff --git a/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_link_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_link_encoder.c index 81e349d5835bb..da94e5309fbaf 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_link_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_link_encoder.c @@ -184,6 +184,8 @@ void dcn35_link_encoder_construct( enc10->base.hpd_source = init_data->hpd_source; enc10->base.connector = init_data->connector; + if (enc10->base.connector.id == CONNECTOR_ID_USBC) + enc10->base.features.flags.bits.DP_IS_USB_C = 1; enc10->base.preferred_engine = ENGINE_ID_UNKNOWN; @@ -238,8 +240,6 @@ void dcn35_link_encoder_construct( } enc10->base.features.flags.bits.HDMI_6GB_EN = 1; - if (enc10->base.connector.id == CONNECTOR_ID_USBC) - enc10->base.features.flags.bits.DP_IS_USB_C = 1; if (bp_funcs->get_connector_speed_cap_info) result = bp_funcs->get_connector_speed_cap_info(enc10->base.ctx->dc_bios, -- GitLab From d111855ab7ffffc552f6a475259dc392f2319b6d Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 15 Apr 2024 07:52:13 +0200 Subject: [PATCH 946/989] s390/mm: Fix NULL pointer dereference The recently added check to figure out if a fault happened on gmap ASCE dereferences the gmap pointer in lowcore without checking that it is not NULL. For all non-KVM processes the pointer is NULL, so that some value from lowcore will be read. With the current layouts of struct gmap and struct lowcore the read value (aka ASCE) is zero, so that this doesn't lead to any observable bug; at least currently. Fix this by adding the missing NULL pointer check. Fixes: 64c3431808bd ("s390/entry: compare gmap asce to determine guest/host fault") Signed-off-by: Sven Schnelle Reviewed-by: Claudio Imbrenda Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/kernel/entry.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 3dc85638bc63b..6a1e0fbbaa15b 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -340,7 +340,8 @@ SYM_CODE_START(pgm_check_handler) mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK stctg %c1,%c1,__PT_CR1(%r11) #if IS_ENABLED(CONFIG_KVM) - lg %r12,__LC_GMAP + ltg %r12,__LC_GMAP + jz 5f clc __GMAP_ASCE(8,%r12), __PT_CR1(%r11) jne 5f BPENTER __SF_SIE_FLAGS(%r10),_TIF_ISOLATE_BP_GUEST -- GitLab From efefd4f00c967d00ad7abe092554ffbb70c1a793 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Apr 2024 17:43:01 +0200 Subject: [PATCH 947/989] netfilter: nf_tables: missing iterator type in lookup walk Add missing decorator type to lookup expression and tighten WARN_ON_ONCE check in pipapo to spot earlier that this is unset. Fixes: 29b359cf6d95 ("netfilter: nft_set_pipapo: walk over current view on netlink dump") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_lookup.c | 1 + net/netfilter/nft_set_pipapo.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index a0055f510e31e..b314ca728a291 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -216,6 +216,7 @@ static int nft_lookup_validate(const struct nft_ctx *ctx, return 0; iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index eeaf05ffba953..0f903d18bbea0 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2123,7 +2123,8 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_pipapo_field *f; unsigned int i, r; - WARN_ON_ONCE(iter->type == NFT_ITER_UNSPEC); + WARN_ON_ONCE(iter->type != NFT_ITER_READ && + iter->type != NFT_ITER_UPDATE); rcu_read_lock(); if (iter->type == NFT_ITER_READ) -- GitLab From e79b47a8615d42c68aaeb68971593333667382ed Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Apr 2024 17:43:11 +0200 Subject: [PATCH 948/989] netfilter: nf_tables: restore set elements when delete set fails From abort path, nft_mapelem_activate() needs to restore refcounters to the original state. Currently, it uses the set->ops->walk() to iterate over these set elements. The existing set iterator skips inactive elements in the next generation, this does not work from the abort path to restore the original state since it has to skip active elements instead (not inactive ones). This patch moves the check for inactive elements to the set iterator callback, then it reverses the logic for the .activate case which needs to skip active elements. Toggle next generation bit for elements when delete set command is invoked and call nft_clear() from .activate (abort) path to restore the next generation bit. The splat below shows an object in mappings memleak: [43929.457523] ------------[ cut here ]------------ [43929.457532] WARNING: CPU: 0 PID: 1139 at include/net/netfilter/nf_tables.h:1237 nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [...] [43929.458014] RIP: 0010:nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [43929.458076] Code: 83 f8 01 77 ab 49 8d 7c 24 08 e8 37 5e d0 de 49 8b 6c 24 08 48 8d 7d 50 e8 e9 5c d0 de 8b 45 50 8d 50 ff 89 55 50 85 c0 75 86 <0f> 0b eb 82 0f 0b eb b3 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 [43929.458081] RSP: 0018:ffff888140f9f4b0 EFLAGS: 00010246 [43929.458086] RAX: 0000000000000000 RBX: ffff8881434f5288 RCX: dffffc0000000000 [43929.458090] RDX: 00000000ffffffff RSI: ffffffffa26d28a7 RDI: ffff88810ecc9550 [43929.458093] RBP: ffff88810ecc9500 R08: 0000000000000001 R09: ffffed10281f3e8f [43929.458096] R10: 0000000000000003 R11: ffff0000ffff0000 R12: ffff8881434f52a0 [43929.458100] R13: ffff888140f9f5f4 R14: ffff888151c7a800 R15: 0000000000000002 [43929.458103] FS: 00007f0c687c4740(0000) GS:ffff888390800000(0000) knlGS:0000000000000000 [43929.458107] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [43929.458111] CR2: 00007f58dbe5b008 CR3: 0000000123602005 CR4: 00000000001706f0 [43929.458114] Call Trace: [43929.458118] [43929.458121] ? __warn+0x9f/0x1a0 [43929.458127] ? nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [43929.458188] ? report_bug+0x1b1/0x1e0 [43929.458196] ? handle_bug+0x3c/0x70 [43929.458200] ? exc_invalid_op+0x17/0x40 [43929.458211] ? nft_setelem_data_deactivate+0xd7/0xf0 [nf_tables] [43929.458271] ? nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [43929.458332] nft_mapelem_deactivate+0x24/0x30 [nf_tables] [43929.458392] nft_rhash_walk+0xdd/0x180 [nf_tables] [43929.458453] ? __pfx_nft_rhash_walk+0x10/0x10 [nf_tables] [43929.458512] ? rb_insert_color+0x2e/0x280 [43929.458520] nft_map_deactivate+0xdc/0x1e0 [nf_tables] [43929.458582] ? __pfx_nft_map_deactivate+0x10/0x10 [nf_tables] [43929.458642] ? __pfx_nft_mapelem_deactivate+0x10/0x10 [nf_tables] [43929.458701] ? __rcu_read_unlock+0x46/0x70 [43929.458709] nft_delset+0xff/0x110 [nf_tables] [43929.458769] nft_flush_table+0x16f/0x460 [nf_tables] [43929.458830] nf_tables_deltable+0x501/0x580 [nf_tables] Fixes: 628bd3e49cba ("netfilter: nf_tables: drop map element references from preparation phase") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 44 ++++++++++++++++++++++++++++++---- net/netfilter/nft_set_bitmap.c | 4 +--- net/netfilter/nft_set_hash.c | 8 ++----- net/netfilter/nft_set_pipapo.c | 5 +--- net/netfilter/nft_set_rbtree.c | 4 +--- 5 files changed, 45 insertions(+), 20 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a7a34db62ea93..d0c09f899e801 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -594,6 +594,12 @@ static int nft_mapelem_deactivate(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, elem_priv); return 0; @@ -617,6 +623,7 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; + nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, catchall->elem); break; } @@ -3880,6 +3887,9 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_data *data; int err; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; @@ -3903,17 +3913,20 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) { - u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_iter dummy_iter = { + .genmask = nft_genmask_next(ctx->net), + }; struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; int ret = 0; list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask)) + if (!nft_set_elem_active(ext, dummy_iter.genmask)) continue; - ret = nft_setelem_validate(ctx, set, NULL, catchall->elem); + ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem); if (ret < 0) return ret; } @@ -5402,6 +5415,11 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + return nft_setelem_data_validate(ctx, set, elem_priv); } @@ -5494,6 +5512,13 @@ static int nft_mapelem_activate(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + /* called from abort path, reverse check to undo changes. */ + if (nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, elem_priv); return 0; @@ -5511,6 +5536,7 @@ static void nft_map_catchall_activate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; + nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, catchall->elem); break; } @@ -5785,6 +5811,9 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_set_dump_args *args; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) return 0; @@ -6635,7 +6664,7 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set, struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_setelem_is_catchall(set, elem_priv)) { - nft_set_elem_change_active(net, set, ext); + nft_clear(net, ext); } else { set->ops->activate(net, set, elem_priv); } @@ -7317,8 +7346,12 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_trans *trans; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, sizeof(struct nft_trans_elem), GFP_ATOMIC); if (!trans) @@ -10800,6 +10833,9 @@ static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 32df7a16835da..1caa04619dc6d 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -172,7 +172,7 @@ static void nft_bitmap_activate(const struct net *net, nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); - nft_set_elem_change_active(net, set, &be->ext); + nft_clear(net, &be->ext); } static void nft_bitmap_flush(const struct net *net, @@ -222,8 +222,6 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, list_for_each_entry_rcu(be, &priv->list, head) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&be->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &be->priv); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6968a3b342367..daa56dda737ae 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -199,7 +199,7 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, { struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } static void nft_rhash_flush(const struct net *net, @@ -286,8 +286,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) @@ -599,7 +597,7 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set, { struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } static void nft_hash_flush(const struct net *net, @@ -652,8 +650,6 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, hlist_for_each_entry_rcu(he, &priv->table[i], node) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 0f903d18bbea0..187138afac45d 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1847,7 +1847,7 @@ static void nft_pipapo_activate(const struct net *net, { struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &e->ext); + nft_clear(net, &e->ext); } /** @@ -2149,9 +2149,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, e = f->mt[r].e; - if (!nft_set_elem_active(&e->ext, iter->genmask)) - goto cont; - iter->err = iter->fn(ctx, set, iter, &e->priv); if (iter->err < 0) goto out; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 9944fe479e536..b7ea21327549b 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -532,7 +532,7 @@ static void nft_rbtree_activate(const struct net *net, { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &rbe->ext); + nft_clear(net, &rbe->ext); } static void nft_rbtree_flush(const struct net *net, @@ -600,8 +600,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&rbe->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); if (iter->err < 0) { -- GitLab From 6fef2d4c00b5b8561ad68dd2b68173f5c6af1e75 Mon Sep 17 00:00:00 2001 From: xinhui pan Date: Thu, 11 Apr 2024 11:11:38 +0800 Subject: [PATCH 949/989] drm/amdgpu: validate the parameters of bo mapping operations more clearly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verify the parameters of amdgpu_vm_bo_(map/replace_map/clearing_mappings) in one common place. Fixes: dc54d3d1744d ("drm/amdgpu: implement AMDGPU_VA_OP_CLEAR v2") Cc: stable@vger.kernel.org Reported-by: Vlad Stolyarov Suggested-by: Christian König Signed-off-by: xinhui pan Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 72 ++++++++++++++++---------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 4299ce386322e..94089069c9ada 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1613,6 +1613,37 @@ static void amdgpu_vm_bo_insert_map(struct amdgpu_device *adev, trace_amdgpu_vm_bo_map(bo_va, mapping); } +/* Validate operation parameters to prevent potential abuse */ +static int amdgpu_vm_verify_parameters(struct amdgpu_device *adev, + struct amdgpu_bo *bo, + uint64_t saddr, + uint64_t offset, + uint64_t size) +{ + uint64_t tmp, lpfn; + + if (saddr & AMDGPU_GPU_PAGE_MASK + || offset & AMDGPU_GPU_PAGE_MASK + || size & AMDGPU_GPU_PAGE_MASK) + return -EINVAL; + + if (check_add_overflow(saddr, size, &tmp) + || check_add_overflow(offset, size, &tmp) + || size == 0 /* which also leads to end < begin */) + return -EINVAL; + + /* make sure object fit at this offset */ + if (bo && offset + size > amdgpu_bo_size(bo)) + return -EINVAL; + + /* Ensure last pfn not exceed max_pfn */ + lpfn = (saddr + size - 1) >> AMDGPU_GPU_PAGE_SHIFT; + if (lpfn >= adev->vm_manager.max_pfn) + return -EINVAL; + + return 0; +} + /** * amdgpu_vm_bo_map - map bo inside a vm * @@ -1639,21 +1670,14 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev, struct amdgpu_bo *bo = bo_va->base.bo; struct amdgpu_vm *vm = bo_va->base.vm; uint64_t eaddr; + int r; - /* validate the parameters */ - if (saddr & ~PAGE_MASK || offset & ~PAGE_MASK || size & ~PAGE_MASK) - return -EINVAL; - if (saddr + size <= saddr || offset + size <= offset) - return -EINVAL; - - /* make sure object fit at this offset */ - eaddr = saddr + size - 1; - if ((bo && offset + size > amdgpu_bo_size(bo)) || - (eaddr >= adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT)) - return -EINVAL; + r = amdgpu_vm_verify_parameters(adev, bo, saddr, offset, size); + if (r) + return r; saddr /= AMDGPU_GPU_PAGE_SIZE; - eaddr /= AMDGPU_GPU_PAGE_SIZE; + eaddr = saddr + (size - 1) / AMDGPU_GPU_PAGE_SIZE; tmp = amdgpu_vm_it_iter_first(&vm->va, saddr, eaddr); if (tmp) { @@ -1706,17 +1730,9 @@ int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev, uint64_t eaddr; int r; - /* validate the parameters */ - if (saddr & ~PAGE_MASK || offset & ~PAGE_MASK || size & ~PAGE_MASK) - return -EINVAL; - if (saddr + size <= saddr || offset + size <= offset) - return -EINVAL; - - /* make sure object fit at this offset */ - eaddr = saddr + size - 1; - if ((bo && offset + size > amdgpu_bo_size(bo)) || - (eaddr >= adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT)) - return -EINVAL; + r = amdgpu_vm_verify_parameters(adev, bo, saddr, offset, size); + if (r) + return r; /* Allocate all the needed memory */ mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); @@ -1730,7 +1746,7 @@ int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev, } saddr /= AMDGPU_GPU_PAGE_SIZE; - eaddr /= AMDGPU_GPU_PAGE_SIZE; + eaddr = saddr + (size - 1) / AMDGPU_GPU_PAGE_SIZE; mapping->start = saddr; mapping->last = eaddr; @@ -1817,10 +1833,14 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev, struct amdgpu_bo_va_mapping *before, *after, *tmp, *next; LIST_HEAD(removed); uint64_t eaddr; + int r; + + r = amdgpu_vm_verify_parameters(adev, NULL, saddr, 0, size); + if (r) + return r; - eaddr = saddr + size - 1; saddr /= AMDGPU_GPU_PAGE_SIZE; - eaddr /= AMDGPU_GPU_PAGE_SIZE; + eaddr = saddr + (size - 1) / AMDGPU_GPU_PAGE_SIZE; /* Allocate all the needed memory */ before = kzalloc(sizeof(*before), GFP_KERNEL); -- GitLab From a6ff969fe9cbf369e3cd0ac54261fec1122682ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 4 Apr 2024 16:25:40 +0200 Subject: [PATCH 950/989] drm/amdgpu: fix visible VRAM handling during faults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we removed the hacky start code check we actually didn't took into account that *all* VRAM pages needs to be CPU accessible. Clean up the code and unify the handling into a single helper which checks if the whole resource is CPU accessible. The only place where a partial check would make sense is during eviction, but that is neglitible. Signed-off-by: Christian König Fixes: aed01a68047b ("drm/amdgpu: Remove TTM resource->start visible VRAM condition v2") Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher CC: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 22 ++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 22 -------- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 61 ++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 3 ++ 5 files changed, 53 insertions(+), 57 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 0a4b09709cfb1..ec888fc6ead8d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -819,7 +819,7 @@ retry: p->bytes_moved += ctx.bytes_moved; if (!amdgpu_gmc_vram_full_visible(&adev->gmc) && - amdgpu_bo_in_cpu_visible_vram(bo)) + amdgpu_res_cpu_visible(adev, bo->tbo.resource)) p->bytes_moved_vis += ctx.bytes_moved; if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 010b0cb7693c9..2099159a693fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -617,8 +617,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, return r; if (!amdgpu_gmc_vram_full_visible(&adev->gmc) && - bo->tbo.resource->mem_type == TTM_PL_VRAM && - amdgpu_bo_in_cpu_visible_vram(bo)) + amdgpu_res_cpu_visible(adev, bo->tbo.resource)) amdgpu_cs_report_moved_bytes(adev, ctx.bytes_moved, ctx.bytes_moved); else @@ -1272,23 +1271,25 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict) void amdgpu_bo_get_memory(struct amdgpu_bo *bo, struct amdgpu_mem_stats *stats) { + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); + struct ttm_resource *res = bo->tbo.resource; uint64_t size = amdgpu_bo_size(bo); struct drm_gem_object *obj; unsigned int domain; bool shared; /* Abort if the BO doesn't currently have a backing store */ - if (!bo->tbo.resource) + if (!res) return; obj = &bo->tbo.base; shared = drm_gem_object_is_shared_for_memory_stats(obj); - domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); + domain = amdgpu_mem_type_to_domain(res->mem_type); switch (domain) { case AMDGPU_GEM_DOMAIN_VRAM: stats->vram += size; - if (amdgpu_bo_in_cpu_visible_vram(bo)) + if (amdgpu_res_cpu_visible(adev, bo->tbo.resource)) stats->visible_vram += size; if (shared) stats->vram_shared += size; @@ -1389,10 +1390,7 @@ vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo) /* Remember that this BO was accessed by the CPU */ abo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; - if (bo->resource->mem_type != TTM_PL_VRAM) - return 0; - - if (amdgpu_bo_in_cpu_visible_vram(abo)) + if (amdgpu_res_cpu_visible(adev, bo->resource)) return 0; /* Can't move a pinned BO to visible VRAM */ @@ -1415,7 +1413,7 @@ vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo) /* this should never happen */ if (bo->resource->mem_type == TTM_PL_VRAM && - !amdgpu_bo_in_cpu_visible_vram(abo)) + !amdgpu_res_cpu_visible(adev, bo->resource)) return VM_FAULT_SIGBUS; ttm_bo_move_to_lru_tail_unlocked(bo); @@ -1579,6 +1577,7 @@ uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev, */ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m) { + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); struct dma_buf_attachment *attachment; struct dma_buf *dma_buf; const char *placement; @@ -1587,10 +1586,11 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m) if (dma_resv_trylock(bo->tbo.base.resv)) { unsigned int domain; + domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); switch (domain) { case AMDGPU_GEM_DOMAIN_VRAM: - if (amdgpu_bo_in_cpu_visible_vram(bo)) + if (amdgpu_res_cpu_visible(adev, bo->tbo.resource)) placement = "VRAM VISIBLE"; else placement = "VRAM"; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h index be679c42b0b8c..fa03d9e4874cc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -250,28 +250,6 @@ static inline u64 amdgpu_bo_mmap_offset(struct amdgpu_bo *bo) return drm_vma_node_offset_addr(&bo->tbo.base.vma_node); } -/** - * amdgpu_bo_in_cpu_visible_vram - check if BO is (partly) in visible VRAM - */ -static inline bool amdgpu_bo_in_cpu_visible_vram(struct amdgpu_bo *bo) -{ - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); - struct amdgpu_res_cursor cursor; - - if (!bo->tbo.resource || bo->tbo.resource->mem_type != TTM_PL_VRAM) - return false; - - amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &cursor); - while (cursor.remaining) { - if (cursor.start < adev->gmc.visible_vram_size) - return true; - - amdgpu_res_next(&cursor, cursor.size); - } - - return false; -} - /** * amdgpu_bo_explicit_sync - return whether the bo is explicitly synced */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 6417cb76ccd44..1d71729e3f6bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -133,7 +133,7 @@ static void amdgpu_evict_flags(struct ttm_buffer_object *bo, } else if (!amdgpu_gmc_vram_full_visible(&adev->gmc) && !(abo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) && - amdgpu_bo_in_cpu_visible_vram(abo)) { + amdgpu_res_cpu_visible(adev, bo->resource)) { /* Try evicting to the CPU inaccessible part of VRAM * first, but only set GTT as busy placement, so this @@ -403,40 +403,55 @@ error: return r; } -/* - * amdgpu_mem_visible - Check that memory can be accessed by ttm_bo_move_memcpy +/** + * amdgpu_res_cpu_visible - Check that resource can be accessed by CPU + * @adev: amdgpu device + * @res: the resource to check * - * Called by amdgpu_bo_move() + * Returns: true if the full resource is CPU visible, false otherwise. */ -static bool amdgpu_mem_visible(struct amdgpu_device *adev, - struct ttm_resource *mem) +bool amdgpu_res_cpu_visible(struct amdgpu_device *adev, + struct ttm_resource *res) { - u64 mem_size = (u64)mem->size; struct amdgpu_res_cursor cursor; - u64 end; - if (mem->mem_type == TTM_PL_SYSTEM || - mem->mem_type == TTM_PL_TT) + if (!res) + return false; + + if (res->mem_type == TTM_PL_SYSTEM || res->mem_type == TTM_PL_TT || + res->mem_type == AMDGPU_PL_PREEMPT) return true; - if (mem->mem_type != TTM_PL_VRAM) + + if (res->mem_type != TTM_PL_VRAM) return false; - amdgpu_res_first(mem, 0, mem_size, &cursor); - end = cursor.start + cursor.size; + amdgpu_res_first(res, 0, res->size, &cursor); while (cursor.remaining) { + if ((cursor.start + cursor.size) >= adev->gmc.visible_vram_size) + return false; amdgpu_res_next(&cursor, cursor.size); + } - if (!cursor.remaining) - break; + return true; +} - /* ttm_resource_ioremap only supports contiguous memory */ - if (end != cursor.start) - return false; +/* + * amdgpu_res_copyable - Check that memory can be accessed by ttm_bo_move_memcpy + * + * Called by amdgpu_bo_move() + */ +static bool amdgpu_res_copyable(struct amdgpu_device *adev, + struct ttm_resource *mem) +{ + if (!amdgpu_res_cpu_visible(adev, mem)) + return false; - end = cursor.start + cursor.size; - } + /* ttm_resource_ioremap only supports contiguous memory */ + if (mem->mem_type == TTM_PL_VRAM && + !(mem->placement & TTM_PL_FLAG_CONTIGUOUS)) + return false; - return end <= adev->gmc.visible_vram_size; + return true; } /* @@ -529,8 +544,8 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict, if (r) { /* Check that all memory is CPU accessible */ - if (!amdgpu_mem_visible(adev, old_mem) || - !amdgpu_mem_visible(adev, new_mem)) { + if (!amdgpu_res_copyable(adev, old_mem) || + !amdgpu_res_copyable(adev, new_mem)) { pr_err("Move buffer fallback to memcpy unavailable\n"); return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index 65ec82141a8e0..32cf6b6f6efd9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -139,6 +139,9 @@ int amdgpu_vram_mgr_reserve_range(struct amdgpu_vram_mgr *mgr, int amdgpu_vram_mgr_query_page_status(struct amdgpu_vram_mgr *mgr, uint64_t start); +bool amdgpu_res_cpu_visible(struct amdgpu_device *adev, + struct ttm_resource *res); + int amdgpu_ttm_init(struct amdgpu_device *adev); void amdgpu_ttm_fini(struct amdgpu_device *adev); void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, -- GitLab From 0ba753bc7e79e49556e81b0d09b2de1aa558553b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Sun, 14 Apr 2024 22:06:08 -0400 Subject: [PATCH 951/989] drm/radeon: make -fstrict-flex-arrays=3 happy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver parses a union where the layout up through the first array is the same, however, the array has different sizes depending on the elements in the union. Be explicit to fix the UBSAN checker. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3323 Fixes: df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3") Acked-by: Christian König Reviewed-by: Kees Cook Signed-off-by: Alex Deucher Cc: Kees Cook --- drivers/gpu/drm/radeon/radeon_atombios.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c index bb1f0a3371ab5..10793a433bf58 100644 --- a/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/drivers/gpu/drm/radeon/radeon_atombios.c @@ -923,8 +923,12 @@ bool radeon_get_atom_connector_info_from_supported_devices_table(struct max_device = ATOM_MAX_SUPPORTED_DEVICE_INFO; for (i = 0; i < max_device; i++) { - ATOM_CONNECTOR_INFO_I2C ci = - supported_devices->info.asConnInfo[i]; + ATOM_CONNECTOR_INFO_I2C ci; + + if (frev > 1) + ci = supported_devices->info_2d1.asConnInfo[i]; + else + ci = supported_devices->info.asConnInfo[i]; bios_connectors[i].valid = false; -- GitLab From 781d41fed19caf900c8405064676813dc9921d32 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 8 Apr 2024 13:30:15 -0400 Subject: [PATCH 952/989] drm/radeon: silence UBSAN warning (v3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert a variable sized array from [1] to []. v2: fix up a few more. v3: integrate comments from Kees. Reviewed-by: Kees Cook Tested-by: Jeff Johnson (v2) Acked-by: Christian König (v1) Signed-off-by: Alex Deucher Cc: keescook@chromium.org --- drivers/gpu/drm/radeon/pptable.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/radeon/pptable.h b/drivers/gpu/drm/radeon/pptable.h index 94947229888ba..b7f22597ee95e 100644 --- a/drivers/gpu/drm/radeon/pptable.h +++ b/drivers/gpu/drm/radeon/pptable.h @@ -424,7 +424,7 @@ typedef struct _ATOM_PPLIB_SUMO_CLOCK_INFO{ typedef struct _ATOM_PPLIB_STATE_V2 { //number of valid dpm levels in this state; Driver uses it to calculate the whole - //size of the state: sizeof(ATOM_PPLIB_STATE_V2) + (ucNumDPMLevels - 1) * sizeof(UCHAR) + //size of the state: struct_size(ATOM_PPLIB_STATE_V2, clockInfoIndex, ucNumDPMLevels) UCHAR ucNumDPMLevels; //a index to the array of nonClockInfos @@ -432,14 +432,14 @@ typedef struct _ATOM_PPLIB_STATE_V2 /** * Driver will read the first ucNumDPMLevels in this array */ - UCHAR clockInfoIndex[1]; + UCHAR clockInfoIndex[] __counted_by(ucNumDPMLevels); } ATOM_PPLIB_STATE_V2; typedef struct _StateArray{ //how many states we have UCHAR ucNumEntries; - ATOM_PPLIB_STATE_V2 states[1]; + ATOM_PPLIB_STATE_V2 states[] __counted_by(ucNumEntries); }StateArray; @@ -450,7 +450,7 @@ typedef struct _ClockInfoArray{ //sizeof(ATOM_PPLIB_CLOCK_INFO) UCHAR ucEntrySize; - UCHAR clockInfo[1]; + UCHAR clockInfo[] __counted_by(ucNumEntries); }ClockInfoArray; typedef struct _NonClockInfoArray{ @@ -460,7 +460,7 @@ typedef struct _NonClockInfoArray{ //sizeof(ATOM_PPLIB_NONCLOCK_INFO) UCHAR ucEntrySize; - ATOM_PPLIB_NONCLOCK_INFO nonClockInfo[1]; + ATOM_PPLIB_NONCLOCK_INFO nonClockInfo[] __counted_by(ucNumEntries); }NonClockInfoArray; typedef struct _ATOM_PPLIB_Clock_Voltage_Dependency_Record -- GitLab From 6376306adde5b252ee7c73572e35d13fb13f6f18 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Mon, 15 Apr 2024 18:15:43 +0200 Subject: [PATCH 953/989] x86/retpolines: Enable the default thunk warning only on relevant configs The using-default-thunk warning check makes sense only with configurations which actually enable the special return thunks. Otherwise, it fires on unrelated 32-bit configs on which the special return thunks won't even work (they're 64-bit only) and, what is more, those configs even go off into the weeds when booting in the alternatives patching code, leading to a dead machine. Fixes: 4461438a8405 ("x86/retpoline: Ensure default return thunk isn't used at runtime") Reported-by: Klara Modin Reported-by: Erhard Furtner Signed-off-by: Borislav Petkov (AMD) Tested-by: Klara Modin Link: https://lore.kernel.org/r/78e0d19c-b77a-4169-a80f-2eef91f4a1d6@gmail.com Link: https://lore.kernel.org/r/20240413024956.488d474e@yea --- arch/x86/lib/retpoline.S | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index e674ccf720b9f..391059b2c6fbc 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -382,8 +382,15 @@ SYM_FUNC_END(call_depth_return_thunk) SYM_CODE_START(__x86_return_thunk) UNWIND_HINT_FUNC ANNOTATE_NOENDBR +#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || \ + defined(CONFIG_MITIGATION_SRSO) || \ + defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) ALTERNATIVE __stringify(ANNOTATE_UNRET_SAFE; ret), \ "jmp warn_thunk_thunk", X86_FEATURE_ALWAYS +#else + ANNOTATE_UNRET_SAFE + ret +#endif int3 SYM_CODE_END(__x86_return_thunk) EXPORT_SYMBOL(__x86_return_thunk) -- GitLab From 298b871cd55a607037ac8af0011b9fdeb54c1e65 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 16 Apr 2024 06:44:04 +0900 Subject: [PATCH 954/989] bootconfig: Fix the kerneldoc of _xbc_exit() Fix the kerneldoc of _xbc_exit() which is updated to have an @early argument and the function name is changed. Link: https://lore.kernel.org/all/171321744474.599864.13532445969528690358.stgit@devnote2/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404150036.kPJ3HEFA-lkp@intel.com/ Fixes: 89f9a1e876b5 ("bootconfig: use memblock_free_late to free xbc memory to buddy") Signed-off-by: Masami Hiramatsu (Google) --- lib/bootconfig.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 8841554432d5b..97f8911ea339e 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -901,7 +901,8 @@ static int __init xbc_parse_tree(void) } /** - * xbc_exit() - Clean up all parsed bootconfig + * _xbc_exit() - Clean up all parsed bootconfig + * @early: Set true if this is called before budy system is initialized. * * This clears all data structures of parsed bootconfig on memory. * If you need to reuse xbc_init() with new boot config, you can -- GitLab From 69ffed4b62523bbc85511f150500329d28aba356 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 17 Apr 2024 17:19:13 +0300 Subject: [PATCH 955/989] gpiolib: swnode: Remove wrong header inclusion The flags in the software node properties are supposed to be the GPIO lookup flags, which are provided by gpio/machine.h, as the software nodes are the kernel internal thing and doesn't need to rely to any of ABIs. Fixes: e7f9ff5dc90c ("gpiolib: add support for software nodes") Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/property.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/gpio/property.h b/include/linux/gpio/property.h index 6c75c8bd44a0b..1a14e239221f7 100644 --- a/include/linux/gpio/property.h +++ b/include/linux/gpio/property.h @@ -2,7 +2,6 @@ #ifndef __LINUX_GPIO_PROPERTY_H #define __LINUX_GPIO_PROPERTY_H -#include /* for GPIO_* flags */ #include #define PROPERTY_ENTRY_GPIO(_name_, _chip_node_, _idx_, _flags_) \ -- GitLab From 86a1471d7cde792941109b93b558b5dc078b9ee9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Apr 2024 17:43:21 +0200 Subject: [PATCH 956/989] netfilter: nf_tables: fix memleak in map from abort path The delete set command does not rely on the transaction object for element removal, therefore, a combination of delete element + delete set from the abort path could result in restoring twice the refcount of the mapping. Check for inactive element in the next generation for the delete element command in the abort path, skip restoring state if next generation bit has been already cleared. This is similar to the activate logic using the set walk iterator. [ 6170.286929] ------------[ cut here ]------------ [ 6170.286939] WARNING: CPU: 6 PID: 790302 at net/netfilter/nf_tables_api.c:2086 nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.287071] Modules linked in: [...] [ 6170.287633] CPU: 6 PID: 790302 Comm: kworker/6:2 Not tainted 6.9.0-rc3+ #365 [ 6170.287768] RIP: 0010:nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.287886] Code: df 48 8d 7d 58 e8 69 2e 3b df 48 8b 7d 58 e8 80 1b 37 df 48 8d 7d 68 e8 57 2e 3b df 48 8b 7d 68 e8 6e 1b 37 df 48 89 ef eb c4 <0f> 0b 48 83 c4 08 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 0f [ 6170.287895] RSP: 0018:ffff888134b8fd08 EFLAGS: 00010202 [ 6170.287904] RAX: 0000000000000001 RBX: ffff888125bffb28 RCX: dffffc0000000000 [ 6170.287912] RDX: 0000000000000003 RSI: ffffffffa20298ab RDI: ffff88811ebe4750 [ 6170.287919] RBP: ffff88811ebe4700 R08: ffff88838e812650 R09: fffffbfff0623a55 [ 6170.287926] R10: ffffffff8311d2af R11: 0000000000000001 R12: ffff888125bffb10 [ 6170.287933] R13: ffff888125bffb10 R14: dead000000000122 R15: dead000000000100 [ 6170.287940] FS: 0000000000000000(0000) GS:ffff888390b00000(0000) knlGS:0000000000000000 [ 6170.287948] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 6170.287955] CR2: 00007fd31fc00710 CR3: 0000000133f60004 CR4: 00000000001706f0 [ 6170.287962] Call Trace: [ 6170.287967] [ 6170.287973] ? __warn+0x9f/0x1a0 [ 6170.287986] ? nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.288092] ? report_bug+0x1b1/0x1e0 [ 6170.287986] ? nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.288092] ? report_bug+0x1b1/0x1e0 [ 6170.288104] ? handle_bug+0x3c/0x70 [ 6170.288112] ? exc_invalid_op+0x17/0x40 [ 6170.288120] ? asm_exc_invalid_op+0x1a/0x20 [ 6170.288132] ? nf_tables_chain_destroy+0x2b/0x220 [nf_tables] [ 6170.288243] ? nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.288366] ? nf_tables_chain_destroy+0x2b/0x220 [nf_tables] [ 6170.288483] nf_tables_trans_destroy_work+0x588/0x590 [nf_tables] Fixes: 591054469b3e ("netfilter: nf_tables: revisit chain/object refcounting from elements") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d0c09f899e801..167074283ea91 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7223,6 +7223,16 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) } } +static int nft_setelem_active_next(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + u8 genmask = nft_genmask_next(net); + + return nft_set_elem_active(ext, genmask); +} + static void nft_setelem_data_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) @@ -10644,8 +10654,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; - nft_setelem_data_activate(net, te->set, te->elem_priv); - nft_setelem_activate(net, te->set, te->elem_priv); + if (!nft_setelem_active_next(net, te->set, te->elem_priv)) { + nft_setelem_data_activate(net, te->set, te->elem_priv); + nft_setelem_activate(net, te->set, te->elem_priv); + } if (!nft_setelem_is_catchall(te->set, te->elem_priv)) te->set->ndeact--; -- GitLab From 0f022d32c3eca477fbf79a205243a6123ed0fe11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Apr 2024 18:07:28 -0300 Subject: [PATCH 957/989] net/sched: Fix mirred deadlock on device recursion When the mirred action is used on a classful egress qdisc and a packet is mirrored or redirected to self we hit a qdisc lock deadlock. See trace below. [..... other info removed for brevity....] [ 82.890906] [ 82.890906] ============================================ [ 82.890906] WARNING: possible recursive locking detected [ 82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G W [ 82.890906] -------------------------------------------- [ 82.890906] ping/418 is trying to acquire lock: [ 82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at: __dev_queue_xmit+0x1778/0x3550 [ 82.890906] [ 82.890906] but task is already holding lock: [ 82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at: __dev_queue_xmit+0x1778/0x3550 [ 82.890906] [ 82.890906] other info that might help us debug this: [ 82.890906] Possible unsafe locking scenario: [ 82.890906] [ 82.890906] CPU0 [ 82.890906] ---- [ 82.890906] lock(&sch->q.lock); [ 82.890906] lock(&sch->q.lock); [ 82.890906] [ 82.890906] *** DEADLOCK *** [ 82.890906] [..... other info removed for brevity....] Example setup (eth0->eth0) to recreate tc qdisc add dev eth0 root handle 1: htb default 30 tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth0 Another example(eth0->eth1->eth0) to recreate tc qdisc add dev eth0 root handle 1: htb default 30 tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth1 tc qdisc add dev eth1 root handle 1: htb default 30 tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth0 We fix this by adding an owner field (CPU id) to struct Qdisc set after root qdisc is entered. When the softirq enters it a second time, if the qdisc owner is the same CPU, the packet is dropped to break the loop. Reported-by: Mingshuai Ren Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/ Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()") Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops") Signed-off-by: Eric Dumazet Reviewed-by: Victor Nogueira Reviewed-by: Pedro Tammela Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240415210728.36949-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 1 + net/core/dev.c | 6 ++++++ net/sched/sch_generic.c | 1 + 3 files changed, 8 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cefe0c4bdae34..41ca14e81d55f 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -117,6 +117,7 @@ struct Qdisc { struct qdisc_skb_head q; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; + int owner; unsigned long state; unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; diff --git a/net/core/dev.c b/net/core/dev.c index 984ff8b9d0e1a..331848eca7d31 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3775,6 +3775,10 @@ no_lock_out: return rc; } + if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { + kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); + return NET_XMIT_DROP; + } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3814,7 +3818,9 @@ no_lock_out: qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { + WRITE_ONCE(q->owner, smp_processor_id()); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + WRITE_ONCE(q->owner, -1); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ff53364937775..4a2c763e2d116 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -974,6 +974,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; + sch->owner = -1; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); -- GitLab From caed8eba221533123192d39cc947f45cbb1e1db5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Apr 2024 08:10:48 -0700 Subject: [PATCH 958/989] selftests: kselftest_harness: fix Clang warning about zero-length format Apparently it's more legal to pass the format as NULL, than it is to use an empty string. Clang complains about empty formats: ./../kselftest_harness.h:1207:30: warning: format string is empty [-Wformat-zero-length] 1207 | diagnostic ? "%s" : "", diagnostic); | ^~ 1 warning generated. Reported-by: Sean Christopherson Link: https://lore.kernel.org/all/20240409224256.1581292-1-seanjc@google.com Fixes: 378193eff339 ("selftests: kselftest_harness: let PASS / FAIL provide diagnostic") Tested-by: Sean Christopherson Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240416151048.1682352-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/kselftest.h | 10 ++++++---- tools/testing/selftests/kselftest_harness.h | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 541bf192e30e6..4eca3fd1292cf 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -288,15 +288,17 @@ void ksft_test_result_code(int exit_code, const char *test_name, } /* Docs seem to call for double space if directive is absent */ - if (!directive[0] && msg[0]) + if (!directive[0] && msg) directive = " # "; - va_start(args, msg); printf("%s %u %s%s", tap_code, ksft_test_num(), test_name, directive); errno = saved_errno; - vprintf(msg, args); + if (msg) { + va_start(args, msg); + vprintf(msg, args); + va_end(args); + } printf("\n"); - va_end(args); } static inline int ksft_exit_pass(void) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 4fd735e48ee7e..adb15cae79abc 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -1202,7 +1202,7 @@ void __run_test(struct __fixture_metadata *f, diagnostic = "unknown"; ksft_test_result_code(t->exit_code, test_name, - diagnostic ? "%s" : "", diagnostic); + diagnostic ? "%s" : NULL, diagnostic); } static int test_harness_run(int argc, char **argv) -- GitLab From d362046021ea122309da8c8e0b6850c792ca97b5 Mon Sep 17 00:00:00 2001 From: Vanillan Wang Date: Tue, 16 Apr 2024 20:07:13 +0800 Subject: [PATCH 959/989] net:usb:qmi_wwan: support Rolling modules Update the qmi_wwan driver support for the Rolling LTE modules. - VID:PID 33f8:0104, RW101-GL for laptop debug M.2 cards(with RMNET interface for /Linux/Chrome OS) 0x0104: RMNET, diag, at, pipe Here are the outputs of usb-devices: T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=33f8 ProdID=0104 Rev=05.04 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=ba2eb033 C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms Signed-off-by: Vanillan Wang Link: https://lore.kernel.org/r/20240416120713.24777-1-vanillanwang@163.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index e2e181378f412..edc34402e787f 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1431,6 +1431,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x2692, 0x9025, 4)}, /* Cellient MPL200 (rebranded Qualcomm 05c6:9025) */ {QMI_QUIRK_SET_DTR(0x1546, 0x1312, 4)}, /* u-blox LARA-R6 01B */ {QMI_QUIRK_SET_DTR(0x1546, 0x1342, 4)}, /* u-blox LARA-L6 */ + {QMI_QUIRK_SET_DTR(0x33f8, 0x0104, 4)}, /* Rolling RW101 RMNET */ /* 4. Gobi 1000 devices */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9212)}, /* Acer Gobi Modem Device */ -- GitLab From 94667949ec3bbb2218c46ad0a0e7274c8832e494 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 16 Apr 2024 10:23:29 +0200 Subject: [PATCH 960/989] net: ethernet: mtk_eth_soc: fix WED + wifi reset The WLAN + WED reset sequence relies on being able to receive interrupts from the card, in order to synchronize individual steps with the firmware. When WED is stopped, leave interrupts running and rely on the driver turning off unwanted ones. WED DMA also needs to be disabled before resetting. Fixes: f78cd9c783e0 ("net: ethernet: mtk_wed: update mtk_wed_stop") Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20240416082330.82564-1-nbd@nbd.name Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_wed.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c index c895e265ae0eb..61334a71058c7 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.c +++ b/drivers/net/ethernet/mediatek/mtk_wed.c @@ -1074,13 +1074,13 @@ mtk_wed_dma_disable(struct mtk_wed_device *dev) static void mtk_wed_stop(struct mtk_wed_device *dev) { + mtk_wed_dma_disable(dev); mtk_wed_set_ext_int(dev, false); wed_w32(dev, MTK_WED_WPDMA_INT_TRIGGER, 0); wed_w32(dev, MTK_WED_WDMA_INT_TRIGGER, 0); wdma_w32(dev, MTK_WDMA_INT_MASK, 0); wdma_w32(dev, MTK_WDMA_INT_GRP2, 0); - wed_w32(dev, MTK_WED_WPDMA_INT_MASK, 0); if (!mtk_wed_get_rx_capa(dev)) return; @@ -1093,7 +1093,6 @@ static void mtk_wed_deinit(struct mtk_wed_device *dev) { mtk_wed_stop(dev); - mtk_wed_dma_disable(dev); wed_clr(dev, MTK_WED_CTRL, MTK_WED_CTRL_WDMA_INT_AGENT_EN | @@ -2605,9 +2604,6 @@ mtk_wed_irq_get(struct mtk_wed_device *dev, u32 mask) static void mtk_wed_irq_set_mask(struct mtk_wed_device *dev, u32 mask) { - if (!dev->running) - return; - mtk_wed_set_ext_int(dev, !!mask); wed_w32(dev, MTK_WED_INT_MASK, mask); } -- GitLab From f74ab0c5e5947bcb3a400ab73d837974e76fad23 Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Thu, 11 Apr 2024 17:18:22 +0800 Subject: [PATCH 961/989] ALSA: hda/tas2781: Add new vendor_id and subsystem_id to support ThinkPad ICE-1 Add new vendor_id and subsystem_id to support new Lenovo laptop ThinkPad ICE-1 Signed-off-by: Shenghao Ding Cc: Message-ID: <20240411091823.1644-1-shenghao-ding@ti.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index d6940bc4ec393..d4c2c3d6c76c6 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10335,6 +10335,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x222e, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x2231, "Thinkpad T560", ALC292_FIXUP_TPT460), SND_PCI_QUIRK(0x17aa, 0x2233, "Thinkpad", ALC292_FIXUP_TPT460), + SND_PCI_QUIRK(0x17aa, 0x2234, "Thinkpad ICE-1", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x2245, "Thinkpad T470", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x2246, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x2247, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), -- GitLab From 0672b017324b1444f12d008a3ba8bc0c6c9384fa Mon Sep 17 00:00:00 2001 From: Vitalii Torshyn Date: Thu, 11 Apr 2024 15:58:03 +0300 Subject: [PATCH 962/989] ALSA: hda/realtek: Fixes for Asus GU605M and GA403U sound Added the correct pin table for Asus GU605M and GA403U, enabling all speakers to be controlled with the master. Updated quirks for GU605M and GA403U by including the pin table patch in the chain. Co-developed-by: Luke D. Jones Signed-off-by: Luke D. Jones Signed-off-by: Vitalii Torshyn Message-ID: <20240411125803.18539-1-vitaly.torshyn@gmail.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 40 +++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index d4c2c3d6c76c6..4cf05a5c3aba8 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7467,6 +7467,10 @@ enum { ALC285_FIXUP_CS35L56_I2C_2, ALC285_FIXUP_CS35L56_I2C_4, ALC285_FIXUP_ASUS_GA403U, + ALC285_FIXUP_ASUS_GA403U_HEADSET_MIC, + ALC285_FIXUP_ASUS_GA403U_I2C_SPEAKER2_TO_DAC1, + ALC285_FIXUP_ASUS_GU605_SPI_2_HEADSET_MIC, + ALC285_FIXUP_ASUS_GU605_SPI_SPEAKER2_TO_DAC1 }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -9690,6 +9694,38 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc285_fixup_asus_ga403u, }, + [ALC285_FIXUP_ASUS_GA403U_HEADSET_MIC] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x03a11050 }, + { 0x1b, 0x03a11c30 }, + { } + }, + .chained = true, + .chain_id = ALC285_FIXUP_ASUS_GA403U_I2C_SPEAKER2_TO_DAC1 + }, + [ALC285_FIXUP_ASUS_GU605_SPI_SPEAKER2_TO_DAC1] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_speaker2_to_dac1, + .chained = true, + .chain_id = ALC285_FIXUP_ASUS_GU605_SPI_2_HEADSET_MIC, + }, + [ALC285_FIXUP_ASUS_GU605_SPI_2_HEADSET_MIC] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x03a11050 }, + { 0x1b, 0x03a11c30 }, + { } + }, + .chained = true, + .chain_id = ALC285_FIXUP_CS35L56_SPI_2 + }, + [ALC285_FIXUP_ASUS_GA403U_I2C_SPEAKER2_TO_DAC1] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_speaker2_to_dac1, + .chained = true, + .chain_id = ALC285_FIXUP_ASUS_GA403U, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -10145,7 +10181,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1a83, "ASUS UM5302LA", ALC294_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1a8f, "ASUS UX582ZS", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1b11, "ASUS UX431DA", ALC294_FIXUP_ASUS_COEF_1B), - SND_PCI_QUIRK(0x1043, 0x1b13, "ASUS U41SV/GA403U", ALC285_FIXUP_ASUS_GA403U), + SND_PCI_QUIRK(0x1043, 0x1b13, "ASUS U41SV/GA403U", ALC285_FIXUP_ASUS_GA403U_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1b93, "ASUS G614JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1bbd, "ASUS Z550MA", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1c03, "ASUS UM3406HA", ALC287_FIXUP_CS35L41_I2C_2), @@ -10153,7 +10189,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1c33, "ASUS UX5304MA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1c43, "ASUS UX8406MA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1c62, "ASUS GU603", ALC289_FIXUP_ASUS_GA401), - SND_PCI_QUIRK(0x1043, 0x1c63, "ASUS GU605M", ALC285_FIXUP_CS35L56_SPI_2), + SND_PCI_QUIRK(0x1043, 0x1c63, "ASUS GU605M", ALC285_FIXUP_ASUS_GU605_SPI_SPEAKER2_TO_DAC1), SND_PCI_QUIRK(0x1043, 0x1c92, "ASUS ROG Strix G15", ALC285_FIXUP_ASUS_G533Z_PINS), SND_PCI_QUIRK(0x1043, 0x1c9f, "ASUS G614JU/JV/JI", ALC285_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1caf, "ASUS G634JY/JZ/JI/JG", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), -- GitLab From dca5f4dfa925b51becee65031869e917e6229620 Mon Sep 17 00:00:00 2001 From: Huayu Zhang Date: Sat, 13 Apr 2024 19:41:22 +0800 Subject: [PATCH 963/989] ALSA: hda/realtek: Fix volumn control of ThinkBook 16P Gen4 change HDA & AMP configuration from ALC287_FIXUP_CS35L41_I2C_2 to ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD for ThinkBook 16P Gen4 models to fix volumn control issue (cannot fully mute). Signed-off-by: Huayu Zhang Fixes: 6214e24cae9b ("ALSA: hda/realtek: Add quirks for Lenovo Thinkbook 16P laptops") Message-ID: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 4cf05a5c3aba8..afe1238ca51ba 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10433,8 +10433,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3886, "Y780 VECO DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38a7, "Y780P AMD YG dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38a8, "Y780P AMD VECO dual", ALC287_FIXUP_TAS2781_I2C), - SND_PCI_QUIRK(0x17aa, 0x38a9, "Thinkbook 16P", ALC287_FIXUP_CS35L41_I2C_2), - SND_PCI_QUIRK(0x17aa, 0x38ab, "Thinkbook 16P", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x38a9, "Thinkbook 16P", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), + SND_PCI_QUIRK(0x17aa, 0x38ab, "Thinkbook 16P", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), SND_PCI_QUIRK(0x17aa, 0x38b4, "Legion Slim 7 16IRH8", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x38b5, "Legion Slim 7 16IRH8", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x38b6, "Legion Slim 7 16APH8", ALC287_FIXUP_CS35L41_I2C_2), -- GitLab From cf16ffa17c398434a77b8a373e69287c95b60de2 Mon Sep 17 00:00:00 2001 From: Coia Prant Date: Mon, 15 Apr 2024 07:26:25 -0700 Subject: [PATCH 964/989] USB: serial: option: add Lonsung U8300/U9300 product Update the USB serial option driver to support Longsung U8300/U9300. For U8300 Interface 4 is used by for QMI interface in stock firmware of U8300, the router which uses U8300 modem. Interface 5 is used by for ADB interface in stock firmware of U8300, the router which uses U8300 modem. Interface mapping is: 0: unknown (Debug), 1: AT (Modem), 2: AT, 3: PPP (NDIS / Pipe), 4: QMI, 5: ADB T: Bus=05 Lev=01 Prnt=03 Port=02 Cnt=01 Dev#= 4 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1c9e ProdID=9b05 Rev=03.18 S: Manufacturer=Android S: Product=Android C: #Ifs= 6 Cfg#= 1 Atr=80 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms For U9300 Interface 1 is used by for ADB interface in stock firmware of U9300, the router which uses U9300 modem. Interface 4 is used by for QMI interface in stock firmware of U9300, the router which uses U9300 modem. Interface mapping is: 0: ADB, 1: AT (Modem), 2: AT, 3: PPP (NDIS / Pipe), 4: QMI Note: Interface 3 of some models of the U9300 series can send AT commands. T: Bus=05 Lev=01 Prnt=05 Port=04 Cnt=01 Dev#= 6 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1c9e ProdID=9b3c Rev=03.18 S: Manufacturer=Android S: Product=Android C: #Ifs= 5 Cfg#= 1 Atr=80 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=03(Int.) MxPS= 8 Ivl=32ms Tested successfully using Modem Manager on U9300. Tested successfully AT commands using If=1, If=2 and If=3 on U9300. Signed-off-by: Coia Prant Reviewed-by: Lars Melin [ johan: drop product defines, trim commit message ] Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index ad0fff1e889d0..8827ca3a65955 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2068,6 +2068,10 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(LONGCHEER_VENDOR_ID, 0x9803, 0xff), .driver_info = RSVD(4) }, + { USB_DEVICE(LONGCHEER_VENDOR_ID, 0x9b05), /* Longsung U8300 */ + .driver_info = RSVD(4) | RSVD(5) }, + { USB_DEVICE(LONGCHEER_VENDOR_ID, 0x9b3c), /* Longsung U9300 */ + .driver_info = RSVD(0) | RSVD(4) }, { USB_DEVICE(LONGCHEER_VENDOR_ID, ZOOM_PRODUCT_4597) }, { USB_DEVICE(LONGCHEER_VENDOR_ID, IBALL_3_5G_CONNECT) }, { USB_DEVICE(HAIER_VENDOR_ID, HAIER_PRODUCT_CE100) }, -- GitLab From 311f97a4c7c22a01f8897bddf00428dfd0668e79 Mon Sep 17 00:00:00 2001 From: Vanillan Wang Date: Tue, 16 Apr 2024 18:02:55 +0800 Subject: [PATCH 965/989] USB: serial: option: add Rolling RW101-GL and RW135-GL support Update the USB serial option driver support for the Rolling LTE modules. - VID:PID 33f8:01a2, RW101-GL for laptop debug M.2 cards(with MBIM interface for /Linux/Chrome OS) 0x01a2: mbim, diag, at, pipe - VID:PID 33f8:01a3, RW101-GL for laptop debug M.2 cards(with MBIM interface for /Linux/Chrome OS) 0x01a3: mbim, pipe - VID:PID 33f8:01a4, RW101-GL for laptop debug M.2 cards(with MBIM interface for /Linux/Chrome OS) 0x01a4: mbim, diag, at, pipe - VID:PID 33f8:0104, RW101-GL for laptop debug M.2 cards(with RMNET interface for /Linux/Chrome OS) 0x0104: RMNET, diag, at, pipe - VID:PID 33f8:0115, RW135-GL for laptop debug M.2 cards(with MBIM interface for /Linux/Chrome OS) 0x0115: MBIM, diag, at, pipe Here are the outputs of usb-devices: T: Bus=01 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 5 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=33f8 ProdID=01a2 Rev=05.15 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=12345678 C: #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms T: Bus=01 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=33f8 ProdID=01a3 Rev=05.15 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=12345678 C: #Ifs= 3 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms T: Bus=01 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 17 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=33f8 ProdID=01a4 Rev=05.15 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=12345678 C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=33f8 ProdID=0104 Rev=05.04 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=ba2eb033 C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms T: Bus=01 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 16 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=33f8 ProdID=0115 Rev=05.15 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=12345678 C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Vanillan Wang Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 8827ca3a65955..e22612017f666 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2307,6 +2307,14 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1404, 0xff) }, /* GosunCn GM500 RNDIS */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1405, 0xff) }, /* GosunCn GM500 MBIM */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1406, 0xff) }, /* GosunCn GM500 ECM/NCM */ + { USB_DEVICE(0x33f8, 0x0104), /* Rolling RW101-GL (laptop RMNET) */ + .driver_info = RSVD(4) | RSVD(5) }, + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a2, 0xff) }, /* Rolling RW101-GL (laptop MBIM) */ + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a3, 0xff) }, /* Rolling RW101-GL (laptop MBIM) */ + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a4, 0xff), /* Rolling RW101-GL (laptop MBIM) */ + .driver_info = RSVD(4) }, + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0115, 0xff), /* Rolling RW135-GL (laptop MBIM) */ + .driver_info = RSVD(5) }, { USB_DEVICE_AND_INTERFACE_INFO(OPPO_VENDOR_ID, OPPO_PRODUCT_R11, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x40) }, -- GitLab From 7caf3daaaf0436fe370834c72c667a97d3671d1a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 17 Apr 2024 17:16:33 +0100 Subject: [PATCH 966/989] ALSA: hda/realtek: Add quirks for Huawei Matebook D14 NBLB-WAX9N The headset mic requires a fixup to be properly detected/used. As a reference, this specific model from 2021 reports the following devices: https://alsa-project.org/db/?f=1a5ddeb0b151db8fe051407f5bb1c075b7dd3e4a Signed-off-by: Mauro Carvalho Chehab Cc: Message-ID: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index afe1238ca51ba..f714608474922 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10266,6 +10266,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1462, 0xb120, "MSI Cubi MS-B120", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb171, "Cubi N 8GL (MS-B171)", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x152d, 0x1082, "Quanta NL3", ALC269_FIXUP_LIFEBOOK), + SND_PCI_QUIRK(0x152d, 0x1262, "Huawei NBLB-WAX9N", ALC2XX_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1558, 0x0353, "Clevo V35[05]SN[CDE]Q", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x1323, "Clevo N130ZU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x1325, "Clevo N15[01][CW]U", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), -- GitLab From 32f5f73b79ffdef215e2e1bcb6ad74387c0f925c Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Wed, 17 Apr 2024 10:47:31 -0700 Subject: [PATCH 967/989] x86/fred: Fix INT80 emulation for FRED Add a FRED-specific INT80 handler and document why it differs from the current one. Eventually, the common bits will be unified once FRED hw is available and it turns out that no further changes are needed but for now, keep the handlers separate for everyone's sanity's sake. [ bp: Zap duplicated commit message, massage. ] Fixes: 55617fb991df ("x86/entry: Do not allow external 0x80 interrupts") Suggested-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li (Intel) Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240417174731.4189592-1-xin@zytor.com --- arch/x86/entry/common.c | 65 +++++++++++++++++++++++++++++++++++++ arch/x86/entry/entry_fred.c | 2 +- 2 files changed, 66 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 6de50b80702e6..51cc9c7cb9bdc 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -255,6 +255,71 @@ __visible noinstr void do_int80_emulation(struct pt_regs *regs) instrumentation_end(); syscall_exit_to_user_mode(regs); } + +#ifdef CONFIG_X86_FRED +/* + * A FRED-specific INT80 handler is warranted for the follwing reasons: + * + * 1) As INT instructions and hardware interrupts are separate event + * types, FRED does not preclude the use of vector 0x80 for external + * interrupts. As a result, the FRED setup code does not reserve + * vector 0x80 and calling int80_is_external() is not merely + * suboptimal but actively incorrect: it could cause a system call + * to be incorrectly ignored. + * + * 2) It is called only for handling vector 0x80 of event type + * EVENT_TYPE_SWINT and will never be called to handle any external + * interrupt (event type EVENT_TYPE_EXTINT). + * + * 3) FRED has separate entry flows depending on if the event came from + * user space or kernel space, and because the kernel does not use + * INT insns, the FRED kernel entry handler fred_entry_from_kernel() + * falls through to fred_bad_type() if the event type is + * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling + * an INT insn, it can only be from a user level. + * + * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will + * likely take a different approach if it is ever needed: it + * probably belongs in either fred_intx()/ fred_other() or + * asm_fred_entrypoint_user(), depending on if this ought to be done + * for all entries from userspace or only system + * calls. + * + * 5) INT $0x80 is the fast path for 32-bit system calls under FRED. + */ +DEFINE_FREDENTRY_RAW(int80_emulation) +{ + int nr; + + enter_from_user_mode(regs); + + instrumentation_begin(); + add_random_kstack_offset(); + + /* + * FRED pushed 0 into regs::orig_ax and regs::ax contains the + * syscall number. + * + * User tracing code (ptrace or signal handlers) might assume + * that the regs::orig_ax contains a 32-bit number on invoking + * a 32-bit syscall. + * + * Establish the syscall convention by saving the 32bit truncated + * syscall number in regs::orig_ax and by invalidating regs::ax. + */ + regs->orig_ax = regs->ax & GENMASK(31, 0); + regs->ax = -ENOSYS; + + nr = syscall_32_enter(regs); + + local_irq_enable(); + nr = syscall_enter_from_user_mode_work(regs, nr); + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} +#endif #else /* CONFIG_IA32_EMULATION */ /* Handles int $0x80 on a 32bit kernel */ diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index ac120cbdaaf2b..9fa18b8c7f264 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -66,7 +66,7 @@ static noinstr void fred_intx(struct pt_regs *regs) /* INT80 */ case IA32_SYSCALL_VECTOR: if (ia32_enabled()) - return int80_emulation(regs); + return fred_int80_emulation(regs); fallthrough; #endif -- GitLab From a4b37f5033fa812f02f3b7bd1242393d347ba791 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Wed, 17 Apr 2024 19:34:25 +0800 Subject: [PATCH 968/989] x86/fred: Fix incorrect error code printout in fred_bad_type() regs->orig_ax has been set to -1 on entry so in the printout, fred_bad_type() should use the passed parameter error_code. Fixes: 14619d912b65 ("x86/fred: FRED entry/exit and dispatch code") Signed-off-by: Hou Wenlong Signed-off-by: Borislav Petkov (AMD) Acked-by: H. Peter Anvin (Intel) Link: https://lore.kernel.org/r/b2a8f0a41449d25240e314a2ddfbf6549511fb04.1713353612.git.houwenlong.hwl@antgroup.com --- arch/x86/entry/entry_fred.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 9fa18b8c7f264..89c1476fcdd9f 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -28,9 +28,9 @@ static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code if (regs->fred_cs.sl > 0) { pr_emerg("PANIC: invalid or fatal FRED event; event type %u " "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", - regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + regs->fred_ss.type, regs->fred_ss.vector, error_code, fred_event_data(regs), regs->cs, regs->ip); - die("invalid or fatal FRED event", regs, regs->orig_ax); + die("invalid or fatal FRED event", regs, error_code); panic("invalid or fatal FRED event"); } else { unsigned long flags = oops_begin(); @@ -38,10 +38,10 @@ static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code pr_alert("BUG: invalid or fatal FRED event; event type %u " "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", - regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + regs->fred_ss.type, regs->fred_ss.vector, error_code, fred_event_data(regs), regs->cs, regs->ip); - if (__die("Invalid or fatal FRED event", regs, regs->orig_ax)) + if (__die("Invalid or fatal FRED event", regs, error_code)) sig = 0; oops_end(flags, regs, sig); -- GitLab From def52db470df28d6f43cacbd21137f03b9502073 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:51 +0100 Subject: [PATCH 969/989] net: ravb: Count packets instead of descriptors in R-Car RX path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The units of "work done" in the RX path should be packets instead of descriptors. Descriptors which are used by the hardware to record error conditions or are empty in the case of a DMA mapping error should not count towards our RX work budget. Also make the limit variable unsigned as it can never be negative. Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Reviewed-by: Niklas Söderlund Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index ba01c8cc3c906..7d231d011bffa 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -892,29 +892,24 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) struct ravb_private *priv = netdev_priv(ndev); const struct ravb_hw_info *info = priv->info; int entry = priv->cur_rx[q] % priv->num_rx_ring[q]; - int boguscnt = (priv->dirty_rx[q] + priv->num_rx_ring[q]) - - priv->cur_rx[q]; struct net_device_stats *stats = &priv->stats[q]; struct ravb_ex_rx_desc *desc; + unsigned int limit, i; struct sk_buff *skb; dma_addr_t dma_addr; struct timespec64 ts; + int rx_packets = 0; u8 desc_status; u16 pkt_len; - int limit; - boguscnt = min(boguscnt, *quota); - limit = boguscnt; + limit = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q]; desc = &priv->rx_ring[q].ex_desc[entry]; - while (desc->die_dt != DT_FEMPTY) { + for (i = 0; i < limit && rx_packets < *quota && desc->die_dt != DT_FEMPTY; i++) { /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; pkt_len = le16_to_cpu(desc->ds_cc) & RX_DS; - if (--boguscnt < 0) - break; - /* We use 0-byte descriptors to mark the DMA mapping errors */ if (!pkt_len) continue; @@ -960,7 +955,7 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) if (ndev->features & NETIF_F_RXCSUM) ravb_rx_csum(skb); napi_gro_receive(&priv->napi[q], skb); - stats->rx_packets++; + rx_packets++; stats->rx_bytes += pkt_len; } @@ -995,9 +990,9 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) desc->die_dt = DT_FEMPTY; } - *quota -= limit - (++boguscnt); - - return boguscnt <= 0; + stats->rx_packets += rx_packets; + *quota -= rx_packets; + return *quota == 0; } /* Packet receive function for Ethernet AVB */ -- GitLab From a892493a343494bd6bab9d098593932077ff3c43 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:52 +0100 Subject: [PATCH 970/989] net: ravb: Allow RX loop to move past DMA mapping errors The RX loops in ravb_rx_gbeth() and ravb_rx_rcar() skip to the next loop iteration if a zero-length descriptor is seen (indicating a DMA mapping error). However, the current RX descriptor index `priv->cur_rx[q]` was incremented at the end of the loop and so would not be incremented when we skip to the next loop iteration. This would cause the loop to keep seeing the same zero-length descriptor instead of moving on to the next descriptor. As the loop counter `i` still increments, the loop would eventually terminate so there is no risk of being stuck here forever - but we should still fix this to avoid wasting cycles. To fix this, the RX descriptor index is incremented at the top of the loop, in the for statement itself. The assignments of `entry` and `desc` are brought into the loop to avoid the need for duplication. Fixes: d8b48911fd24 ("ravb: fix ring memory allocation") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 25 ++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 7d231d011bffa..3b870926af14e 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -775,12 +775,15 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) int limit; int i; - entry = priv->cur_rx[q] % priv->num_rx_ring[q]; limit = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q]; stats = &priv->stats[q]; - desc = &priv->rx_ring[q].desc[entry]; - for (i = 0; i < limit && rx_packets < *quota && desc->die_dt != DT_FEMPTY; i++) { + for (i = 0; i < limit; i++, priv->cur_rx[q]++) { + entry = priv->cur_rx[q] % priv->num_rx_ring[q]; + desc = &priv->rx_ring[q].desc[entry]; + if (rx_packets == *quota || desc->die_dt == DT_FEMPTY) + break; + /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; @@ -848,9 +851,6 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) break; } } - - entry = (++priv->cur_rx[q]) % priv->num_rx_ring[q]; - desc = &priv->rx_ring[q].desc[entry]; } /* Refill the RX ring buffers. */ @@ -891,7 +891,6 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) { struct ravb_private *priv = netdev_priv(ndev); const struct ravb_hw_info *info = priv->info; - int entry = priv->cur_rx[q] % priv->num_rx_ring[q]; struct net_device_stats *stats = &priv->stats[q]; struct ravb_ex_rx_desc *desc; unsigned int limit, i; @@ -901,10 +900,15 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) int rx_packets = 0; u8 desc_status; u16 pkt_len; + int entry; limit = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q]; - desc = &priv->rx_ring[q].ex_desc[entry]; - for (i = 0; i < limit && rx_packets < *quota && desc->die_dt != DT_FEMPTY; i++) { + for (i = 0; i < limit; i++, priv->cur_rx[q]++) { + entry = priv->cur_rx[q] % priv->num_rx_ring[q]; + desc = &priv->rx_ring[q].ex_desc[entry]; + if (rx_packets == *quota || desc->die_dt == DT_FEMPTY) + break; + /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; @@ -958,9 +962,6 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) rx_packets++; stats->rx_bytes += pkt_len; } - - entry = (++priv->cur_rx[q]) % priv->num_rx_ring[q]; - desc = &priv->rx_ring[q].ex_desc[entry]; } /* Refill the RX ring buffers. */ -- GitLab From c7c449502b51c5b5de79f97a42be750b28f6ecee Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:53 +0100 Subject: [PATCH 971/989] net: ravb: Fix GbEth jumbo packet RX checksum handling Sending a 7kB ping packet to the RZ/G2L in v6.9-rc2 causes the following backtrace: WARNING: CPU: 0 PID: 0 at include/linux/skbuff.h:3127 skb_trim+0x30/0x38 Hardware name: Renesas SMARC EVK based on r9a07g044l2 (DT) pc : skb_trim+0x30/0x38 lr : ravb_rx_csum_gbeth+0x40/0x90 Call trace: skb_trim+0x30/0x38 ravb_rx_gbeth+0x56c/0x5cc ravb_poll+0xa0/0x204 __napi_poll+0x38/0x17c This is caused by ravb_rx_gbeth() calling ravb_rx_csum_gbeth() with the wrong skb for a packet which spans multiple descriptors. To fix this, use the correct skb. Fixes: c2da9408579d ("ravb: Add Rx checksum offload support for GbEth") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 3b870926af14e..6969cdeeb67ab 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -843,7 +843,7 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) priv->rx_1st_skb->protocol = eth_type_trans(priv->rx_1st_skb, ndev); if (ndev->features & NETIF_F_RXCSUM) - ravb_rx_csum_gbeth(skb); + ravb_rx_csum_gbeth(priv->rx_1st_skb); napi_gro_receive(&priv->napi[q], priv->rx_1st_skb); rx_packets++; -- GitLab From 2e36c9fbc476f95a1b19e3fa0a2cdf408475ff56 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:54 +0100 Subject: [PATCH 972/989] net: ravb: Fix RX byte accounting for jumbo packets The RX byte accounting for jumbo packets was changed to fix a potential use-after-free bug. However, that fix used the wrong variable and so only accounted for the number of bytes in the final descriptor, not the number of bytes in the whole packet. To fix this, we can simply update our stats with the correct number of bytes before calling napi_gro_receive(). Also rename pkt_len to desc_len in ravb_rx_gbeth() to avoid any future confusion. The variable name pkt_len is correct in ravb_rx_rcar() as that function does not handle packets spanning multiple descriptors. Fixes: 5a5a3e564de6 ("ravb: Fix potential use-after-free in ravb_rx_gbeth()") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 6969cdeeb67ab..fcb756d77681c 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -769,7 +769,7 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) dma_addr_t dma_addr; int rx_packets = 0; u8 desc_status; - u16 pkt_len; + u16 desc_len; u8 die_dt; int entry; int limit; @@ -787,10 +787,10 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; - pkt_len = le16_to_cpu(desc->ds_cc) & RX_DS; + desc_len = le16_to_cpu(desc->ds_cc) & RX_DS; /* We use 0-byte descriptors to mark the DMA mapping errors */ - if (!pkt_len) + if (!desc_len) continue; if (desc_status & MSC_MC) @@ -811,25 +811,25 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) switch (die_dt) { case DT_FSINGLE: skb = ravb_get_skb_gbeth(ndev, entry, desc); - skb_put(skb, pkt_len); + skb_put(skb, desc_len); skb->protocol = eth_type_trans(skb, ndev); if (ndev->features & NETIF_F_RXCSUM) ravb_rx_csum_gbeth(skb); napi_gro_receive(&priv->napi[q], skb); rx_packets++; - stats->rx_bytes += pkt_len; + stats->rx_bytes += desc_len; break; case DT_FSTART: priv->rx_1st_skb = ravb_get_skb_gbeth(ndev, entry, desc); - skb_put(priv->rx_1st_skb, pkt_len); + skb_put(priv->rx_1st_skb, desc_len); break; case DT_FMID: skb = ravb_get_skb_gbeth(ndev, entry, desc); skb_copy_to_linear_data_offset(priv->rx_1st_skb, priv->rx_1st_skb->len, skb->data, - pkt_len); - skb_put(priv->rx_1st_skb, pkt_len); + desc_len); + skb_put(priv->rx_1st_skb, desc_len); dev_kfree_skb(skb); break; case DT_FEND: @@ -837,17 +837,17 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) skb_copy_to_linear_data_offset(priv->rx_1st_skb, priv->rx_1st_skb->len, skb->data, - pkt_len); - skb_put(priv->rx_1st_skb, pkt_len); + desc_len); + skb_put(priv->rx_1st_skb, desc_len); dev_kfree_skb(skb); priv->rx_1st_skb->protocol = eth_type_trans(priv->rx_1st_skb, ndev); if (ndev->features & NETIF_F_RXCSUM) ravb_rx_csum_gbeth(priv->rx_1st_skb); + stats->rx_bytes += priv->rx_1st_skb->len; napi_gro_receive(&priv->napi[q], priv->rx_1st_skb); rx_packets++; - stats->rx_bytes += pkt_len; break; } } -- GitLab From 3aadf100f93d80815685493d60cd8cab206403df Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 18 Apr 2024 13:45:17 +0200 Subject: [PATCH 973/989] Revert "vmgenid: emit uevent when VMGENID updates" This reverts commit ad6bcdad2b6724e113f191a12f859a9e8456b26d. I had nak'd it, and Greg said on the thread that it links that he wasn't going to take it either, especially since it's not his code or his tree, but then, seemingly accidentally, it got pushed up some months later, in what looks like a mistake, with no further discussion in the linked thread. So revert it, since it's clearly not intended. Fixes: ad6bcdad2b67 ("vmgenid: emit uevent when VMGENID updates") Cc: stable@vger.kernel.org Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230531095119.11202-2-bchalios@amazon.es Signed-off-by: Jason A. Donenfeld --- drivers/virt/vmgenid.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/virt/vmgenid.c b/drivers/virt/vmgenid.c index b67a28da47026..a1c467a0e9f71 100644 --- a/drivers/virt/vmgenid.c +++ b/drivers/virt/vmgenid.c @@ -68,7 +68,6 @@ out: static void vmgenid_notify(struct acpi_device *device, u32 event) { struct vmgenid_state *state = acpi_driver_data(device); - char *envp[] = { "NEW_VMGENID=1", NULL }; u8 old_id[VMGENID_SIZE]; memcpy(old_id, state->this_id, sizeof(old_id)); @@ -76,7 +75,6 @@ static void vmgenid_notify(struct acpi_device *device, u32 event) if (!memcmp(old_id, state->this_id, sizeof(old_id))) return; add_vmfork_randomness(state->this_id, sizeof(state->this_id)); - kobject_uevent_env(&device->dev.kobj, KOBJ_CHANGE, envp); } static const struct acpi_device_id vmgenid_ids[] = { -- GitLab From 1607830dadeefc407e4956336d9fcd9e9defd810 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 18 Apr 2024 16:33:28 +0200 Subject: [PATCH 974/989] Revert "usb: cdc-wdm: close race between read and workqueue" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 339f83612f3a569b194680768b22bf113c26a29d. It has been found to cause problems in a number of Chromebook devices, so revert the change until it can be brought back in a safe way. Link: https://lore.kernel.org/r/385a3519-b45d-48c5-a6fd-a3fdb6bec92f@chromium.org Reported-by:: Aleksander Morgado Fixes: 339f83612f3a ("usb: cdc-wdm: close race between read and workqueue") Cc: stable Cc: Oliver Neukum Cc: Bjørn Mork Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index c8262e2f29177..c553decb54610 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -485,7 +485,6 @@ out_free_mem: static int service_outstanding_interrupt(struct wdm_device *desc) { int rv = 0; - int used; /* submit read urb only if the device is waiting for it */ if (!desc->resp_count || !--desc->resp_count) @@ -500,10 +499,7 @@ static int service_outstanding_interrupt(struct wdm_device *desc) goto out; } - used = test_and_set_bit(WDM_RESPONDING, &desc->flags); - if (used) - goto out; - + set_bit(WDM_RESPONDING, &desc->flags); spin_unlock_irq(&desc->iuspin); rv = usb_submit_urb(desc->response, GFP_KERNEL); spin_lock_irq(&desc->iuspin); -- GitLab From f2e0eee4703869dc5edb5302a919861566ca7797 Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Tue, 16 Apr 2024 01:23:07 +0000 Subject: [PATCH 975/989] usb: dwc3: ep0: Don't reset resource alloc flag The DWC3_EP_RESOURCE_ALLOCATED flag ensures that the resource of an endpoint is only assigned once. Unless the endpoint is reset, don't clear this flag. Otherwise we may set endpoint resource again, which prevents the driver from initiate transfer after handling a STALL or endpoint halt to the control endpoint. Cc: stable@vger.kernel.org Fixes: b311048c174d ("usb: dwc3: gadget: Rewrite endpoint allocation flow") Signed-off-by: Thinh Nguyen Link: https://lore.kernel.org/r/00122b7cc5be06abef461776e7cc9f5ebc8bc1cb.1713229786.git.Thinh.Nguyen@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/ep0.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c index 72bb722da2f25..d96ffbe520397 100644 --- a/drivers/usb/dwc3/ep0.c +++ b/drivers/usb/dwc3/ep0.c @@ -226,7 +226,8 @@ void dwc3_ep0_stall_and_restart(struct dwc3 *dwc) /* reinitialize physical ep1 */ dep = dwc->eps[1]; - dep->flags = DWC3_EP_ENABLED; + dep->flags &= DWC3_EP_RESOURCE_ALLOCATED; + dep->flags |= DWC3_EP_ENABLED; /* stall is always issued on EP0 */ dep = dwc->eps[0]; -- GitLab From 582ee2f9d268d302595db3e36b985e5cbb93284d Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Thu, 18 Apr 2024 13:34:30 +0200 Subject: [PATCH 976/989] USB: serial: option: add Telit FN920C04 rmnet compositions Add the following Telit FN920C04 compositions: 0x10a0: rmnet + tty (AT/NMEA) + tty (AT) + tty (diag) T: Bus=03 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 5 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10a0 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN920 S: SerialNumber=92c4c4d8 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10a4: rmnet + tty (AT) + tty (AT) + tty (diag) T: Bus=03 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10a4 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN920 S: SerialNumber=92c4c4d8 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10a9: rmnet + tty (AT) + tty (diag) + DPL (data packet logging) + adb T: Bus=03 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 9 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10a9 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN920 S: SerialNumber=92c4c4d8 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Daniele Palmas Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index e22612017f666..8a5846d4adf67 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1376,6 +1376,12 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(2) | RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1083, 0xff), /* Telit FE990 (ECM) */ .driver_info = NCTRL(0) | RSVD(1) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a0, 0xff), /* Telit FN20C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a4, 0xff), /* Telit FN20C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a9, 0xff), /* Telit FN20C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM), -- GitLab From 9543f6e26634537997b6e909c20911b7bf4876de Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 16 Apr 2024 23:04:34 -0700 Subject: [PATCH 977/989] x86/cpufeatures: Fix dependencies for GFNI, VAES, and VPCLMULQDQ Fix cpuid_deps[] to list the correct dependencies for GFNI, VAES, and VPCLMULQDQ. These features don't depend on AVX512, and there exist CPUs that support these features but not AVX512. GFNI actually doesn't even depend on AVX. This prevents GFNI from being unnecessarily disabled if AVX is disabled to mitigate the GDS vulnerability. This also prevents all three features from being unnecessarily disabled if AVX512VL (or its dependency AVX512F) were to be disabled, but it looks like there isn't any case where this happens anyway. Fixes: c128dbfa0f87 ("x86/cpufeatures: Enable new SSE/AVX/AVX512 CPU features") Signed-off-by: Eric Biggers Signed-off-by: Borislav Petkov (AMD) Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20240417060434.47101-1-ebiggers@kernel.org --- arch/x86/kernel/cpu/cpuid-deps.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index b7174209d855c..946813d816bfc 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -44,7 +44,10 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, { X86_FEATURE_AES, X86_FEATURE_XMM2 }, { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_GFNI, X86_FEATURE_XMM2 }, { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_VAES, X86_FEATURE_AVX }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX }, { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, @@ -56,9 +59,6 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, - { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, -- GitLab From 752863bddacab6b5c5164b1df8c8b2e3a175ee28 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Apr 2024 16:47:43 +0200 Subject: [PATCH 978/989] block: propagate partition scanning errors to the BLKRRPART ioctl Commit 4601b4b130de ("block: reopen the device in blkdev_reread_part") lost the propagation of I/O errors from the low-level read of the partition table to the user space caller of the BLKRRPART. Apparently some user space relies on, so restore the propagation. This isn't exactly pretty as other block device open calls explicitly do not are about these errors, so add a new BLK_OPEN_STRICT_SCAN to opt into the error propagation. Fixes: 4601b4b130de ("block: reopen the device in blkdev_reread_part") Reported-by: Saranya Muruganandam Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Shin'ichiro Kawasaki Tested-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20240417144743.2277601-1-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 29 +++++++++++++++++++---------- block/ioctl.c | 3 ++- include/linux/blkdev.h | 2 ++ 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 7a5f611c3d2e3..cea51dca87531 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -652,6 +652,14 @@ static void blkdev_flush_mapping(struct block_device *bdev) bdev_write_inode(bdev); } +static void blkdev_put_whole(struct block_device *bdev) +{ + if (atomic_dec_and_test(&bdev->bd_openers)) + blkdev_flush_mapping(bdev); + if (bdev->bd_disk->fops->release) + bdev->bd_disk->fops->release(bdev->bd_disk); +} + static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) { struct gendisk *disk = bdev->bd_disk; @@ -670,20 +678,21 @@ static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) if (!atomic_read(&bdev->bd_openers)) set_init_blocksize(bdev); - if (test_bit(GD_NEED_PART_SCAN, &disk->state)) - bdev_disk_changed(disk, false); atomic_inc(&bdev->bd_openers); + if (test_bit(GD_NEED_PART_SCAN, &disk->state)) { + /* + * Only return scanning errors if we are called from contexts + * that explicitly want them, e.g. the BLKRRPART ioctl. + */ + ret = bdev_disk_changed(disk, false); + if (ret && (mode & BLK_OPEN_STRICT_SCAN)) { + blkdev_put_whole(bdev); + return ret; + } + } return 0; } -static void blkdev_put_whole(struct block_device *bdev) -{ - if (atomic_dec_and_test(&bdev->bd_openers)) - blkdev_flush_mapping(bdev); - if (bdev->bd_disk->fops->release) - bdev->bd_disk->fops->release(bdev->bd_disk); -} - static int blkdev_get_part(struct block_device *part, blk_mode_t mode) { struct gendisk *disk = part->bd_disk; diff --git a/block/ioctl.c b/block/ioctl.c index a9028a2c2db57..f505f9c341eb0 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -563,7 +563,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; - return disk_scan_partitions(bdev->bd_disk, mode); + return disk_scan_partitions(bdev->bd_disk, + mode | BLK_OPEN_STRICT_SCAN); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c3e8f7cf96be9..d16320852c4ba 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -128,6 +128,8 @@ typedef unsigned int __bitwise blk_mode_t; #define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) /* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */ #define BLK_OPEN_RESTRICT_WRITES ((__force blk_mode_t)(1 << 5)) +/* return partition scanning errors */ +#define BLK_OPEN_STRICT_SCAN ((__force blk_mode_t)(1 << 6)) struct gendisk { /* -- GitLab From 56f78615bcb1c3ba58a5d9911bad3d9185cf141b Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Wed, 17 Apr 2024 10:55:13 +0200 Subject: [PATCH 979/989] net: usb: ax88179_178a: avoid writing the mac address before first reading After the commit d2689b6a86b9 ("net: usb: ax88179_178a: avoid two consecutive device resets"), reset operation, in which the default mac address from the device is read, is not executed from bind operation and the random address, that is pregenerated just in case, is direclty written the first time in the device, so the default one from the device is not even read. This writing is not dangerous because is volatile and the default mac address is not missed. In order to avoid this and keep the simplification to have only one reset and reduce the delays, restore the reset from bind operation and remove the reset that is commanded from open operation. The behavior is the same but everything is ready for usbnet_probe. Tested with ASIX AX88179 USB Gigabit Ethernet devices. Restore the old behavior for the rest of possible devices because I don't have the hardware to test. cc: stable@vger.kernel.org # 6.6+ Fixes: d2689b6a86b9 ("net: usb: ax88179_178a: avoid two consecutive device resets") Reported-by: Jarkko Palviainen Signed-off-by: Jose Ignacio Tornos Martinez Link: https://lore.kernel.org/r/20240417085524.219532-1-jtornosm@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/ax88179_178a.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index a9c418890a1ca..752f821a19901 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1317,6 +1317,8 @@ static int ax88179_bind(struct usbnet *dev, struct usb_interface *intf) netif_set_tso_max_size(dev->net, 16384); + ax88179_reset(dev); + return 0; } @@ -1695,7 +1697,6 @@ static const struct driver_info ax88179_info = { .unbind = ax88179_unbind, .status = ax88179_status, .link_reset = ax88179_link_reset, - .reset = ax88179_reset, .stop = ax88179_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX, .rx_fixup = ax88179_rx_fixup, @@ -1708,7 +1709,6 @@ static const struct driver_info ax88178a_info = { .unbind = ax88179_unbind, .status = ax88179_status, .link_reset = ax88179_link_reset, - .reset = ax88179_reset, .stop = ax88179_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX, .rx_fixup = ax88179_rx_fixup, -- GitLab From c24cd679b075b0e953ea167b0aa2b2d59e4eba7f Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Wed, 17 Apr 2024 15:24:25 +0530 Subject: [PATCH 980/989] net: ethernet: ti: am65-cpsw-nuss: cleanup DMA Channels before using them The TX and RX DMA Channels used by the driver to exchange data with CPSW are not guaranteed to be in a clean state during driver initialization. The Bootloader could have used the same DMA Channels without cleaning them up in the event of failure. Thus, reset and disable the DMA Channels to ensure that they are in a clean state before using them. Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Reported-by: Schuyler Patton Signed-off-by: Siddharth Vadapalli Reviewed-by: Roger Quadros Link: https://lore.kernel.org/r/20240417095425.2253876-1-s-vadapalli@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 2939a21ca74f3..1d00e21808c1c 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2793,6 +2793,8 @@ static void am65_cpsw_unregister_devlink(struct am65_cpsw_common *common) static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common) { + struct am65_cpsw_rx_chn *rx_chan = &common->rx_chns; + struct am65_cpsw_tx_chn *tx_chan = common->tx_chns; struct device *dev = common->dev; struct am65_cpsw_port *port; int ret = 0, i; @@ -2805,6 +2807,22 @@ static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common) if (ret) return ret; + /* The DMA Channels are not guaranteed to be in a clean state. + * Reset and disable them to ensure that they are back to the + * clean state and ready to be used. + */ + for (i = 0; i < common->tx_ch_num; i++) { + k3_udma_glue_reset_tx_chn(tx_chan[i].tx_chn, &tx_chan[i], + am65_cpsw_nuss_tx_cleanup); + k3_udma_glue_disable_tx_chn(tx_chan[i].tx_chn); + } + + for (i = 0; i < AM65_CPSW_MAX_RX_FLOWS; i++) + k3_udma_glue_reset_rx_chn(rx_chan->rx_chn, i, rx_chan, + am65_cpsw_nuss_rx_cleanup, !!i); + + k3_udma_glue_disable_rx_chn(rx_chan->rx_chn); + ret = am65_cpsw_nuss_register_devlink(common); if (ret) return ret; -- GitLab From 2b504e1620376052744ebee408a84394bdaef40a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 15 Apr 2024 09:54:14 +0200 Subject: [PATCH 981/989] arm64/head: Drop unnecessary pre-disable-MMU workaround The Falkor erratum that results in the need for an ISB before clearing the M bit in SCTLR_ELx only applies to execution at exception level x, and so the workaround is not needed when disabling the EL1 MMU while running at EL2. Signed-off-by: Ard Biesheuvel Acked-by: Marc Zyngier Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20240415075412.2347624-5-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/head.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 06234c3a15f3d..b8bbd72cb1944 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -323,13 +323,11 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) cbz x0, 2f /* Set a sane SCTLR_EL1, the VHE way */ - pre_disable_mmu_workaround msr_s SYS_SCTLR_EL12, x1 mov x2, #BOOT_CPU_FLAG_E2H b 3f 2: - pre_disable_mmu_workaround msr sctlr_el1, x1 mov x2, xzr 3: -- GitLab From 34e526cb7d46726b2ae5f83f2892d00ebb088509 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 15 Apr 2024 09:54:15 +0200 Subject: [PATCH 982/989] arm64/head: Disable MMU at EL2 before clearing HCR_EL2.E2H Even though the boot protocol stipulates otherwise, an exception has been made for the EFI stub, and entering the core kernel with the MMU enabled is permitted. This allows a substantial amount of cache maintenance to be elided, wich is significant when fast boot times are critical (e.g., for booting micro-VMs) Once the initial ID map has been populated, the MMU is disabled as part of the logic sequence that puts all system registers into a known state. Any code that needs to execute within the window where the MMU is off is cleaned to the PoC explicitly, which includes all of HYP text when entering at EL2. However, the current sequence of initializing the EL2 system registers is not safe: HCR_EL2 is set to its nVHE initial state before SCTLR_EL2 is reprogrammed, and this means that a VHE-to-nVHE switch may occur while the MMU is enabled. This switch causes some system registers as well as page table descriptors to be interpreted in a different way, potentially resulting in spurious exceptions relating to MMU translation. So disable the MMU explicitly first when entering in EL2 with the MMU and caches enabled. Fixes: 617861703830 ("efi: arm64: enter with MMU and caches enabled") Signed-off-by: Ard Biesheuvel Cc: # 6.3.x Acked-by: Mark Rutland Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240415075412.2347624-6-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/head.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index b8bbd72cb1944..cb68adcabe078 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -289,6 +289,11 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) adr_l x1, __hyp_text_end adr_l x2, dcache_clean_poc blr x2 + + mov_q x0, INIT_SCTLR_EL2_MMU_OFF + pre_disable_mmu_workaround + msr sctlr_el2, x0 + isb 0: mov_q x0, HCR_HOST_NVHE_FLAGS -- GitLab From 7ee5faad0f8c3ad86c8cfc2f6aac91d2ba29790f Mon Sep 17 00:00:00 2001 From: Ai Chao Date: Fri, 19 Apr 2024 16:21:59 +0800 Subject: [PATCH 983/989] ALSA: hda/realtek - Enable audio jacks of Haier Boyue G42 with ALC269VC The Haier Boyue G42 with ALC269VC cannot detect the MIC of headset, the line out and internal speaker until ALC269VC_FIXUP_ACER_VCOPPERBOX_PINS quirk applied. Signed-off-by: Ai Chao Cc: Message-ID: <20240419082159.476879-1-aichao@kylinos.cn> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index f714608474922..70d80b6af3fe3 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10497,6 +10497,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1d05, 0x115c, "TongFang GMxTGxx", ALC269_FIXUP_NO_SHUTUP), SND_PCI_QUIRK(0x1d05, 0x121b, "TongFang GMxAGxx", ALC269_FIXUP_NO_SHUTUP), SND_PCI_QUIRK(0x1d05, 0x1387, "TongFang GMxIXxx", ALC2XX_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x1d17, 0x3288, "Haier Boyue G42", ALC269VC_FIXUP_ACER_VCOPPERBOX_PINS), SND_PCI_QUIRK(0x1d72, 0x1602, "RedmiBook", ALC255_FIXUP_XIAOMI_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1701, "XiaomiNotebook Pro", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1d72, 0x1901, "RedmiBook 14", ALC256_FIXUP_ASUS_HEADSET_MIC), -- GitLab From f25f17dc5c6a5e3f2014d44635f0c0db45224efe Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 19 Apr 2024 12:04:39 +0200 Subject: [PATCH 984/989] ALSA: seq: ump: Fix conversion from MIDI2 to MIDI1 UMP messages The conversion from MIDI2 to MIDI1 UMP messages had a leftover artifact (superfluous bit shift), and this resulted in the bogus type check, leading to empty outputs. Let's fix it. Fixes: e9e02819a98a ("ALSA: seq: Automatic conversion of UMP events") Cc: Link: https://github.com/alsa-project/alsa-utils/issues/262 Message-ID: <20240419100442.14806-1-tiwai@suse.de> Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index b141024830ecc..ee6ac649df836 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -428,7 +428,7 @@ static int cvt_ump_midi2_to_midi1(struct snd_seq_client *dest, midi1->note.group = midi2->note.group; midi1->note.status = midi2->note.status; midi1->note.channel = midi2->note.channel; - switch (midi2->note.status << 4) { + switch (midi2->note.status) { case UMP_MSG_STATUS_NOTE_ON: case UMP_MSG_STATUS_NOTE_OFF: midi1->note.note = midi2->note.note; -- GitLab From b552f63cd43735048bbe9bfbb7a9dcfce166fbdd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Apr 2024 21:02:12 +0200 Subject: [PATCH 985/989] thermal/debugfs: Add missing count increment to thermal_debug_tz_trip_up() The count field in struct trip_stats, representing the number of times the zone temperature was above the trip point, needs to be incremented in thermal_debug_tz_trip_up(), for two reasons. First, if a trip point is crossed on the way up for the first time, thermal_debug_update_temp() called from update_temperature() does not see it because it has not been added to trips_crossed[] array in the thermal zone's struct tz_debugfs object yet. Therefore, when thermal_debug_tz_trip_up() is called after that, the trip point's count value is 0, and the attempt to divide by it during the average temperature computation leads to a divide error which causes the kernel to crash. Setting the count to 1 before the division by incrementing it fixes this problem. Second, if a trip point is crossed on the way up, but it has been crossed on the way up already before, its count value needs to be incremented to make a record of the fact that the zone temperature is above the trip now. Without doing that, if the mitigations applied after crossing the trip cause the zone temperature to drop below its threshold, the count will not be updated for this episode at all and the average temperature in the trip statistics record will be somewhat higher than it should be. Fixes: 7ef01f228c9f ("thermal/debugfs: Add thermal debugfs information for mitigation episodes") Cc :6.8+ # 6.8+ Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_debugfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/thermal/thermal_debugfs.c b/drivers/thermal/thermal_debugfs.c index c617e8b9f0ddf..d78d54ae2605e 100644 --- a/drivers/thermal/thermal_debugfs.c +++ b/drivers/thermal/thermal_debugfs.c @@ -616,6 +616,7 @@ void thermal_debug_tz_trip_up(struct thermal_zone_device *tz, tze->trip_stats[trip_id].timestamp = now; tze->trip_stats[trip_id].max = max(tze->trip_stats[trip_id].max, temperature); tze->trip_stats[trip_id].min = min(tze->trip_stats[trip_id].min, temperature); + tze->trip_stats[trip_id].count++; tze->trip_stats[trip_id].avg = tze->trip_stats[trip_id].avg + (temperature - tze->trip_stats[trip_id].avg) / tze->trip_stats[trip_id].count; -- GitLab From 01bc4fda9ea0a6b52f12326486f07a4910666cf6 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 19 Apr 2024 17:32:57 +0800 Subject: [PATCH 986/989] blk-iocost: do not WARN if iocg was already offlined In iocg_pay_debt(), warn is triggered if 'active_list' is empty, which is intended to confirm iocg is active when it has debt. However, warn can be triggered during a blkcg or disk removal, if iocg_waitq_timer_fn() is run at that time: WARNING: CPU: 0 PID: 2344971 at block/blk-iocost.c:1402 iocg_pay_debt+0x14c/0x190 Call trace: iocg_pay_debt+0x14c/0x190 iocg_kick_waitq+0x438/0x4c0 iocg_waitq_timer_fn+0xd8/0x130 __run_hrtimer+0x144/0x45c __hrtimer_run_queues+0x16c/0x244 hrtimer_interrupt+0x2cc/0x7b0 The warn in this situation is meaningless. Since this iocg is being removed, the state of the 'active_list' is irrelevant, and 'waitq_timer' is canceled after removing 'active_list' in ioc_pd_free(), which ensures iocg is freed after iocg_waitq_timer_fn() returns. Therefore, add the check if iocg was already offlined to avoid warn when removing a blkcg or disk. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240419093257.3004211-1-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index baa20c85799d5..690ca99dfaca6 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1439,8 +1439,11 @@ static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, lockdep_assert_held(&iocg->ioc->lock); lockdep_assert_held(&iocg->waitq.lock); - /* make sure that nobody messed with @iocg */ - WARN_ON_ONCE(list_empty(&iocg->active_list)); + /* + * make sure that nobody messed with @iocg. Check iocg->pd.online + * to avoid warn when removing blkcg or disk. + */ + WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online); WARN_ON_ONCE(iocg->inuse > 1); iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); -- GitLab From 50449ca66cc5a8cbc64749cf4b9f3d3fc5f4b457 Mon Sep 17 00:00:00 2001 From: Yaxiong Tian Date: Wed, 17 Apr 2024 10:52:48 +0800 Subject: [PATCH 987/989] arm64: hibernate: Fix level3 translation fault in swsusp_save() On arm64 machines, swsusp_save() faults if it attempts to access MEMBLOCK_NOMAP memory ranges. This can be reproduced in QEMU using UEFI when booting with rodata=off debug_pagealloc=off and CONFIG_KFENCE=n: Unable to handle kernel paging request at virtual address ffffff8000000000 Mem abort info: ESR = 0x0000000096000007 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x07: level 3 translation fault Data abort info: ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 swapper pgtable: 4k pages, 39-bit VAs, pgdp=00000000eeb0b000 [ffffff8000000000] pgd=180000217fff9803, p4d=180000217fff9803, pud=180000217fff9803, pmd=180000217fff8803, pte=0000000000000000 Internal error: Oops: 0000000096000007 [#1] SMP Internal error: Oops: 0000000096000007 [#1] SMP Modules linked in: xt_multiport ipt_REJECT nf_reject_ipv4 xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c iptable_filter bpfilter rfkill at803x snd_hda_codec_hdmi snd_hda_intel snd_intel_dspcfg dwmac_generic stmmac_platform snd_hda_codec stmmac joydev pcs_xpcs snd_hda_core phylink ppdev lp parport ramoops reed_solomon ip_tables x_tables nls_iso8859_1 vfat multipath linear amdgpu amdxcp drm_exec gpu_sched drm_buddy hid_generic usbhid hid radeon video drm_suballoc_helper drm_ttm_helper ttm i2c_algo_bit drm_display_helper cec drm_kms_helper drm CPU: 0 PID: 3663 Comm: systemd-sleep Not tainted 6.6.2+ #76 Source Version: 4e22ed63a0a48e7a7cff9b98b7806d8d4add7dc0 Hardware name: Greatwall GW-XXXXXX-XXX/GW-XXXXXX-XXX, BIOS KunLun BIOS V4.0 01/19/2021 pstate: 600003c5 (nZCv DAIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : swsusp_save+0x280/0x538 lr : swsusp_save+0x280/0x538 sp : ffffffa034a3fa40 x29: ffffffa034a3fa40 x28: ffffff8000001000 x27: 0000000000000000 x26: ffffff8001400000 x25: ffffffc08113e248 x24: 0000000000000000 x23: 0000000000080000 x22: ffffffc08113e280 x21: 00000000000c69f2 x20: ffffff8000000000 x19: ffffffc081ae2500 x18: 0000000000000000 x17: 6666662074736420 x16: 3030303030303030 x15: 3038666666666666 x14: 0000000000000b69 x13: ffffff9f89088530 x12: 00000000ffffffea x11: 00000000ffff7fff x10: 00000000ffff7fff x9 : ffffffc08193f0d0 x8 : 00000000000bffe8 x7 : c0000000ffff7fff x6 : 0000000000000001 x5 : ffffffa0fff09dc8 x4 : 0000000000000000 x3 : 0000000000000027 x2 : 0000000000000000 x1 : 0000000000000000 x0 : 000000000000004e Call trace: swsusp_save+0x280/0x538 swsusp_arch_suspend+0x148/0x190 hibernation_snapshot+0x240/0x39c hibernate+0xc4/0x378 state_store+0xf0/0x10c kobj_attr_store+0x14/0x24 The reason is swsusp_save() -> copy_data_pages() -> page_is_saveable() -> kernel_page_present() assuming that a page is always present when can_set_direct_map() is false (all of rodata_full, debug_pagealloc_enabled() and arm64_kfence_can_set_direct_map() false), irrespective of the MEMBLOCK_NOMAP ranges. Such MEMBLOCK_NOMAP regions should not be saved during hibernation. This problem was introduced by changes to the pfn_valid() logic in commit a7d9f306ba70 ("arm64: drop pfn_valid_within() and simplify pfn_valid()"). Similar to other architectures, drop the !can_set_direct_map() check in kernel_page_present() so that page_is_savable() skips such pages. Fixes: a7d9f306ba70 ("arm64: drop pfn_valid_within() and simplify pfn_valid()") Cc: # 5.14.x Suggested-by: Mike Rapoport Suggested-by: Catalin Marinas Co-developed-by: xiongxin Signed-off-by: xiongxin Signed-off-by: Yaxiong Tian Acked-by: Mike Rapoport (IBM) Link: https://lore.kernel.org/r/20240417025248.386622-1-tianyaxiong@kylinos.cn [catalin.marinas@arm.com: rework commit message] Signed-off-by: Catalin Marinas --- arch/arm64/mm/pageattr.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 0c4e3ecf989d4..0e270a1c51e64 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -219,9 +219,6 @@ bool kernel_page_present(struct page *page) pte_t *ptep; unsigned long addr = (unsigned long)page_address(page); - if (!can_set_direct_map()) - return true; - pgdp = pgd_offset_k(addr); if (pgd_none(READ_ONCE(*pgdp))) return false; -- GitLab From 366c5cec9ce473f68925d703a07cac56e1d16956 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sat, 20 Apr 2024 08:34:09 -0400 Subject: [PATCH 988/989] MAINTAINERS: update to working email address jejb@linux.ibm.com no longer works. Signed-off-by: James Bottomley --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index c23fda1aa1f09..e5121ff5e60ef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11995,7 +11995,7 @@ F: include/keys/encrypted-type.h F: security/keys/encrypted-keys/ KEYS-TRUSTED -M: James Bottomley +M: James Bottomley M: Jarkko Sakkinen M: Mimi Zohar L: linux-integrity@vger.kernel.org @@ -19669,7 +19669,7 @@ F: drivers/scsi/sg.c F: include/scsi/sg.h SCSI SUBSYSTEM -M: "James E.J. Bottomley" +M: "James E.J. Bottomley" M: "Martin K. Petersen" L: linux-scsi@vger.kernel.org S: Maintained -- GitLab From ed30a4a51bb196781c8058073ea720133a65596f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 21 Apr 2024 12:35:54 -0700 Subject: [PATCH 989/989] Linux 6.9-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 59d8a7f95d0a8..43b10f3d438cf 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 9 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- GitLab