diff --git a/.mailmap b/.mailmap index 08f28f2999f0dc5d64cb5d04a77c7c3eab78130a..bd9f1025ac44e0e289a6843de2c4497be2b76118 100644 --- a/.mailmap +++ b/.mailmap @@ -325,6 +325,7 @@ Kenneth W Chen Kenneth Westfield Kiran Gunda Kirill Tkhai +Kishon Vijay Abraham I Konstantin Khlebnikov Konstantin Khlebnikov Koushik @@ -609,6 +610,11 @@ TripleX Chung TripleX Chung Tsuneo Yoshioka Tudor Ambarus +Tvrtko Ursulin +Tvrtko Ursulin +Tvrtko Ursulin +Tvrtko Ursulin +Tvrtko Ursulin Tycho Andersen Tzung-Bi Shih Uwe Kleine-König diff --git a/CREDITS b/CREDITS index df8d6946739f68655a8b077f0ebcc4bf4612944b..3c2bb55847c607f027ddc6c50c259545327c0fe8 100644 --- a/CREDITS +++ b/CREDITS @@ -63,6 +63,11 @@ D: dosfs, LILO, some fd features, ATM, various other hacks here and there S: Buenos Aires S: Argentina +NTFS FILESYSTEM +N: Anton Altaparmakov +E: anton@tuxera.com +D: NTFS filesystem + N: Tim Alpaerts E: tim_alpaerts@toyota-motor-europe.com D: 802.2 class II logical link control layer, diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index 2d42998a89a6378a94521d49785c4f1632b25a34..3e6407de231c998ac7f0539c3d856458097d8971 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -68,7 +68,8 @@ over a rather long period of time, but improvements are always welcome! rcu_read_lock_sched(), or by the appropriate update-side lock. Explicit disabling of preemption (preempt_disable(), for example) can serve as rcu_read_lock_sched(), but is less readable and - prevents lockdep from detecting locking issues. + prevents lockdep from detecting locking issues. Acquiring a + spinlock also enters an RCU read-side critical section. Please note that you *cannot* rely on code known to be built only in non-preemptible kernels. Such code can and will break, @@ -382,16 +383,17 @@ over a rather long period of time, but improvements are always welcome! must use whatever locking or other synchronization is required to safely access and/or modify that data structure. - Do not assume that RCU callbacks will be executed on the same - CPU that executed the corresponding call_rcu() or call_srcu(). - For example, if a given CPU goes offline while having an RCU - callback pending, then that RCU callback will execute on some - surviving CPU. (If this was not the case, a self-spawning RCU - callback would prevent the victim CPU from ever going offline.) - Furthermore, CPUs designated by rcu_nocbs= might well *always* - have their RCU callbacks executed on some other CPUs, in fact, - for some real-time workloads, this is the whole point of using - the rcu_nocbs= kernel boot parameter. + Do not assume that RCU callbacks will be executed on + the same CPU that executed the corresponding call_rcu(), + call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(), or + call_rcu_tasks_trace(). For example, if a given CPU goes offline + while having an RCU callback pending, then that RCU callback + will execute on some surviving CPU. (If this was not the case, + a self-spawning RCU callback would prevent the victim CPU from + ever going offline.) Furthermore, CPUs designated by rcu_nocbs= + might well *always* have their RCU callbacks executed on some + other CPUs, in fact, for some real-time workloads, this is the + whole point of using the rcu_nocbs= kernel boot parameter. In addition, do not assume that callbacks queued in a given order will be invoked in that order, even if they all are queued on the @@ -444,7 +446,7 @@ over a rather long period of time, but improvements are always welcome! real-time workloads than is synchronize_rcu_expedited(). It is also permissible to sleep in RCU Tasks Trace read-side - critical, which are delimited by rcu_read_lock_trace() and + critical section, which are delimited by rcu_read_lock_trace() and rcu_read_unlock_trace(). However, this is a specialized flavor of RCU, and you should not use it without first checking with its current users. In most cases, you should instead use SRCU. @@ -490,6 +492,12 @@ over a rather long period of time, but improvements are always welcome! since the last time that you passed that same object to call_rcu() (or friends). + CONFIG_RCU_STRICT_GRACE_PERIOD: + combine with KASAN to check for pointers leaked out + of RCU read-side critical sections. This Kconfig + option is tough on both performance and scalability, + and so is limited to four-CPU systems. + __rcu sparse checks: tag the pointer to the RCU-protected data structure with __rcu, and sparse will warn you if you access that diff --git a/Documentation/RCU/rcu_dereference.rst b/Documentation/RCU/rcu_dereference.rst index 659d5913784d0d9e2d196a04bdfb7fadb57d5eed..2524dcdadde2b801b33a4ce0a93f31948ea7aefb 100644 --- a/Documentation/RCU/rcu_dereference.rst +++ b/Documentation/RCU/rcu_dereference.rst @@ -408,7 +408,10 @@ member of the rcu_dereference() to use in various situations: RCU flavors, an RCU read-side critical section is entered using rcu_read_lock(), anything that disables bottom halves, anything that disables interrupts, or anything that disables - preemption. + preemption. Please note that spinlock critical sections + are also implied RCU read-side critical sections, even when + they are preemptible, as they are in kernels built with + CONFIG_PREEMPT_RT=y. 2. If the access might be within an RCU read-side critical section on the one hand, or protected by (say) my_lock on the other, diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 60ce02475142d881b0238f67eee5b9306830745e..872ac665223fbd51f7e06fd6fcf9eddd0de5a65f 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -172,14 +172,25 @@ rcu_read_lock() critical section. Reference counts may be used in conjunction with RCU to maintain longer-term references to data structures. + Note that anything that disables bottom halves, preemption, + or interrupts also enters an RCU read-side critical section. + Acquiring a spinlock also enters an RCU read-side critical + sections, even for spinlocks that do not disable preemption, + as is the case in kernels built with CONFIG_PREEMPT_RT=y. + Sleeplocks do *not* enter RCU read-side critical sections. + rcu_read_unlock() ^^^^^^^^^^^^^^^^^ void rcu_read_unlock(void); This temporal primitives is used by a reader to inform the reclaimer that the reader is exiting an RCU read-side critical - section. Note that RCU read-side critical sections may be nested - and/or overlapping. + section. Anything that enables bottom halves, preemption, + or interrupts also exits an RCU read-side critical section. + Releasing a spinlock also exits an RCU read-side critical section. + + Note that RCU read-side critical sections may be nested and/or + overlapping. synchronize_rcu() ^^^^^^^^^^^^^^^^^ @@ -952,8 +963,8 @@ unfortunately any spinlock in a ``SLAB_TYPESAFE_BY_RCU`` object must be initialized after each and every call to kmem_cache_alloc(), which renders reference-free spinlock acquisition completely unsafe. Therefore, when using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter. -(Those willing to use a kmem_cache constructor may also use locking, -including cache-friendly sequence locking.) +(Those willing to initialize their locks in a kmem_cache constructor +may also use locking, including cache-friendly sequence locking.) With traditional reference counting -- such as that implemented by the kref library in Linux -- there is typically code that runs when the last diff --git a/Documentation/admin-guide/RAS/address-translation.rst b/Documentation/admin-guide/RAS/address-translation.rst new file mode 100644 index 0000000000000000000000000000000000000000..f0ca17b43cd3de7699349104b0219b3b883f1117 --- /dev/null +++ b/Documentation/admin-guide/RAS/address-translation.rst @@ -0,0 +1,24 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Address translation +=================== + +x86 AMD +------- + +Zen-based AMD systems include a Data Fabric that manages the layout of +physical memory. Devices attached to the Fabric, like memory controllers, +I/O, etc., may not have a complete view of the system physical memory map. +These devices may provide a "normalized", i.e. device physical, address +when reporting memory errors. Normalized addresses must be translated to +a system physical address for the kernel to action on the memory. + +AMD Address Translation Library (CONFIG_AMD_ATL) provides translation for +this case. + +Glossary of acronyms used in address translation for Zen-based systems + +* CCM = Cache Coherent Moderator +* COD = Cluster-on-Die +* COH_ST = Coherent Station +* DF = Data Fabric diff --git a/Documentation/RAS/ras.rst b/Documentation/admin-guide/RAS/error-decoding.rst similarity index 73% rename from Documentation/RAS/ras.rst rename to Documentation/admin-guide/RAS/error-decoding.rst index 2556b397cd271fc536daf727a9f7ea178fac6e02..26a72f3fe5de83230db5c51403fc8c462649b490 100644 --- a/Documentation/RAS/ras.rst +++ b/Documentation/admin-guide/RAS/error-decoding.rst @@ -1,15 +1,10 @@ .. SPDX-License-Identifier: GPL-2.0 -Reliability, Availability and Serviceability features -===================================================== - -This documents different aspects of the RAS functionality present in the -kernel. - Error decoding ---------------- +============== -* x86 +x86 +--- Error decoding on AMD systems should be done using the rasdaemon tool: https://github.com/mchehab/rasdaemon/ diff --git a/Documentation/admin-guide/RAS/index.rst b/Documentation/admin-guide/RAS/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..f4087040a7c054ae06f0cdd436101740420839dd --- /dev/null +++ b/Documentation/admin-guide/RAS/index.rst @@ -0,0 +1,7 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. toctree:: + :maxdepth: 2 + + main + error-decoding + address-translation diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/RAS/main.rst similarity index 99% rename from Documentation/admin-guide/ras.rst rename to Documentation/admin-guide/RAS/main.rst index 8e03751d126d01f6bed53b5f694f2c0a95c59893..7ac1d4ccc5099391470fc41c690187b6f3c820b5 100644 --- a/Documentation/admin-guide/ras.rst +++ b/Documentation/admin-guide/RAS/main.rst @@ -1,8 +1,12 @@ +.. SPDX-License-Identifier: GPL-2.0 .. include:: -============================================ -Reliability, Availability and Serviceability -============================================ +================================================== +Reliability, Availability and Serviceability (RAS) +================================================== + +This documents different aspects of the RAS functionality present in the +kernel. RAS concepts ************ diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst index ae646d621a8ab6d208585bf0fdcd53e583ff76b7..7d3415eea05d027572241bd8150762e159577ed8 100644 --- a/Documentation/admin-guide/cgroup-v1/cpusets.rst +++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst @@ -179,7 +179,7 @@ files describing that cpuset: - cpuset.mem_hardwall flag: is memory allocation hardwalled - cpuset.memory_pressure: measure of how much paging pressure in cpuset - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes - - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes + - cpuset.memory_spread_slab flag: OBSOLETE. Doesn't have any function. - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset - cpuset.sched_relax_domain_level: the searching range when migrating tasks diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst index 0fa724d82abb60821fb3bce63e6ec859d0b629d2..493a8e386700ae3c19035159eab62ee7be48a954 100644 --- a/Documentation/admin-guide/cgroup-v1/hugetlb.rst +++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst @@ -65,10 +65,12 @@ files include:: 1. Page fault accounting -hugetlb..limit_in_bytes -hugetlb..max_usage_in_bytes -hugetlb..usage_in_bytes -hugetlb..failcnt +:: + + hugetlb..limit_in_bytes + hugetlb..max_usage_in_bytes + hugetlb..usage_in_bytes + hugetlb..failcnt The HugeTLB controller allows users to limit the HugeTLB usage (page fault) per control group and enforces the limit during page fault. Since HugeTLB @@ -82,10 +84,12 @@ getting SIGBUS. 2. Reservation accounting -hugetlb..rsvd.limit_in_bytes -hugetlb..rsvd.max_usage_in_bytes -hugetlb..rsvd.usage_in_bytes -hugetlb..rsvd.failcnt +:: + + hugetlb..rsvd.limit_in_bytes + hugetlb..rsvd.max_usage_in_bytes + hugetlb..rsvd.usage_in_bytes + hugetlb..rsvd.failcnt The HugeTLB controller allows to limit the HugeTLB reservations per control group and enforces the controller limit at reservation time and at the fault of diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 32a8893e5617764e1a9ca1ae4e106150decf2952..cce768afec6bed11a961643dcdc2d1ae97848684 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -473,8 +473,8 @@ Spectre variant 2 -mindirect-branch=thunk-extern -mindirect-branch-register options. If the kernel is compiled with a Clang compiler, the compiler needs to support -mretpoline-external-thunk option. The kernel config - CONFIG_RETPOLINE needs to be turned on, and the CPU needs to run with - the latest updated microcode. + CONFIG_MITIGATION_RETPOLINE needs to be turned on, and the CPU needs + to run with the latest updated microcode. On Intel Skylake-era systems the mitigation covers most, but not all, cases. See :ref:`[3] ` for more details. @@ -609,8 +609,8 @@ kernel command line. Selecting 'on' will, and 'auto' may, choose a mitigation method at run time according to the CPU, the available microcode, the setting of the - CONFIG_RETPOLINE configuration option, and the - compiler with which the kernel was built. + CONFIG_MITIGATION_RETPOLINE configuration option, + and the compiler with which the kernel was built. Selecting 'on' will also enable the mitigation against user space to user space task attacks. diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index fb40a1f6f79e18d85ddb1ff254c9a04fed9ade2b..dfc06fab94322581e732405f3d722da3bba529bb 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -122,7 +122,7 @@ configure specific aspects of kernel behavior to your liking. pmf pnp rapidio - ras + RAS/index rtc serial-console svga diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index 5762e7477a0c8edb28d59e0a2176a23fb627016b..0302a93b1d40b7b0e6265306e580bfe32c9afd1b 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -191,9 +191,7 @@ Dump-capture kernel config options (Arch Dependent, i386 and x86_64) CPU is enough for kdump kernel to dump vmcore on most of systems. However, you can also specify nr_cpus=X to enable multiple processors - in kdump kernel. In this case, "disable_cpu_apicid=" is needed to - tell kdump kernel which cpu is 1st kernel's BSP. Please refer to - admin-guide/kernel-parameters.txt for more details. + in kdump kernel. With CONFIG_SMP=n, the above things are not related. @@ -454,8 +452,7 @@ Notes on loading the dump-capture kernel: to use multi-thread programs with it, such as parallel dump feature of makedumpfile. Otherwise, the multi-thread program may have a great performance degradation. To enable multi-cpu support, you should bring up an - SMP dump-capture kernel and specify maxcpus/nr_cpus, disable_cpu_apicid=[X] - options while loading it. + SMP dump-capture kernel and specify maxcpus/nr_cpus options while loading it. * For s390x there are two kdump modes: If a ELF header is specified with the elfcorehdr= kernel parameter, it is used by the kdump kernel as it diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index 4410384596a90b0ab26b4cf43bac54aaf78193fe..e8bdf5e86a9ba15b9d52858e66b7478307b6660f 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -108,6 +108,7 @@ is applicable:: CMA Contiguous Memory Area support is enabled. DRM Direct Rendering Management support is enabled. DYNAMIC_DEBUG Build in debug messages and enable them at runtime + EARLY Parameter processed too early to be embedded in initrd. EDD BIOS Enhanced Disk Drive Services (EDD) is enabled EFI EFI Partitioning (GPT) is enabled EVM Extended Verification Module diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2cb70a384af8a890b26aa21efcaef5992669d24c..77c3d1a7f116f36284c4d27c5462f3c13abf85d2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -9,7 +9,7 @@ accept_memory=eager can be used to accept all memory at once during boot. - acpi= [HW,ACPI,X86,ARM64,RISCV64] + acpi= [HW,ACPI,X86,ARM64,RISCV64,EARLY] Advanced Configuration and Power Interface Format: { force | on | off | strict | noirq | rsdt | copy_dsdt } @@ -26,7 +26,7 @@ See also Documentation/power/runtime_pm.rst, pci=noacpi - acpi_apic_instance= [ACPI, IOAPIC] + acpi_apic_instance= [ACPI,IOAPIC,EARLY] Format: 2: use 2nd APIC table, if available 1,0: use 1st APIC table @@ -41,7 +41,7 @@ If set to native, use the device's native backlight mode. If set to none, disable the ACPI backlight interface. - acpi_force_32bit_fadt_addr + acpi_force_32bit_fadt_addr [ACPI,EARLY] force FADT to use 32 bit addresses rather than the 64 bit X_* addresses. Some firmware have broken 64 bit addresses for force ACPI ignore these and use @@ -97,7 +97,7 @@ no: ACPI OperationRegions are not marked as reserved, no further checks are performed. - acpi_force_table_verification [HW,ACPI] + acpi_force_table_verification [HW,ACPI,EARLY] Enable table checksum verification during early stage. By default, this is disabled due to x86 early mapping size limitation. @@ -137,7 +137,7 @@ acpi_no_memhotplug [ACPI] Disable memory hotplug. Useful for kdump kernels. - acpi_no_static_ssdt [HW,ACPI] + acpi_no_static_ssdt [HW,ACPI,EARLY] Disable installation of static SSDTs at early boot time By default, SSDTs contained in the RSDT/XSDT will be installed automatically and they will appear under @@ -151,7 +151,7 @@ Ignore the ACPI-based watchdog interface (WDAT) and let a native driver control the watchdog device instead. - acpi_rsdp= [ACPI,EFI,KEXEC] + acpi_rsdp= [ACPI,EFI,KEXEC,EARLY] Pass the RSDP address to the kernel, mostly used on machines running EFI runtime service to boot the second kernel for kdump. @@ -228,10 +228,10 @@ to assume that this machine's pmtimer latches its value and always returns good values. - acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode + acpi_sci= [HW,ACPI,EARLY] ACPI System Control Interrupt trigger mode Format: { level | edge | high | low } - acpi_skip_timer_override [HW,ACPI] + acpi_skip_timer_override [HW,ACPI,EARLY] Recognize and ignore IRQ0/pin2 Interrupt Override. For broken nForce2 BIOS resulting in XT-PIC timer. @@ -266,11 +266,11 @@ behave incorrectly in some ways with respect to system suspend and resume to be ignored (use wisely). - acpi_use_timer_override [HW,ACPI] + acpi_use_timer_override [HW,ACPI,EARLY] Use timer override. For some broken Nvidia NF5 boards that require a timer override, but don't have HPET - add_efi_memmap [EFI; X86] Include EFI memory map in + add_efi_memmap [EFI,X86,EARLY] Include EFI memory map in kernel's map of available physical RAM. agp= [AGP] @@ -307,7 +307,7 @@ do not want to use tracing_snapshot_alloc() as it needs to be done where GFP_KERNEL allocations are allowed. - allow_mismatched_32bit_el0 [ARM64] + allow_mismatched_32bit_el0 [ARM64,EARLY] Allow execve() of 32-bit applications and setting of the PER_LINUX32 personality on systems where only a strict subset of the CPUs support 32-bit EL0. When this @@ -351,7 +351,7 @@ This mode requires kvm-amd.avic=1. (Default when IOMMU HW support is present.) - amd_pstate= [X86] + amd_pstate= [X86,EARLY] disable Do not enable amd_pstate as the default scaling driver for the supported processors @@ -391,7 +391,7 @@ not play well with APC CPU idle - disable it if you have APC and your system crashes randomly. - apic= [APIC,X86] Advanced Programmable Interrupt Controller + apic= [APIC,X86,EARLY] Advanced Programmable Interrupt Controller Change the output verbosity while booting Format: { quiet (default) | verbose | debug } Change the amount of debugging information output @@ -401,7 +401,7 @@ Format: apic=driver_name Examples: apic=bigsmp - apic_extnmi= [APIC,X86] External NMI delivery setting + apic_extnmi= [APIC,X86,EARLY] External NMI delivery setting Format: { bsp (default) | all | none } bsp: External NMI is delivered only to CPU 0 all: External NMIs are broadcast to all CPUs as a @@ -508,21 +508,22 @@ bert_disable [ACPI] Disable BERT OS support on buggy BIOSes. - bgrt_disable [ACPI][X86] + bgrt_disable [ACPI,X86,EARLY] Disable BGRT to avoid flickering OEM logo. blkdevparts= Manual partition parsing of block device(s) for embedded devices based on command line input. See Documentation/block/cmdline-partition.rst - boot_delay= Milliseconds to delay each printk during boot. + boot_delay= [KNL,EARLY] + Milliseconds to delay each printk during boot. Only works if CONFIG_BOOT_PRINTK_DELAY is enabled, and you may also have to specify "lpj=". Boot_delay values larger than 10 seconds (10000) are assumed erroneous and ignored. Format: integer - bootconfig [KNL] + bootconfig [KNL,EARLY] Extended command line options can be added to an initrd and this will cause the kernel to look for it. @@ -557,7 +558,7 @@ trust validation. format: { id: | builtin } - cca= [MIPS] Override the kernel pages' cache coherency + cca= [MIPS,EARLY] Override the kernel pages' cache coherency algorithm. Accepted values range from 0 to 7 inclusive. See arch/mips/include/asm/pgtable-bits.h for platform specific values (SB1, Loongson3 and @@ -672,19 +673,13 @@ [X86-64] hpet,tsc clocksource.arm_arch_timer.evtstrm= - [ARM,ARM64] + [ARM,ARM64,EARLY] Format: Enable/disable the eventstream feature of the ARM architected timer so that code using WFE-based polling loops can be debugged more effectively on production systems. - clocksource.max_cswd_read_retries= [KNL] - Number of clocksource_watchdog() retries due to - external delays before the clock will be marked - unstable. Defaults to two retries, that is, - three attempts to read the clock under test. - clocksource.verify_n_cpus= [KNL] Limit the number of CPUs checked for clocksources marked with CLOCK_SOURCE_VERIFY_PERCPU that @@ -702,7 +697,7 @@ 10 seconds when built into the kernel. cma=nn[MG]@[start[MG][-end[MG]]] - [KNL,CMA] + [KNL,CMA,EARLY] Sets the size of kernel global memory area for contiguous memory allocations and optionally the placement constraint by the physical address range of @@ -711,7 +706,7 @@ kernel/dma/contiguous.c cma_pernuma=nn[MG] - [KNL,CMA] + [KNL,CMA,EARLY] Sets the size of kernel per-numa memory area for contiguous memory allocations. A value of 0 disables per-numa CMA altogether. And If this option is not @@ -722,7 +717,7 @@ they will fallback to the global default memory area. numa_cma=:nn[MG][,:nn[MG]] - [KNL,CMA] + [KNL,CMA,EARLY] Sets the size of kernel numa memory area for contiguous memory allocations. It will reserve CMA area for the specified node. @@ -739,7 +734,7 @@ a hypervisor. Default: yes - coherent_pool=nn[KMG] [ARM,KNL] + coherent_pool=nn[KMG] [ARM,KNL,EARLY] Sets the size of memory pool for coherent, atomic dma allocations, by default set to 256K. @@ -757,7 +752,7 @@ condev= [HW,S390] console device conmode= - con3215_drop= [S390] 3215 console drop mode. + con3215_drop= [S390,EARLY] 3215 console drop mode. Format: y|n|Y|N|1|0 When set to true, drop data on the 3215 console when the console buffer is full. In this case the @@ -863,7 +858,7 @@ kernel before the cpufreq driver probes. cpu_init_udelay=N - [X86] Delay for N microsec between assert and de-assert + [X86,EARLY] Delay for N microsec between assert and de-assert of APIC INIT to start processors. This delay occurs on every CPU online, such as boot, and resume from suspend. Default: 10000 @@ -883,7 +878,7 @@ kernel more unstable. crashkernel=size[KMG][@offset[KMG]] - [KNL] Using kexec, Linux can switch to a 'crash kernel' + [KNL,EARLY] Using kexec, Linux can switch to a 'crash kernel' upon panic. This parameter reserves the physical memory region [offset, offset + size] for that kernel image. If '@offset' is omitted, then a suitable offset @@ -954,10 +949,10 @@ Format: , See also Documentation/input/devices/joystick-parport.rst - debug [KNL] Enable kernel debugging (events log level). + debug [KNL,EARLY] Enable kernel debugging (events log level). debug_boot_weak_hash - [KNL] Enable printing [hashed] pointers early in the + [KNL,EARLY] Enable printing [hashed] pointers early in the boot sequence. If enabled, we use a weak hash instead of siphash to hash pointers. Use this option if you are seeing instances of '(___ptrval___)') and need to see a @@ -974,10 +969,10 @@ will print _a_lot_ more information - normally only useful to lockdep developers. - debug_objects [KNL] Enable object debugging + debug_objects [KNL,EARLY] Enable object debugging debug_guardpage_minorder= - [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this + [KNL,EARLY] When CONFIG_DEBUG_PAGEALLOC is set, this parameter allows control of the order of pages that will be intentionally kept free (and hence protected) by the buddy allocator. Bigger value increase the probability @@ -996,7 +991,7 @@ help tracking down these problems. debug_pagealloc= - [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter + [KNL,EARLY] When CONFIG_DEBUG_PAGEALLOC is set, this parameter enables the feature at boot time. By default, it is disabled and the system will work mostly the same as a kernel built without CONFIG_DEBUG_PAGEALLOC. @@ -1004,8 +999,8 @@ useful to also enable the page_owner functionality. on: enable the feature - debugfs= [KNL] This parameter enables what is exposed to userspace - and debugfs internal clients. + debugfs= [KNL,EARLY] This parameter enables what is exposed to + userspace and debugfs internal clients. Format: { on, no-mount, off } on: All functions are enabled. no-mount: @@ -1084,7 +1079,7 @@ dhash_entries= [KNL] Set number of hash buckets for dentry cache. - disable_1tb_segments [PPC] + disable_1tb_segments [PPC,EARLY] Disables the use of 1TB hash page table segments. This causes the kernel to fall back to 256MB segments which can be useful when debugging issues that require an SLB @@ -1093,41 +1088,32 @@ disable= [IPV6] See Documentation/networking/ipv6.rst. - disable_radix [PPC] + disable_radix [PPC,EARLY] Disable RADIX MMU mode on POWER9 disable_tlbie [PPC] Disable TLBIE instruction. Currently does not work with KVM, with HASH MMU, or with coherent accelerators. - disable_cpu_apicid= [X86,APIC,SMP] - Format: - The number of initial APIC ID for the - corresponding CPU to be disabled at boot, - mostly used for the kdump 2nd kernel to - disable BSP to wake up multiple CPUs without - causing system reset or hang due to sending - INIT from AP to BSP. - - disable_ddw [PPC/PSERIES] + disable_ddw [PPC/PSERIES,EARLY] Disable Dynamic DMA Window support. Use this to workaround buggy firmware. disable_ipv6= [IPV6] See Documentation/networking/ipv6.rst. - disable_mtrr_cleanup [X86] + disable_mtrr_cleanup [X86,EARLY] The kernel tries to adjust MTRR layout from continuous to discrete, to make X server driver able to add WB entry later. This parameter disables that. - disable_mtrr_trim [X86, Intel and AMD only] + disable_mtrr_trim [X86, Intel and AMD only,EARLY] By default the kernel will trim any uncacheable memory out of your available memory pool based on MTRR settings. This parameter disables that behavior, possibly causing your machine to run very slowly. - disable_timer_pin_1 [X86] + disable_timer_pin_1 [X86,EARLY] Disable PIN 1 of APIC timer Can be useful to work around chipset bugs. @@ -1177,7 +1163,7 @@ dscc4.setup= [NET] - dt_cpu_ftrs= [PPC] + dt_cpu_ftrs= [PPC,EARLY] Format: {"off" | "known"} Control how the dt_cpu_ftrs device-tree binding is used for CPU feature discovery and setup (if it @@ -1197,12 +1183,12 @@ Documentation/admin-guide/dynamic-debug-howto.rst for details. - early_ioremap_debug [KNL] + early_ioremap_debug [KNL,EARLY] Enable debug messages in early_ioremap support. This is useful for tracking down temporary early mappings which are not unmapped. - earlycon= [KNL] Output early console device and options. + earlycon= [KNL,EARLY] Output early console device and options. When used with no options, the early console is determined by stdout-path property in device tree's @@ -1338,7 +1324,7 @@ address must be provided, and the serial port must already be setup and configured. - earlyprintk= [X86,SH,ARM,M68k,S390] + earlyprintk= [X86,SH,ARM,M68k,S390,UM,EARLY] earlyprintk=vga earlyprintk=sclp earlyprintk=xen @@ -1396,7 +1382,7 @@ edd= [EDD] Format: {"off" | "on" | "skip[mbr]"} - efi= [EFI] + efi= [EFI,EARLY] Format: { "debug", "disable_early_pci_dma", "nochunk", "noruntime", "nosoftreserve", "novamap", "no_disable_early_pci_dma" } @@ -1417,13 +1403,13 @@ no_disable_early_pci_dma: Leave the busmaster bit set on all PCI bridges while in the EFI boot stub - efi_no_storage_paranoia [EFI; X86] + efi_no_storage_paranoia [EFI,X86,EARLY] Using this parameter you can use more than 50% of your efi variable storage. Use this parameter only if you are really sure that your UEFI does sane gc and fulfills the spec otherwise your board may brick. - efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86] + efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI,X86,EARLY] Add arbitrary attribute to specific memory range by updating original EFI memory map. Region of memory which aa attribute is added to is @@ -1454,7 +1440,7 @@ eisa_irq_edge= [PARISC,HW] See header of drivers/parisc/eisa.c. - ekgdboc= [X86,KGDB] Allow early kernel console debugging + ekgdboc= [X86,KGDB,EARLY] Allow early kernel console debugging Format: ekgdboc=kbd This is designed to be used in conjunction with @@ -1469,13 +1455,13 @@ See comment before function elanfreq_setup() in arch/x86/kernel/cpu/cpufreq/elanfreq.c. - elfcorehdr=[size[KMG]@]offset[KMG] [PPC,SH,X86,S390] + elfcorehdr=[size[KMG]@]offset[KMG] [PPC,SH,X86,S390,EARLY] Specifies physical address of start of kernel core image elf header and optionally the size. Generally kexec loader will pass this option to capture kernel. See Documentation/admin-guide/kdump/kdump.rst for details. - enable_mtrr_cleanup [X86] + enable_mtrr_cleanup [X86,EARLY] The kernel tries to adjust MTRR layout from continuous to discrete, to make X server driver able to add WB entry later. This parameter enables that. @@ -1508,7 +1494,7 @@ Permit 'security.evm' to be updated regardless of current integrity status. - early_page_ext [KNL] Enforces page_ext initialization to earlier + early_page_ext [KNL,EARLY] Enforces page_ext initialization to earlier stages so cover more early boot allocations. Please note that as side effect some optimizations might be disabled to achieve that (e.g. parallelized @@ -1539,6 +1525,12 @@ Warning: use of this parameter will taint the kernel and may cause unknown problems. + fred= [X86-64] + Enable/disable Flexible Return and Event Delivery. + Format: { on | off } + on: enable FRED when it's present. + off: disable FRED, the default setting. + ftrace=[tracer] [FTRACE] will set and start the specified tracer as early as possible in order to facilitate early @@ -1600,7 +1592,7 @@ can be changed at run time by the max_graph_depth file in the tracefs tracing directory. default: 0 (no limit) - fw_devlink= [KNL] Create device links between consumer and supplier + fw_devlink= [KNL,EARLY] Create device links between consumer and supplier devices by scanning the firmware to infer the consumer/supplier relationships. This feature is especially useful when drivers are loaded as modules as @@ -1619,12 +1611,12 @@ rpm -- Like "on", but also use to order runtime PM. fw_devlink.strict= - [KNL] Treat all inferred dependencies as mandatory + [KNL,EARLY] Treat all inferred dependencies as mandatory dependencies. This only applies for fw_devlink=on|rpm. Format: fw_devlink.sync_state = - [KNL] When all devices that could probe have finished + [KNL,EARLY] When all devices that could probe have finished probing, this parameter controls what to do with devices that haven't yet received their sync_state() calls. @@ -1645,12 +1637,12 @@ gamma= [HW,DRM] - gart_fix_e820= [X86-64] disable the fix e820 for K8 GART + gart_fix_e820= [X86-64,EARLY] disable the fix e820 for K8 GART Format: off | on default: on gather_data_sampling= - [X86,INTEL] Control the Gather Data Sampling (GDS) + [X86,INTEL,EARLY] Control the Gather Data Sampling (GDS) mitigation. Gather Data Sampling is a hardware vulnerability which @@ -1748,7 +1740,7 @@ (that will set all pages holding image data during restoration read-only). - highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact + highmem=nn[KMG] [KNL,BOOT,EARLY] forces the highmem zone to have an exact size of . This works even on boxes that have no highmem otherwise. This also works to reduce highmem size on bigger boxes. @@ -1759,7 +1751,7 @@ hlt [BUGS=ARM,SH] - hostname= [KNL] Set the hostname (aka UTS nodename). + hostname= [KNL,EARLY] Set the hostname (aka UTS nodename). Format: This allows setting the system's hostname during early startup. This sets the name returned by gethostname. @@ -1804,7 +1796,7 @@ Documentation/admin-guide/mm/hugetlbpage.rst. Format: size[KMG] - hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation + hugetlb_cma= [HW,CMA,EARLY] The size of a CMA area used for allocation of gigantic hugepages. Or using node format, the size of a CMA area per node can be specified. Format: nn[KMGTPE] or (node format) @@ -1850,9 +1842,10 @@ If specified, z/VM IUCV HVC accepts connections from listed z/VM user IDs only. - hv_nopvspin [X86,HYPER_V] Disables the paravirt spinlock optimizations - which allow the hypervisor to 'idle' the - guest on lock contention. + hv_nopvspin [X86,HYPER_V,EARLY] + Disables the paravirt spinlock optimizations + which allow the hypervisor to 'idle' the guest + on lock contention. i2c_bus= [HW] Override the default board specific I2C bus speed or register an additional I2C bus that is not @@ -1917,7 +1910,7 @@ Format: [,[,[,]]] - idle= [X86] + idle= [X86,EARLY] Format: idle=poll, idle=halt, idle=nomwait Poll forces a polling idle loop that can slightly improve the performance of waking up a idle CPU, but @@ -1973,7 +1966,7 @@ mode generally follows that for the NaN encoding, except where unsupported by hardware. - ignore_loglevel [KNL] + ignore_loglevel [KNL,EARLY] Ignore loglevel setting - this will print /all/ kernel messages to the console. Useful for debugging. We also add it as printk module parameter, so users @@ -2091,21 +2084,21 @@ unpacking being completed before device_ and late_ initcalls. - initrd= [BOOT] Specify the location of the initial ramdisk + initrd= [BOOT,EARLY] Specify the location of the initial ramdisk - initrdmem= [KNL] Specify a physical address and size from which to + initrdmem= [KNL,EARLY] Specify a physical address and size from which to load the initrd. If an initrd is compiled in or specified in the bootparams, it takes priority over this setting. Format: ss[KMG],nn[KMG] Default is 0, 0 - init_on_alloc= [MM] Fill newly allocated pages and heap objects with + init_on_alloc= [MM,EARLY] Fill newly allocated pages and heap objects with zeroes. Format: 0 | 1 Default set by CONFIG_INIT_ON_ALLOC_DEFAULT_ON. - init_on_free= [MM] Fill freed pages and heap objects with zeroes. + init_on_free= [MM,EARLY] Fill freed pages and heap objects with zeroes. Format: 0 | 1 Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON. @@ -2161,7 +2154,7 @@ 0 disables intel_idle and fall back on acpi_idle. 1 to 9 specify maximum depth of C-state. - intel_pstate= [X86] + intel_pstate= [X86,EARLY] disable Do not enable intel_pstate as the default scaling driver for the supported processors @@ -2205,7 +2198,7 @@ Allow per-logical-CPU P-State performance control limits using cpufreq sysfs interface - intremap= [X86-64, Intel-IOMMU] + intremap= [X86-64,Intel-IOMMU,EARLY] on enable Interrupt Remapping (default) off disable Interrupt Remapping nosid disable Source ID checking @@ -2217,7 +2210,7 @@ strict regions from userspace. relaxed - iommu= [X86] + iommu= [X86,EARLY] off force noforce @@ -2232,7 +2225,7 @@ nobypass [PPC/POWERNV] Disable IOMMU bypass, using IOMMU for PCI devices. - iommu.forcedac= [ARM64, X86] Control IOVA allocation for PCI devices. + iommu.forcedac= [ARM64,X86,EARLY] Control IOVA allocation for PCI devices. Format: { "0" | "1" } 0 - Try to allocate a 32-bit DMA address first, before falling back to the full range if needed. @@ -2240,7 +2233,7 @@ forcing Dual Address Cycle for PCI cards supporting greater than 32-bit addressing. - iommu.strict= [ARM64, X86, S390] Configure TLB invalidation behaviour + iommu.strict= [ARM64,X86,S390,EARLY] Configure TLB invalidation behaviour Format: { "0" | "1" } 0 - Lazy mode. Request that DMA unmap operations use deferred @@ -2256,7 +2249,7 @@ legacy driver-specific options takes precedence. iommu.passthrough= - [ARM64, X86] Configure DMA to bypass the IOMMU by default. + [ARM64,X86,EARLY] Configure DMA to bypass the IOMMU by default. Format: { "0" | "1" } 0 - Use IOMMU translation for DMA. 1 - Bypass the IOMMU for DMA. @@ -2266,7 +2259,7 @@ See comment before marvel_specify_io7 in arch/alpha/kernel/core_marvel.c. - io_delay= [X86] I/O delay method + io_delay= [X86,EARLY] I/O delay method 0x80 Standard port 0x80 based delay 0xed @@ -2279,28 +2272,28 @@ ip= [IP_PNP] See Documentation/admin-guide/nfs/nfsroot.rst. - ipcmni_extend [KNL] Extend the maximum number of unique System V + ipcmni_extend [KNL,EARLY] Extend the maximum number of unique System V IPC identifiers from 32,768 to 16,777,216. irqaffinity= [SMP] Set the default irq affinity mask The argument is a cpu list, as described above. irqchip.gicv2_force_probe= - [ARM, ARM64] + [ARM,ARM64,EARLY] Format: Force the kernel to look for the second 4kB page of a GICv2 controller even if the memory range exposed by the device tree is too small. irqchip.gicv3_nolpi= - [ARM, ARM64] + [ARM,ARM64,EARLY] Force the kernel to ignore the availability of LPIs (and by consequence ITSs). Intended for system that use the kernel as a bootloader, and thus want to let secondary kernels in charge of setting up LPIs. - irqchip.gicv3_pseudo_nmi= [ARM64] + irqchip.gicv3_pseudo_nmi= [ARM64,EARLY] Enables support for pseudo-NMIs in the kernel. This requires the kernel to be built with CONFIG_ARM64_PSEUDO_NMI. @@ -2445,7 +2438,7 @@ parameter KASAN will print report only for the first invalid access. - keep_bootcon [KNL] + keep_bootcon [KNL,EARLY] Do not unregister boot console at start. This is only useful for debugging when something happens in the window between unregistering the boot console and initializing @@ -2453,7 +2446,7 @@ keepinitrd [HW,ARM] See retain_initrd. - kernelcore= [KNL,X86,IA-64,PPC] + kernelcore= [KNL,X86,IA-64,PPC,EARLY] Format: nn[KMGTPE] | nn% | "mirror" This parameter specifies the amount of memory usable by the kernel for non-movable allocations. The requested @@ -2478,7 +2471,7 @@ for Movable pages. "nn[KMGTPE]", "nn%", and "mirror" are exclusive, so you cannot specify multiple forms. - kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. + kgdbdbgp= [KGDB,HW,EARLY] kgdb over EHCI usb debug port. Format: [,poll interval] The controller # is the number of the ehci usb debug port as it is probed via PCI. The poll interval is @@ -2499,7 +2492,7 @@ kms, kbd format: kms,kbd kms, kbd and serial format: kms,kbd,[,baud] - kgdboc_earlycon= [KGDB,HW] + kgdboc_earlycon= [KGDB,HW,EARLY] If the boot console provides the ability to read characters and can work in polling mode, you can use this parameter to tell kgdb to use it as a backend @@ -2514,14 +2507,14 @@ blank and the first boot console that implements read() will be picked. - kgdbwait [KGDB] Stop kernel execution and enter the + kgdbwait [KGDB,EARLY] Stop kernel execution and enter the kernel debugger at the earliest opportunity. kmac= [MIPS] Korina ethernet MAC address. Configure the RouterBoard 532 series on-chip Ethernet adapter MAC address. - kmemleak= [KNL] Boot-time kmemleak enable/disable + kmemleak= [KNL,EARLY] Boot-time kmemleak enable/disable Valid arguments: on, off Default: on Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y, @@ -2540,8 +2533,8 @@ See also Documentation/trace/kprobetrace.rst "Kernel Boot Parameter" section. - kpti= [ARM64] Control page table isolation of user - and kernel address spaces. + kpti= [ARM64,EARLY] Control page table isolation of + user and kernel address spaces. Default: enabled on cores which need mitigation. 0: force disabled 1: force enabled @@ -2618,7 +2611,8 @@ for NPT. kvm-arm.mode= - [KVM,ARM] Select one of KVM/arm64's modes of operation. + [KVM,ARM,EARLY] Select one of KVM/arm64's modes of + operation. none: Forcefully disable KVM. @@ -2638,22 +2632,22 @@ used with extreme caution. kvm-arm.vgic_v3_group0_trap= - [KVM,ARM] Trap guest accesses to GICv3 group-0 + [KVM,ARM,EARLY] Trap guest accesses to GICv3 group-0 system registers kvm-arm.vgic_v3_group1_trap= - [KVM,ARM] Trap guest accesses to GICv3 group-1 + [KVM,ARM,EARLY] Trap guest accesses to GICv3 group-1 system registers kvm-arm.vgic_v3_common_trap= - [KVM,ARM] Trap guest accesses to GICv3 common + [KVM,ARM,EARLY] Trap guest accesses to GICv3 common system registers kvm-arm.vgic_v4_enable= - [KVM,ARM] Allow use of GICv4 for direct injection of - LPIs. + [KVM,ARM,EARLY] Allow use of GICv4 for direct + injection of LPIs. - kvm_cma_resv_ratio=n [PPC] + kvm_cma_resv_ratio=n [PPC,EARLY] Reserves given percentage from system memory area for contiguous memory allocation for KVM hash pagetable allocation. @@ -2706,7 +2700,7 @@ (enabled). Disable by KVM if hardware lacks support for it. - l1d_flush= [X86,INTEL] + l1d_flush= [X86,INTEL,EARLY] Control mitigation for L1D based snooping vulnerability. Certain CPUs are vulnerable to an exploit against CPU @@ -2723,7 +2717,7 @@ on - enable the interface for the mitigation - l1tf= [X86] Control mitigation of the L1TF vulnerability on + l1tf= [X86,EARLY] Control mitigation of the L1TF vulnerability on affected CPUs The kernel PTE inversion protection is unconditionally @@ -2792,7 +2786,7 @@ l3cr= [PPC] - lapic [X86-32,APIC] Enable the local APIC even if BIOS + lapic [X86-32,APIC,EARLY] Enable the local APIC even if BIOS disabled it. lapic= [X86,APIC] Do not use TSC deadline @@ -2800,7 +2794,7 @@ back to the programmable timer unit in the LAPIC. Format: notscdeadline - lapic_timer_c2_ok [X86,APIC] trust the local apic timer + lapic_timer_c2_ok [X86,APIC,EARLY] trust the local apic timer in C2 power state. libata.dma= [LIBATA] DMA control @@ -2924,7 +2918,7 @@ lockd.nlm_udpport=M [NFS] Assign UDP port. Format: - lockdown= [SECURITY] + lockdown= [SECURITY,EARLY] { integrity | confidentiality } Enable the kernel lockdown feature. If set to integrity, kernel features that allow userland to @@ -3031,7 +3025,8 @@ logibm.irq= [HW,MOUSE] Logitech Bus Mouse Driver Format: - loglevel= All Kernel Messages with a loglevel smaller than the + loglevel= [KNL,EARLY] + All Kernel Messages with a loglevel smaller than the console loglevel will be printed to the console. It can also be changed with klogd or other programs. The loglevels are defined as follows: @@ -3045,13 +3040,15 @@ 6 (KERN_INFO) informational 7 (KERN_DEBUG) debug-level messages - log_buf_len=n[KMG] Sets the size of the printk ring buffer, - in bytes. n must be a power of two and greater - than the minimal size. The minimal size is defined - by LOG_BUF_SHIFT kernel config parameter. There is - also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter - that allows to increase the default size depending on - the number of CPUs. See init/Kconfig for more details. + log_buf_len=n[KMG] [KNL,EARLY] + Sets the size of the printk ring buffer, in bytes. + n must be a power of two and greater than the + minimal size. The minimal size is defined by + LOG_BUF_SHIFT kernel config parameter. There + is also CONFIG_LOG_CPU_MAX_BUF_SHIFT config + parameter that allows to increase the default size + depending on the number of CPUs. See init/Kconfig + for more details. logo.nologo [FB] Disables display of the built-in Linux logo. This may be used to provide more screen space for @@ -3109,7 +3106,7 @@ max_addr=nn[KMG] [KNL,BOOT,IA-64] All physical memory greater than or equal to this physical address is ignored. - maxcpus= [SMP] Maximum number of processors that an SMP kernel + maxcpus= [SMP,EARLY] Maximum number of processors that an SMP kernel will bring up during bootup. maxcpus=n : n >= 0 limits the kernel to bring up 'n' processors. Surely after bootup you can bring up the other plugged cpu by executing @@ -3136,7 +3133,7 @@ Format: , Specifies range of consoles to be captured by the MDA. - mds= [X86,INTEL] + mds= [X86,INTEL,EARLY] Control mitigation for the Micro-architectural Data Sampling (MDS) vulnerability. @@ -3168,11 +3165,12 @@ For details see: Documentation/admin-guide/hw-vuln/mds.rst - mem=nn[KMG] [HEXAGON] Set the memory size. + mem=nn[KMG] [HEXAGON,EARLY] Set the memory size. Must be specified, otherwise memory size will be 0. - mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory - Amount of memory to be used in cases as follows: + mem=nn[KMG] [KNL,BOOT,EARLY] Force usage of a specific amount + of memory Amount of memory to be used in cases + as follows: 1 for test; 2 when the kernel is not able to see the whole system memory; @@ -3196,8 +3194,8 @@ if system memory of hypervisor is not sufficient. mem=nn[KMG]@ss[KMG] - [ARM,MIPS] - override the memory layout reported by - firmware. + [ARM,MIPS,EARLY] - override the memory layout + reported by firmware. Define a memory region of size nn[KMG] starting at ss[KMG]. Multiple different regions can be specified with @@ -3206,7 +3204,7 @@ mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel memory. - memblock=debug [KNL] Enable memblock debug messages. + memblock=debug [KNL,EARLY] Enable memblock debug messages. memchunk=nn[KMG] [KNL,SH] Allow user to override the default size for @@ -3220,14 +3218,14 @@ option. See Documentation/admin-guide/mm/memory-hotplug.rst. - memmap=exactmap [KNL,X86] Enable setting of an exact + memmap=exactmap [KNL,X86,EARLY] Enable setting of an exact E820 memory map, as specified by the user. Such memmap=exactmap lines can be constructed based on BIOS output or other requirements. See the memmap=nn@ss option description. memmap=nn[KMG]@ss[KMG] - [KNL, X86, MIPS, XTENSA] Force usage of a specific region of memory. + [KNL, X86,MIPS,XTENSA,EARLY] Force usage of a specific region of memory. Region of memory to be used is from ss to ss+nn. If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG], which limits max address to nn[KMG]. @@ -3237,11 +3235,11 @@ memmap=100M@2G,100M#3G,1G!1024G memmap=nn[KMG]#ss[KMG] - [KNL,ACPI] Mark specific memory as ACPI data. + [KNL,ACPI,EARLY] Mark specific memory as ACPI data. Region of memory to be marked is from ss to ss+nn. memmap=nn[KMG]$ss[KMG] - [KNL,ACPI] Mark specific memory as reserved. + [KNL,ACPI,EARLY] Mark specific memory as reserved. Region of memory to be reserved is from ss to ss+nn. Example: Exclude memory from 0x18690000-0x1869ffff memmap=64K$0x18690000 @@ -3251,14 +3249,14 @@ like Grub2, otherwise '$' and the following number will be eaten. - memmap=nn[KMG]!ss[KMG] + memmap=nn[KMG]!ss[KMG,EARLY] [KNL,X86] Mark specific memory as protected. Region of memory to be used, from ss to ss+nn. The memory region may be marked as e820 type 12 (0xc) and is NVDIMM or ADR memory. memmap=%-+ - [KNL,ACPI] Convert memory within the specified region + [KNL,ACPI,EARLY] Convert memory within the specified region from to . If "-" is left out, the whole region will be marked as , even if previously unavailable. If "+" is left @@ -3266,7 +3264,7 @@ specified as e820 types, e.g., 1 = RAM, 2 = reserved, 3 = ACPI, 12 = PRAM. - memory_corruption_check=0/1 [X86] + memory_corruption_check=0/1 [X86,EARLY] Some BIOSes seem to corrupt the first 64k of memory when doing things like suspend/resume. Setting this option will scan the memory @@ -3278,13 +3276,13 @@ affects the same memory, you can use memmap= to prevent the kernel from using that memory. - memory_corruption_check_size=size [X86] + memory_corruption_check_size=size [X86,EARLY] By default it checks for corruption in the low 64k, making this memory unavailable for normal use. Use this parameter to scan for corruption in more or less memory. - memory_corruption_check_period=seconds [X86] + memory_corruption_check_period=seconds [X86,EARLY] By default it checks for corruption every 60 seconds. Use this parameter to check at some other rate. 0 disables periodic checking. @@ -3308,7 +3306,7 @@ Note that even when enabled, there are a few cases where the feature is not effective. - memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest + memtest= [KNL,X86,ARM,M68K,PPC,RISCV,EARLY] Enable memtest Format: default : 0 Specifies the number of memtest passes to be @@ -3374,7 +3372,7 @@ https://repo.or.cz/w/linux-2.6/mini2440.git mitigations= - [X86,PPC,S390,ARM64] Control optional mitigations for + [X86,PPC,S390,ARM64,EARLY] Control optional mitigations for CPU vulnerabilities. This is a set of curated, arch-independent options, each of which is an aggregation of existing arch-specific options. @@ -3397,6 +3395,7 @@ nospectre_v1 [X86,PPC] nospectre_v2 [X86,PPC,S390,ARM64] retbleed=off [X86] + spec_rstack_overflow=off [X86] spec_store_bypass_disable=off [X86,PPC] spectre_v2_user=off [X86] srbds=off [X86,INTEL] @@ -3427,7 +3426,7 @@ retbleed=auto,nosmt [X86] mminit_loglevel= - [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this + [KNL,EARLY] When CONFIG_DEBUG_MEMORY_INIT is set, this parameter allows control of the logging verbosity for the additional memory initialisation checks. A value of 0 disables mminit logging and a level of 4 will @@ -3435,7 +3434,7 @@ so loglevel=8 may also need to be specified. mmio_stale_data= - [X86,INTEL] Control mitigation for the Processor + [X86,INTEL,EARLY] Control mitigation for the Processor MMIO Stale Data vulnerabilities. Processor MMIO Stale Data is a class of @@ -3510,7 +3509,7 @@ mousedev.yres= [MOUSE] Vertical screen resolution, used for devices reporting absolute coordinates, such as tablets - movablecore= [KNL,X86,IA-64,PPC] + movablecore= [KNL,X86,IA-64,PPC,EARLY] Format: nn[KMGTPE] | nn% This parameter is the complement to kernelcore=, it specifies the amount of memory used for migratable @@ -3521,7 +3520,7 @@ that the amount of memory usable for all allocations is not too small. - movable_node [KNL] Boot-time switch to make hotplugable memory + movable_node [KNL,EARLY] Boot-time switch to make hotplugable memory NUMA nodes to be movable. This means that the memory of such nodes will be usable only for movable allocations which rules out almost all kernel @@ -3545,21 +3544,21 @@ [HW] Make the MicroTouch USB driver use raw coordinates ('y', default) or cooked coordinates ('n') - mtrr=debug [X86] + mtrr=debug [X86,EARLY] Enable printing debug information related to MTRR registers at boot time. - mtrr_chunk_size=nn[KMG] [X86] + mtrr_chunk_size=nn[KMG,X86,EARLY] used for mtrr cleanup. It is largest continuous chunk that could hold holes aka. UC entries. - mtrr_gran_size=nn[KMG] [X86] + mtrr_gran_size=nn[KMG,X86,EARLY] Used for mtrr cleanup. It is granularity of mtrr block. Default is 1. Large value could prevent small alignment from using up MTRRs. - mtrr_spare_reg_nr=n [X86] + mtrr_spare_reg_nr=n [X86,EARLY] Format: Range: 0,7 : spare reg number Default : 1 @@ -3745,10 +3744,10 @@ emulation library even if a 387 maths coprocessor is present. - no4lvl [RISCV] Disable 4-level and 5-level paging modes. Forces - kernel to use 3-level paging instead. + no4lvl [RISCV,EARLY] Disable 4-level and 5-level paging modes. + Forces kernel to use 3-level paging instead. - no5lvl [X86-64,RISCV] Disable 5-level paging mode. Forces + no5lvl [X86-64,RISCV,EARLY] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. noaliencache [MM, NUMA, SLAB] Disables the allocation of alien @@ -3757,15 +3756,15 @@ noalign [KNL,ARM] - noaltinstr [S390] Disables alternative instructions patching - (CPU alternatives feature). + noaltinstr [S390,EARLY] Disables alternative instructions + patching (CPU alternatives feature). - noapic [SMP,APIC] Tells the kernel to not make use of any + noapic [SMP,APIC,EARLY] Tells the kernel to not make use of any IOAPICs that may be present in the system. noautogroup Disable scheduler automatic task group creation. - nocache [ARM] + nocache [ARM,EARLY] no_console_suspend [HW] Never suspend the console @@ -3783,13 +3782,13 @@ turn on/off it dynamically. no_debug_objects - [KNL] Disable object debugging + [KNL,EARLY] Disable object debugging nodsp [SH] Disable hardware DSP at boot time. - noefi Disable EFI runtime services support. + noefi [EFI,EARLY] Disable EFI runtime services support. - no_entry_flush [PPC] Don't flush the L1-D cache when entering the kernel. + no_entry_flush [PPC,EARLY] Don't flush the L1-D cache when entering the kernel. noexec [IA-64] @@ -3820,6 +3819,7 @@ real-time systems. no_hash_pointers + [KNL,EARLY] Force pointers printed to the console or buffers to be unhashed. By default, when a pointer is printed via %p format string, that pointer is "hashed", i.e. obscured @@ -3844,9 +3844,9 @@ the impact of the sleep instructions. This is also useful when using JTAG debugger. - nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings. + nohugeiomap [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge I/O mappings. - nohugevmalloc [KNL,X86,PPC,ARM64] Disable kernel huge vmalloc mappings. + nohugevmalloc [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge vmalloc mappings. nohz= [KNL] Boottime enable/disable dynamic ticks Valid arguments: on, off @@ -3868,13 +3868,13 @@ noinitrd [RAM] Tells the kernel not to load any configured initial RAM disk. - nointremap [X86-64, Intel-IOMMU] Do not enable interrupt + nointremap [X86-64,Intel-IOMMU,EARLY] Do not enable interrupt remapping. [Deprecated - use intremap=off] nointroute [IA-64] - noinvpcid [X86] Disable the INVPCID cpu feature. + noinvpcid [X86,EARLY] Disable the INVPCID cpu feature. noiotrap [SH] Disables trapped I/O port accesses. @@ -3885,19 +3885,19 @@ nojitter [IA-64] Disables jitter checking for ITC timers. - nokaslr [KNL] + nokaslr [KNL,EARLY] When CONFIG_RANDOMIZE_BASE is set, this disables kernel and module base offset ASLR (Address Space Layout Randomization). - no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page + no-kvmapf [X86,KVM,EARLY] Disable paravirtualized asynchronous page fault handling. - no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver + no-kvmclock [X86,KVM,EARLY] Disable paravirtualized KVM clock driver - nolapic [X86-32,APIC] Do not enable or use the local APIC. + nolapic [X86-32,APIC,EARLY] Do not enable or use the local APIC. - nolapic_timer [X86-32,APIC] Do not use the local APIC timer. + nolapic_timer [X86-32,APIC,EARLY] Do not use the local APIC timer. nomca [IA-64] Disable machine check abort handling @@ -3922,23 +3922,23 @@ shutdown the other cpus. Instead use the REBOOT_VECTOR irq. - nopat [X86] Disable PAT (page attribute table extension of + nopat [X86,EARLY] Disable PAT (page attribute table extension of pagetables) support. - nopcid [X86-64] Disable the PCID cpu feature. + nopcid [X86-64,EARLY] Disable the PCID cpu feature. nopku [X86] Disable Memory Protection Keys CPU feature found in some Intel CPUs. - nopti [X86-64] + nopti [X86-64,EARLY] Equivalent to pti=off - nopv= [X86,XEN,KVM,HYPER_V,VMWARE] + nopv= [X86,XEN,KVM,HYPER_V,VMWARE,EARLY] Disables the PV optimizations forcing the guest to run as generic guest with no PV drivers. Currently support XEN HVM, KVM, HYPER_V and VMWARE guest. - nopvspin [X86,XEN,KVM] + nopvspin [X86,XEN,KVM,EARLY] Disables the qspinlock slow path using PV optimizations which allow the hypervisor to 'idle' the guest on lock contention. @@ -3958,20 +3958,20 @@ This is required for the Braillex ib80-piezo Braille reader made by F.H. Papenmeier (Germany). - nosgx [X86-64,SGX] Disables Intel SGX kernel support. + nosgx [X86-64,SGX,EARLY] Disables Intel SGX kernel support. - nosmap [PPC] + nosmap [PPC,EARLY] Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. - nosmep [PPC64s] + nosmep [PPC64s,EARLY] Disable SMEP (Supervisor Mode Execution Prevention) even if it is supported by processor. - nosmp [SMP] Tells an SMP kernel to act as a UP kernel, + nosmp [SMP,EARLY] Tells an SMP kernel to act as a UP kernel, and disable the IO APIC. legacy for "maxcpus=0". - nosmt [KNL,MIPS,PPC,S390] Disable symmetric multithreading (SMT). + nosmt [KNL,MIPS,PPC,S390,EARLY] Disable symmetric multithreading (SMT). Equivalent to smt=1. [KNL,X86,PPC] Disable symmetric multithreading (SMT). @@ -3981,22 +3981,23 @@ nosoftlockup [KNL] Disable the soft-lockup detector. nospec_store_bypass_disable - [HW] Disable all mitigations for the Speculative Store Bypass vulnerability + [HW,EARLY] Disable all mitigations for the Speculative + Store Bypass vulnerability - nospectre_bhb [ARM64] Disable all mitigations for Spectre-BHB (branch + nospectre_bhb [ARM64,EARLY] Disable all mitigations for Spectre-BHB (branch history injection) vulnerability. System may allow data leaks with this option. - nospectre_v1 [X86,PPC] Disable mitigations for Spectre Variant 1 + nospectre_v1 [X86,PPC,EARLY] Disable mitigations for Spectre Variant 1 (bounds check bypass). With this option data leaks are possible in the system. - nospectre_v2 [X86,PPC_E500,ARM64] Disable all mitigations for - the Spectre variant 2 (indirect branch prediction) - vulnerability. System may allow data leaks with this - option. + nospectre_v2 [X86,PPC_E500,ARM64,EARLY] Disable all mitigations + for the Spectre variant 2 (indirect branch + prediction) vulnerability. System may allow data + leaks with this option. - no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV] Disable + no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV,EARLY] Disable paravirtualized steal time accounting. steal time is computed, but won't influence scheduler behaviour @@ -4006,7 +4007,7 @@ broken timer IRQ sources. no_uaccess_flush - [PPC] Don't flush the L1-D cache after accessing user data. + [PPC,EARLY] Don't flush the L1-D cache after accessing user data. novmcoredd [KNL,KDUMP] Disable device dump. Device dump allows drivers to @@ -4020,15 +4021,15 @@ is set. no-vmw-sched-clock - [X86,PV_OPS] Disable paravirtualized VMware scheduler - clock and use the default one. + [X86,PV_OPS,EARLY] Disable paravirtualized VMware + scheduler clock and use the default one. nowatchdog [KNL] Disable both lockup detectors, i.e. soft-lockup and NMI watchdog (hard-lockup). - nowb [ARM] + nowb [ARM,EARLY] - nox2apic [X86-64,APIC] Do not enable x2APIC mode. + nox2apic [X86-64,APIC,EARLY] Do not enable x2APIC mode. NOTE: this parameter will be ignored on systems with the LEGACY_XAPIC_DISABLED bit set in the @@ -4066,7 +4067,7 @@ purges which is reported from either PAL_VM_SUMMARY or SAL PALO. - nr_cpus= [SMP] Maximum number of processors that an SMP kernel + nr_cpus= [SMP,EARLY] Maximum number of processors that an SMP kernel could support. nr_cpus=n : n >= 1 limits the kernel to support 'n' processors. It could be larger than the number of already plugged CPU during bootup, later in @@ -4077,8 +4078,9 @@ nr_uarts= [SERIAL] maximum number of UARTs to be registered. - numa=off [KNL, ARM64, PPC, RISCV, SPARC, X86] Disable NUMA, Only - set up a single NUMA node spanning all memory. + numa=off [KNL, ARM64, PPC, RISCV, SPARC, X86, EARLY] + Disable NUMA, Only set up a single NUMA node + spanning all memory. numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic NUMA balancing. @@ -4089,7 +4091,7 @@ This can be set from sysctl after boot. See Documentation/admin-guide/sysctl/vm.rst for details. - ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. + ohci1394_dma=early [HW,EARLY] enable debugging via the ohci1394 driver. See Documentation/core-api/debugging-via-ohci1394.rst for more info. @@ -4115,7 +4117,8 @@ Once locked, the boundary cannot be changed. 1 indicates lock status, 0 indicates unlock status. - oops=panic Always panic on oopses. Default is to just kill the + oops=panic [KNL,EARLY] + Always panic on oopses. Default is to just kill the process, but there is a small probability of deadlocking the machine. This will also cause panics on machine check exceptions. @@ -4131,13 +4134,13 @@ can be read from sysfs at: /sys/module/page_alloc/parameters/shuffle. - page_owner= [KNL] Boot-time page_owner enabling option. + page_owner= [KNL,EARLY] Boot-time page_owner enabling option. Storage of the information about who allocated each page is disabled in default. With this switch, we can turn it on. on: enable the feature - page_poison= [KNL] Boot-time parameter changing the state of + page_poison= [KNL,EARLY] Boot-time parameter changing the state of poisoning on the buddy allocator, available with CONFIG_PAGE_POISONING=y. off: turn off poisoning (default) @@ -4155,7 +4158,8 @@ timeout < 0: reboot immediately Format: - panic_on_taint= Bitmask for conditionally calling panic() in add_taint() + panic_on_taint= [KNL,EARLY] + Bitmask for conditionally calling panic() in add_taint() Format: [,nousertaint] Hexadecimal bitmask representing the set of TAINT flags that will cause the kernel to panic when add_taint() is @@ -4311,7 +4315,7 @@ pcbit= [HW,ISDN] - pci=option[,option...] [PCI] various PCI subsystem options. + pci=option[,option...] [PCI,EARLY] various PCI subsystem options. Some options herein operate on a specific device or a set of devices (). These are @@ -4580,7 +4584,8 @@ Format: { 0 | 1 } See arch/parisc/kernel/pdc_chassis.c - percpu_alloc= Select which percpu first chunk allocator to use. + percpu_alloc= [MM,EARLY] + Select which percpu first chunk allocator to use. Currently supported values are "embed" and "page". Archs may support subset or none of the selections. See comments in mm/percpu.c for details on each @@ -4649,12 +4654,12 @@ execution priority. ppc_strict_facility_enable - [PPC] This option catches any kernel floating point, + [PPC,ENABLE] This option catches any kernel floating point, Altivec, VSX and SPE outside of regions specifically allowed (eg kernel_enable_fpu()/kernel_disable_fpu()). There is some performance impact when enabling this. - ppc_tm= [PPC] + ppc_tm= [PPC,EARLY] Format: {"off"} Disable Hardware Transactional Memory @@ -4764,7 +4769,7 @@ [KNL] Number of legacy pty's. Overwrites compiled-in default number. - quiet [KNL] Disable most log messages + quiet [KNL,EARLY] Disable most log messages r128= [HW,DRM] @@ -4781,17 +4786,17 @@ ramdisk_start= [RAM] RAM disk image start address random.trust_cpu=off - [KNL] Disable trusting the use of the CPU's + [KNL,EARLY] Disable trusting the use of the CPU's random number generator (if available) to initialize the kernel's RNG. random.trust_bootloader=off - [KNL] Disable trusting the use of the a seed + [KNL,EARLY] Disable trusting the use of the a seed passed by the bootloader (if available) to initialize the kernel's RNG. randomize_kstack_offset= - [KNL] Enable or disable kernel stack offset + [KNL,EARLY] Enable or disable kernel stack offset randomization, which provides roughly 5 bits of entropy, frustrating memory corruption attacks that depend on stack address determinism or @@ -5032,6 +5037,11 @@ this kernel boot parameter, forcibly setting it to zero. + rcutree.enable_rcu_lazy= [KNL] + To save power, batch RCU callbacks and flush after + delay, memory pressure or callback list growing too + big. + rcuscale.gp_async= [KNL] Measure performance of asynchronous grace-period primitives such as call_rcu(). @@ -5482,7 +5492,7 @@ Run specified binary instead of /init from the ramdisk, used for early userspace startup. See initrd. - rdrand= [X86] + rdrand= [X86,EARLY] force - Override the decision by the kernel to hide the advertisement of RDRAND support (this affects certain AMD processors because of buggy BIOS @@ -5578,7 +5588,7 @@ them. If is less than 0x10000, the region is assumed to be I/O ports; otherwise it is memory. - reservetop= [X86-32] + reservetop= [X86-32,EARLY] Format: nn[KMG] Reserves a hole at the top of the kernel virtual address space. @@ -5663,7 +5673,7 @@ [KNL] Disable ring 3 MONITOR/MWAIT feature on supported CPUs. - riscv_isa_fallback [RISCV] + riscv_isa_fallback [RISCV,EARLY] When CONFIG_RISCV_ISA_FALLBACK is not enabled, permit falling back to detecting extension support by parsing "riscv,isa" property on devicetree systems when the @@ -5672,13 +5682,14 @@ ro [KNL] Mount root device read-only on boot - rodata= [KNL] + rodata= [KNL,EARLY] on Mark read-only kernel memory as read-only (default). off Leave read-only kernel memory writable for debugging. full Mark read-only kernel memory and aliases as read-only [arm64] rockchip.usb_uart + [EARLY] Enable the uart passthrough on the designated usb port on Rockchip SoCs. When active, the signals of the debug-uart get routed to the D+ and D- pins of the usb @@ -5739,7 +5750,7 @@ sa1100ir [NET] See drivers/net/irda/sa1100_ir.c. - sched_verbose [KNL] Enables verbose scheduler debug messages. + sched_verbose [KNL,EARLY] Enables verbose scheduler debug messages. schedstats= [KNL,X86] Enable or disable scheduled statistics. Allowed values are enable and disable. This feature @@ -5854,7 +5865,7 @@ non-zero "wait" parameter. See weight_single and weight_many. - skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate + skew_tick= [KNL,EARLY] Offset the periodic timer tick per cpu to mitigate xtime_lock contention on larger systems, and/or RCU lock contention on all systems with CONFIG_MAXSMP set. Format: { "0" | "1" } @@ -5985,10 +5996,10 @@ 1: Fast pin select (default) 2: ATC IRMode - smt= [KNL,MIPS,S390] Set the maximum number of threads (logical - CPUs) to use per physical CPU on systems capable of - symmetric multithreading (SMT). Will be capped to the - actual hardware limit. + smt= [KNL,MIPS,S390,EARLY] Set the maximum number of threads + (logical CPUs) to use per physical CPU on systems + capable of symmetric multithreading (SMT). Will + be capped to the actual hardware limit. Format: Default: -1 (no limit) @@ -6010,7 +6021,7 @@ sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/admin-guide/laptops/sonypi.rst - spectre_v2= [X86] Control mitigation of Spectre variant 2 + spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. The default operation protects the kernel from user space attacks. @@ -6025,8 +6036,8 @@ Selecting 'on' will, and 'auto' may, choose a mitigation method at run time according to the CPU, the available microcode, the setting of the - CONFIG_RETPOLINE configuration option, and the - compiler with which the kernel was built. + CONFIG_MITIGATION_RETPOLINE configuration option, + and the compiler with which the kernel was built. Selecting 'on' will also enable the mitigation against user space to user space task attacks. @@ -6090,7 +6101,7 @@ spectre_v2_user=auto. spec_rstack_overflow= - [X86] Control RAS overflow mitigation on AMD Zen CPUs + [X86,EARLY] Control RAS overflow mitigation on AMD Zen CPUs off - Disable mitigation microcode - Enable microcode mitigation only @@ -6101,7 +6112,7 @@ (cloud-specific mitigation) spec_store_bypass_disable= - [HW] Control Speculative Store Bypass (SSB) Disable mitigation + [HW,EARLY] Control Speculative Store Bypass (SSB) Disable mitigation (Speculative Store Bypass vulnerability) Certain CPUs are vulnerable to an exploit against a @@ -6197,7 +6208,7 @@ #DB exception for bus lock is triggered only when CPL > 0. - srbds= [X86,INTEL] + srbds= [X86,INTEL,EARLY] Control the Special Register Buffer Data Sampling (SRBDS) mitigation. @@ -6284,7 +6295,7 @@ srcutree.convert_to_big must have the 0x10 bit set for contention-based conversions to occur. - ssbd= [ARM64,HW] + ssbd= [ARM64,HW,EARLY] Speculative Store Bypass Disable control On CPUs that are vulnerable to the Speculative @@ -6308,7 +6319,7 @@ growing up) the main stack are reserved for no other mapping. Default value is 256 pages. - stack_depot_disable= [KNL] + stack_depot_disable= [KNL,EARLY] Setting this to true through kernel command line will disable the stack depot thereby saving the static memory consumed by the stack hash table. By default this is set @@ -6347,12 +6358,12 @@ be used to filter out binaries which have not yet been made aware of AT_MINSIGSTKSZ. - stress_hpt [PPC] + stress_hpt [PPC,EARLY] Limits the number of kernel HPT entries in the hash page table to increase the rate of hash page table faults on kernel addresses. - stress_slb [PPC] + stress_slb [PPC,EARLY] Limits the number of kernel SLB entries, and flushes them frequently to increase the rate of SLB faults on kernel addresses. @@ -6412,7 +6423,7 @@ This parameter controls use of the Protected Execution Facility on pSeries. - swiotlb= [ARM,IA-64,PPC,MIPS,X86] + swiotlb= [ARM,IA-64,PPC,MIPS,X86,EARLY] Format: { [,] | force | noforce } -- Number of I/O TLB slabs -- Second integer after comma. Number of swiotlb @@ -6422,7 +6433,7 @@ wouldn't be automatically used by the kernel noforce -- Never use bounce buffers (for debugging) - switches= [HW,M68k] + switches= [HW,M68k,EARLY] sysctl.*= [KNL] Set a sysctl parameter, right before loading the init @@ -6481,11 +6492,11 @@ : poll all this frequency 0: no polling (default) - threadirqs [KNL] + threadirqs [KNL,EARLY] Force threading of all interrupt handlers except those marked explicitly IRQF_NO_THREAD. - topology= [S390] + topology= [S390,EARLY] Format: {off | on} Specify if the kernel should make use of the cpu topology information if the hardware supports this. @@ -6726,7 +6737,7 @@ can be overridden by a later tsc=nowatchdog. A console message will flag any such suppression or overriding. - tsc_early_khz= [X86] Skip early TSC calibration and use the given + tsc_early_khz= [X86,EARLY] Skip early TSC calibration and use the given value instead. Useful when the early TSC frequency discovery procedure is not reliable, such as on overclocked systems with CPUID.16h support and partial CPUID.15h support. @@ -6761,7 +6772,7 @@ See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst for more details. - tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async + tsx_async_abort= [X86,INTEL,EARLY] Control mitigation for the TSX Async Abort (TAA) vulnerability. Similar to Micro-architectural Data Sampling (MDS) @@ -6827,7 +6838,7 @@ unknown_nmi_panic [X86] Cause panic on unknown NMI. - unwind_debug [X86-64] + unwind_debug [X86-64,EARLY] Enable unwinder debug output. This can be useful for debugging certain unwinder error conditions, including corrupt stacks and @@ -7017,7 +7028,7 @@ Example: user_debug=31 userpte= - [X86] Flags controlling user PTE allocations. + [X86,EARLY] Flags controlling user PTE allocations. nohigh = do not allocate PTE pages in HIGHMEM regardless of setting @@ -7046,7 +7057,7 @@ vector= [IA-64,SMP] vector=percpu: enable percpu vector domain - video= [FB] Frame buffer configuration + video= [FB,EARLY] Frame buffer configuration See Documentation/fb/modedb.rst. video.brightness_switch_enabled= [ACPI] @@ -7094,13 +7105,13 @@ P Enable page structure init time poisoning - Disable all of the above options - vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact - size of . This can be used to increase the - minimum size (128MB on x86). It can also be used to - decrease the size and leave more room for directly - mapped kernel RAM. + vmalloc=nn[KMG] [KNL,BOOT,EARLY] Forces the vmalloc area to have an + exact size of . This can be used to increase + the minimum size (128MB on x86). It can also be + used to decrease the size and leave more room + for directly mapped kernel RAM. - vmcp_cma=nn[MG] [KNL,S390] + vmcp_cma=nn[MG] [KNL,S390,EARLY] Sets the memory size reserved for contiguous memory allocations for the vmcp device driver. @@ -7113,7 +7124,7 @@ vmpoff= [KNL,S390] Perform z/VM CP command after power off. Format: - vsyscall= [X86-64] + vsyscall= [X86-64,EARLY] Controls the behavior of vsyscalls (i.e. calls to fixed addresses of 0xffffffffff600x00 from legacy code). Most statically-linked binaries and older @@ -7223,6 +7234,15 @@ threshold repeatedly. They are likely good candidates for using WQ_UNBOUND workqueues instead. + workqueue.cpu_intensive_warning_thresh= + If CONFIG_WQ_CPU_INTENSIVE_REPORT is set, the kernel + will report the work functions which violate the + intensive_threshold_us repeatedly. In order to prevent + spurious warnings, start printing only after a work + function has violated this threshold number of times. + + The default is 4 times. 0 disables the warning. + workqueue.power_efficient Per-cpu workqueues are generally preferred because they show better performance thanks to cache @@ -7261,13 +7281,13 @@ When enabled, memory and cache locality will be impacted. - writecombine= [LOONGARCH] Control the MAT (Memory Access Type) of - ioremap_wc(). + writecombine= [LOONGARCH,EARLY] Control the MAT (Memory Access + Type) of ioremap_wc(). on - Enable writecombine, use WUC for ioremap_wc() off - Disable writecombine, use SUC for ioremap_wc() - x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of + x2apic_phys [X86-64,APIC,EARLY] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. @@ -7278,7 +7298,7 @@ save/restore/migration must be enabled to handle larger domains. - xen_emul_unplug= [HW,X86,XEN] + xen_emul_unplug= [HW,X86,XEN,EARLY] Unplug Xen emulated devices Format: [unplug0,][unplug1] ide-disks -- unplug primary master IDE devices @@ -7290,17 +7310,17 @@ the unplug protocol never -- do not unplug even if version check succeeds - xen_legacy_crash [X86,XEN] + xen_legacy_crash [X86,XEN,EARLY] Crash from Xen panic notifier, without executing late panic() code such as dumping handler. - xen_msr_safe= [X86,XEN] + xen_msr_safe= [X86,XEN,EARLY] Format: Select whether to always use non-faulting (safe) MSR access functions when running as Xen PV guest. The default value is controlled by CONFIG_XEN_PV_MSR_SAFE. - xen_nopvspin [X86,XEN] + xen_nopvspin [X86,XEN,EARLY] Disables the qspinlock slowpath using Xen PV optimizations. This parameter is obsoleted by "nopvspin" parameter, which has equivalent effect for XEN platform. @@ -7312,7 +7332,7 @@ has equivalent effect for XEN platform. xen_no_vector_callback - [KNL,X86,XEN] Disable the vector callback for Xen + [KNL,X86,XEN,EARLY] Disable the vector callback for Xen event channel interrupts. xen_scrub_pages= [XEN] @@ -7321,7 +7341,7 @@ with /sys/devices/system/xen_memory/xen_memory0/scrub_pages. Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT. - xen_timer_slop= [X86-64,XEN] + xen_timer_slop= [X86-64,XEN,EARLY] Set the timer slop (in nanoseconds) for the virtual Xen timers (default is 100000). This adjusts the minimum delta of virtualized Xen timers, where lower values @@ -7374,7 +7394,7 @@ host controller quirks. Meaning of each bit can be consulted in header drivers/usb/host/xhci.h. - xmon [PPC] + xmon [PPC,EARLY] Format: { early | on | rw | ro | off } Controls if xmon debugger is enabled. Default is off. Passing only "xmon" is equivalent to "xmon=early". diff --git a/Documentation/arch/x86/pti.rst b/Documentation/arch/x86/pti.rst index e08d35177bc028b4878207e3feadcbce71e9811e..57e8392f61d35460927904f0436164d129a6ee50 100644 --- a/Documentation/arch/x86/pti.rst +++ b/Documentation/arch/x86/pti.rst @@ -26,9 +26,9 @@ comments in pti.c). This approach helps to ensure that side-channel attacks leveraging the paging structures do not function when PTI is enabled. It can be -enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time. -Once enabled at compile-time, it can be disabled at boot with the -'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt). +enabled by setting CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=y at compile +time. Once enabled at compile-time, it can be disabled at boot with +the 'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt). Page Table Management ===================== diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst index 08ebf9edbfc1ed4982e550c283a03bc00e6af345..7352ab89a55ae4fc79dfe7cee997e2345576d2fe 100644 --- a/Documentation/arch/x86/topology.rst +++ b/Documentation/arch/x86/topology.rst @@ -47,17 +47,21 @@ AMD nomenclature for package is 'Node'. Package-related topology information in the kernel: - - cpuinfo_x86.x86_max_cores: + - topology_num_threads_per_package() - The number of cores in a package. This information is retrieved via CPUID. + The number of threads in a package. - - cpuinfo_x86.x86_max_dies: + - topology_num_cores_per_package() - The number of dies in a package. This information is retrieved via CPUID. + The number of cores in a package. + + - topology_max_dies_per_package() + + The maximum number of dies in a package. - cpuinfo_x86.topo.die_id: - The physical ID of the die. This information is retrieved via CPUID. + The physical ID of the die. - cpuinfo_x86.topo.pkg_id: @@ -96,16 +100,6 @@ are SMT- or CMT-type threads. AMDs nomenclature for a CMT core is "Compute Unit". The kernel always uses "core". -Core-related topology information in the kernel: - - - smp_num_siblings: - - The number of threads in a core. The number of threads in a package can be - calculated by:: - - threads_per_package = cpuinfo_x86.x86_max_cores * smp_num_siblings - - Threads ======= A thread is a single scheduling unit. It's the equivalent to a logical Linux diff --git a/Documentation/arch/x86/x86_64/fred.rst b/Documentation/arch/x86/x86_64/fred.rst new file mode 100644 index 0000000000000000000000000000000000000000..9f57e7b91f7e7af97e6d0a99296989c912dad39b --- /dev/null +++ b/Documentation/arch/x86/x86_64/fred.rst @@ -0,0 +1,96 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +Flexible Return and Event Delivery (FRED) +========================================= + +Overview +======== + +The FRED architecture defines simple new transitions that change +privilege level (ring transitions). The FRED architecture was +designed with the following goals: + +1) Improve overall performance and response time by replacing event + delivery through the interrupt descriptor table (IDT event + delivery) and event return by the IRET instruction with lower + latency transitions. + +2) Improve software robustness by ensuring that event delivery + establishes the full supervisor context and that event return + establishes the full user context. + +The new transitions defined by the FRED architecture are FRED event +delivery and, for returning from events, two FRED return instructions. +FRED event delivery can effect a transition from ring 3 to ring 0, but +it is used also to deliver events incident to ring 0. One FRED +instruction (ERETU) effects a return from ring 0 to ring 3, while the +other (ERETS) returns while remaining in ring 0. Collectively, FRED +event delivery and the FRED return instructions are FRED transitions. + +In addition to these transitions, the FRED architecture defines a new +instruction (LKGS) for managing the state of the GS segment register. +The LKGS instruction can be used by 64-bit operating systems that do +not use the new FRED transitions. + +Furthermore, the FRED architecture is easy to extend for future CPU +architectures. + +Software based event dispatching +================================ + +FRED operates differently from IDT in terms of event handling. Instead +of directly dispatching an event to its handler based on the event +vector, FRED requires the software to dispatch an event to its handler +based on both the event's type and vector. Therefore, an event dispatch +framework must be implemented to facilitate the event-to-handler +dispatch process. The FRED event dispatch framework takes control +once an event is delivered, and employs a two-level dispatch. + +The first level dispatching is event type based, and the second level +dispatching is event vector based. + +Full supervisor/user context +============================ + +FRED event delivery atomically save and restore full supervisor/user +context upon event delivery and return. Thus it avoids the problem of +transient states due to %cr2 and/or %dr6, and it is no longer needed +to handle all the ugly corner cases caused by half baked entry states. + +FRED allows explicit unblock of NMI with new event return instructions +ERETS/ERETU, avoiding the mess caused by IRET which unconditionally +unblocks NMI, e.g., when an exception happens during NMI handling. + +FRED always restores the full value of %rsp, thus ESPFIX is no longer +needed when FRED is enabled. + +LKGS +==== + +LKGS behaves like the MOV to GS instruction except that it loads the +base address into the IA32_KERNEL_GS_BASE MSR instead of the GS +segment’s descriptor cache. With LKGS, it ends up with avoiding +mucking with kernel GS, i.e., an operating system can always operate +with its own GS base address. + +Because FRED event delivery from ring 3 and ERETU both swap the value +of the GS base address and that of the IA32_KERNEL_GS_BASE MSR, plus +the introduction of LKGS instruction, the SWAPGS instruction is no +longer needed when FRED is enabled, thus is disallowed (#UD). + +Stack levels +============ + +4 stack levels 0~3 are introduced to replace the nonreentrant IST for +event handling, and each stack level should be configured to use a +dedicated stack. + +The current stack level could be unchanged or go higher upon FRED +event delivery. If unchanged, the CPU keeps using the current event +stack. If higher, the CPU switches to a new event stack specified by +the MSR of the new stack level, i.e., MSR_IA32_FRED_RSP[123]. + +Only execution of a FRED return instruction ERET[US], could lower the +current stack level, causing the CPU to switch back to the stack it was +on before a previous event delivery that promoted the stack level. diff --git a/Documentation/arch/x86/x86_64/index.rst b/Documentation/arch/x86/x86_64/index.rst index a56070fc8e77a9555ff093002705906a99d4d5bc..ad15e9bd623f684e537f794b9e970485a7391dfd 100644 --- a/Documentation/arch/x86/x86_64/index.rst +++ b/Documentation/arch/x86/x86_64/index.rst @@ -15,3 +15,4 @@ x86_64 Support cpu-hotplug-spec machinecheck fsgs + fred diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index 3599cf9267b4766c4628dde7d27d98fb95e325a9..ed73c612174d4c99d3329cc1443b553537e049db 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -77,10 +77,12 @@ wants a function to be executed asynchronously it has to set up a work item pointing to that function and queue that work item on a workqueue. -Special purpose threads, called worker threads, execute the functions -off of the queue, one after the other. If no work is queued, the -worker threads become idle. These worker threads are managed in so -called worker-pools. +A work item can be executed in either a thread or the BH (softirq) context. + +For threaded workqueues, special purpose threads, called [k]workers, execute +the functions off of the queue, one after the other. If no work is queued, +the worker threads become idle. These worker threads are managed in +worker-pools. The cmwq design differentiates between the user-facing workqueues that subsystems and drivers queue work items on and the backend mechanism @@ -91,6 +93,12 @@ for high priority ones, for each possible CPU and some extra worker-pools to serve work items queued on unbound workqueues - the number of these backing pools is dynamic. +BH workqueues use the same framework. However, as there can only be one +concurrent execution context, there's no need to worry about concurrency. +Each per-CPU BH worker pool contains only one pseudo worker which represents +the BH execution context. A BH workqueue can be considered a convenience +interface to softirq. + Subsystems and drivers can create and queue work items through special workqueue API functions as they see fit. They can influence some aspects of the way the work items are executed by setting flags on the @@ -106,7 +114,7 @@ unless specifically overridden, a work item of a bound workqueue will be queued on the worklist of either normal or highpri worker-pool that is associated to the CPU the issuer is running on. -For any worker pool implementation, managing the concurrency level +For any thread pool implementation, managing the concurrency level (how many execution contexts are active) is an important issue. cmwq tries to keep the concurrency at a minimal but sufficient level. Minimal to save resources and sufficient in that the system is used at @@ -164,6 +172,17 @@ resources, scheduled and executed. ``flags`` --------- +``WQ_BH`` + BH workqueues can be considered a convenience interface to softirq. BH + workqueues are always per-CPU and all BH work items are executed in the + queueing CPU's softirq context in the queueing order. + + All BH workqueues must have 0 ``max_active`` and ``WQ_HIGHPRI`` is the + only allowed additional flag. + + BH work items cannot sleep. All other features such as delayed queueing, + flushing and canceling are supported. + ``WQ_UNBOUND`` Work items queued to an unbound wq are served by the special worker-pools which host workers which are not bound to any @@ -237,15 +256,11 @@ may queue at the same time. Unless there is a specific need for throttling the number of active work items, specifying '0' is recommended. -Some users depend on the strict execution ordering of ST wq. The -combination of ``@max_active`` of 1 and ``WQ_UNBOUND`` used to -achieve this behavior. Work items on such wq were always queued to the -unbound worker-pools and only one work item could be active at any given -time thus achieving the same ordering property as ST wq. - -In the current implementation the above configuration only guarantees -ST behavior within a given NUMA node. Instead ``alloc_ordered_workqueue()`` should -be used to achieve system-wide ST behavior. +Some users depend on strict execution ordering where only one work item +is in flight at any given time and the work items are processed in +queueing order. While the combination of ``@max_active`` of 1 and +``WQ_UNBOUND`` used to achieve this behavior, this is no longer the +case. Use ``alloc_ordered_queue()`` instead. Example Execution Scenarios diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index ab376b316c36d6e3bebd0fe050ff28314d5b60b2..7f3582a67318bebe42dffd5a666a8babdca0fb49 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -245,6 +245,10 @@ Contributing new tests (details) TEST_PROGS, TEST_GEN_PROGS mean it is the executable tested by default. + TEST_GEN_MODS_DIR should be used by tests that require modules to be built + before the test starts. The variable will contain the name of the directory + containing the modules. + TEST_CUSTOM_PROGS should be used by tests that require custom build rules and prevent common build rule use. diff --git a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml index 3d06db98e978000a6db760bb87d35d9d36621a70..a93744763787d0b901e530d7e13eea8682ae8c2b 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml @@ -36,6 +36,7 @@ properties: - amlogic,meson-a1-gpio-intc - amlogic,meson-s4-gpio-intc - amlogic,c3-gpio-intc + - amlogic,t7-gpio-intc - const: amlogic,meson-gpio-intc reg: diff --git a/Documentation/devicetree/bindings/interrupt-controller/starfive,jh8100-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/starfive,jh8100-intc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ada5788602d65ebac93b97c9314bef21cb3b2f50 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/starfive,jh8100-intc.yaml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/interrupt-controller/starfive,jh8100-intc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: StarFive External Interrupt Controller + +description: + StarFive SoC JH8100 contain a external interrupt controller. It can be used + to handle high-level input interrupt signals. It also send the output + interrupt signal to RISC-V PLIC. + +maintainers: + - Changhuang Liang + +properties: + compatible: + const: starfive,jh8100-intc + + reg: + maxItems: 1 + + clocks: + description: APB clock for the interrupt controller + maxItems: 1 + + resets: + description: APB reset for the interrupt controller + maxItems: 1 + + interrupts: + maxItems: 1 + + interrupt-controller: true + + "#interrupt-cells": + const: 1 + +required: + - compatible + - reg + - clocks + - resets + - interrupts + - interrupt-controller + - "#interrupt-cells" + +additionalProperties: false + +examples: + - | + interrupt-controller@12260000 { + compatible = "starfive,jh8100-intc"; + reg = <0x12260000 0x10000>; + clocks = <&syscrg_ne 76>; + resets = <&syscrg_ne 13>; + interrupts = <45>; + interrupt-controller; + #interrupt-cells = <1>; + }; diff --git a/Documentation/devicetree/bindings/net/renesas,ethertsn.yaml b/Documentation/devicetree/bindings/net/renesas,ethertsn.yaml index 475aff7714d6419a9cb7266c65bffffce733b29d..ea35d19be829a37a657f6a3fb45153981ea16fb9 100644 --- a/Documentation/devicetree/bindings/net/renesas,ethertsn.yaml +++ b/Documentation/devicetree/bindings/net/renesas,ethertsn.yaml @@ -65,9 +65,11 @@ properties: rx-internal-delay-ps: enum: [0, 1800] + default: 0 tx-internal-delay-ps: enum: [0, 2000] + default: 0 '#address-cells': const: 1 diff --git a/Documentation/devicetree/bindings/sound/nvidia,tegra-audio-max9808x.yaml b/Documentation/devicetree/bindings/sound/nvidia,tegra-audio-max9808x.yaml index c29d7942915cccbaf58679f702c642632f5495b8..241d20f3aad08a845c57e3dead8fba25c4856e6a 100644 --- a/Documentation/devicetree/bindings/sound/nvidia,tegra-audio-max9808x.yaml +++ b/Documentation/devicetree/bindings/sound/nvidia,tegra-audio-max9808x.yaml @@ -64,7 +64,7 @@ examples: #include #include sound { - compatible = "lge,tegra-audio-max98089-p895", + compatible = "lg,tegra-audio-max98089-p895", "nvidia,tegra-audio-max98089"; nvidia,model = "LG Optimus Vu MAX98089"; diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst index e3d593841aa7ddd96237283b27470a9fba97ec89..ea8d16600e16a8530b7e633368bb53b75e878c15 100644 --- a/Documentation/driver-api/dpll.rst +++ b/Documentation/driver-api/dpll.rst @@ -545,7 +545,7 @@ In such scenario, dpll device input signal shall be also configurable to drive dpll with signal recovered from the PHY netdevice. This is done by exposing a pin to the netdevice - attaching pin to the netdevice itself with -``netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)``. +``dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)``. Exposed pin id handle ``DPLL_A_PIN_ID`` is then identifiable by the user as it is attached to rtnetlink respond to get ``RTM_NEWLINK`` command in nested attribute ``IFLA_DPLL_PIN``. diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst index 9e38e4c221ca5dc1be598bc1ab45bdd890f1419e..eb770f891b275f3e8b905f1bb794494a61dde54d 100644 --- a/Documentation/filesystems/files.rst +++ b/Documentation/filesystems/files.rst @@ -116,7 +116,7 @@ before and after the reference count increment. This pattern can be seen in get_file_rcu() and __files_get_rcu(). In addition, it isn't possible to access or check fields in struct file -without first aqcuiring a reference on it under rcu lookup. Not doing +without first acquiring a reference on it under rcu lookup. Not doing that was always very dodgy and it was only usable for non-pointer data in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers either first acquire a reference or they must hold the files_lock of the diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index e18bc5ae3b35f89ecc61fc6266d5151c53dbd34d..0ea1e44fa02823ffd51f4739a3a9aab635a35bbe 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -98,7 +98,6 @@ Documentation for filesystem implementations. isofs nilfs2 nfs/index - ntfs ntfs3 ocfs2 ocfs2-online-filecheck diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index d5bf4b6b7509b01c9a2d5225a6bb5b2e1ef327b2..e664061ed55dc1bdc6d7d16c086f3050c32909d6 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -29,7 +29,7 @@ prototypes:: char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); struct vfsmount *(*d_automount)(struct path *path); int (*d_manage)(const struct path *, bool); - struct dentry *(*d_real)(struct dentry *, const struct inode *); + struct dentry *(*d_real)(struct dentry *, enum d_real_type type); locking rules: diff --git a/Documentation/filesystems/ntfs.rst b/Documentation/filesystems/ntfs.rst deleted file mode 100644 index 5bb093a26485e048ef1a01833129d82f6c903aed..0000000000000000000000000000000000000000 --- a/Documentation/filesystems/ntfs.rst +++ /dev/null @@ -1,466 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -================================ -The Linux NTFS filesystem driver -================================ - - -.. Table of contents - - - Overview - - Web site - - Features - - Supported mount options - - Known bugs and (mis-)features - - Using NTFS volume and stripe sets - - The Device-Mapper driver - - The Software RAID / MD driver - - Limitations when using the MD driver - - -Overview -======== - -Linux-NTFS comes with a number of user-space programs known as ntfsprogs. -These include mkntfs, a full-featured ntfs filesystem format utility, -ntfsundelete used for recovering files that were unintentionally deleted -from an NTFS volume and ntfsresize which is used to resize an NTFS partition. -See the web site for more information. - -To mount an NTFS 1.2/3.x (Windows NT4/2000/XP/2003) volume, use the file -system type 'ntfs'. The driver currently supports read-only mode (with no -fault-tolerance, encryption or journalling) and very limited, but safe, write -support. - -For fault tolerance and raid support (i.e. volume and stripe sets), you can -use the kernel's Software RAID / MD driver. See section "Using Software RAID -with NTFS" for details. - - -Web site -======== - -There is plenty of additional information on the linux-ntfs web site -at http://www.linux-ntfs.org/ - -The web site has a lot of additional information, such as a comprehensive -FAQ, documentation on the NTFS on-disk format, information on the Linux-NTFS -userspace utilities, etc. - - -Features -======== - -- This is a complete rewrite of the NTFS driver that used to be in the 2.4 and - earlier kernels. This new driver implements NTFS read support and is - functionally equivalent to the old ntfs driver and it also implements limited - write support. The biggest limitation at present is that files/directories - cannot be created or deleted. See below for the list of write features that - are so far supported. Another limitation is that writing to compressed files - is not implemented at all. Also, neither read nor write access to encrypted - files is so far implemented. -- The new driver has full support for sparse files on NTFS 3.x volumes which - the old driver isn't happy with. -- The new driver supports execution of binaries due to mmap() now being - supported. -- The new driver supports loopback mounting of files on NTFS which is used by - some Linux distributions to enable the user to run Linux from an NTFS - partition by creating a large file while in Windows and then loopback - mounting the file while in Linux and creating a Linux filesystem on it that - is used to install Linux on it. -- A comparison of the two drivers using:: - - time find . -type f -exec md5sum "{}" \; - - run three times in sequence with each driver (after a reboot) on a 1.4GiB - NTFS partition, showed the new driver to be 20% faster in total time elapsed - (from 9:43 minutes on average down to 7:53). The time spent in user space - was unchanged but the time spent in the kernel was decreased by a factor of - 2.5 (from 85 CPU seconds down to 33). -- The driver does not support short file names in general. For backwards - compatibility, we implement access to files using their short file names if - they exist. The driver will not create short file names however, and a - rename will discard any existing short file name. -- The new driver supports exporting of mounted NTFS volumes via NFS. -- The new driver supports async io (aio). -- The new driver supports fsync(2), fdatasync(2), and msync(2). -- The new driver supports readv(2) and writev(2). -- The new driver supports access time updates (including mtime and ctime). -- The new driver supports truncate(2) and open(2) with O_TRUNC. But at present - only very limited support for highly fragmented files, i.e. ones which have - their data attribute split across multiple extents, is included. Another - limitation is that at present truncate(2) will never create sparse files, - since to mark a file sparse we need to modify the directory entry for the - file and we do not implement directory modifications yet. -- The new driver supports write(2) which can both overwrite existing data and - extend the file size so that you can write beyond the existing data. Also, - writing into sparse regions is supported and the holes are filled in with - clusters. But at present only limited support for highly fragmented files, - i.e. ones which have their data attribute split across multiple extents, is - included. Another limitation is that write(2) will never create sparse - files, since to mark a file sparse we need to modify the directory entry for - the file and we do not implement directory modifications yet. - -Supported mount options -======================= - -In addition to the generic mount options described by the manual page for the -mount command (man 8 mount, also see man 5 fstab), the NTFS driver supports the -following mount options: - -======================= ======================================================= -iocharset=name Deprecated option. Still supported but please use - nls=name in the future. See description for nls=name. - -nls=name Character set to use when returning file names. - Unlike VFAT, NTFS suppresses names that contain - unconvertible characters. Note that most character - sets contain insufficient characters to represent all - possible Unicode characters that can exist on NTFS. - To be sure you are not missing any files, you are - advised to use nls=utf8 which is capable of - representing all Unicode characters. - -utf8= Option no longer supported. Currently mapped to - nls=utf8 but please use nls=utf8 in the future and - make sure utf8 is compiled either as module or into - the kernel. See description for nls=name. - -uid= -gid= -umask= Provide default owner, group, and access mode mask. - These options work as documented in mount(8). By - default, the files/directories are owned by root and - he/she has read and write permissions, as well as - browse permission for directories. No one else has any - access permissions. I.e. the mode on all files is by - default rw------- and for directories rwx------, a - consequence of the default fmask=0177 and dmask=0077. - Using a umask of zero will grant all permissions to - everyone, i.e. all files and directories will have mode - rwxrwxrwx. - -fmask= -dmask= Instead of specifying umask which applies both to - files and directories, fmask applies only to files and - dmask only to directories. - -sloppy= If sloppy is specified, ignore unknown mount options. - Otherwise the default behaviour is to abort mount if - any unknown options are found. - -show_sys_files= If show_sys_files is specified, show the system files - in directory listings. Otherwise the default behaviour - is to hide the system files. - Note that even when show_sys_files is specified, "$MFT" - will not be visible due to bugs/mis-features in glibc. - Further, note that irrespective of show_sys_files, all - files are accessible by name, i.e. you can always do - "ls -l \$UpCase" for example to specifically show the - system file containing the Unicode upcase table. - -case_sensitive= If case_sensitive is specified, treat all file names as - case sensitive and create file names in the POSIX - namespace. Otherwise the default behaviour is to treat - file names as case insensitive and to create file names - in the WIN32/LONG name space. Note, the Linux NTFS - driver will never create short file names and will - remove them on rename/delete of the corresponding long - file name. - Note that files remain accessible via their short file - name, if it exists. If case_sensitive, you will need - to provide the correct case of the short file name. - -disable_sparse= If disable_sparse is specified, creation of sparse - regions, i.e. holes, inside files is disabled for the - volume (for the duration of this mount only). By - default, creation of sparse regions is enabled, which - is consistent with the behaviour of traditional Unix - filesystems. - -errors=opt What to do when critical filesystem errors are found. - Following values can be used for "opt": - - ======== ========================================= - continue DEFAULT, try to clean-up as much as - possible, e.g. marking a corrupt inode as - bad so it is no longer accessed, and then - continue. - recover At present only supported is recovery of - the boot sector from the backup copy. - If read-only mount, the recovery is done - in memory only and not written to disk. - ======== ========================================= - - Note that the options are additive, i.e. specifying:: - - errors=continue,errors=recover - - means the driver will attempt to recover and if that - fails it will clean-up as much as possible and - continue. - -mft_zone_multiplier= Set the MFT zone multiplier for the volume (this - setting is not persistent across mounts and can be - changed from mount to mount but cannot be changed on - remount). Values of 1 to 4 are allowed, 1 being the - default. The MFT zone multiplier determines how much - space is reserved for the MFT on the volume. If all - other space is used up, then the MFT zone will be - shrunk dynamically, so this has no impact on the - amount of free space. However, it can have an impact - on performance by affecting fragmentation of the MFT. - In general use the default. If you have a lot of small - files then use a higher value. The values have the - following meaning: - - ===== ================================= - Value MFT zone size (% of volume size) - ===== ================================= - 1 12.5% - 2 25% - 3 37.5% - 4 50% - ===== ================================= - - Note this option is irrelevant for read-only mounts. -======================= ======================================================= - - -Known bugs and (mis-)features -============================= - -- The link count on each directory inode entry is set to 1, due to Linux not - supporting directory hard links. This may well confuse some user space - applications, since the directory names will have the same inode numbers. - This also speeds up ntfs_read_inode() immensely. And we haven't found any - problems with this approach so far. If you find a problem with this, please - let us know. - - -Please send bug reports/comments/feedback/abuse to the Linux-NTFS development -list at sourceforge: linux-ntfs-dev@lists.sourceforge.net - - -Using NTFS volume and stripe sets -================================= - -For support of volume and stripe sets, you can either use the kernel's -Device-Mapper driver or the kernel's Software RAID / MD driver. The former is -the recommended one to use for linear raid. But the latter is required for -raid level 5. For striping and mirroring, either driver should work fine. - - -The Device-Mapper driver ------------------------- - -You will need to create a table of the components of the volume/stripe set and -how they fit together and load this into the kernel using the dmsetup utility -(see man 8 dmsetup). - -Linear volume sets, i.e. linear raid, has been tested and works fine. Even -though untested, there is no reason why stripe sets, i.e. raid level 0, and -mirrors, i.e. raid level 1 should not work, too. Stripes with parity, i.e. -raid level 5, unfortunately cannot work yet because the current version of the -Device-Mapper driver does not support raid level 5. You may be able to use the -Software RAID / MD driver for raid level 5, see the next section for details. - -To create the table describing your volume you will need to know each of its -components and their sizes in sectors, i.e. multiples of 512-byte blocks. - -For NT4 fault tolerant volumes you can obtain the sizes using fdisk. So for -example if one of your partitions is /dev/hda2 you would do:: - - $ fdisk -ul /dev/hda - - Disk /dev/hda: 81.9 GB, 81964302336 bytes - 255 heads, 63 sectors/track, 9964 cylinders, total 160086528 sectors - Units = sectors of 1 * 512 = 512 bytes - - Device Boot Start End Blocks Id System - /dev/hda1 * 63 4209029 2104483+ 83 Linux - /dev/hda2 4209030 37768814 16779892+ 86 NTFS - /dev/hda3 37768815 46170809 4200997+ 83 Linux - -And you would know that /dev/hda2 has a size of 37768814 - 4209030 + 1 = -33559785 sectors. - -For Win2k and later dynamic disks, you can for example use the ldminfo utility -which is part of the Linux LDM tools (the latest version at the time of -writing is linux-ldm-0.0.8.tar.bz2). You can download it from: - - http://www.linux-ntfs.org/ - -Simply extract the downloaded archive (tar xvjf linux-ldm-0.0.8.tar.bz2), go -into it (cd linux-ldm-0.0.8) and change to the test directory (cd test). You -will find the precompiled (i386) ldminfo utility there. NOTE: You will not be -able to compile this yourself easily so use the binary version! - -Then you would use ldminfo in dump mode to obtain the necessary information:: - - $ ./ldminfo --dump /dev/hda - -This would dump the LDM database found on /dev/hda which describes all of your -dynamic disks and all the volumes on them. At the bottom you will see the -VOLUME DEFINITIONS section which is all you really need. You may need to look -further above to determine which of the disks in the volume definitions is -which device in Linux. Hint: Run ldminfo on each of your dynamic disks and -look at the Disk Id close to the top of the output for each (the PRIVATE HEADER -section). You can then find these Disk Ids in the VBLK DATABASE section in the - components where you will get the LDM Name for the disk that is found in -the VOLUME DEFINITIONS section. - -Note you will also need to enable the LDM driver in the Linux kernel. If your -distribution did not enable it, you will need to recompile the kernel with it -enabled. This will create the LDM partitions on each device at boot time. You -would then use those devices (for /dev/hda they would be /dev/hda1, 2, 3, etc) -in the Device-Mapper table. - -You can also bypass using the LDM driver by using the main device (e.g. -/dev/hda) and then using the offsets of the LDM partitions into this device as -the "Start sector of device" when creating the table. Once again ldminfo would -give you the correct information to do this. - -Assuming you know all your devices and their sizes things are easy. - -For a linear raid the table would look like this (note all values are in -512-byte sectors):: - - # Offset into Size of this Raid type Device Start sector - # volume device of device - 0 1028161 linear /dev/hda1 0 - 1028161 3903762 linear /dev/hdb2 0 - 4931923 2103211 linear /dev/hdc1 0 - -For a striped volume, i.e. raid level 0, you will need to know the chunk size -you used when creating the volume. Windows uses 64kiB as the default, so it -will probably be this unless you changes the defaults when creating the array. - -For a raid level 0 the table would look like this (note all values are in -512-byte sectors):: - - # Offset Size Raid Number Chunk 1st Start 2nd Start - # into of the type of size Device in Device in - # volume volume stripes device device - 0 2056320 striped 2 128 /dev/hda1 0 /dev/hdb1 0 - -If there are more than two devices, just add each of them to the end of the -line. - -Finally, for a mirrored volume, i.e. raid level 1, the table would look like -this (note all values are in 512-byte sectors):: - - # Ofs Size Raid Log Number Region Should Number Source Start Target Start - # in of the type type of log size sync? of Device in Device in - # vol volume params mirrors Device Device - 0 2056320 mirror core 2 16 nosync 2 /dev/hda1 0 /dev/hdb1 0 - -If you are mirroring to multiple devices you can specify further targets at the -end of the line. - -Note the "Should sync?" parameter "nosync" means that the two mirrors are -already in sync which will be the case on a clean shutdown of Windows. If the -mirrors are not clean, you can specify the "sync" option instead of "nosync" -and the Device-Mapper driver will then copy the entirety of the "Source Device" -to the "Target Device" or if you specified multiple target devices to all of -them. - -Once you have your table, save it in a file somewhere (e.g. /etc/ntfsvolume1), -and hand it over to dmsetup to work with, like so:: - - $ dmsetup create myvolume1 /etc/ntfsvolume1 - -You can obviously replace "myvolume1" with whatever name you like. - -If it all worked, you will now have the device /dev/device-mapper/myvolume1 -which you can then just use as an argument to the mount command as usual to -mount the ntfs volume. For example:: - - $ mount -t ntfs -o ro /dev/device-mapper/myvolume1 /mnt/myvol1 - -(You need to create the directory /mnt/myvol1 first and of course you can use -anything you like instead of /mnt/myvol1 as long as it is an existing -directory.) - -It is advisable to do the mount read-only to see if the volume has been setup -correctly to avoid the possibility of causing damage to the data on the ntfs -volume. - - -The Software RAID / MD driver ------------------------------ - -An alternative to using the Device-Mapper driver is to use the kernel's -Software RAID / MD driver. For which you need to set up your /etc/raidtab -appropriately (see man 5 raidtab). - -Linear volume sets, i.e. linear raid, as well as stripe sets, i.e. raid level -0, have been tested and work fine (though see section "Limitations when using -the MD driver with NTFS volumes" especially if you want to use linear raid). -Even though untested, there is no reason why mirrors, i.e. raid level 1, and -stripes with parity, i.e. raid level 5, should not work, too. - -You have to use the "persistent-superblock 0" option for each raid-disk in the -NTFS volume/stripe you are configuring in /etc/raidtab as the persistent -superblock used by the MD driver would damage the NTFS volume. - -Windows by default uses a stripe chunk size of 64k, so you probably want the -"chunk-size 64k" option for each raid-disk, too. - -For example, if you have a stripe set consisting of two partitions /dev/hda5 -and /dev/hdb1 your /etc/raidtab would look like this:: - - raiddev /dev/md0 - raid-level 0 - nr-raid-disks 2 - nr-spare-disks 0 - persistent-superblock 0 - chunk-size 64k - device /dev/hda5 - raid-disk 0 - device /dev/hdb1 - raid-disk 1 - -For linear raid, just change the raid-level above to "raid-level linear", for -mirrors, change it to "raid-level 1", and for stripe sets with parity, change -it to "raid-level 5". - -Note for stripe sets with parity you will also need to tell the MD driver -which parity algorithm to use by specifying the option "parity-algorithm -which", where you need to replace "which" with the name of the algorithm to -use (see man 5 raidtab for available algorithms) and you will have to try the -different available algorithms until you find one that works. Make sure you -are working read-only when playing with this as you may damage your data -otherwise. If you find which algorithm works please let us know (email the -linux-ntfs developers list linux-ntfs-dev@lists.sourceforge.net or drop in on -IRC in channel #ntfs on the irc.freenode.net network) so we can update this -documentation. - -Once the raidtab is setup, run for example raid0run -a to start all devices or -raid0run /dev/md0 to start a particular md device, in this case /dev/md0. - -Then just use the mount command as usual to mount the ntfs volume using for -example:: - - mount -t ntfs -o ro /dev/md0 /mnt/myntfsvolume - -It is advisable to do the mount read-only to see if the md volume has been -setup correctly to avoid the possibility of causing damage to the data on the -ntfs volume. - - -Limitations when using the Software RAID / MD driver ------------------------------------------------------ - -Using the md driver will not work properly if any of your NTFS partitions have -an odd number of sectors. This is especially important for linear raid as all -data after the first partition with an odd number of sectors will be offset by -one or more sectors so if you mount such a partition with write support you -will cause massive damage to the data on the volume which will only become -apparent when you try to use the volume again under Windows. - -So when using linear raid, make sure that all your partitions have an even -number of sectors BEFORE attempting to use it. You have been warned! - -Even better is to simply use the Device-Mapper for linear raid and then you do -not have this problem with odd numbers of sectors. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index eebcc0f9e2bcd1f3eecc99c00ec656fe45cbbd8e..6e903a903f8f691d55af7f3ae200f959fc7978a2 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -1264,7 +1264,7 @@ defined: char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); - struct dentry *(*d_real)(struct dentry *, const struct inode *); + struct dentry *(*d_real)(struct dentry *, enum d_real_type type); }; ``d_revalidate`` @@ -1419,16 +1419,14 @@ defined: the dentry being transited from. ``d_real`` - overlay/union type filesystems implement this method to return - one of the underlying dentries hidden by the overlay. It is - used in two different modes: + overlay/union type filesystems implement this method to return one + of the underlying dentries of a regular file hidden by the overlay. - Called from file_dentry() it returns the real dentry matching - the inode argument. The real dentry may be from a lower layer - already copied up, but still referenced from the file. This - mode is selected with a non-NULL inode argument. + The 'type' argument takes the values D_REAL_DATA or D_REAL_METADATA + for returning the real underlying dentry that refers to the inode + hosting the file's data or metadata respectively. - With NULL inode the topmost real underlying dentry is returned. + For non-regular files, the 'dentry' argument is returned. Each dentry has a pointer to its parent dentry, as well as a hash list of child dentries. Child dentries are basically like files in a diff --git a/Documentation/index.rst b/Documentation/index.rst index 36e61783437c1034cfbeb19aa662be64fea80857..9dfdc826618c08f5f62f81f1cea543dea3223967 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -113,7 +113,6 @@ to ReStructured Text format, or are simply too old. :maxdepth: 1 staging/index - RAS/ras Translations diff --git a/Documentation/networking/net_cachelines/inet_sock.rst b/Documentation/networking/net_cachelines/inet_sock.rst index a2babd0d7954e6729ed8533518dbef039f5fdeac..595d7ef5fc8b090788e7a3439843c060951d1098 100644 --- a/Documentation/networking/net_cachelines/inet_sock.rst +++ b/Documentation/networking/net_cachelines/inet_sock.rst @@ -1,9 +1,9 @@ .. SPDX-License-Identifier: GPL-2.0 .. Copyright (C) 2023 Google LLC -===================================================== -inet_connection_sock struct fast path usage breakdown -===================================================== +========================================== +inet_sock struct fast path usage breakdown +========================================== Type Name fastpath_tx_access fastpath_rx_access comment ..struct ..inet_sock diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 50b3d1cb11159b5fad1082d521598984a881594f..c78ecc1e176fc03cad1e35606b9d9c8d7572b4eb 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -31,7 +31,7 @@ you probably needn't concern yourself with pcmciautils. ====================== =============== ======================================== GNU C 5.1 gcc --version Clang/LLVM (optional) 11.0.0 clang --version -Rust (optional) 1.74.1 rustc --version +Rust (optional) 1.76.0 rustc --version bindgen (optional) 0.65.1 bindgen --version GNU make 3.82 make --version bash 4.2 bash --version diff --git a/Documentation/process/maintainer-tip.rst b/Documentation/process/maintainer-tip.rst index 08dd0f804410b6dde4635c77ed930b70e57c8cac..497bb39727c8b05808720a5b41fbeca68ab81681 100644 --- a/Documentation/process/maintainer-tip.rst +++ b/Documentation/process/maintainer-tip.rst @@ -304,13 +304,15 @@ following tag ordering scheme: - Reported-by: ``Reporter `` + - Closes: ``URL or Message-ID of the bug report this is fixing`` + - Originally-by: ``Original author `` - Suggested-by: ``Suggester `` - Co-developed-by: ``Co-author `` - Signed-off: ``Co-author `` + Signed-off-by: ``Co-author `` Note, that Co-developed-by and Signed-off-by of the co-author(s) must come in pairs. @@ -478,7 +480,7 @@ Multi-line comments:: * Larger multi-line comments should be split into paragraphs. */ -No tail comments: +No tail comments (see below): Please refrain from using tail comments. Tail comments disturb the reading flow in almost all contexts, but especially in code:: @@ -499,6 +501,34 @@ No tail comments: /* This magic initialization needs a comment. Maybe not? */ seed = MAGIC_CONSTANT; + Use C++ style, tail comments when documenting structs in headers to + achieve a more compact layout and better readability:: + + // eax + u32 x2apic_shift : 5, // Number of bits to shift APIC ID right + // for the topology ID at the next level + : 27; // Reserved + // ebx + u32 num_processors : 16, // Number of processors at current level + : 16; // Reserved + + versus:: + + /* eax */ + /* + * Number of bits to shift APIC ID right for the topology ID + * at the next level + */ + u32 x2apic_shift : 5, + /* Reserved */ + : 27; + + /* ebx */ + /* Number of processors at current level */ + u32 num_processors : 16, + /* Reserved */ + : 16; + Comment the important things: Comments should be added where the operation is not obvious. Documenting diff --git a/Documentation/rust/general-information.rst b/Documentation/rust/general-information.rst index 236c6dd3c647f815b5096df2d23ae5ac38cb364a..081397827a7eab66bb45648bf0e9e9f67497125e 100644 --- a/Documentation/rust/general-information.rst +++ b/Documentation/rust/general-information.rst @@ -77,27 +77,3 @@ configuration: #[cfg(CONFIG_X="y")] // Enabled as a built-in (`y`) #[cfg(CONFIG_X="m")] // Enabled as a module (`m`) #[cfg(not(CONFIG_X))] // Disabled - - -Testing -------- - -There are the tests that come from the examples in the Rust documentation -and get transformed into KUnit tests. These can be run via KUnit. For example -via ``kunit_tool`` (``kunit.py``) on the command line:: - - ./tools/testing/kunit/kunit.py run --make_options LLVM=1 --arch x86_64 --kconfig_add CONFIG_RUST=y - -Alternatively, KUnit can run them as kernel built-in at boot. Refer to -Documentation/dev-tools/kunit/index.rst for the general KUnit documentation -and Documentation/dev-tools/kunit/architecture.rst for the details of kernel -built-in vs. command line testing. - -Additionally, there are the ``#[test]`` tests. These can be run using -the ``rusttest`` Make target:: - - make LLVM=1 rusttest - -This requires the kernel ``.config`` and downloads external repositories. -It runs the ``#[test]`` tests on the host (currently) and thus is fairly -limited in what these tests can test. diff --git a/Documentation/rust/index.rst b/Documentation/rust/index.rst index 965f2db529e0ff9b02990e870345e7be5cf1d1cb..46d35bd395cf5c4a8cca021a9c204e218c5e672f 100644 --- a/Documentation/rust/index.rst +++ b/Documentation/rust/index.rst @@ -40,6 +40,7 @@ configurations. general-information coding-guidelines arch-support + testing .. only:: subproject and html diff --git a/Documentation/rust/testing.rst b/Documentation/rust/testing.rst new file mode 100644 index 0000000000000000000000000000000000000000..6658998d1b6c4b9b062b6220e12b6e450c3ed52a --- /dev/null +++ b/Documentation/rust/testing.rst @@ -0,0 +1,135 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Testing +======= + +This document contains useful information how to test the Rust code in the +kernel. + +There are two sorts of tests: + +- The KUnit tests. +- The ``#[test]`` tests. + +The KUnit tests +--------------- + +These are the tests that come from the examples in the Rust documentation. They +get transformed into KUnit tests. + +Usage +***** + +These tests can be run via KUnit. For example via ``kunit_tool`` (``kunit.py``) +on the command line:: + + ./tools/testing/kunit/kunit.py run --make_options LLVM=1 --arch x86_64 --kconfig_add CONFIG_RUST=y + +Alternatively, KUnit can run them as kernel built-in at boot. Refer to +Documentation/dev-tools/kunit/index.rst for the general KUnit documentation +and Documentation/dev-tools/kunit/architecture.rst for the details of kernel +built-in vs. command line testing. + +To use these KUnit doctests, the following must be enabled:: + + CONFIG_KUNIT + Kernel hacking -> Kernel Testing and Coverage -> KUnit - Enable support for unit tests + CONFIG_RUST_KERNEL_DOCTESTS + Kernel hacking -> Rust hacking -> Doctests for the `kernel` crate + +in the kernel config system. + +KUnit tests are documentation tests +*********************************** + +These documentation tests are typically examples of usage of any item (e.g. +function, struct, module...). + +They are very convenient because they are just written alongside the +documentation. For instance: + +.. code-block:: rust + + /// Sums two numbers. + /// + /// ``` + /// assert_eq!(mymod::f(10, 20), 30); + /// ``` + pub fn f(a: i32, b: i32) -> i32 { + a + b + } + +In userspace, the tests are collected and run via ``rustdoc``. Using the tool +as-is would be useful already, since it allows verifying that examples compile +(thus enforcing they are kept in sync with the code they document) and as well +as running those that do not depend on in-kernel APIs. + +For the kernel, however, these tests get transformed into KUnit test suites. +This means that doctests get compiled as Rust kernel objects, allowing them to +run against a built kernel. + +A benefit of this KUnit integration is that Rust doctests get to reuse existing +testing facilities. For instance, the kernel log would look like:: + + KTAP version 1 + 1..1 + KTAP version 1 + # Subtest: rust_doctests_kernel + 1..59 + # rust_doctest_kernel_build_assert_rs_0.location: rust/kernel/build_assert.rs:13 + ok 1 rust_doctest_kernel_build_assert_rs_0 + # rust_doctest_kernel_build_assert_rs_1.location: rust/kernel/build_assert.rs:56 + ok 2 rust_doctest_kernel_build_assert_rs_1 + # rust_doctest_kernel_init_rs_0.location: rust/kernel/init.rs:122 + ok 3 rust_doctest_kernel_init_rs_0 + ... + # rust_doctest_kernel_types_rs_2.location: rust/kernel/types.rs:150 + ok 59 rust_doctest_kernel_types_rs_2 + # rust_doctests_kernel: pass:59 fail:0 skip:0 total:59 + # Totals: pass:59 fail:0 skip:0 total:59 + ok 1 rust_doctests_kernel + +Tests using the `? `_ +operator are also supported as usual, e.g.: + +.. code-block:: rust + + /// ``` + /// # use kernel::{spawn_work_item, workqueue}; + /// spawn_work_item!(workqueue::system(), || pr_info!("x"))?; + /// # Ok::<(), Error>(()) + /// ``` + +The tests are also compiled with Clippy under ``CLIPPY=1``, just like normal +code, thus also benefitting from extra linting. + +In order for developers to easily see which line of doctest code caused a +failure, a KTAP diagnostic line is printed to the log. This contains the +location (file and line) of the original test (i.e. instead of the location in +the generated Rust file):: + + # rust_doctest_kernel_types_rs_2.location: rust/kernel/types.rs:150 + +Rust tests appear to assert using the usual ``assert!`` and ``assert_eq!`` +macros from the Rust standard library (``core``). We provide a custom version +that forwards the call to KUnit instead. Importantly, these macros do not +require passing context, unlike those for KUnit testing (i.e. +``struct kunit *``). This makes them easier to use, and readers of the +documentation do not need to care about which testing framework is used. In +addition, it may allow us to test third-party code more easily in the future. + +A current limitation is that KUnit does not support assertions in other tasks. +Thus, we presently simply print an error to the kernel log if an assertion +actually failed. Additionally, doctests are not run for nonpublic functions. + +The ``#[test]`` tests +--------------------- + +Additionally, there are the ``#[test]`` tests. These can be run using the +``rusttest`` Make target:: + + make LLVM=1 rusttest + +This requires the kernel ``.config`` and downloads external repositories. It +runs the ``#[test]`` tests on the host (currently) and thus is fairly limited in +what these tests can test. diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 457e16f06e04defe9f998ba74ff6577b7ff13490..3731ecf1e4370df533430bb2debdc046acdffb88 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -82,8 +82,9 @@ Code Seq# Include File Comments 0x10 00-0F drivers/char/s390/vmcp.h 0x10 10-1F arch/s390/include/uapi/sclp_ctl.h 0x10 20-2F arch/s390/include/uapi/asm/hypfs.h -0x12 all linux/fs.h +0x12 all linux/fs.h BLK* ioctls linux/blkpg.h +0x15 all linux/fs.h FS_IOC_* ioctls 0x1b all InfiniBand Subsystem 0x20 all drivers/cdrom/cm206.h diff --git a/Documentation/virt/hyperv/index.rst b/Documentation/virt/hyperv/index.rst index 4a7a1b738bbead3563cbc70a67c038725d0aadfd..de447e11b4a5c3b9a0948712e59d1d065130a1f7 100644 --- a/Documentation/virt/hyperv/index.rst +++ b/Documentation/virt/hyperv/index.rst @@ -10,3 +10,4 @@ Hyper-V Enlightenments overview vmbus clocks + vpci diff --git a/Documentation/virt/hyperv/vpci.rst b/Documentation/virt/hyperv/vpci.rst new file mode 100644 index 0000000000000000000000000000000000000000..b65b2126ede3e38017eedcd6c7ffea41c69415a4 --- /dev/null +++ b/Documentation/virt/hyperv/vpci.rst @@ -0,0 +1,316 @@ +.. SPDX-License-Identifier: GPL-2.0 + +PCI pass-thru devices +========================= +In a Hyper-V guest VM, PCI pass-thru devices (also called +virtual PCI devices, or vPCI devices) are physical PCI devices +that are mapped directly into the VM's physical address space. +Guest device drivers can interact directly with the hardware +without intermediation by the host hypervisor. This approach +provides higher bandwidth access to the device with lower +latency, compared with devices that are virtualized by the +hypervisor. The device should appear to the guest just as it +would when running on bare metal, so no changes are required +to the Linux device drivers for the device. + +Hyper-V terminology for vPCI devices is "Discrete Device +Assignment" (DDA). Public documentation for Hyper-V DDA is +available here: `DDA`_ + +.. _DDA: https://learn.microsoft.com/en-us/windows-server/virtualization/hyper-v/plan/plan-for-deploying-devices-using-discrete-device-assignment + +DDA is typically used for storage controllers, such as NVMe, +and for GPUs. A similar mechanism for NICs is called SR-IOV +and produces the same benefits by allowing a guest device +driver to interact directly with the hardware. See Hyper-V +public documentation here: `SR-IOV`_ + +.. _SR-IOV: https://learn.microsoft.com/en-us/windows-hardware/drivers/network/overview-of-single-root-i-o-virtualization--sr-iov- + +This discussion of vPCI devices includes DDA and SR-IOV +devices. + +Device Presentation +------------------- +Hyper-V provides full PCI functionality for a vPCI device when +it is operating, so the Linux device driver for the device can +be used unchanged, provided it uses the correct Linux kernel +APIs for accessing PCI config space and for other integration +with Linux. But the initial detection of the PCI device and +its integration with the Linux PCI subsystem must use Hyper-V +specific mechanisms. Consequently, vPCI devices on Hyper-V +have a dual identity. They are initially presented to Linux +guests as VMBus devices via the standard VMBus "offer" +mechanism, so they have a VMBus identity and appear under +/sys/bus/vmbus/devices. The VMBus vPCI driver in Linux at +drivers/pci/controller/pci-hyperv.c handles a newly introduced +vPCI device by fabricating a PCI bus topology and creating all +the normal PCI device data structures in Linux that would +exist if the PCI device were discovered via ACPI on a bare- +metal system. Once those data structures are set up, the +device also has a normal PCI identity in Linux, and the normal +Linux device driver for the vPCI device can function as if it +were running in Linux on bare-metal. Because vPCI devices are +presented dynamically through the VMBus offer mechanism, they +do not appear in the Linux guest's ACPI tables. vPCI devices +may be added to a VM or removed from a VM at any time during +the life of the VM, and not just during initial boot. + +With this approach, the vPCI device is a VMBus device and a +PCI device at the same time. In response to the VMBus offer +message, the hv_pci_probe() function runs and establishes a +VMBus connection to the vPCI VSP on the Hyper-V host. That +connection has a single VMBus channel. The channel is used to +exchange messages with the vPCI VSP for the purpose of setting +up and configuring the vPCI device in Linux. Once the device +is fully configured in Linux as a PCI device, the VMBus +channel is used only if Linux changes the vCPU to be interrupted +in the guest, or if the vPCI device is removed from +the VM while the VM is running. The ongoing operation of the +device happens directly between the Linux device driver for +the device and the hardware, with VMBus and the VMBus channel +playing no role. + +PCI Device Setup +---------------- +PCI device setup follows a sequence that Hyper-V originally +created for Windows guests, and that can be ill-suited for +Linux guests due to differences in the overall structure of +the Linux PCI subsystem compared with Windows. Nonetheless, +with a bit of hackery in the Hyper-V virtual PCI driver for +Linux, the virtual PCI device is setup in Linux so that +generic Linux PCI subsystem code and the Linux driver for the +device "just work". + +Each vPCI device is set up in Linux to be in its own PCI +domain with a host bridge. The PCI domainID is derived from +bytes 4 and 5 of the instance GUID assigned to the VMBus vPCI +device. The Hyper-V host does not guarantee that these bytes +are unique, so hv_pci_probe() has an algorithm to resolve +collisions. The collision resolution is intended to be stable +across reboots of the same VM so that the PCI domainIDs don't +change, as the domainID appears in the user space +configuration of some devices. + +hv_pci_probe() allocates a guest MMIO range to be used as PCI +config space for the device. This MMIO range is communicated +to the Hyper-V host over the VMBus channel as part of telling +the host that the device is ready to enter d0. See +hv_pci_enter_d0(). When the guest subsequently accesses this +MMIO range, the Hyper-V host intercepts the accesses and maps +them to the physical device PCI config space. + +hv_pci_probe() also gets BAR information for the device from +the Hyper-V host, and uses this information to allocate MMIO +space for the BARs. That MMIO space is then setup to be +associated with the host bridge so that it works when generic +PCI subsystem code in Linux processes the BARs. + +Finally, hv_pci_probe() creates the root PCI bus. At this +point the Hyper-V virtual PCI driver hackery is done, and the +normal Linux PCI machinery for scanning the root bus works to +detect the device, to perform driver matching, and to +initialize the driver and device. + +PCI Device Removal +------------------ +A Hyper-V host may initiate removal of a vPCI device from a +guest VM at any time during the life of the VM. The removal +is instigated by an admin action taken on the Hyper-V host and +is not under the control of the guest OS. + +A guest VM is notified of the removal by an unsolicited +"Eject" message sent from the host to the guest over the VMBus +channel associated with the vPCI device. Upon receipt of such +a message, the Hyper-V virtual PCI driver in Linux +asynchronously invokes Linux kernel PCI subsystem calls to +shutdown and remove the device. When those calls are +complete, an "Ejection Complete" message is sent back to +Hyper-V over the VMBus channel indicating that the device has +been removed. At this point, Hyper-V sends a VMBus rescind +message to the Linux guest, which the VMBus driver in Linux +processes by removing the VMBus identity for the device. Once +that processing is complete, all vestiges of the device having +been present are gone from the Linux kernel. The rescind +message also indicates to the guest that Hyper-V has stopped +providing support for the vPCI device in the guest. If the +guest were to attempt to access that device's MMIO space, it +would be an invalid reference. Hypercalls affecting the device +return errors, and any further messages sent in the VMBus +channel are ignored. + +After sending the Eject message, Hyper-V allows the guest VM +60 seconds to cleanly shutdown the device and respond with +Ejection Complete before sending the VMBus rescind +message. If for any reason the Eject steps don't complete +within the allowed 60 seconds, the Hyper-V host forcibly +performs the rescind steps, which will likely result in +cascading errors in the guest because the device is now no +longer present from the guest standpoint and accessing the +device MMIO space will fail. + +Because ejection is asynchronous and can happen at any point +during the guest VM lifecycle, proper synchronization in the +Hyper-V virtual PCI driver is very tricky. Ejection has been +observed even before a newly offered vPCI device has been +fully setup. The Hyper-V virtual PCI driver has been updated +several times over the years to fix race conditions when +ejections happen at inopportune times. Care must be taken when +modifying this code to prevent re-introducing such problems. +See comments in the code. + +Interrupt Assignment +-------------------- +The Hyper-V virtual PCI driver supports vPCI devices using +MSI, multi-MSI, or MSI-X. Assigning the guest vCPU that will +receive the interrupt for a particular MSI or MSI-X message is +complex because of the way the Linux setup of IRQs maps onto +the Hyper-V interfaces. For the single-MSI and MSI-X cases, +Linux calls hv_compse_msi_msg() twice, with the first call +containing a dummy vCPU and the second call containing the +real vCPU. Furthermore, hv_irq_unmask() is finally called +(on x86) or the GICD registers are set (on arm64) to specify +the real vCPU again. Each of these three calls interact +with Hyper-V, which must decide which physical CPU should +receive the interrupt before it is forwarded to the guest VM. +Unfortunately, the Hyper-V decision-making process is a bit +limited, and can result in concentrating the physical +interrupts on a single CPU, causing a performance bottleneck. +See details about how this is resolved in the extensive +comment above the function hv_compose_msi_req_get_cpu(). + +The Hyper-V virtual PCI driver implements the +irq_chip.irq_compose_msi_msg function as hv_compose_msi_msg(). +Unfortunately, on Hyper-V the implementation requires sending +a VMBus message to the Hyper-V host and awaiting an interrupt +indicating receipt of a reply message. Since +irq_chip.irq_compose_msi_msg can be called with IRQ locks +held, it doesn't work to do the normal sleep until awakened by +the interrupt. Instead hv_compose_msi_msg() must send the +VMBus message, and then poll for the completion message. As +further complexity, the vPCI device could be ejected/rescinded +while the polling is in progress, so this scenario must be +detected as well. See comments in the code regarding this +very tricky area. + +Most of the code in the Hyper-V virtual PCI driver (pci- +hyperv.c) applies to Hyper-V and Linux guests running on x86 +and on arm64 architectures. But there are differences in how +interrupt assignments are managed. On x86, the Hyper-V +virtual PCI driver in the guest must make a hypercall to tell +Hyper-V which guest vCPU should be interrupted by each +MSI/MSI-X interrupt, and the x86 interrupt vector number that +the x86_vector IRQ domain has picked for the interrupt. This +hypercall is made by hv_arch_irq_unmask(). On arm64, the +Hyper-V virtual PCI driver manages the allocation of an SPI +for each MSI/MSI-X interrupt. The Hyper-V virtual PCI driver +stores the allocated SPI in the architectural GICD registers, +which Hyper-V emulates, so no hypercall is necessary as with +x86. Hyper-V does not support using LPIs for vPCI devices in +arm64 guest VMs because it does not emulate a GICv3 ITS. + +The Hyper-V virtual PCI driver in Linux supports vPCI devices +whose drivers create managed or unmanaged Linux IRQs. If the +smp_affinity for an unmanaged IRQ is updated via the /proc/irq +interface, the Hyper-V virtual PCI driver is called to tell +the Hyper-V host to change the interrupt targeting and +everything works properly. However, on x86 if the x86_vector +IRQ domain needs to reassign an interrupt vector due to +running out of vectors on a CPU, there's no path to inform the +Hyper-V host of the change, and things break. Fortunately, +guest VMs operate in a constrained device environment where +using all the vectors on a CPU doesn't happen. Since such a +problem is only a theoretical concern rather than a practical +concern, it has been left unaddressed. + +DMA +--- +By default, Hyper-V pins all guest VM memory in the host +when the VM is created, and programs the physical IOMMU to +allow the VM to have DMA access to all its memory. Hence +it is safe to assign PCI devices to the VM, and allow the +guest operating system to program the DMA transfers. The +physical IOMMU prevents a malicious guest from initiating +DMA to memory belonging to the host or to other VMs on the +host. From the Linux guest standpoint, such DMA transfers +are in "direct" mode since Hyper-V does not provide a virtual +IOMMU in the guest. + +Hyper-V assumes that physical PCI devices always perform +cache-coherent DMA. When running on x86, this behavior is +required by the architecture. When running on arm64, the +architecture allows for both cache-coherent and +non-cache-coherent devices, with the behavior of each device +specified in the ACPI DSDT. But when a PCI device is assigned +to a guest VM, that device does not appear in the DSDT, so the +Hyper-V VMBus driver propagates cache-coherency information +from the VMBus node in the ACPI DSDT to all VMBus devices, +including vPCI devices (since they have a dual identity as a VMBus +device and as a PCI device). See vmbus_dma_configure(). +Current Hyper-V versions always indicate that the VMBus is +cache coherent, so vPCI devices on arm64 always get marked as +cache coherent and the CPU does not perform any sync +operations as part of dma_map/unmap_*() calls. + +vPCI protocol versions +---------------------- +As previously described, during vPCI device setup and teardown +messages are passed over a VMBus channel between the Hyper-V +host and the Hyper-v vPCI driver in the Linux guest. Some +messages have been revised in newer versions of Hyper-V, so +the guest and host must agree on the vPCI protocol version to +be used. The version is negotiated when communication over +the VMBus channel is first established. See +hv_pci_protocol_negotiation(). Newer versions of the protocol +extend support to VMs with more than 64 vCPUs, and provide +additional information about the vPCI device, such as the +guest virtual NUMA node to which it is most closely affined in +the underlying hardware. + +Guest NUMA node affinity +------------------------ +When the vPCI protocol version provides it, the guest NUMA +node affinity of the vPCI device is stored as part of the Linux +device information for subsequent use by the Linux driver. See +hv_pci_assign_numa_node(). If the negotiated protocol version +does not support the host providing NUMA affinity information, +the Linux guest defaults the device NUMA node to 0. But even +when the negotiated protocol version includes NUMA affinity +information, the ability of the host to provide such +information depends on certain host configuration options. If +the guest receives NUMA node value "0", it could mean NUMA +node 0, or it could mean "no information is available". +Unfortunately it is not possible to distinguish the two cases +from the guest side. + +PCI config space access in a CoCo VM +------------------------------------ +Linux PCI device drivers access PCI config space using a +standard set of functions provided by the Linux PCI subsystem. +In Hyper-V guests these standard functions map to functions +hv_pcifront_read_config() and hv_pcifront_write_config() +in the Hyper-V virtual PCI driver. In normal VMs, +these hv_pcifront_*() functions directly access the PCI config +space, and the accesses trap to Hyper-V to be handled. +But in CoCo VMs, memory encryption prevents Hyper-V +from reading the guest instruction stream to emulate the +access, so the hv_pcifront_*() functions must invoke +hypercalls with explicit arguments describing the access to be +made. + +Config Block back-channel +------------------------- +The Hyper-V host and Hyper-V virtual PCI driver in Linux +together implement a non-standard back-channel communication +path between the host and guest. The back-channel path uses +messages sent over the VMBus channel associated with the vPCI +device. The functions hyperv_read_cfg_blk() and +hyperv_write_cfg_blk() are the primary interfaces provided to +other parts of the Linux kernel. As of this writing, these +interfaces are used only by the Mellanox mlx5 driver to pass +diagnostic data to a Hyper-V host running in the Azure public +cloud. The functions hyperv_read_cfg_blk() and +hyperv_write_cfg_blk() are implemented in a separate module +(pci-hyperv-intf.c, under CONFIG_PCI_HYPERV_INTERFACE) that +effectively stubs them out when running in non-Hyper-V +environments. diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 3ec0b7a455a0cf489b93683a49b5362cded0b570..09c7e585ff5800da5a72a1f9dbd8b719f0b6d595 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8791,6 +8791,11 @@ means the VM type with value @n is supported. Possible values of @n are:: #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 +Note, KVM_X86_SW_PROTECTED_VM is currently only for development and testing. +Do not use KVM_X86_SW_PROTECTED_VM for "real" VMs, and especially not in +production. The behavior and effective ABI for software-protected VMs is +unstable. + 9. Known KVM API problems ========================= diff --git a/MAINTAINERS b/MAINTAINERS index 2ecaaec6a6bf40b1cd3cccaef0f8db81c2e627b5..a3c7accd387747e89cf367907337ee7c6b61ea13 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -897,6 +897,12 @@ Q: https://patchwork.kernel.org/project/linux-rdma/list/ F: drivers/infiniband/hw/efa/ F: include/uapi/rdma/efa-abi.h +AMD ADDRESS TRANSLATION LIBRARY (ATL) +M: Yazen Ghannam +L: linux-edac@vger.kernel.org +S: Supported +F: drivers/ras/amd/atl/* + AMD AXI W1 DRIVER M: Kris Chaplin R: Thomas Delev @@ -1395,6 +1401,7 @@ F: drivers/hwmon/max31760.c ANALOGBITS PLL LIBRARIES M: Paul Walmsley +M: Samuel Holland S: Supported F: drivers/clk/analogbits/* F: include/linux/clk/analogbits* @@ -2156,7 +2163,7 @@ M: Shawn Guo M: Sascha Hauer R: Pengutronix Kernel Team R: Fabio Estevam -R: NXP Linux Team +L: imx@lists.linux.dev L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git @@ -7582,7 +7589,6 @@ R: Robert Richter L: linux-edac@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next -F: Documentation/admin-guide/ras.rst F: Documentation/driver-api/edac.rst F: drivers/edac/ F: include/linux/edac.h @@ -8495,7 +8501,7 @@ FREESCALE IMX / MXC FEC DRIVER M: Wei Fang R: Shenwei Wang R: Clark Wang -R: NXP Linux Team +L: imx@lists.linux.dev L: netdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/fsl,fec.yaml @@ -8530,7 +8536,7 @@ F: drivers/i2c/busses/i2c-imx.c FREESCALE IMX LPI2C DRIVER M: Dong Aisheng L: linux-i2c@vger.kernel.org -L: linux-imx@nxp.com +L: imx@lists.linux.dev S: Maintained F: Documentation/devicetree/bindings/i2c/i2c-imx-lpi2c.yaml F: drivers/i2c/busses/i2c-imx-lpi2c.c @@ -10734,7 +10740,7 @@ INTEL DRM I915 DRIVER (Meteor Lake, DG2 and older excluding Poulsbo, Moorestown M: Jani Nikula M: Joonas Lahtinen M: Rodrigo Vivi -M: Tvrtko Ursulin +M: Tvrtko Ursulin L: intel-gfx@lists.freedesktop.org S: Supported W: https://drm.pages.freedesktop.org/intel-docs/ @@ -11156,6 +11162,16 @@ L: netdev@vger.kernel.org S: Maintained F: drivers/net/wwan/iosm/ +INTEL(R) FLEXIBLE RETURN AND EVENT DELIVERY +M: Xin Li +M: "H. Peter Anvin" +S: Supported +F: Documentation/arch/x86/x86_64/fred.rst +F: arch/x86/entry/entry_64_fred.S +F: arch/x86/entry/entry_fred.c +F: arch/x86/include/asm/fred.h +F: arch/x86/kernel/fred.c + INTEL(R) TRACE HUB M: Alexander Shishkin S: Supported @@ -12516,7 +12532,6 @@ F: arch/powerpc/include/asm/livepatch.h F: include/linux/livepatch.h F: kernel/livepatch/ F: kernel/module/livepatch.c -F: lib/livepatch/ F: samples/livepatch/ F: tools/testing/selftests/livepatch/ @@ -14111,6 +14126,17 @@ F: mm/ F: tools/mm/ F: tools/testing/selftests/mm/ +MEMORY MAPPING +M: Andrew Morton +R: Liam R. Howlett +R: Vlastimil Babka +R: Lorenzo Stoakes +L: linux-mm@kvack.org +S: Maintained +W: http://www.linux-mm.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: mm/mmap.c + MEMORY TECHNOLOGY DEVICES (MTD) M: Miquel Raynal M: Richard Weinberger @@ -14369,7 +14395,7 @@ MICROCHIP MCP16502 PMIC DRIVER M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported -F: Documentation/devicetree/bindings/regulator/mcp16502-regulator.txt +F: Documentation/devicetree/bindings/regulator/microchip,mcp16502.yaml F: drivers/regulator/mcp16502.c MICROCHIP MCP3564 ADC DRIVER @@ -15578,16 +15604,6 @@ W: https://github.com/davejiang/linux/wiki T: git https://github.com/davejiang/linux.git F: drivers/ntb/hw/intel/ -NTFS FILESYSTEM -M: Anton Altaparmakov -R: Namjae Jeon -L: linux-ntfs-dev@lists.sourceforge.net -S: Supported -W: http://www.tuxera.com/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/aia21/ntfs.git -F: Documentation/filesystems/ntfs.rst -F: fs/ntfs/ - NTFS3 FILESYSTEM M: Konstantin Komarov L: ntfs3@lists.linux.dev @@ -15716,7 +15732,7 @@ F: drivers/iio/gyro/fxas21002c_spi.c NXP i.MX 7D/6SX/6UL/93 AND VF610 ADC DRIVER M: Haibo Chen L: linux-iio@vger.kernel.org -L: linux-imx@nxp.com +L: imx@lists.linux.dev S: Maintained F: Documentation/devicetree/bindings/iio/adc/fsl,imx7d-adc.yaml F: Documentation/devicetree/bindings/iio/adc/fsl,vf610-adc.yaml @@ -15753,7 +15769,7 @@ F: drivers/gpu/drm/imx/dcss/ NXP i.MX 8QXP ADC DRIVER M: Cai Huoqing M: Haibo Chen -L: linux-imx@nxp.com +L: imx@lists.linux.dev L: linux-iio@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/iio/adc/nxp,imx8qxp-adc.yaml @@ -15761,7 +15777,7 @@ F: drivers/iio/adc/imx8qxp-adc.c NXP i.MX 8QXP/8QM JPEG V4L2 DRIVER M: Mirela Rabulea -R: NXP Linux Team +L: imx@lists.linux.dev L: linux-media@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/media/nxp,imx8-jpeg.yaml @@ -15771,7 +15787,7 @@ NXP i.MX CLOCK DRIVERS M: Abel Vesa R: Peng Fan L: linux-clk@vger.kernel.org -L: linux-imx@nxp.com +L: imx@lists.linux.dev S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/abelvesa/linux.git clk/imx F: Documentation/devicetree/bindings/clock/imx* @@ -16732,6 +16748,7 @@ F: drivers/pci/controller/dwc/*layerscape* PCI DRIVER FOR FU740 M: Paul Walmsley M: Greentime Hu +M: Samuel Holland L: linux-pci@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml @@ -17501,6 +17518,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core F: fs/timerfd.c F: include/linux/time_namespace.h F: include/linux/timer* +F: include/trace/events/timer* F: kernel/time/*timer* F: kernel/time/namespace.c @@ -17537,6 +17555,7 @@ F: Documentation/devicetree/bindings/power/supply/ F: drivers/power/supply/ F: include/linux/power/ F: include/linux/power_supply.h +F: tools/testing/selftests/power_supply/ POWERNV OPERATOR PANEL LCD DISPLAY DRIVER M: Suraj Jitindar Singh @@ -17984,33 +18003,34 @@ F: drivers/media/tuners/qt1010* QUALCOMM ATH12K WIRELESS DRIVER M: Kalle Valo -M: Jeff Johnson +M: Jeff Johnson L: ath12k@lists.infradead.org S: Supported W: https://wireless.wiki.kernel.org/en/users/Drivers/ath12k T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git F: drivers/net/wireless/ath/ath12k/ +N: ath12k QUALCOMM ATHEROS ATH10K WIRELESS DRIVER M: Kalle Valo -M: Jeff Johnson +M: Jeff Johnson L: ath10k@lists.infradead.org S: Supported W: https://wireless.wiki.kernel.org/en/users/Drivers/ath10k T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git -F: Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml F: drivers/net/wireless/ath/ath10k/ +N: ath10k QUALCOMM ATHEROS ATH11K WIRELESS DRIVER M: Kalle Valo -M: Jeff Johnson +M: Jeff Johnson L: ath11k@lists.infradead.org S: Supported W: https://wireless.wiki.kernel.org/en/users/Drivers/ath11k B: https://wireless.wiki.kernel.org/en/users/Drivers/ath11k/bugreport T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git -F: Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml F: drivers/net/wireless/ath/ath11k/ +N: ath11k QUALCOMM ATHEROS ATH9K WIRELESS DRIVER M: Toke Høiland-Jørgensen @@ -18364,11 +18384,17 @@ M: Tony Luck M: Borislav Petkov L: linux-edac@vger.kernel.org S: Maintained -F: Documentation/admin-guide/ras.rst +F: Documentation/admin-guide/RAS F: drivers/ras/ F: include/linux/ras.h F: include/ras/ras_event.h +RAS FRU MEMORY POISON MANAGER (FMPM) +M: Yazen Ghannam +L: linux-edac@vger.kernel.org +S: Maintained +F: drivers/ras/amd/fmpm.c + RC-CORE / LIRC FRAMEWORK M: Sean Young L: linux-media@vger.kernel.org @@ -19106,6 +19132,7 @@ F: Documentation/rust/ F: rust/ F: samples/rust/ F: scripts/*rust* +F: tools/testing/selftests/rust/ K: \b(?i:rust)\b RXRPC SOCKETS (AF_RXRPC) @@ -19641,7 +19668,7 @@ F: drivers/mmc/host/sdhci-of-at91.c SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) NXP i.MX DRIVER M: Haibo Chen -L: linux-imx@nxp.com +L: imx@lists.linux.dev L: linux-mmc@vger.kernel.org S: Maintained F: drivers/mmc/host/sdhci-esdhc-imx.c @@ -19976,36 +20003,15 @@ S: Maintained F: drivers/watchdog/simatic-ipc-wdt.c SIFIVE DRIVERS -M: Palmer Dabbelt M: Paul Walmsley +M: Samuel Holland L: linux-riscv@lists.infradead.org S: Supported +F: drivers/dma/sf-pdma/ N: sifive +K: fu[57]40 K: [^@]sifive -SIFIVE CACHE DRIVER -M: Conor Dooley -L: linux-riscv@lists.infradead.org -S: Maintained -F: Documentation/devicetree/bindings/cache/sifive,ccache0.yaml -F: drivers/cache/sifive_ccache.c - -SIFIVE FU540 SYSTEM-ON-CHIP -M: Paul Walmsley -M: Palmer Dabbelt -L: linux-riscv@lists.infradead.org -S: Supported -T: git git://git.kernel.org/pub/scm/linux/kernel/git/pjw/sifive.git -N: fu540 -K: fu540 - -SIFIVE PDMA DRIVER -M: Green Wan -S: Maintained -F: Documentation/devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml -F: drivers/dma/sf-pdma/ - - SILEAD TOUCHSCREEN DRIVER M: Hans de Goede L: linux-input@vger.kernel.org @@ -20214,8 +20220,8 @@ F: Documentation/devicetree/bindings/net/socionext,uniphier-ave4.yaml F: drivers/net/ethernet/socionext/sni_ave.c SOCIONEXT (SNI) NETSEC NETWORK DRIVER -M: Jassi Brar M: Ilias Apalodimas +M: Masahisa Kojima L: netdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/socionext,synquacer-netsec.yaml @@ -20968,6 +20974,12 @@ F: Documentation/devicetree/bindings/phy/starfive,jh7110-usb-phy.yaml F: drivers/phy/starfive/phy-jh7110-pcie.c F: drivers/phy/starfive/phy-jh7110-usb.c +STARFIVE JH8100 EXTERNAL INTERRUPT CONTROLLER DRIVER +M: Changhuang Liang +S: Supported +F: Documentation/devicetree/bindings/interrupt-controller/starfive,jh8100-intc.yaml +F: drivers/irqchip/irq-starfive-jh8100-intc.c + STATIC BRANCH/CALL M: Peter Zijlstra M: Josh Poimboeuf diff --git a/Makefile b/Makefile index 6cdb5717bfe05a42bc97f33bb56aa3c1dfccf24d..d18fa2a6240ddeee632215061d10ba3c7fb6ce24 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 8 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* @@ -1201,7 +1201,7 @@ prepare0: archprepare # All the preparing.. prepare: prepare0 ifdef CONFIG_RUST - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh + +$(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh $(Q)$(MAKE) $(build)=rust endif @@ -1711,7 +1711,7 @@ $(DOC_TARGETS): # "Is Rust available?" target PHONY += rustavailable rustavailable: - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh && echo "Rust is available!" + +$(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh && echo "Rust is available!" # Documentation target # diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 7439b2377df5799e371669faa7cc96229dadca38..8e9dd63b220c68f1f9cbb4cfa470192900ea10f2 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -467,11 +467,6 @@ smp_prepare_cpus(unsigned int max_cpus) smp_num_cpus = smp_num_probed; } -void -smp_prepare_boot_cpu(void) -{ -} - int __cpu_up(unsigned int cpu, struct task_struct *tidle) { diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index 8d9b188caa27bcd965cd3f94a6a4a4739657c97d..b2f2c59279a6799ad89daf3ce709b8d010915920 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -39,11 +39,6 @@ struct plat_smp_ops __weak plat_smp_ops; /* XXX: per cpu ? Only needed once in early secondary boot */ struct task_struct *secondary_idle_tsk; -/* Called from start_kernel */ -void __init smp_prepare_boot_cpu(void) -{ -} - static int __init arc_get_cpu_map(const char *name, struct cpumask *cpumask) { unsigned long dt_root = of_get_flat_dt_root(); diff --git a/arch/arm/boot/dts/nxp/imx/imx7s.dtsi b/arch/arm/boot/dts/nxp/imx/imx7s.dtsi index ebf7befcc11e3e8cd5985d72c384ae2248635bcc..9c81c6baa2d39ae7cd73a34144598d513423c343 100644 --- a/arch/arm/boot/dts/nxp/imx/imx7s.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx7s.dtsi @@ -834,16 +834,6 @@ lcdif: lcdif@30730000 { <&clks IMX7D_LCDIF_PIXEL_ROOT_CLK>; clock-names = "pix", "axi"; status = "disabled"; - - port { - #address-cells = <1>; - #size-cells = <0>; - - lcdif_out_mipi_dsi: endpoint@0 { - reg = <0>; - remote-endpoint = <&mipi_dsi_in_lcdif>; - }; - }; }; mipi_csi: mipi-csi@30750000 { @@ -895,22 +885,6 @@ mipi_dsi: dsi@30760000 { samsung,esc-clock-frequency = <20000000>; samsung,pll-clock-frequency = <24000000>; status = "disabled"; - - ports { - #address-cells = <1>; - #size-cells = <0>; - - port@0 { - reg = <0>; - #address-cells = <1>; - #size-cells = <0>; - - mipi_dsi_in_lcdif: endpoint@0 { - reg = <0>; - remote-endpoint = <&lcdif_out_mipi_dsi>; - }; - }; - }; }; }; diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig index 0a90583f9f017ed2f88cd20cb6f731440909e830..8f9dbe8d90291ef33f42498d29f477cf54337b2a 100644 --- a/arch/arm/configs/imx_v6_v7_defconfig +++ b/arch/arm/configs/imx_v6_v7_defconfig @@ -297,6 +297,7 @@ CONFIG_FB_MODE_HELPERS=y CONFIG_LCD_CLASS_DEVICE=y CONFIG_LCD_L4F00242T03=y CONFIG_LCD_PLATFORM=y +CONFIG_BACKLIGHT_CLASS_DEVICE=y CONFIG_BACKLIGHT_PWM=y CONFIG_BACKLIGHT_GPIO=y CONFIG_FRAMEBUFFER_CONSOLE=y diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h index d68101655b74ef0cca647d2ff6e3c59d55fc4a11..9f21e170320fc57f1dc21a33d637bcfd3d1d3917 100644 --- a/arch/arm/include/asm/elf.h +++ b/arch/arm/include/asm/elf.h @@ -4,7 +4,6 @@ #include #include -#include /* * ELF register definitions.. diff --git a/arch/arm/include/asm/vdso_datapage.h b/arch/arm/include/asm/vdso_datapage.h deleted file mode 100644 index bef68f59928d692f46e4128ce2d12b51fe6ab280..0000000000000000000000000000000000000000 --- a/arch/arm/include/asm/vdso_datapage.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Adapted from arm64 version. - * - * Copyright (C) 2012 ARM Limited - */ -#ifndef __ASM_VDSO_DATAPAGE_H -#define __ASM_VDSO_DATAPAGE_H - -#ifdef __KERNEL__ - -#ifndef __ASSEMBLY__ - -#include -#include - -union vdso_data_store { - struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; -}; - -#endif /* !__ASSEMBLY__ */ - -#endif /* __KERNEL__ */ - -#endif /* __ASM_VDSO_DATAPAGE_H */ diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 219cbc7e5d134b5bb0638d158266f809968977b6..4915662842ff1df4041ed504a456db581c063570 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -21,10 +21,12 @@ #include #include #include -#include #include #include #include + +#include + #include "signal.h" /* diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index f297d66a8a7624daca23ad2ff052c504fb859b34..d499ad461b004b05e1f0f13cbedad71b587f8478 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -35,9 +34,6 @@ extern char vdso_start[], vdso_end[]; /* Total number of pages needed for the data and text portions of the VDSO. */ unsigned int vdso_total_pages __ro_after_init; -/* - * The VDSO data page. - */ static union vdso_data_store vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; diff --git a/arch/arm64/boot/dts/allwinner/Makefile b/arch/arm64/boot/dts/allwinner/Makefile index 91d505b385de5a55f66b125586158c75720672a6..1f1f8d865d0e52a2a872d677504a125e06f57746 100644 --- a/arch/arm64/boot/dts/allwinner/Makefile +++ b/arch/arm64/boot/dts/allwinner/Makefile @@ -42,5 +42,6 @@ dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h616-bigtreetech-cb1-manta.dtb dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h616-bigtreetech-pi.dtb dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h616-orangepi-zero2.dtb dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h616-x96-mate.dtb +dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h618-orangepi-zero2w.dtb dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h618-orangepi-zero3.dtb dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h618-transpeed-8k618-t.dtb diff --git a/arch/arm64/boot/dts/amlogic/amlogic-t7.dtsi b/arch/arm64/boot/dts/amlogic/amlogic-t7.dtsi index a03c7667d2b636b35abd68b26548e04f4a411f59..2bfe2c4316117a9d0ddcebf42f4073bce0ce1be5 100644 --- a/arch/arm64/boot/dts/amlogic/amlogic-t7.dtsi +++ b/arch/arm64/boot/dts/amlogic/amlogic-t7.dtsi @@ -171,6 +171,16 @@ gpio: bank@4000 { }; }; + gpio_intc: interrupt-controller@4080 { + compatible = "amlogic,t7-gpio-intc", + "amlogic,meson-gpio-intc"; + reg = <0x0 0x4080 0x0 0x20>; + interrupt-controller; + #interrupt-cells = <2>; + amlogic,channel-interrupts = + <10 11 12 13 14 15 16 17 18 19 20 21>; + }; + uart_a: serial@78000 { compatible = "amlogic,t7-uart", "amlogic,meson-s4-uart"; reg = <0x0 0x78000 0x0 0x18>; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi index 4ae4fdab461e008d4816816eedb90f91e7d32561..43f1d45ccc96f01686534d228de9b69630db3ebb 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi @@ -255,7 +255,7 @@ tc_bridge: bridge@f { <&clk IMX8MP_AUDIO_PLL2_OUT>; assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL2_OUT>; assigned-clock-rates = <13000000>, <13000000>, <156000000>; - reset-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; + reset-gpios = <&gpio4 1 GPIO_ACTIVE_HIGH>; status = "disabled"; ports { diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi index 76c73daf546bd0f64bc22e5e1176d814ad677e18..39a550c1cd261dd516da26757bfa8eccd908b92a 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi @@ -1820,7 +1820,7 @@ lvds_bridge: bridge@5c { compatible = "fsl,imx8mp-ldb"; reg = <0x5c 0x4>, <0x128 0x4>; reg-names = "ldb", "lvds"; - clocks = <&clk IMX8MP_CLK_MEDIA_LDB>; + clocks = <&clk IMX8MP_CLK_MEDIA_LDB_ROOT>; clock-names = "ldb"; assigned-clocks = <&clk IMX8MP_CLK_MEDIA_LDB>; assigned-clock-parents = <&clk IMX8MP_VIDEO_PLL1_OUT>; diff --git a/arch/arm64/boot/dts/nvidia/tegra234-p3737-0000+p3701-0000.dts b/arch/arm64/boot/dts/nvidia/tegra234-p3737-0000+p3701-0000.dts index ea13c4a7027c46ba5f5151947537b5376bcbad20..81a82933e35004e7df51383ed22d291e40874dd9 100644 --- a/arch/arm64/boot/dts/nvidia/tegra234-p3737-0000+p3701-0000.dts +++ b/arch/arm64/boot/dts/nvidia/tegra234-p3737-0000+p3701-0000.dts @@ -175,7 +175,7 @@ ethernet@6800000 { status = "okay"; phy-handle = <&mgbe0_phy>; - phy-mode = "usxgmii"; + phy-mode = "10gbase-r"; mdio { #address-cells = <1>; diff --git a/arch/arm64/boot/dts/nvidia/tegra234.dtsi b/arch/arm64/boot/dts/nvidia/tegra234.dtsi index 3f16595d099c5620b0d2dde77f0e2c6491c4a576..d1bd328892afa2c319750b20c5b8b979283e6481 100644 --- a/arch/arm64/boot/dts/nvidia/tegra234.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra234.dtsi @@ -1459,7 +1459,7 @@ ethernet@6800000 { <&mc TEGRA234_MEMORY_CLIENT_MGBEAWR &emc>; interconnect-names = "dma-mem", "write"; iommus = <&smmu_niso0 TEGRA234_SID_MGBE>; - power-domains = <&bpmp TEGRA234_POWER_DOMAIN_MGBEA>; + power-domains = <&bpmp TEGRA234_POWER_DOMAIN_MGBEB>; status = "disabled"; }; @@ -1493,7 +1493,7 @@ ethernet@6900000 { <&mc TEGRA234_MEMORY_CLIENT_MGBEBWR &emc>; interconnect-names = "dma-mem", "write"; iommus = <&smmu_niso0 TEGRA234_SID_MGBE_VF1>; - power-domains = <&bpmp TEGRA234_POWER_DOMAIN_MGBEB>; + power-domains = <&bpmp TEGRA234_POWER_DOMAIN_MGBEC>; status = "disabled"; }; @@ -1527,7 +1527,7 @@ ethernet@6a00000 { <&mc TEGRA234_MEMORY_CLIENT_MGBECWR &emc>; interconnect-names = "dma-mem", "write"; iommus = <&smmu_niso0 TEGRA234_SID_MGBE_VF2>; - power-domains = <&bpmp TEGRA234_POWER_DOMAIN_MGBEC>; + power-domains = <&bpmp TEGRA234_POWER_DOMAIN_MGBED>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi b/arch/arm64/boot/dts/qcom/msm8996.dtsi index 8d41ed261adfbfc99e15c07755f54d8f4cf5cc80..ee6f87c828aefab76ff58c1ba1f59ae023068381 100644 --- a/arch/arm64/boot/dts/qcom/msm8996.dtsi +++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi @@ -457,25 +457,6 @@ modem_etm_out_funnel_in2: endpoint { }; }; - mpm: interrupt-controller { - compatible = "qcom,mpm"; - qcom,rpm-msg-ram = <&apss_mpm>; - interrupts = ; - mboxes = <&apcs_glb 1>; - interrupt-controller; - #interrupt-cells = <2>; - #power-domain-cells = <0>; - interrupt-parent = <&intc>; - qcom,mpm-pin-count = <96>; - qcom,mpm-pin-map = <2 184>, /* TSENS1 upper_lower_int */ - <52 243>, /* DWC3_PRI ss_phy_irq */ - <79 347>, /* DWC3_PRI hs_phy_irq */ - <80 352>, /* DWC3_SEC hs_phy_irq */ - <81 347>, /* QUSB2_PHY_PRI DP+DM */ - <82 352>, /* QUSB2_PHY_SEC DP+DM */ - <87 326>; /* SPMI */ - }; - psci { compatible = "arm,psci-1.0"; method = "smc"; @@ -765,15 +746,8 @@ pciephy_2: phy@3000 { }; rpm_msg_ram: sram@68000 { - compatible = "qcom,rpm-msg-ram", "mmio-sram"; + compatible = "qcom,rpm-msg-ram"; reg = <0x00068000 0x6000>; - #address-cells = <1>; - #size-cells = <1>; - ranges = <0 0x00068000 0x7000>; - - apss_mpm: sram@1b8 { - reg = <0x1b8 0x48>; - }; }; qfprom@74000 { @@ -856,8 +830,8 @@ tsens1: thermal-sensor@4ad000 { reg = <0x004ad000 0x1000>, /* TM */ <0x004ac000 0x1000>; /* SROT */ #qcom,sensors = <8>; - interrupts-extended = <&mpm 2 IRQ_TYPE_LEVEL_HIGH>, - <&intc GIC_SPI 430 IRQ_TYPE_LEVEL_HIGH>; + interrupts = , + ; interrupt-names = "uplow", "critical"; #thermal-sensor-cells = <1>; }; @@ -1363,7 +1337,6 @@ tlmm: pinctrl@1010000 { interrupts = ; gpio-controller; gpio-ranges = <&tlmm 0 0 150>; - wakeup-parent = <&mpm>; #gpio-cells = <2>; interrupt-controller; #interrupt-cells = <2>; @@ -1891,7 +1864,7 @@ spmi_bus: spmi@400f000 { <0x0400a000 0x002100>; reg-names = "core", "chnls", "obsrvr", "intr", "cnfg"; interrupt-names = "periph_irq"; - interrupts-extended = <&mpm 87 IRQ_TYPE_LEVEL_HIGH>; + interrupts = ; qcom,ee = <0>; qcom,channel = <0>; #address-cells = <2>; @@ -3052,8 +3025,8 @@ usb3: usb@6af8800 { #size-cells = <1>; ranges; - interrupts-extended = <&mpm 79 IRQ_TYPE_LEVEL_HIGH>, - <&mpm 52 IRQ_TYPE_LEVEL_HIGH>; + interrupts = , + ; interrupt-names = "hs_phy_irq", "ss_phy_irq"; clocks = <&gcc GCC_SYS_NOC_USB3_AXI_CLK>, diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts b/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts index ffc4406422ae2f82c9636e0fb521f34a1d28c1eb..41215567b3aed7d4211a8a4c5ab94042d205b422 100644 --- a/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts +++ b/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts @@ -563,6 +563,8 @@ &pcie3a_phy { }; &pcie4 { + max-link-speed = <2>; + perst-gpios = <&tlmm 141 GPIO_ACTIVE_LOW>; wake-gpios = <&tlmm 139 GPIO_ACTIVE_LOW>; diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts index def3976bd5bb154d27228831de14e9463239bdf8..eb657e544961d7c2ac60e0f505767c1427893a14 100644 --- a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts +++ b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts @@ -722,6 +722,8 @@ &pcie3a_phy { }; &pcie4 { + max-link-speed = <2>; + perst-gpios = <&tlmm 141 GPIO_ACTIVE_LOW>; wake-gpios = <&tlmm 139 GPIO_ACTIVE_LOW>; diff --git a/arch/arm64/boot/dts/qcom/sm6115.dtsi b/arch/arm64/boot/dts/qcom/sm6115.dtsi index 160e098f10757e5f4e9c68e82ecc45f1ce27aa14..f9849b8befbf24b54992d49af812eaa94288c3fb 100644 --- a/arch/arm64/boot/dts/qcom/sm6115.dtsi +++ b/arch/arm64/boot/dts/qcom/sm6115.dtsi @@ -1304,6 +1304,9 @@ &clk_virt SLAVE_QUP_CORE_0 RPM_ALWAYS_TAG>, &config_noc SLAVE_QUP_0 RPM_ALWAYS_TAG>, <&system_noc MASTER_QUP_0 RPM_ALWAYS_TAG &bimc SLAVE_EBI_CH0 RPM_ALWAYS_TAG>; + interconnect-names = "qup-core", + "qup-config", + "qup-memory"; #address-cells = <1>; #size-cells = <0>; status = "disabled"; diff --git a/arch/arm64/boot/dts/qcom/sm8650-mtp.dts b/arch/arm64/boot/dts/qcom/sm8650-mtp.dts index 9d916edb1c73c10ef5e4fde52e33b75ff902a957..be133a3d5cbe0cb073c0fe8d4f253740da584992 100644 --- a/arch/arm64/boot/dts/qcom/sm8650-mtp.dts +++ b/arch/arm64/boot/dts/qcom/sm8650-mtp.dts @@ -622,7 +622,7 @@ right_spkr: speaker@0,1 { &tlmm { /* Reserved I/Os for NFC */ - gpio-reserved-ranges = <32 8>; + gpio-reserved-ranges = <32 8>, <74 1>; disp0_reset_n_active: disp0-reset-n-active-state { pins = "gpio133"; diff --git a/arch/arm64/boot/dts/qcom/sm8650-qrd.dts b/arch/arm64/boot/dts/qcom/sm8650-qrd.dts index 592a67a47c782f667cd48d8d8bcad7d457a84ee4..b9151c2ddf2e5ce7944bed07aa6864ccdc75f2a5 100644 --- a/arch/arm64/boot/dts/qcom/sm8650-qrd.dts +++ b/arch/arm64/boot/dts/qcom/sm8650-qrd.dts @@ -659,7 +659,7 @@ touchscreen@0 { &tlmm { /* Reserved I/Os for NFC */ - gpio-reserved-ranges = <32 8>; + gpio-reserved-ranges = <32 8>, <74 1>; bt_default: bt-default-state { bt-en-pins { diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index bac4cabef6073e5b0c652d0ed031ea7cce97c72f..467ac2f768ac2bb423b92eb797dce8bde697f259 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -227,8 +227,19 @@ static int ctr_encrypt(struct skcipher_request *req) src += blocks * AES_BLOCK_SIZE; } if (nbytes && walk.nbytes == walk.total) { + u8 buf[AES_BLOCK_SIZE]; + u8 *d = dst; + + if (unlikely(nbytes < AES_BLOCK_SIZE)) + src = dst = memcpy(buf + sizeof(buf) - nbytes, + src, nbytes); + neon_aes_ctr_encrypt(dst, src, ctx->enc, ctx->key.rounds, nbytes, walk.iv); + + if (unlikely(nbytes < AES_BLOCK_SIZE)) + memcpy(d, dst, nbytes); + nbytes = 0; } kernel_neon_end(); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 7f88028a00c02c0e176af5ae7674ae606f5afd3a..b2a60e0bcfd21d28a60db750590732bf1115551d 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -247,7 +247,7 @@ struct kunwind_consume_entry_data { void *cookie; }; -static bool +static __always_inline bool arch_kunwind_consume_entry(const struct kunwind_state *state, void *cookie) { struct kunwind_consume_entry_data *data = cookie; diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 5562daf38a22f59478a935d73dd3375117e7d759..89b6e78400023d0cfb02d8e4ff059e243a7e3f59 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -69,10 +69,7 @@ static struct vdso_abi_info vdso_info[] __ro_after_init = { /* * The vDSO data page. */ -static union { - struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; +static union vdso_data_store vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; static int vdso_mremap(const struct vm_special_mapping *sm, diff --git a/arch/csky/include/asm/vdso.h b/arch/csky/include/asm/vdso.h index bdce581b5fcb8113129221acab65436dfbf8de9a..181a15edafe87ef54cc7380bfac9e66d1484a815 100644 --- a/arch/csky/include/asm/vdso.h +++ b/arch/csky/include/asm/vdso.h @@ -5,11 +5,6 @@ #include -#ifndef GENERIC_TIME_VSYSCALL -struct vdso_data { -}; -#endif - /* * The VDSO symbols are mapped into Linux so we can just use regular symbol * addressing to get their offsets in userspace. The symbols are mapped at an diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c index 8e42352cbf123f4bc771dfefda013bf12d818cfa..92dbbf3e0205b61bb9b89697750e501a22747390 100644 --- a/arch/csky/kernel/smp.c +++ b/arch/csky/kernel/smp.c @@ -152,10 +152,6 @@ void arch_irq_work_raise(void) } #endif -void __init smp_prepare_boot_cpu(void) -{ -} - void __init smp_prepare_cpus(unsigned int max_cpus) { } diff --git a/arch/csky/kernel/vdso.c b/arch/csky/kernel/vdso.c index 16c20d64d16532b1d1498a007193c7916a6bb15a..2ca886e4a458b36e38014a8ca7b30dd0cc55a6e3 100644 --- a/arch/csky/kernel/vdso.c +++ b/arch/csky/kernel/vdso.c @@ -8,25 +8,15 @@ #include #include -#ifdef GENERIC_TIME_VSYSCALL #include -#else -#include -#endif extern char vdso_start[], vdso_end[]; static unsigned int vdso_pages; static struct page **vdso_pagelist; -/* - * The vDSO data page. - */ -static union { - struct vdso_data data; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = &vdso_data_store.data; +static union vdso_data_store vdso_data_store __page_aligned_data; +struct vdso_data *vdso_data = vdso_data_store.data; static int __init vdso_init(void) { diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index 608884bc3396763eefcc4dc44896322a89da6763..65e1fdf9fdb21d730c36675d0ffdac287db8252f 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -114,10 +114,6 @@ void send_ipi(const struct cpumask *cpumask, enum ipi_message_type msg) local_irq_restore(flags); } -void __init smp_prepare_boot_cpu(void) -{ -} - /* * interrupts should already be disabled from the VM * SP should already be correct; need to set THREADINFO_REG diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c index 14941e4be66d8273adaebb7de388e8782e07a892..90dfccb41c14a0036f03e97ee886612a3c7cee97 100644 --- a/arch/loongarch/kernel/vdso.c +++ b/arch/loongarch/kernel/vdso.c @@ -21,15 +21,13 @@ #include #include #include +#include #include extern char vdso_start[], vdso_end[]; /* Kernel-provided data used by the VDSO. */ -static union { - u8 page[PAGE_SIZE]; - struct vdso_data data[CS_BASES]; -} generic_vdso_data __page_aligned_data; +static union vdso_data_store generic_vdso_data __page_aligned_data; static union { u8 page[LOONGARCH_VDSO_DATA_SIZE]; diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index a708fbd5a844f8a2c6a60cea6eb2e5a126e8dbdb..642fb80c5c4e31f6c595e1663a19d7760c68e1e1 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -96,6 +96,9 @@ static const struct block_device_operations nfhd_ops = { static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) { + struct queue_limits lim = { + .logical_block_size = bsize, + }; struct nfhd_device *dev; int dev_id = id - NFHD_DEV_OFFSET; int err = -ENOMEM; @@ -117,9 +120,11 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) dev->bsize = bsize; dev->bshift = ffs(bsize) - 10; - dev->disk = blk_alloc_disk(NUMA_NO_NODE); - if (!dev->disk) + dev->disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(dev->disk)) { + err = PTR_ERR(dev->disk); goto free_dev; + } dev->disk->major = major_num; dev->disk->first_minor = dev_id * 16; @@ -128,7 +133,6 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) dev->disk->private_data = dev; sprintf(dev->disk->disk_name, "nfhd%u", dev_id); set_capacity(dev->disk, (sector_t)blocks * (bsize / 512)); - blk_queue_logical_block_size(dev->disk->queue, bsize); err = add_disk(dev->disk); if (err) goto out_cleanup_disk; diff --git a/arch/mips/include/asm/vdso.h b/arch/mips/include/asm/vdso.h index cc7b516129a8fbc43bf532a85fac26801a2ba753..afb03d45bcd0d83716adf2d867c814f5f0fb0d1f 100644 --- a/arch/mips/include/asm/vdso.h +++ b/arch/mips/include/asm/vdso.h @@ -50,9 +50,4 @@ extern struct mips_vdso_image vdso_image_o32; extern struct mips_vdso_image vdso_image_n32; #endif -union mips_vdso_data { - struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; -}; - #endif /* __ASM_VDSO_H */ diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index f6d40e43f10851205b03a84dbe0a82b227038f2c..dda36fa26307e27d3de414c811450ed912294a0e 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -24,7 +24,7 @@ #include /* Kernel-provided data used by the VDSO. */ -static union mips_vdso_data mips_vdso_data __page_aligned_data; +static union vdso_data_store mips_vdso_data __page_aligned_data; struct vdso_data *vdso_data = mips_vdso_data.data; /* diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c index 1c5a2d71d6753e8a02a8903712c163802e56891c..86da4bc5ee0bbeb7fbc2a36a3275a271624feffe 100644 --- a/arch/openrisc/kernel/smp.c +++ b/arch/openrisc/kernel/smp.c @@ -57,10 +57,6 @@ static void boot_secondary(unsigned int cpu, struct task_struct *idle) spin_unlock(&boot_lock); } -void __init smp_prepare_boot_cpu(void) -{ -} - void __init smp_init_cpus(void) { struct device_node *cpu; diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 9bb2210c8d4417a4262aab81d68d851e175b77b4..065ffd1b2f8adaef8369846531bf4e6f78159b57 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -69,7 +69,7 @@ enum rtas_function_index { RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE, RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2, RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW, - RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOWS, + RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOW, RTAS_FNIDX__IBM_SCAN_LOG_DUMP, RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR, RTAS_FNIDX__IBM_SET_EEH_OPTION, @@ -164,7 +164,7 @@ typedef struct { #define RTAS_FN_IBM_READ_SLOT_RESET_STATE rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE) #define RTAS_FN_IBM_READ_SLOT_RESET_STATE2 rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2) #define RTAS_FN_IBM_REMOVE_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW) -#define RTAS_FN_IBM_RESET_PE_DMA_WINDOWS rtas_fn_handle(RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOWS) +#define RTAS_FN_IBM_RESET_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOW) #define RTAS_FN_IBM_SCAN_LOG_DUMP rtas_fn_handle(RTAS_FNIDX__IBM_SCAN_LOG_DUMP) #define RTAS_FN_IBM_SET_DYNAMIC_INDICATOR rtas_fn_handle(RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR) #define RTAS_FN_IBM_SET_EEH_OPTION rtas_fn_handle(RTAS_FNIDX__IBM_SET_EEH_OPTION) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 7e793b503e29f1ff878e7289c8703e7c4cf20edc..8064d9c3de8620d27d9c87f829676ef048aeed40 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -375,8 +375,13 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { [RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW] = { .name = "ibm,remove-pe-dma-window", }, - [RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOWS] = { - .name = "ibm,reset-pe-dma-windows", + [RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOW] = { + /* + * Note: PAPR+ v2.13 7.3.31.4.1 spells this as + * "ibm,reset-pe-dma-windows" (plural), but RTAS + * implementations use the singular form in practice. + */ + .name = "ibm,reset-pe-dma-window", }, [RTAS_FNIDX__IBM_SCAN_LOG_DUMP] = { .name = "ibm,scan-log-dump", diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 693334c20d07db70e46224333c565abc457be11a..a60e4139214be58384edca3e282f8df936b8c4ea 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -984,7 +984,7 @@ static bool shared_caches __ro_after_init; /* cpumask of CPUs with asymmetric SMT dependency */ static int powerpc_smt_flags(void) { - int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); @@ -1010,9 +1010,9 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack); static int powerpc_shared_cache_flags(void) { if (static_branch_unlikely(&splpar_asym_pack)) - return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING; + return SD_SHARE_LLC | SD_ASYM_PACKING; - return SD_SHARE_PKG_RESOURCES; + return SD_SHARE_LLC; } static int powerpc_shared_proc_flags(void) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 496e16c588aaa8edcd0294825862312471928506..e8c4129697b142ba48490481ee38793086e8425a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -574,29 +574,6 @@ static void iommu_table_setparms(struct pci_controller *phb, struct iommu_table_ops iommu_table_lpar_multi_ops; -/* - * iommu_table_setparms_lpar - * - * Function: On pSeries LPAR systems, return TCE table info, given a pci bus. - */ -static void iommu_table_setparms_lpar(struct pci_controller *phb, - struct device_node *dn, - struct iommu_table *tbl, - struct iommu_table_group *table_group, - const __be32 *dma_window) -{ - unsigned long offset, size, liobn; - - of_parse_dma_window(dn, dma_window, &liobn, &offset, &size); - - iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL, - &iommu_table_lpar_multi_ops); - - - table_group->tce32_start = offset; - table_group->tce32_size = size; -} - struct iommu_table_ops iommu_table_pseries_ops = { .set = tce_build_pSeries, .clear = tce_free_pSeries, @@ -724,26 +701,71 @@ struct iommu_table_ops iommu_table_lpar_multi_ops = { * dynamic 64bit DMA window, walking up the device tree. */ static struct device_node *pci_dma_find(struct device_node *dn, - const __be32 **dma_window) + struct dynamic_dma_window_prop *prop) { - const __be32 *dw = NULL; + const __be32 *default_prop = NULL; + const __be32 *ddw_prop = NULL; + struct device_node *rdn = NULL; + bool default_win = false, ddw_win = false; for ( ; dn && PCI_DN(dn); dn = dn->parent) { - dw = of_get_property(dn, "ibm,dma-window", NULL); - if (dw) { - if (dma_window) - *dma_window = dw; - return dn; + default_prop = of_get_property(dn, "ibm,dma-window", NULL); + if (default_prop) { + rdn = dn; + default_win = true; + } + ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL); + if (ddw_prop) { + rdn = dn; + ddw_win = true; + break; + } + ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL); + if (ddw_prop) { + rdn = dn; + ddw_win = true; + break; } - dw = of_get_property(dn, DIRECT64_PROPNAME, NULL); - if (dw) - return dn; - dw = of_get_property(dn, DMA64_PROPNAME, NULL); - if (dw) - return dn; + + /* At least found default window, which is the case for normal boot */ + if (default_win) + break; } - return NULL; + /* For PCI devices there will always be a DMA window, either on the device + * or parent bus + */ + WARN_ON(!(default_win | ddw_win)); + + /* caller doesn't want to get DMA window property */ + if (!prop) + return rdn; + + /* parse DMA window property. During normal system boot, only default + * DMA window is passed in OF. But, for kdump, a dedicated adapter might + * have both default and DDW in FDT. In this scenario, DDW takes precedence + * over default window. + */ + if (ddw_win) { + struct dynamic_dma_window_prop *p; + + p = (struct dynamic_dma_window_prop *)ddw_prop; + prop->liobn = p->liobn; + prop->dma_base = p->dma_base; + prop->tce_shift = p->tce_shift; + prop->window_shift = p->window_shift; + } else if (default_win) { + unsigned long offset, size, liobn; + + of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size); + + prop->liobn = cpu_to_be32((u32)liobn); + prop->dma_base = cpu_to_be64(offset); + prop->tce_shift = cpu_to_be32(IOMMU_PAGE_SHIFT_4K); + prop->window_shift = cpu_to_be32(order_base_2(size)); + } + + return rdn; } static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) @@ -751,17 +773,20 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) struct iommu_table *tbl; struct device_node *dn, *pdn; struct pci_dn *ppci; - const __be32 *dma_window = NULL; + struct dynamic_dma_window_prop prop; dn = pci_bus_to_OF_node(bus); pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n", dn); - pdn = pci_dma_find(dn, &dma_window); + pdn = pci_dma_find(dn, &prop); - if (dma_window == NULL) - pr_debug(" no ibm,dma-window property !\n"); + /* In PPC architecture, there will always be DMA window on bus or one of the + * parent bus. During reboot, there will be ibm,dma-window property to + * define DMA window. For kdump, there will at least be default window or DDW + * or both. + */ ppci = PCI_DN(pdn); @@ -771,13 +796,24 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) if (!ppci->table_group) { ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); tbl = ppci->table_group->tables[0]; - if (dma_window) { - iommu_table_setparms_lpar(ppci->phb, pdn, tbl, - ppci->table_group, dma_window); - if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) - panic("Failed to initialize iommu table"); - } + iommu_table_setparms_common(tbl, ppci->phb->bus->number, + be32_to_cpu(prop.liobn), + be64_to_cpu(prop.dma_base), + 1ULL << be32_to_cpu(prop.window_shift), + be32_to_cpu(prop.tce_shift), NULL, + &iommu_table_lpar_multi_ops); + + /* Only for normal boot with default window. Doesn't matter even + * if we set these with DDW which is 64bit during kdump, since + * these will not be used during kdump. + */ + ppci->table_group->tce32_start = be64_to_cpu(prop.dma_base); + ppci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); + + if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) + panic("Failed to initialize iommu table"); + iommu_register_group(ppci->table_group, pci_domain_nr(bus), 0); pr_debug(" created table: %p\n", ppci->table_group); @@ -968,6 +1004,12 @@ static void find_existing_ddw_windows_named(const char *name) continue; } + /* If at the time of system initialization, there are DDWs in OF, + * it means this is during kexec. DDW could be direct or dynamic. + * We will just mark DDWs as "dynamic" since this is kdump path, + * no need to worry about perforance. ddw_list_new_entry() will + * set window->direct = false. + */ window = ddw_list_new_entry(pdn, dma64); if (!window) { of_node_put(pdn); @@ -1524,8 +1566,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) { struct device_node *pdn, *dn; struct iommu_table *tbl; - const __be32 *dma_window = NULL; struct pci_dn *pci; + struct dynamic_dma_window_prop prop; pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev)); @@ -1538,7 +1580,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) dn = pci_device_to_OF_node(dev); pr_debug(" node is %pOF\n", dn); - pdn = pci_dma_find(dn, &dma_window); + pdn = pci_dma_find(dn, &prop); if (!pdn || !PCI_DN(pdn)) { printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: " "no DMA window found for pci dev=%s dn=%pOF\n", @@ -1551,8 +1593,20 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) if (!pci->table_group) { pci->table_group = iommu_pseries_alloc_group(pci->phb->node); tbl = pci->table_group->tables[0]; - iommu_table_setparms_lpar(pci->phb, pdn, tbl, - pci->table_group, dma_window); + + iommu_table_setparms_common(tbl, pci->phb->bus->number, + be32_to_cpu(prop.liobn), + be64_to_cpu(prop.dma_base), + 1ULL << be32_to_cpu(prop.window_shift), + be32_to_cpu(prop.tce_shift), NULL, + &iommu_table_lpar_multi_ops); + + /* Only for normal boot with default window. Doesn't matter even + * if we set these with DDW which is 64bit during kdump, since + * these will not be used during kdump. + */ + pci->table_group->tce32_start = be64_to_cpu(prop.dma_base); + pci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); iommu_init_table(tbl, pci->phb->node, 0, 0); iommu_register_group(pci->table_group, diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bffbd869a0682842883591788da784648acf1626..e3142ce531a097b8cf0e39251ba88ae143d6594c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -315,7 +315,6 @@ config AS_HAS_OPTION_ARCH # https://reviews.llvm.org/D123515 def_bool y depends on $(as-instr, .option arch$(comma) +m) - depends on !$(as-instr, .option arch$(comma) -i) source "arch/riscv/Kconfig.socs" source "arch/riscv/Kconfig.errata" diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 510014051f5dbb1aa61098e4974e7e7ac02145ee..2468c55933cd0d5d55d71d83a52226172bd5121c 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -424,6 +424,7 @@ # define CSR_STATUS CSR_MSTATUS # define CSR_IE CSR_MIE # define CSR_TVEC CSR_MTVEC +# define CSR_ENVCFG CSR_MENVCFG # define CSR_SCRATCH CSR_MSCRATCH # define CSR_EPC CSR_MEPC # define CSR_CAUSE CSR_MCAUSE @@ -448,6 +449,7 @@ # define CSR_STATUS CSR_SSTATUS # define CSR_IE CSR_SIE # define CSR_TVEC CSR_STVEC +# define CSR_ENVCFG CSR_SENVCFG # define CSR_SCRATCH CSR_SSCRATCH # define CSR_EPC CSR_SEPC # define CSR_CAUSE CSR_SCAUSE diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index 3291721229523456247532009bc2ed2ddc444540..15055f9df4daa1e4250c8a37c64193bf5c943ee3 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -25,6 +25,11 @@ #define ARCH_SUPPORTS_FTRACE_OPS 1 #ifndef __ASSEMBLY__ + +extern void *return_address(unsigned int level); + +#define ftrace_return_address(n) return_address(n) + void MCOUNT_NAME(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index 20f9c3ba2341412812ba003caf86f546c162bd34..22deb7a2a6ec4e4daba8322c7c6c28137b49f5f8 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -11,8 +11,10 @@ static inline void arch_clear_hugepage_flags(struct page *page) } #define arch_clear_hugepage_flags arch_clear_hugepage_flags +#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION bool arch_hugetlb_migration_supported(struct hstate *h); #define arch_hugetlb_migration_supported arch_hugetlb_migration_supported +#endif #ifdef CONFIG_RISCV_ISA_SVNAPOT #define __HAVE_ARCH_HUGE_PTE_CLEAR diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 5340f818746b71a805319eb6f941fa311c9b36a2..1f2d2599c655d20be6df7516382e20a7e3956301 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -81,6 +81,8 @@ #define RISCV_ISA_EXT_ZTSO 72 #define RISCV_ISA_EXT_ZACAS 73 +#define RISCV_ISA_EXT_XLINUXENVCFG 127 + #define RISCV_ISA_EXT_MAX 128 #define RISCV_ISA_EXT_INVALID U32_MAX diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index d169a4f41a2e728276a97898e1270c7b4763f9ed..c80bb9990d32ef706452d7d4fcc1c049cd7436d9 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -95,7 +95,13 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) __pud_free(mm, pud); } -#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud) +#define __pud_free_tlb(tlb, pud, addr) \ +do { \ + if (pgtable_l4_enabled) { \ + pagetable_pud_dtor(virt_to_ptdesc(pud)); \ + tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pud)); \ + } \ +} while (0) #define p4d_alloc_one p4d_alloc_one static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) @@ -124,7 +130,11 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) __p4d_free(mm, p4d); } -#define __p4d_free_tlb(tlb, p4d, addr) p4d_free((tlb)->mm, p4d) +#define __p4d_free_tlb(tlb, p4d, addr) \ +do { \ + if (pgtable_l5_enabled) \ + tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(p4d)); \ +} while (0) #endif /* __PAGETABLE_PMD_FOLDED */ static inline void sync_kernel_mappings(pgd_t *pgd) @@ -149,7 +159,11 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #ifndef __PAGETABLE_PMD_FOLDED -#define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd) +#define __pmd_free_tlb(tlb, pmd, addr) \ +do { \ + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); \ + tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \ +} while (0) #endif /* __PAGETABLE_PMD_FOLDED */ diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index b42017d76924f74386bc712719280af21781bb5d..b99bd66107a69038c835ead6b77725aaeaf882c3 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -136,7 +136,7 @@ enum napot_cont_order { * 10010 - IO Strongly-ordered, Non-cacheable, Non-bufferable, Shareable, Non-trustable */ #define _PAGE_PMA_THEAD ((1UL << 62) | (1UL << 61) | (1UL << 60)) -#define _PAGE_NOCACHE_THEAD ((1UL < 61) | (1UL << 60)) +#define _PAGE_NOCACHE_THEAD ((1UL << 61) | (1UL << 60)) #define _PAGE_IO_THEAD ((1UL << 63) | (1UL << 60)) #define _PAGE_MTMASK_THEAD (_PAGE_PMA_THEAD | _PAGE_IO_THEAD | (1UL << 59)) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 0c94260b5d0c126f6302f39a59507f19eed48dac..6066822e7396fa5078a546356a3a6f6605470712 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -84,7 +84,7 @@ * Define vmemmap for pfn_to_page & page_to_pfn calls. Needed if kernel * is configured with CONFIG_SPARSEMEM_VMEMMAP enabled. */ -#define vmemmap ((struct page *)VMEMMAP_START) +#define vmemmap ((struct page *)VMEMMAP_START - (phys_ram_base >> PAGE_SHIFT)) #define PCI_IO_SIZE SZ_16M #define PCI_IO_END VMEMMAP_START @@ -439,6 +439,10 @@ static inline pte_t pte_mkhuge(pte_t pte) return pte; } +#define pte_leaf_size(pte) (pte_napot(pte) ? \ + napot_cont_size(napot_cont_order(pte)) :\ + PAGE_SIZE) + #ifdef CONFIG_NUMA_BALANCING /* * See the comment in include/asm-generic/pgtable.h diff --git a/arch/riscv/include/asm/suspend.h b/arch/riscv/include/asm/suspend.h index 02f87867389a9e660f91b64c7ca818a6b61637dc..491296a335d0ce6cd9c8f242646c3c60c762bc87 100644 --- a/arch/riscv/include/asm/suspend.h +++ b/arch/riscv/include/asm/suspend.h @@ -14,6 +14,7 @@ struct suspend_context { struct pt_regs regs; /* Saved and restored by high-level functions */ unsigned long scratch; + unsigned long envcfg; unsigned long tvec; unsigned long ie; #ifdef CONFIG_MMU diff --git a/arch/riscv/include/asm/vmalloc.h b/arch/riscv/include/asm/vmalloc.h index 924d01b56c9a1eb1eacd53a923fc55591cda654f..51f6dfe19745aa486bd73d7de472faa538cf0486 100644 --- a/arch/riscv/include/asm/vmalloc.h +++ b/arch/riscv/include/asm/vmalloc.h @@ -19,65 +19,6 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot) return true; } -#ifdef CONFIG_RISCV_ISA_SVNAPOT -#include +#endif -#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size -static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, unsigned long end, - u64 pfn, unsigned int max_page_shift) -{ - unsigned long map_size = PAGE_SIZE; - unsigned long size, order; - - if (!has_svnapot()) - return map_size; - - for_each_napot_order_rev(order) { - if (napot_cont_shift(order) > max_page_shift) - continue; - - size = napot_cont_size(order); - if (end - addr < size) - continue; - - if (!IS_ALIGNED(addr, size)) - continue; - - if (!IS_ALIGNED(PFN_PHYS(pfn), size)) - continue; - - map_size = size; - break; - } - - return map_size; -} - -#define arch_vmap_pte_supported_shift arch_vmap_pte_supported_shift -static inline int arch_vmap_pte_supported_shift(unsigned long size) -{ - int shift = PAGE_SHIFT; - unsigned long order; - - if (!has_svnapot()) - return shift; - - WARN_ON_ONCE(size >= PMD_SIZE); - - for_each_napot_order_rev(order) { - if (napot_cont_size(order) > size) - continue; - - if (!IS_ALIGNED(size, napot_cont_size(order))) - continue; - - shift = napot_cont_shift(order); - break; - } - - return shift; -} - -#endif /* CONFIG_RISCV_ISA_SVNAPOT */ -#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #endif /* _ASM_RISCV_VMALLOC_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index f71910718053d841a361fd97e7d62da4f86bebcf..604d6bf7e47672e9b01902f6fa497aeb4e102ee5 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -7,6 +7,7 @@ ifdef CONFIG_FTRACE CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_patch.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_sbi.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) endif CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,) CFLAGS_compat_syscall_table.o += $(call cc-option,-Wno-override-init,) @@ -46,6 +47,7 @@ obj-y += irq.o obj-y += process.o obj-y += ptrace.o obj-y += reset.o +obj-y += return_address.o obj-y += setup.o obj-y += signal.o obj-y += syscall_table.o diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 89920f84d0a34385471e9afbf9c26d287cbbd838..79a5a35fab964d3b54db97b5504f45f68dface11 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "copy-unaligned.h" @@ -201,6 +202,16 @@ static const unsigned int riscv_zvbb_exts[] = { RISCV_ISA_EXT_ZVKB }; +/* + * While the [ms]envcfg CSRs were not defined until version 1.12 of the RISC-V + * privileged ISA, the existence of the CSRs is implied by any extension which + * specifies [ms]envcfg bit(s). Hence, we define a custom ISA extension for the + * existence of the CSR, and treat it as a subset of those other extensions. + */ +static const unsigned int riscv_xlinuxenvcfg_exts[] = { + RISCV_ISA_EXT_XLINUXENVCFG +}; + /* * The canonical order of ISA extension names in the ISA string is defined in * chapter 27 of the unprivileged specification. @@ -250,8 +261,8 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_DATA(c, RISCV_ISA_EXT_c), __RISCV_ISA_EXT_DATA(v, RISCV_ISA_EXT_v), __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h), - __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM), - __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ), + __RISCV_ISA_EXT_SUPERSET(zicbom, RISCV_ISA_EXT_ZICBOM, riscv_xlinuxenvcfg_exts), + __RISCV_ISA_EXT_SUPERSET(zicboz, RISCV_ISA_EXT_ZICBOZ, riscv_xlinuxenvcfg_exts), __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR), __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND), __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR), @@ -538,6 +549,20 @@ static void __init riscv_fill_hwcap_from_isa_string(unsigned long *isa2hwcap) set_bit(RISCV_ISA_EXT_ZIHPM, isainfo->isa); } + /* + * "V" in ISA strings is ambiguous in practice: it should mean + * just the standard V-1.0 but vendors aren't well behaved. + * Many vendors with T-Head CPU cores which implement the 0.7.1 + * version of the vector specification put "v" into their DTs. + * CPU cores with the ratified spec will contain non-zero + * marchid. + */ + if (acpi_disabled && riscv_cached_mvendorid(cpu) == THEAD_VENDOR_ID && + riscv_cached_marchid(cpu) == 0x0) { + this_hwcap &= ~isa2hwcap[RISCV_ISA_EXT_v]; + clear_bit(RISCV_ISA_EXT_v, isainfo->isa); + } + /* * All "okay" hart should have same isa. Set HWCAP based on * common capabilities of every "okay" hart, in case they don't @@ -950,7 +975,7 @@ arch_initcall(check_unaligned_access_all_cpus); void riscv_user_isa_enable(void) { if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICBOZ)) - csr_set(CSR_SENVCFG, ENVCFG_CBZE); + csr_set(CSR_ENVCFG, ENVCFG_CBZE); } #ifdef CONFIG_RISCV_ALTERNATIVE diff --git a/arch/riscv/kernel/return_address.c b/arch/riscv/kernel/return_address.c new file mode 100644 index 0000000000000000000000000000000000000000..c8115ec8fb304bb5b44fdbfc38c5d55605f3c5c2 --- /dev/null +++ b/arch/riscv/kernel/return_address.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This code come from arch/arm64/kernel/return_address.c + * + * Copyright (C) 2023 SiFive. + */ + +#include +#include +#include + +struct return_address_data { + unsigned int level; + void *addr; +}; + +static bool save_return_addr(void *d, unsigned long pc) +{ + struct return_address_data *data = d; + + if (!data->level) { + data->addr = (void *)pc; + return false; + } + + --data->level; + + return true; +} +NOKPROBE_SYMBOL(save_return_addr); + +noinline void *return_address(unsigned int level) +{ + struct return_address_data data; + + data.level = level + 3; + data.addr = NULL; + + arch_stack_walk(save_return_addr, &data, current, NULL); + + if (!data.level) + return data.addr; + else + return NULL; + +} +EXPORT_SYMBOL_GPL(return_address); +NOKPROBE_SYMBOL(return_address); diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 519b6bd946e5d1b69edf3379e31b345e38a03deb..c4ed7d977f57b2c305e5ecbf2b767fd9b0c175df 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -42,10 +42,6 @@ static DECLARE_COMPLETION(cpu_running); -void __init smp_prepare_boot_cpu(void) -{ -} - void __init smp_prepare_cpus(unsigned int max_cpus) { int cpuid; diff --git a/arch/riscv/kernel/suspend.c b/arch/riscv/kernel/suspend.c index 239509367e4233336806c19da964a06537d5a9b5..299795341e8a2207dc922373511e31118bbd0f8b 100644 --- a/arch/riscv/kernel/suspend.c +++ b/arch/riscv/kernel/suspend.c @@ -15,6 +15,8 @@ void suspend_save_csrs(struct suspend_context *context) { context->scratch = csr_read(CSR_SCRATCH); + if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_XLINUXENVCFG)) + context->envcfg = csr_read(CSR_ENVCFG); context->tvec = csr_read(CSR_TVEC); context->ie = csr_read(CSR_IE); @@ -36,6 +38,8 @@ void suspend_save_csrs(struct suspend_context *context) void suspend_restore_csrs(struct suspend_context *context) { csr_write(CSR_SCRATCH, context->scratch); + if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_XLINUXENVCFG)) + csr_write(CSR_ENVCFG, context->envcfg); csr_write(CSR_TVEC, context->tvec); csr_write(CSR_IE, context->ie); diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 2cf76218a5bd02c8f148318f78abd648dab276bf..98315b98256df412d48479c0114b5a5b49a7b583 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -30,14 +30,8 @@ enum rv_vdso_map { #define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT) -/* - * The vDSO data page. - */ -static union { - struct vdso_data data; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = &vdso_data_store.data; +static union vdso_data_store vdso_data_store __page_aligned_data; +struct vdso_data *vdso_data = vdso_data_store.data; struct __vdso_info { const char *name; diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 29c7606414d276d1c3639e2a80e10037ea899cfc..5ef2a6891158a6d59de8f36b4f4d98cf3ad6eb2a 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -426,10 +426,12 @@ bool __init arch_hugetlb_valid_size(unsigned long size) return __hugetlb_valid_size(size); } +#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION bool arch_hugetlb_migration_supported(struct hstate *h) { return __hugetlb_valid_size(huge_page_size(h)); } +#endif #ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index c924be0d7ed873b2ab9b82a7ab789598f497b016..06756bad5e30ffbe8e4d0153517f180cbd82a1cd 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -880,4 +880,3 @@ CONFIG_ATOMIC64_SELFTEST=y CONFIG_STRING_SELFTEST=y CONFIG_TEST_BITOPS=m CONFIG_TEST_BPF=m -CONFIG_TEST_LIVEPATCH=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index c8f0c9fe40d708e9b082df3ac0fd5fb901883584..d33f814f78b2c115f31bdbb9bfbe6417258501f0 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -808,4 +808,3 @@ CONFIG_KPROBES_SANITY_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BPF=m -CONFIG_TEST_LIVEPATCH=m diff --git a/arch/s390/include/asm/vdso/data.h b/arch/s390/include/asm/vdso/data.h index 73ee8914266629559ad11053f79984ac49b4b97e..0e2b40ef69b049c5e79ab2e31811e1e6e6ef2475 100644 --- a/arch/s390/include/asm/vdso/data.h +++ b/arch/s390/include/asm/vdso/data.h @@ -3,7 +3,6 @@ #define __S390_ASM_VDSO_DATA_H #include -#include struct arch_vdso_data { __s64 tod_steering_delta; diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index bbaefd84f15e4f37efeee0123d0be9ef122f8886..a45b3a4c91db0f46a9518c47dd9356cf74907b71 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -25,10 +25,7 @@ extern char vdso32_start[], vdso32_end[]; static struct vm_special_mapping vvar_mapping; -static union { - struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; +static union vdso_data_store vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index f3969a3600dbfe4931a18d95da4e2ee8fb53503f..a0cc9bb41a921cb3acd74c50a40f663bc042f2e3 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1206,10 +1206,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) { } -void smp_prepare_boot_cpu(void) -{ -} - void __init smp_setup_processor_id(void) { if (tlb_type == spitfire) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 92ee2697ff398458c004ce4bc7515abfc0def5d4..63fc062add708cf8e09f5f23185478ef8b85a88a 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -108,8 +108,6 @@ static inline void ubd_set_bit(__u64 bit, unsigned char *data) static DEFINE_MUTEX(ubd_lock); static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */ -static int ubd_open(struct gendisk *disk, blk_mode_t mode); -static void ubd_release(struct gendisk *disk); static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); @@ -118,16 +116,11 @@ static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); static const struct block_device_operations ubd_blops = { .owner = THIS_MODULE, - .open = ubd_open, - .release = ubd_release, .ioctl = ubd_ioctl, .compat_ioctl = blkdev_compat_ptr_ioctl, .getgeo = ubd_getgeo, }; -/* Protected by ubd_lock */ -static struct gendisk *ubd_gendisk[MAX_DEV]; - #ifdef CONFIG_BLK_DEV_UBD_SYNC #define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \ .cl = 1 }) @@ -155,7 +148,6 @@ struct ubd { * backing or the cow file. */ char *file; char *serial; - int count; int fd; __u64 size; struct openflags boot_openflags; @@ -165,7 +157,7 @@ struct ubd { unsigned no_trim:1; struct cow cow; struct platform_device pdev; - struct request_queue *queue; + struct gendisk *disk; struct blk_mq_tag_set tag_set; spinlock_t lock; }; @@ -181,7 +173,6 @@ struct ubd { #define DEFAULT_UBD { \ .file = NULL, \ .serial = NULL, \ - .count = 0, \ .fd = -1, \ .size = -1, \ .boot_openflags = OPEN_FLAGS, \ @@ -774,8 +765,6 @@ static int ubd_open_dev(struct ubd *ubd_dev) ubd_dev->fd = fd; if(ubd_dev->cow.file != NULL){ - blk_queue_max_hw_sectors(ubd_dev->queue, 8 * sizeof(long)); - err = -ENOMEM; ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len); if(ubd_dev->cow.bitmap == NULL){ @@ -797,11 +786,6 @@ static int ubd_open_dev(struct ubd *ubd_dev) if(err < 0) goto error; ubd_dev->cow.fd = err; } - if (ubd_dev->no_trim == 0) { - blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST); - blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST); - } - blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue); return 0; error: os_close_file(ubd_dev->fd); @@ -851,27 +835,6 @@ static const struct attribute_group *ubd_attr_groups[] = { NULL, }; -static int ubd_disk_register(int major, u64 size, int unit, - struct gendisk *disk) -{ - disk->major = major; - disk->first_minor = unit << UBD_SHIFT; - disk->minors = 1 << UBD_SHIFT; - disk->fops = &ubd_blops; - set_capacity(disk, size / 512); - sprintf(disk->disk_name, "ubd%c", 'a' + unit); - - ubd_devs[unit].pdev.id = unit; - ubd_devs[unit].pdev.name = DRIVER_NAME; - ubd_devs[unit].pdev.dev.release = ubd_device_release; - dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]); - platform_device_register(&ubd_devs[unit].pdev); - - disk->private_data = &ubd_devs[unit]; - disk->queue = ubd_devs[unit].queue; - return device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups); -} - #define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE)) static const struct blk_mq_ops ubd_mq_ops = { @@ -881,18 +844,36 @@ static const struct blk_mq_ops ubd_mq_ops = { static int ubd_add(int n, char **error_out) { struct ubd *ubd_dev = &ubd_devs[n]; + struct queue_limits lim = { + .max_segments = MAX_SG, + .seg_boundary_mask = PAGE_SIZE - 1, + }; struct gendisk *disk; int err = 0; if(ubd_dev->file == NULL) goto out; + if (ubd_dev->cow.file) + lim.max_hw_sectors = 8 * sizeof(long); + if (!ubd_dev->no_trim) { + lim.max_hw_discard_sectors = UBD_MAX_REQUEST; + lim.max_write_zeroes_sectors = UBD_MAX_REQUEST; + } + err = ubd_file_size(ubd_dev, &ubd_dev->size); if(err < 0){ *error_out = "Couldn't determine size of device's file"; goto out; } + err = ubd_open_dev(ubd_dev); + if (err) { + pr_err("ubd%c: Can't open \"%s\": errno = %d\n", + 'a' + n, ubd_dev->file, -err); + goto out; + } + ubd_dev->size = ROUND_BLOCK(ubd_dev->size); ubd_dev->tag_set.ops = &ubd_mq_ops; @@ -904,29 +885,43 @@ static int ubd_add(int n, char **error_out) err = blk_mq_alloc_tag_set(&ubd_dev->tag_set); if (err) - goto out; + goto out_close; - disk = blk_mq_alloc_disk(&ubd_dev->tag_set, ubd_dev); + disk = blk_mq_alloc_disk(&ubd_dev->tag_set, &lim, ubd_dev); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_cleanup_tags; } - ubd_dev->queue = disk->queue; - blk_queue_write_cache(ubd_dev->queue, true, false); - blk_queue_max_segments(ubd_dev->queue, MAX_SG); - blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1); - err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk); + blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); + blk_queue_write_cache(disk->queue, true, false); + disk->major = UBD_MAJOR; + disk->first_minor = n << UBD_SHIFT; + disk->minors = 1 << UBD_SHIFT; + disk->fops = &ubd_blops; + set_capacity(disk, ubd_dev->size / 512); + sprintf(disk->disk_name, "ubd%c", 'a' + n); + disk->private_data = ubd_dev; + set_disk_ro(disk, !ubd_dev->openflags.w); + + ubd_dev->pdev.id = n; + ubd_dev->pdev.name = DRIVER_NAME; + ubd_dev->pdev.dev.release = ubd_device_release; + dev_set_drvdata(&ubd_dev->pdev.dev, ubd_dev); + platform_device_register(&ubd_dev->pdev); + + err = device_add_disk(&ubd_dev->pdev.dev, disk, ubd_attr_groups); if (err) goto out_cleanup_disk; - ubd_gendisk[n] = disk; return 0; out_cleanup_disk: put_disk(disk); out_cleanup_tags: blk_mq_free_tag_set(&ubd_dev->tag_set); +out_close: + ubd_close_dev(ubd_dev); out: return err; } @@ -1012,7 +1007,6 @@ static int ubd_id(char **str, int *start_out, int *end_out) static int ubd_remove(int n, char **error_out) { - struct gendisk *disk = ubd_gendisk[n]; struct ubd *ubd_dev; int err = -ENODEV; @@ -1023,15 +1017,15 @@ static int ubd_remove(int n, char **error_out) if(ubd_dev->file == NULL) goto out; - /* you cannot remove a open disk */ - err = -EBUSY; - if(ubd_dev->count > 0) - goto out; + if (ubd_dev->disk) { + /* you cannot remove a open disk */ + err = -EBUSY; + if (disk_openers(ubd_dev->disk)) + goto out; - ubd_gendisk[n] = NULL; - if(disk != NULL){ - del_gendisk(disk); - put_disk(disk); + del_gendisk(ubd_dev->disk); + ubd_close_dev(ubd_dev); + put_disk(ubd_dev->disk); } err = 0; @@ -1153,37 +1147,6 @@ static int __init ubd_driver_init(void){ device_initcall(ubd_driver_init); -static int ubd_open(struct gendisk *disk, blk_mode_t mode) -{ - struct ubd *ubd_dev = disk->private_data; - int err = 0; - - mutex_lock(&ubd_mutex); - if(ubd_dev->count == 0){ - err = ubd_open_dev(ubd_dev); - if(err){ - printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n", - disk->disk_name, ubd_dev->file, -err); - goto out; - } - } - ubd_dev->count++; - set_disk_ro(disk, !ubd_dev->openflags.w); -out: - mutex_unlock(&ubd_mutex); - return err; -} - -static void ubd_release(struct gendisk *disk) -{ - struct ubd *ubd_dev = disk->private_data; - - mutex_lock(&ubd_mutex); - if(--ubd_dev->count == 0) - ubd_close_dev(ubd_dev); - mutex_unlock(&ubd_mutex); -} - static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask, __u64 *cow_offset, unsigned long *bitmap, __u64 bitmap_offset, unsigned long *bitmap_words, diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 63535c8c8b2a7aecaf818cec3a2f26b864debfd1..720b963881914498e31db02c335634781744dccf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -147,6 +147,7 @@ config X86 select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) + select GENERIC_CLOCKEVENTS_BROADCAST_IDLE if GENERIC_CLOCKEVENTS_BROADCAST select GENERIC_CLOCKEVENTS_MIN_ADJUST select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE @@ -496,6 +497,15 @@ config X86_CPU_RESCTRL Say N if unsure. +config X86_FRED + bool "Flexible Return and Event Delivery" + depends on X86_64 + help + When enabled, try to use Flexible Return and Event Delivery + instead of the legacy SYSCALL/SYSENTER/IDT architecture for + ring transitions and exception/interrupt handling if the + system supports. + if X86_32 config X86_BIGSMP bool "Support for big SMP systems with more than 8 CPUs" @@ -2421,6 +2431,18 @@ source "kernel/livepatch/Kconfig" endmenu +config CC_HAS_NAMED_AS + def_bool CC_IS_GCC && GCC_VERSION >= 120100 + +config USE_X86_SEG_SUPPORT + def_bool y + depends on CC_HAS_NAMED_AS + # + # -fsanitize=kernel-address (KASAN) is at the moment incompatible + # with named address spaces - see GCC PR sanitizer/111736. + # + depends on !KASAN + config CC_HAS_SLS def_bool $(cc-option,-mharden-sls=all) @@ -2452,12 +2474,12 @@ config CALL_PADDING config FINEIBT def_bool y - depends on X86_KERNEL_IBT && CFI_CLANG && RETPOLINE + depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE select CALL_PADDING config HAVE_CALL_THUNKS def_bool y - depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL + depends on CC_HAS_ENTRY_PADDING && MITIGATION_RETHUNK && OBJTOOL config CALL_THUNKS def_bool n @@ -2479,7 +2501,7 @@ menuconfig SPECULATION_MITIGATIONS if SPECULATION_MITIGATIONS -config PAGE_TABLE_ISOLATION +config MITIGATION_PAGE_TABLE_ISOLATION bool "Remove the kernel mapping in user mode" default y depends on (X86_64 || X86_PAE) @@ -2490,7 +2512,7 @@ config PAGE_TABLE_ISOLATION See Documentation/arch/x86/pti.rst for more details. -config RETPOLINE +config MITIGATION_RETPOLINE bool "Avoid speculative indirect branches in kernel" select OBJTOOL if HAVE_OBJTOOL default y @@ -2500,9 +2522,9 @@ config RETPOLINE branches. Requires a compiler with -mindirect-branch=thunk-extern support for full protection. The kernel may run slower. -config RETHUNK +config MITIGATION_RETHUNK bool "Enable return-thunks" - depends on RETPOLINE && CC_HAS_RETURN_THUNK + depends on MITIGATION_RETPOLINE && CC_HAS_RETURN_THUNK select OBJTOOL if HAVE_OBJTOOL default y if X86_64 help @@ -2511,14 +2533,14 @@ config RETHUNK Requires a compiler with -mfunction-return=thunk-extern support for full protection. The kernel may run slower. -config CPU_UNRET_ENTRY +config MITIGATION_UNRET_ENTRY bool "Enable UNRET on kernel entry" - depends on CPU_SUP_AMD && RETHUNK && X86_64 + depends on CPU_SUP_AMD && MITIGATION_RETHUNK && X86_64 default y help Compile the kernel with support for the retbleed=unret mitigation. -config CALL_DEPTH_TRACKING +config MITIGATION_CALL_DEPTH_TRACKING bool "Mitigate RSB underflow with call depth tracking" depends on CPU_SUP_INTEL && HAVE_CALL_THUNKS select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE @@ -2538,7 +2560,7 @@ config CALL_DEPTH_TRACKING config CALL_THUNKS_DEBUG bool "Enable call thunks and call depth tracking debugging" - depends on CALL_DEPTH_TRACKING + depends on MITIGATION_CALL_DEPTH_TRACKING select FUNCTION_ALIGNMENT_32B default n help @@ -2549,14 +2571,14 @@ config CALL_THUNKS_DEBUG Only enable this when you are debugging call thunks as this creates a noticeable runtime overhead. If unsure say N. -config CPU_IBPB_ENTRY +config MITIGATION_IBPB_ENTRY bool "Enable IBPB on kernel entry" depends on CPU_SUP_AMD && X86_64 default y help Compile the kernel with support for the retbleed=ibpb mitigation. -config CPU_IBRS_ENTRY +config MITIGATION_IBRS_ENTRY bool "Enable IBRS on kernel entry" depends on CPU_SUP_INTEL && X86_64 default y @@ -2565,14 +2587,14 @@ config CPU_IBRS_ENTRY This mitigates both spectre_v2 and retbleed at great cost to performance. -config CPU_SRSO +config MITIGATION_SRSO bool "Mitigate speculative RAS overflow on AMD" - depends on CPU_SUP_AMD && X86_64 && RETHUNK + depends on CPU_SUP_AMD && X86_64 && MITIGATION_RETHUNK default y help Enable the SRSO mitigation needed on AMD Zen1-4 machines. -config SLS +config MITIGATION_SLS bool "Mitigate Straight-Line-Speculation" depends on CC_HAS_SLS && X86_64 select OBJTOOL if HAVE_OBJTOOL @@ -2582,7 +2604,7 @@ config SLS against straight line speculation. The kernel image might be slightly larger. -config GDS_FORCE_MITIGATION +config MITIGATION_GDS_FORCE bool "Force GDS Mitigation" depends on CPU_SUP_INTEL default n diff --git a/arch/x86/Makefile b/arch/x86/Makefile index da8f3caf27815e39592443c7c8c09674fe9e2362..1eccc2ee45fb7d4a7ad8f07b410339f3decbde13 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -22,7 +22,7 @@ RETPOLINE_VDSO_CFLAGS := -mretpoline endif RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix) -ifdef CONFIG_RETHUNK +ifdef CONFIG_MITIGATION_RETHUNK RETHUNK_CFLAGS := -mfunction-return=thunk-extern RETPOLINE_CFLAGS += $(RETHUNK_CFLAGS) endif @@ -53,6 +53,9 @@ REALMODE_CFLAGS += -fno-stack-protector REALMODE_CFLAGS += -Wno-address-of-packed-member REALMODE_CFLAGS += $(cc_stack_align4) REALMODE_CFLAGS += $(CLANG_FLAGS) +ifdef CONFIG_CC_IS_CLANG +REALMODE_CFLAGS += -Wno-gnu +endif export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -192,7 +195,7 @@ KBUILD_CFLAGS += -Wno-sign-compare KBUILD_CFLAGS += -fno-asynchronous-unwind-tables # Avoid indirect branches in kernel to deal with Spectre -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) # Additionally, avoid generating expensive indirect jumps which # are subject to retpolines for small number of switch cases. @@ -205,7 +208,7 @@ ifdef CONFIG_RETPOLINE endif endif -ifdef CONFIG_SLS +ifdef CONFIG_MITIGATION_SLS KBUILD_CFLAGS += -mharden-sls=all endif @@ -296,12 +299,11 @@ install: vdso-install-$(CONFIG_X86_64) += arch/x86/entry/vdso/vdso64.so.dbg vdso-install-$(CONFIG_X86_X32_ABI) += arch/x86/entry/vdso/vdsox32.so.dbg -vdso-install-$(CONFIG_X86_32) += arch/x86/entry/vdso/vdso32.so.dbg -vdso-install-$(CONFIG_IA32_EMULATION) += arch/x86/entry/vdso/vdso32.so.dbg +vdso-install-$(CONFIG_COMPAT_32) += arch/x86/entry/vdso/vdso32.so.dbg archprepare: checkbin checkbin: -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE ifeq ($(RETPOLINE_CFLAGS),) @echo "You are building kernel with non-retpoline compiler." >&2 @echo "Please update your compiler." >&2 diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 18d15d1ce87d5993946c31579b006594e9c4e9da..f196b1d1ddf867b62161cb0528872526c82a693f 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -5,6 +5,8 @@ #include "../string.h" #include "efi.h" +#include + #include /* diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index c1bb180973ea2a74eaf7b1788faeeba366643daa..e162d7f59cc5bdabfc129f68cb9e4ac79a29bb1e 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" +#include + static unsigned long fs; static inline void set_fs(unsigned long seg) { diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c index 6edd034b0b30cb2df90937e06d30e125cf88268f..f2e50f9758e6cccebc1f380e6e1ee6e6c484e2a4 100644 --- a/arch/x86/boot/compressed/efi.c +++ b/arch/x86/boot/compressed/efi.c @@ -7,6 +7,8 @@ #include "misc.h" +#include + /** * efi_get_type - Given a pointer to boot_params, determine the type of EFI environment. * diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h index 866c0af8b5b9e2a6ae8b7de0a28e3c77f47f933d..b22300970f97daf8cf051a337c1ae96f6ebd3488 100644 --- a/arch/x86/boot/compressed/efi.h +++ b/arch/x86/boot/compressed/efi.h @@ -97,15 +97,6 @@ typedef struct { u32 tables; } efi_system_table_32_t; -/* kexec external ABI */ -struct efi_setup_data { - u64 fw_vendor; - u64 __unused; - u64 tables; - u64 smbios; - u64 reserved[8]; -}; - struct efi_unaccepted_memory { u32 version; u32 unit_size; diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index 4a029baa5147748e19cd3699e9a6d67778396b29..909f2a35b60c5da423868732ee9b2c3aa2e45ccd 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -8,8 +8,8 @@ * Copyright (C) 2016 Kees Cook */ -/* No PAGE_TABLE_ISOLATION support needed either: */ -#undef CONFIG_PAGE_TABLE_ISOLATION +/* No MITIGATION_PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION #include "error.h" #include "misc.h" diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 51f957b24ba7a2babdd7c89d3b6cf0b070519691..c882e1f67af01c50a20bfe00a32138dc771ee88c 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" +#include #include #include #include "pgtable.h" diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index bea0719d70f2d2ea08e4e2a4e80d9c684481a2cf..ec71846d28c9ed1230fc910954974aca73e51fcf 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -12,6 +12,7 @@ */ #include "misc.h" +#include #include #include #include @@ -372,7 +373,7 @@ static void enforce_vmpl0(void) MSR_AMD64_SNP_VMPL_SSS | \ MSR_AMD64_SNP_SECURE_TSC | \ MSR_AMD64_SNP_VMGEXIT_PARAM | \ - MSR_AMD64_SNP_VMSA_REG_PROTECTION | \ + MSR_AMD64_SNP_VMSA_REG_PROT | \ MSR_AMD64_SNP_RESERVED_BIT13 | \ MSR_AMD64_SNP_RESERVED_BIT15 | \ MSR_AMD64_SNP_RESERVED_MASK) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 73abbbdd26f87d73ea85b962f686ef178f06bad3..91801138b10bbffa44d6702dbcecdab5275fd034 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -42,7 +42,7 @@ CONFIG_EFI_STUB=y CONFIG_HZ_1000=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y -# CONFIG_RETHUNK is not set +# CONFIG_MITIGATION_RETHUNK is not set CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y CONFIG_PM_TRACE_RTC=y diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index ca2fe186994b0a653a525583e18b90b0715f3fa7..c93e7f5c2a065233a04637702d5df401553ed6cc 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -18,6 +18,9 @@ obj-y += vdso/ obj-y += vsyscall/ obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o +CFLAGS_entry_fred.o += -fno-stack-protector +CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE) +obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o + obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o - diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 9f1d94790a54912cc431e9e39fc0a4a7e6069398..ea81770629eea62532f8f6429bc08cf6e3be7949 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -65,7 +65,7 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 +.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 unwind_hint=1 .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ @@ -87,14 +87,17 @@ For 32-bit we have the following conventions - kernel is built with pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + + .if \unwind_hint UNWIND_HINT_REGS + .endif .if \save_ret pushq %rsi /* return address on top of stack */ .endif .endm -.macro CLEAR_REGS +.macro CLEAR_REGS clear_bp=1 /* * Sanitize registers of values that a speculation attack might * otherwise want to exploit. The lower registers are likely clobbered @@ -109,7 +112,9 @@ For 32-bit we have the following conventions - kernel is built with xorl %r10d, %r10d /* nospec r10 */ xorl %r11d, %r11d /* nospec r11 */ xorl %ebx, %ebx /* nospec rbx */ + .if \clear_bp xorl %ebp, %ebp /* nospec rbp */ + .endif xorl %r12d, %r12d /* nospec r12 */ xorl %r13d, %r13d /* nospec r13 */ xorl %r14d, %r14d /* nospec r14 */ @@ -117,9 +122,9 @@ For 32-bit we have the following conventions - kernel is built with .endm -.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 - PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret - CLEAR_REGS +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_bp=1 unwind_hint=1 + PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret unwind_hint=\unwind_hint + CLEAR_REGS clear_bp=\clear_bp .endm .macro POP_REGS pop_rdi=1 @@ -142,10 +147,10 @@ For 32-bit we have the following conventions - kernel is built with .endif .endm -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two + * MITIGATION_PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two * halves: */ #define PTI_USER_PGTABLE_BIT PAGE_SHIFT @@ -160,7 +165,7 @@ For 32-bit we have the following conventions - kernel is built with .macro ADJUST_KERNEL_CR3 reg:req ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID - /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ + /* Clear PCID and "MITIGATION_PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg .endm @@ -173,7 +178,7 @@ For 32-bit we have the following conventions - kernel is built with .endm #define THIS_CPU_user_pcid_flush_mask \ - PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask + PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask) .macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req mov %cr3, \scratch_reg @@ -239,17 +244,19 @@ For 32-bit we have the following conventions - kernel is built with .Ldone_\@: .endm -.macro RESTORE_CR3 scratch_reg:req save_reg:req +/* Restore CR3 from a kernel context. May restore a user CR3 value. */ +.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI - ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID - /* - * KERNEL pages can always resume with NOFLUSH as we do - * explicit flushes. + * If CR3 contained the kernel page tables at the paranoid exception + * entry, then there is nothing to restore as CR3 is not modified while + * handling the exception. */ bt $PTI_USER_PGTABLE_BIT, \save_reg - jnc .Lnoflush_\@ + jnc .Lend_\@ + + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID /* * Check if there's a pending flush for the user ASID we're @@ -257,25 +264,17 @@ For 32-bit we have the following conventions - kernel is built with */ movq \save_reg, \scratch_reg andq $(0x7FF), \scratch_reg - bt \scratch_reg, THIS_CPU_user_pcid_flush_mask - jnc .Lnoflush_\@ - btr \scratch_reg, THIS_CPU_user_pcid_flush_mask - jmp .Lwrcr3_\@ + jc .Lwrcr3_\@ -.Lnoflush_\@: SET_NOFLUSH_BIT \save_reg .Lwrcr3_\@: - /* - * The CR3 write could be avoided when not changing its value, - * but would require a CR3 read *and* a scratch register. - */ movq \save_reg, %cr3 .Lend_\@: .endm -#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ +#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=n: */ .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req .endm @@ -285,7 +284,7 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req .endm -.macro RESTORE_CR3 scratch_reg:req save_reg:req +.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req .endm #endif @@ -303,7 +302,7 @@ For 32-bit we have the following conventions - kernel is built with * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set. */ .macro IBRS_ENTER save_reg -#ifdef CONFIG_CPU_IBRS_ENTRY +#ifdef CONFIG_MITIGATION_IBRS_ENTRY ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS movl $MSR_IA32_SPEC_CTRL, %ecx @@ -332,7 +331,7 @@ For 32-bit we have the following conventions - kernel is built with * regs. Must be called after the last RET. */ .macro IBRS_EXIT save_reg -#ifdef CONFIG_CPU_IBRS_ENTRY +#ifdef CONFIG_MITIGATION_IBRS_ENTRY ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS movl $MSR_IA32_SPEC_CTRL, %ecx @@ -426,3 +425,63 @@ For 32-bit we have the following conventions - kernel is built with .endm #endif /* CONFIG_SMP */ + +#ifdef CONFIG_X86_64 + +/* rdi: arg1 ... normal C conventions. rax is saved/restored. */ +.macro THUNK name, func +SYM_FUNC_START(\name) + pushq %rbp + movq %rsp, %rbp + + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + + call \func + + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + popq %rbp + RET +SYM_FUNC_END(\name) + _ASM_NOKPROBE(\name) +.endm + +#else /* CONFIG_X86_32 */ + +/* put return address in eax (arg1) */ +.macro THUNK name, func, put_ret_addr_in_eax=0 +SYM_CODE_START_NOALIGN(\name) + pushl %eax + pushl %ecx + pushl %edx + + .if \put_ret_addr_in_eax + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + .endif + + call \func + popl %edx + popl %ecx + popl %eax + RET + _ASM_NOKPROBE(\name) +SYM_CODE_END(\name) + .endm + +#endif diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 0033790499245e3df5f10496986badbe0150aac2..d9feadffa972dadf0e2dcf5804ac50f9827d6c07 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -10,6 +10,8 @@ #include #include +#include "calling.h" + .pushsection .noinstr.text, "ax" SYM_FUNC_START(entry_ibpb) @@ -43,3 +45,4 @@ EXPORT_SYMBOL_GPL(mds_verw_sel); .popsection +THUNK warn_thunk_thunk, __warn_thunk diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index fba427646805d55221664538be2285c3ae188ca1..d3a814efbff66318b8f2f750aa1d3a0906036aca 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -305,7 +305,7 @@ .macro CHECK_AND_APPLY_ESPFIX #ifdef CONFIG_X86_ESPFIX32 #define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8) -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page + GDT_ESPFIX_OFFSET) ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX @@ -649,10 +649,6 @@ SYM_CODE_START_LOCAL(asm_\cfunc) SYM_CODE_END(asm_\cfunc) .endm -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /* * Include the defines which emit the idt entries which are shared * shared between 32 and 64 bit and emit the __irqentry_text_* markers diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9bb4859776291593249b9998416505aeec505011..8af2a26b24f6a9783f9bb348cd67c15e1c3799c8 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -191,7 +191,7 @@ SYM_FUNC_START(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary + movq %rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary) #endif /* @@ -248,7 +248,13 @@ SYM_CODE_START(ret_from_fork_asm) * and unwind should work normally. */ UNWIND_HINT_REGS + +#ifdef CONFIG_X86_FRED + ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \ + "jmp asm_fred_exit_user", X86_FEATURE_FRED +#else jmp swapgs_restore_regs_and_return_to_usermode +#endif SYM_CODE_END(ret_from_fork_asm) .popsection @@ -371,14 +377,6 @@ SYM_CODE_END(\asmsym) idtentry \vector asm_\cfunc \cfunc has_error_code=1 .endm -/* - * System vectors which invoke their handlers directly and are not - * going through the regular common device interrupt handling code. - */ -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /** * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB * @vector: Vector number @@ -563,7 +561,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) #ifdef CONFIG_XEN_PV ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV #endif -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI #endif @@ -580,7 +578,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) jnz .Lnative_iret ud2 -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION .Lpti_restore_regs_and_return_to_usermode: POP_REGS pop_rdi=0 @@ -972,14 +970,14 @@ SYM_CODE_START_LOCAL(paranoid_exit) IBRS_EXIT save_reg=%r15 /* - * The order of operations is important. RESTORE_CR3 requires + * The order of operations is important. PARANOID_RESTORE_CR3 requires * kernel GSBASE. * * NB to anyone to try to optimize this code: this code does * not execute at all for exceptions from user mode. Those * exceptions go through error_return instead. */ - RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14 /* Handle the three GSBASE cases */ ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE @@ -1100,7 +1098,7 @@ SYM_CODE_END(error_return) * * Registers: * %r14: Used to save/restore the CR3 of the interrupted context - * when PAGE_TABLE_ISOLATION is in use. Do not clobber. + * when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber. */ SYM_CODE_START(asm_exc_nmi) UNWIND_HINT_IRET_ENTRY @@ -1408,8 +1406,7 @@ end_repeat_nmi: /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ IBRS_EXIT save_reg=%r15 - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 /* * The above invocation of paranoid_entry stored the GSBASE diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S new file mode 100644 index 0000000000000000000000000000000000000000..a02bc6f3d2e6a43d4e00c4657986382029065aa0 --- /dev/null +++ b/arch/x86/entry/entry_64_fred.S @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The actual FRED entry points. + */ + +#include + +#include +#include +#include + +#include "calling.h" + + .code64 + .section .noinstr.text, "ax" + +.macro FRED_ENTER + UNWIND_HINT_END_OF_STACK + ENDBR + PUSH_AND_CLEAR_REGS + movq %rsp, %rdi /* %rdi -> pt_regs */ +.endm + +.macro FRED_EXIT + UNWIND_HINT_REGS + POP_REGS +.endm + +/* + * The new RIP value that FRED event delivery establishes is + * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3. + * Thus the FRED ring 3 entry point must be 4K page aligned. + */ + .align 4096 + +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user) + FRED_ENTER + call fred_entry_from_user +SYM_INNER_LABEL(asm_fred_exit_user, SYM_L_GLOBAL) + FRED_EXIT +1: ERETU + + _ASM_EXTABLE_TYPE(1b, asm_fred_entrypoint_user, EX_TYPE_ERETU) +SYM_CODE_END(asm_fred_entrypoint_user) + +/* + * The new RIP value that FRED event delivery establishes is + * (IA32_FRED_CONFIG & ~FFFH) + 256 for events that occur in + * ring 0, i.e., asm_fred_entrypoint_user + 256. + */ + .org asm_fred_entrypoint_user + 256, 0xcc +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel) + FRED_ENTER + call fred_entry_from_kernel + FRED_EXIT + ERETS +SYM_CODE_END(asm_fred_entrypoint_kernel) + +#if IS_ENABLED(CONFIG_KVM_INTEL) +SYM_FUNC_START(asm_fred_entry_from_kvm) + push %rbp + mov %rsp, %rbp + + UNWIND_HINT_SAVE + + /* + * Both IRQ and NMI from VMX can be handled on current task stack + * because there is no need to protect from reentrancy and the call + * stack leading to this helper is effectively constant and shallow + * (relatively speaking). Do the same when FRED is active, i.e., no + * need to check current stack level for a stack switch. + * + * Emulate the FRED-defined redzone and stack alignment. + */ + sub $(FRED_CONFIG_REDZONE_AMOUNT << 6), %rsp + and $FRED_STACK_FRAME_RSP_MASK, %rsp + + /* + * Start to push a FRED stack frame, which is always 64 bytes: + * + * +--------+-----------------+ + * | Bytes | Usage | + * +--------+-----------------+ + * | 63:56 | Reserved | + * | 55:48 | Event Data | + * | 47:40 | SS + Event Info | + * | 39:32 | RSP | + * | 31:24 | RFLAGS | + * | 23:16 | CS + Aux Info | + * | 15:8 | RIP | + * | 7:0 | Error Code | + * +--------+-----------------+ + */ + push $0 /* Reserved, must be 0 */ + push $0 /* Event data, 0 for IRQ/NMI */ + push %rdi /* fred_ss handed in by the caller */ + push %rbp + pushf + mov $__KERNEL_CS, %rax + push %rax + + /* + * Unlike the IDT event delivery, FRED _always_ pushes an error code + * after pushing the return RIP, thus the CALL instruction CANNOT be + * used here to push the return RIP, otherwise there is no chance to + * push an error code before invoking the IRQ/NMI handler. + * + * Use LEA to get the return RIP and push it, then push an error code. + */ + lea 1f(%rip), %rax + push %rax /* Return RIP */ + push $0 /* Error code, 0 for IRQ/NMI */ + + PUSH_AND_CLEAR_REGS clear_bp=0 unwind_hint=0 + movq %rsp, %rdi /* %rdi -> pt_regs */ + call __fred_entry_from_kvm /* Call the C entry point */ + POP_REGS + ERETS +1: + /* + * Objtool doesn't understand what ERETS does, this hint tells it that + * yes, we'll reach here and with what stack state. A save/restore pair + * isn't strictly needed, but it's the simplest form. + */ + UNWIND_HINT_RESTORE + pop %rbp + RET + +SYM_FUNC_END(asm_fred_entry_from_kvm) +EXPORT_SYMBOL_GPL(asm_fred_entry_from_kvm); +#endif diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c new file mode 100644 index 0000000000000000000000000000000000000000..ac120cbdaaf2b4c474954c9a9f148222a370a72a --- /dev/null +++ b/arch/x86/entry/entry_fred.c @@ -0,0 +1,294 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The FRED specific kernel/user entry functions which are invoked from + * assembly code and dispatch to the associated handlers. + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* FRED EVENT_TYPE_OTHER vector numbers */ +#define FRED_SYSCALL 1 +#define FRED_SYSENTER 2 + +static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code) +{ + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + + /* Panic on events from a high stack level */ + if (regs->fred_cs.sl > 0) { + pr_emerg("PANIC: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + fred_event_data(regs), regs->cs, regs->ip); + die("invalid or fatal FRED event", regs, regs->orig_ax); + panic("invalid or fatal FRED event"); + } else { + unsigned long flags = oops_begin(); + int sig = SIGKILL; + + pr_alert("BUG: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + fred_event_data(regs), regs->cs, regs->ip); + + if (__die("Invalid or fatal FRED event", regs, regs->orig_ax)) + sig = 0; + + oops_end(flags, regs, sig); + } + + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); +} + +static noinstr void fred_intx(struct pt_regs *regs) +{ + switch (regs->fred_ss.vector) { + /* Opcode 0xcd, 0x3, NOT INT3 (opcode 0xcc) */ + case X86_TRAP_BP: + return exc_int3(regs); + + /* Opcode 0xcd, 0x4, NOT INTO (opcode 0xce) */ + case X86_TRAP_OF: + return exc_overflow(regs); + +#ifdef CONFIG_IA32_EMULATION + /* INT80 */ + case IA32_SYSCALL_VECTOR: + if (ia32_enabled()) + return int80_emulation(regs); + fallthrough; +#endif + + default: + return exc_general_protection(regs, 0); + } +} + +static __always_inline void fred_other(struct pt_regs *regs) +{ + /* The compiler can fold these conditions into a single test */ + if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.lm)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_syscall_64(regs, regs->orig_ax); + return; + } else if (ia32_enabled() && + likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.lm)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_fast_syscall_32(regs); + return; + } else { + exc_invalid_op(regs); + return; + } +} + +#define SYSVEC(_vector, _function) [_vector - FIRST_SYSTEM_VECTOR] = fred_sysvec_##_function + +static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { + SYSVEC(ERROR_APIC_VECTOR, error_interrupt), + SYSVEC(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt), + SYSVEC(LOCAL_TIMER_VECTOR, apic_timer_interrupt), + SYSVEC(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), + + SYSVEC(RESCHEDULE_VECTOR, reschedule_ipi), + SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, call_function_single), + SYSVEC(CALL_FUNCTION_VECTOR, call_function), + SYSVEC(REBOOT_VECTOR, reboot), + + SYSVEC(THRESHOLD_APIC_VECTOR, threshold), + SYSVEC(DEFERRED_ERROR_VECTOR, deferred_error), + SYSVEC(THERMAL_APIC_VECTOR, thermal), + + SYSVEC(IRQ_WORK_VECTOR, irq_work), + + SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), +}; + +static bool fred_setup_done __initdata; + +void __init fred_install_sysvec(unsigned int sysvec, idtentry_t handler) +{ + if (WARN_ON_ONCE(sysvec < FIRST_SYSTEM_VECTOR)) + return; + + if (WARN_ON_ONCE(fred_setup_done)) + return; + + if (!WARN_ON_ONCE(sysvec_table[sysvec - FIRST_SYSTEM_VECTOR])) + sysvec_table[sysvec - FIRST_SYSTEM_VECTOR] = handler; +} + +static noinstr void fred_handle_spurious_interrupt(struct pt_regs *regs) +{ + spurious_interrupt(regs, regs->fred_ss.vector); +} + +void __init fred_complete_exception_setup(void) +{ + unsigned int vector; + + for (vector = 0; vector < FIRST_EXTERNAL_VECTOR; vector++) + set_bit(vector, system_vectors); + + for (vector = 0; vector < NR_SYSTEM_VECTORS; vector++) { + if (sysvec_table[vector]) + set_bit(vector + FIRST_SYSTEM_VECTOR, system_vectors); + else + sysvec_table[vector] = fred_handle_spurious_interrupt; + } + fred_setup_done = true; +} + +static noinstr void fred_extint(struct pt_regs *regs) +{ + unsigned int vector = regs->fred_ss.vector; + unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR, + NR_SYSTEM_VECTORS); + + if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR)) + return; + + if (likely(vector >= FIRST_SYSTEM_VECTOR)) { + irqentry_state_t state = irqentry_enter(regs); + + instrumentation_begin(); + sysvec_table[index](regs); + instrumentation_end(); + irqentry_exit(regs, state); + } else { + common_interrupt(regs, vector); + } +} + +static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code) +{ + /* Optimize for #PF. That's the only exception which matters performance wise */ + if (likely(regs->fred_ss.vector == X86_TRAP_PF)) + return exc_page_fault(regs, error_code); + + switch (regs->fred_ss.vector) { + case X86_TRAP_DE: return exc_divide_error(regs); + case X86_TRAP_DB: return fred_exc_debug(regs); + case X86_TRAP_BR: return exc_bounds(regs); + case X86_TRAP_UD: return exc_invalid_op(regs); + case X86_TRAP_NM: return exc_device_not_available(regs); + case X86_TRAP_DF: return exc_double_fault(regs, error_code); + case X86_TRAP_TS: return exc_invalid_tss(regs, error_code); + case X86_TRAP_NP: return exc_segment_not_present(regs, error_code); + case X86_TRAP_SS: return exc_stack_segment(regs, error_code); + case X86_TRAP_GP: return exc_general_protection(regs, error_code); + case X86_TRAP_MF: return exc_coprocessor_error(regs); + case X86_TRAP_AC: return exc_alignment_check(regs, error_code); + case X86_TRAP_XF: return exc_simd_coprocessor_error(regs); + +#ifdef CONFIG_X86_MCE + case X86_TRAP_MC: return fred_exc_machine_check(regs); +#endif +#ifdef CONFIG_INTEL_TDX_GUEST + case X86_TRAP_VE: return exc_virtualization_exception(regs); +#endif +#ifdef CONFIG_X86_CET + case X86_TRAP_CP: return exc_control_protection(regs, error_code); +#endif + default: return fred_bad_type(regs, error_code); + } + +} + +static noinstr void fred_swexc(struct pt_regs *regs, unsigned long error_code) +{ + switch (regs->fred_ss.vector) { + case X86_TRAP_BP: return exc_int3(regs); + case X86_TRAP_OF: return exc_overflow(regs); + default: return fred_bad_type(regs, error_code); + } +} + +__visible noinstr void fred_entry_from_user(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_SWINT: + return fred_intx(regs); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + case EVENT_TYPE_OTHER: + return fred_other(regs); + default: break; + } + + return fred_bad_type(regs, error_code); +} + +__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + default: break; + } + + return fred_bad_type(regs, error_code); +} + +#if IS_ENABLED(CONFIG_KVM_INTEL) +__visible noinstr void __fred_entry_from_kvm(struct pt_regs *regs) +{ + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + return fred_exc_nmi(regs); + default: + WARN_ON_ONCE(1); + } +} +#endif diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S index 0103e103a6573adab040c4e41b588edcdf7a7ecd..da37f42f45498d8282f2ea805d04eec1bfbfaa44 100644 --- a/arch/x86/entry/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -4,33 +4,15 @@ * Copyright 2008 by Steven Rostedt, Red Hat, Inc * (inspired by Andi Kleen's thunk_64.S) */ - #include - #include - #include - /* put return address in eax (arg1) */ - .macro THUNK name, func, put_ret_addr_in_eax=0 -SYM_CODE_START_NOALIGN(\name) - pushl %eax - pushl %ecx - pushl %edx +#include +#include +#include - .if \put_ret_addr_in_eax - /* Place EIP in the arg1 */ - movl 3*4(%esp), %eax - .endif +#include "calling.h" - call \func - popl %edx - popl %ecx - popl %eax - RET - _ASM_NOKPROBE(\name) -SYM_CODE_END(\name) - .endm - - THUNK preempt_schedule_thunk, preempt_schedule - THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace - EXPORT_SYMBOL(preempt_schedule_thunk) - EXPORT_SYMBOL(preempt_schedule_notrace_thunk) +THUNK preempt_schedule_thunk, preempt_schedule +THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace +EXPORT_SYMBOL(preempt_schedule_thunk) +EXPORT_SYMBOL(preempt_schedule_notrace_thunk) diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index 416b400f39dbb6b16694a6e76b22bb03e8834bc5..119ebdc3d362398092736e7132f16c0c89e97c5a 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -9,39 +9,6 @@ #include "calling.h" #include - /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ - .macro THUNK name, func -SYM_FUNC_START(\name) - pushq %rbp - movq %rsp, %rbp - - pushq %rdi - pushq %rsi - pushq %rdx - pushq %rcx - pushq %rax - pushq %r8 - pushq %r9 - pushq %r10 - pushq %r11 - - call \func - - popq %r11 - popq %r10 - popq %r9 - popq %r8 - popq %rax - popq %rcx - popq %rdx - popq %rsi - popq %rdi - popq %rbp - RET -SYM_FUNC_END(\name) - _ASM_NOKPROBE(\name) - .endm - THUNK preempt_schedule_thunk, preempt_schedule THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace EXPORT_SYMBOL(preempt_schedule_thunk) diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index b1b8dd1608f7ebd73510cb94e2866c09aac825b6..620f6257bbe9358ce918bbf35aff659b683a2149 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -3,7 +3,7 @@ # Building vDSO images for x86. # -# Include the generic Makefile to check the built vdso. +# Include the generic Makefile to check the built vDSO: include $(srctree)/lib/vdso/Makefile # Sanitizer runtimes are unavailable and cannot be linked here. @@ -18,48 +18,39 @@ OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. KCOV_INSTRUMENT := n -VDSO64-$(CONFIG_X86_64) := y -VDSOX32-$(CONFIG_X86_X32_ABI) := y -VDSO32-$(CONFIG_X86_32) := y -VDSO32-$(CONFIG_IA32_EMULATION) := y - -# files to link into the vdso +# Files to link into the vDSO: vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o vobjs-$(CONFIG_X86_SGX) += vsgx.o -# files to link into kernel -obj-y += vma.o extable.o -KASAN_SANITIZE_vma.o := y -UBSAN_SANITIZE_vma.o := y -KCSAN_SANITIZE_vma.o := y -OBJECT_FILES_NON_STANDARD_vma.o := n -OBJECT_FILES_NON_STANDARD_extable.o := n +# Files to link into the kernel: +obj-y += vma.o extable.o +KASAN_SANITIZE_vma.o := y +UBSAN_SANITIZE_vma.o := y +KCSAN_SANITIZE_vma.o := y + +OBJECT_FILES_NON_STANDARD_vma.o := n +OBJECT_FILES_NON_STANDARD_extable.o := n -# vDSO images to build -vdso_img-$(VDSO64-y) += 64 -vdso_img-$(VDSOX32-y) += x32 -vdso_img-$(VDSO32-y) += 32 +# vDSO images to build: +obj-$(CONFIG_X86_64) += vdso-image-64.o +obj-$(CONFIG_X86_X32_ABI) += vdso-image-x32.o +obj-$(CONFIG_COMPAT_32) += vdso-image-32.o vdso32-setup.o -obj-$(VDSO32-y) += vdso32-setup.o -OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n +OBJECT_FILES_NON_STANDARD_vdso-image-32.o := n +OBJECT_FILES_NON_STANDARD_vdso-image-64.o := n +OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n -vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) -vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F) +vobjs := $(addprefix $(obj)/, $(vobjs-y)) +vobjs32 := $(addprefix $(obj)/, $(vobjs32-y)) $(obj)/vdso.o: $(obj)/vdso.so targets += vdso.lds $(vobjs-y) targets += vdso32/vdso32.lds $(vobjs32-y) -# Build the vDSO image C files and link them in. -vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) -vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) -vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) -obj-y += $(vdso_img_objs) -targets += $(vdso_img_cfiles) -targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so) +targets += $(foreach x, 64 x32 32, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg) CPPFLAGS_vdso.lds += -P -C @@ -87,7 +78,7 @@ CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ -fno-omit-frame-pointer -foptimize-sibling-calls \ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE ifneq ($(RETPOLINE_VDSO_CFLAGS),) CFL += $(RETPOLINE_VDSO_CFLAGS) endif @@ -123,7 +114,7 @@ VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \ vobjx32s-y := $(vobjs-y:.o=-x32.o) # same thing, but in the output directory -vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F) +vobjx32s := $(addprefix $(obj)/, $(vobjx32s-y)) # Convert 64bit object file to x32 for x32 vDSO. quiet_cmd_x32 = X32 $@ @@ -164,7 +155,7 @@ KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE ifneq ($(RETPOLINE_VDSO_CFLAGS),) KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) endif @@ -190,5 +181,3 @@ GCOV_PROFILE := n quiet_cmd_vdso_and_check = VDSO $@ cmd_vdso_and_check = $(cmd_vdso); $(cmd_vdso_check) - -clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 7645730dc228f960e69373f860ccfca811097a84..6d83ceb7f1badac96649b4397d3313d8a1dbaab9 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -274,59 +274,6 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) return ret; } -#ifdef CONFIG_X86_64 -/* - * Put the vdso above the (randomized) stack with another randomized - * offset. This way there is no hole in the middle of address space. - * To save memory make sure it is still in the same PTE as the stack - * top. This doesn't give that many random bits. - * - * Note that this algorithm is imperfect: the distribution of the vdso - * start address within a PMD is biased toward the end. - * - * Only used for the 64-bit and x32 vdsos. - */ -static unsigned long vdso_addr(unsigned long start, unsigned len) -{ - unsigned long addr, end; - unsigned offset; - - /* - * Round up the start address. It can start out unaligned as a result - * of stack start randomization. - */ - start = PAGE_ALIGN(start); - - /* Round the lowest possible end address up to a PMD boundary. */ - end = (start + len + PMD_SIZE - 1) & PMD_MASK; - if (end >= DEFAULT_MAP_WINDOW) - end = DEFAULT_MAP_WINDOW; - end -= len; - - if (end > start) { - offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1); - addr = start + (offset << PAGE_SHIFT); - } else { - addr = start; - } - - /* - * Forcibly align the final address in case we have a hardware - * issue that requires alignment for performance reasons. - */ - addr = align_vdso_addr(addr); - - return addr; -} - -static int map_vdso_randomized(const struct vdso_image *image) -{ - unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); - - return map_vdso(image, addr); -} -#endif - int map_vdso_once(const struct vdso_image *image, unsigned long addr) { struct mm_struct *mm = current->mm; @@ -369,7 +316,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (!vdso64_enabled) return 0; - return map_vdso_randomized(&vdso_image_64); + return map_vdso(&vdso_image_64, 0); } #ifdef CONFIG_COMPAT @@ -380,7 +327,7 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm, if (x32) { if (!vdso64_enabled) return 0; - return map_vdso_randomized(&vdso_image_x32); + return map_vdso(&vdso_image_x32, 0); } #endif #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index e0ca8120aea87631200df447bcdf756abb317c5d..a3c0df11d0e6d8c36db77b59077fef711724b2db 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -76,7 +76,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, if (!show_unhandled_signals) return; - printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n", level, current->comm, task_pid_nr(current), message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 81f6d8275b6bf2aab39ff40eac3fcf706192654b..69a3b02e50bb0cfbe2688769ad3a5ea72fd8e11f 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -579,7 +579,7 @@ static void amd_pmu_cpu_starting(int cpu) if (!x86_pmu.amd_nb_constraints) return; - nb_id = topology_die_id(cpu); + nb_id = topology_amd_node_id(cpu); WARN_ON_ONCE(nb_id == BAD_APICID); for_each_online_cpu(i) { diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 5bf03c5758129569afdc394203033f95f0757c41..4ccb8fa483e613af8bade68e3718bda0e91acd38 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -71,7 +71,7 @@ union amd_uncore_info { }; struct amd_uncore { - union amd_uncore_info * __percpu info; + union amd_uncore_info __percpu *info; struct amd_uncore_pmu *pmus; unsigned int num_pmus; bool init_done; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 3804f21ab0494f90effb7399210524ba17c73b43..768d1414897fb1de72a9cb9abad354add11828a4 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4b50a3a9818aec676d44da4db29fc5d4448757e1..326c8cd5aa2d2e587277207ee3f225ef7b0512a6 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -834,7 +834,7 @@ static int __init cstate_init(void) } if (has_cstate_pkg) { - if (topology_max_die_per_package() > 1) { + if (topology_max_dies_per_package() > 1) { err = perf_pmu_register(&cstate_pkg_pmu, "cstate_die", -1); } else { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index d49d661ec0a7d1b75d17d05b8c75a44e8d5122e7..2641ba620f12a51d4c5d71ceba3bd28557926bfb 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7927c0b832faa4eec31f53d3330c98b858cba249..258e2cdf28fadc8d84a0574b38c000de09df802d 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1893,7 +1893,7 @@ static int __init intel_uncore_init(void) return -ENODEV; __uncore_max_dies = - topology_max_packages() * topology_max_die_per_package(); + topology_max_packages() * topology_max_dies_per_package(); id = x86_match_cpu(intel_uncore_match); if (!id) { diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c index 56eea2c66cfb8cc7d9cc395e58c24a7319afc944..92da8aaa59660e2097423eee0f018bb1f195efd7 100644 --- a/arch/x86/events/intel/uncore_nhmex.c +++ b/arch/x86/events/intel/uncore_nhmex.c @@ -1221,8 +1221,8 @@ void nhmex_uncore_cpu_init(void) uncore_nhmex = true; else nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events; - if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (nhmex_uncore_cbox.num_boxes > topology_num_cores_per_package()) + nhmex_uncore_cbox.num_boxes = topology_num_cores_per_package(); uncore_msr_uncores = nhmex_msr_uncores; } /* end of Nehalem-EX uncore support */ diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 7fd4334e12a172d495bdf9353ac678f2a19f86f6..9462fd9f3b7abc2263c203651a89f01230bd3b51 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -364,8 +364,8 @@ static struct intel_uncore_type *snb_msr_uncores[] = { void snb_uncore_cpu_init(void) { uncore_msr_uncores = snb_msr_uncores; - if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (snb_uncore_cbox.num_boxes > topology_num_cores_per_package()) + snb_uncore_cbox.num_boxes = topology_num_cores_per_package(); } static void skl_uncore_msr_init_box(struct intel_uncore_box *box) @@ -428,8 +428,8 @@ static struct intel_uncore_type *skl_msr_uncores[] = { void skl_uncore_cpu_init(void) { uncore_msr_uncores = skl_msr_uncores; - if (skl_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - skl_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (skl_uncore_cbox.num_boxes > topology_num_cores_per_package()) + skl_uncore_cbox.num_boxes = topology_num_cores_per_package(); snb_uncore_arb.ops = &skl_uncore_msr_ops; } diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index a96496bef678bb5cc562092811e4a919340c3bf3..2eaf0f339849b66d0d8cc7800ed9e63e85ce4bbc 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1172,8 +1172,8 @@ static struct intel_uncore_type *snbep_msr_uncores[] = { void snbep_uncore_cpu_init(void) { - if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (snbep_uncore_cbox.num_boxes > topology_num_cores_per_package()) + snbep_uncore_cbox.num_boxes = topology_num_cores_per_package(); uncore_msr_uncores = snbep_msr_uncores; } @@ -1406,7 +1406,7 @@ static int topology_gidnid_map(int nodeid, u32 gidnid) */ for (i = 0; i < 8; i++) { if (nodeid == GIDNIDMAP(gidnid, i)) { - if (topology_max_die_per_package() > 1) + if (topology_max_dies_per_package() > 1) die_id = i; else die_id = topology_phys_to_logical_pkg(i); @@ -1845,8 +1845,8 @@ static struct intel_uncore_type *ivbep_msr_uncores[] = { void ivbep_uncore_cpu_init(void) { - if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (ivbep_uncore_cbox.num_boxes > topology_num_cores_per_package()) + ivbep_uncore_cbox.num_boxes = topology_num_cores_per_package(); uncore_msr_uncores = ivbep_msr_uncores; } @@ -2917,8 +2917,8 @@ static bool hswep_has_limit_sbox(unsigned int device) void hswep_uncore_cpu_init(void) { - if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (hswep_uncore_cbox.num_boxes > topology_num_cores_per_package()) + hswep_uncore_cbox.num_boxes = topology_num_cores_per_package(); /* Detect 6-8 core systems with only two SBOXes */ if (hswep_has_limit_sbox(HSWEP_PCU_DID)) @@ -3280,8 +3280,8 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = { void bdx_uncore_cpu_init(void) { - if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (bdx_uncore_cbox.num_boxes > topology_num_cores_per_package()) + bdx_uncore_cbox.num_boxes = topology_num_cores_per_package(); uncore_msr_uncores = bdx_msr_uncores; /* Detect systems with no SBOXes */ diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 8d98d468b97618b2646f43dd2a922db7677eb4b1..fb2b1961e5a33a190ab21ce6ea7d27026ffbc18e 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -674,7 +674,7 @@ static const struct attribute_group *rapl_attr_update[] = { static int __init init_rapl_pmus(void) { - int maxdie = topology_max_packages() * topology_max_die_per_package(); + int maxdie = topology_max_packages() * topology_max_dies_per_package(); size_t size; size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *); diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 96e6c51515f50467efbf7cb77082c6b9d18cb8f6..edd2f35b2a5e100ce9662ceead612bc04c213f29 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -16,6 +16,11 @@ extern struct boot_params boot_params; static struct real_mode_header hv_vtl_real_mode_header; +static bool __init hv_vtl_msi_ext_dest_id(void) +{ + return true; +} + void __init hv_vtl_init_platform(void) { pr_info("Linux runs in Hyper-V Virtual Trust Level\n"); @@ -26,8 +31,9 @@ void __init hv_vtl_init_platform(void) x86_init.timers.timer_init = x86_init_noop; /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; + x86_init.mpparse.parse_smp_cfg = x86_init_noop; x86_platform.get_wallclock = get_rtc_noop; x86_platform.set_wallclock = set_rtc_noop; @@ -38,6 +44,8 @@ void __init hv_vtl_init_platform(void) x86_platform.legacy.warm_reset = 0; x86_platform.legacy.reserve_bios_regions = 0; x86_platform.legacy.devices.pnpbios = 0; + + x86_init.hyper.msi_ext_dest_id = hv_vtl_msi_ext_dest_id; } static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc) diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 7dcbf153ad7257c3fda712df91b3195efe522ab2..768d73de0d098afc5d7a9b6ea1e7c2747aab4feb 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -502,6 +503,31 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], return -EFAULT; } +/* + * When transitioning memory between encrypted and decrypted, the caller + * of set_memory_encrypted() or set_memory_decrypted() is responsible for + * ensuring that the memory isn't in use and isn't referenced while the + * transition is in progress. The transition has multiple steps, and the + * memory is in an inconsistent state until all steps are complete. A + * reference while the state is inconsistent could result in an exception + * that can't be cleanly fixed up. + * + * But the Linux kernel load_unaligned_zeropad() mechanism could cause a + * stray reference that can't be prevented by the caller, so Linux has + * specific code to handle this case. But when the #VC and #VE exceptions + * routed to a paravisor, the specific code doesn't work. To avoid this + * problem, mark the pages as "not present" while the transition is in + * progress. If load_unaligned_zeropad() causes a stray reference, a normal + * page fault is generated instead of #VC or #VE, and the page-fault-based + * handlers for load_unaligned_zeropad() resolve the reference. When the + * transition is complete, hv_vtom_set_host_visibility() marks the pages + * as "present" again. + */ +static bool hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc) +{ + return !set_memory_np(kbuffer, pagecount); +} + /* * hv_vtom_set_host_visibility - Set specified memory visible to host. * @@ -515,16 +541,28 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo enum hv_mem_host_visibility visibility = enc ? VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE; u64 *pfn_array; + phys_addr_t paddr; + void *vaddr; int ret = 0; bool result = true; int i, pfn; pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); - if (!pfn_array) - return false; + if (!pfn_array) { + result = false; + goto err_set_memory_p; + } for (i = 0, pfn = 0; i < pagecount; i++) { - pfn_array[pfn] = virt_to_hvpfn((void *)kbuffer + i * HV_HYP_PAGE_SIZE); + /* + * Use slow_virt_to_phys() because the PRESENT bit has been + * temporarily cleared in the PTEs. slow_virt_to_phys() works + * without the PRESENT bit while virt_to_hvpfn() or similar + * does not. + */ + vaddr = (void *)kbuffer + (i * HV_HYP_PAGE_SIZE); + paddr = slow_virt_to_phys(vaddr); + pfn_array[pfn] = paddr >> HV_HYP_PAGE_SHIFT; pfn++; if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) { @@ -538,14 +576,30 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo } } - err_free_pfn_array: +err_free_pfn_array: kfree(pfn_array); + +err_set_memory_p: + /* + * Set the PTE PRESENT bits again to revert what hv_vtom_clear_present() + * did. Do this even if there is an error earlier in this function in + * order to avoid leaving the memory range in a "broken" state. Setting + * the PRESENT bits shouldn't fail, but return an error if it does. + */ + if (set_memory_p(kbuffer, pagecount)) + result = false; + return result; } static bool hv_vtom_tlb_flush_required(bool private) { - return true; + /* + * Since hv_vtom_clear_present() marks the PTEs as "not present" + * and flushes the TLB, they can't be in the TLB. That makes the + * flush controlled by this function redundant, so return "false". + */ + return false; } static bool hv_vtom_cache_flush_required(void) @@ -608,6 +662,7 @@ void __init hv_vtom_init(void) x86_platform.hyper.is_private_mmio = hv_is_private_mmio; x86_platform.guest.enc_cache_flush_required = hv_vtom_cache_flush_required; x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required; + x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present; x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility; /* Set WB as the default cache mode. */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 9d159b771dc814a123e77f85ca0793891d16a04d..94ce0f7c9d3a26cd2b766a60042a0b941b3fe0d2 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -46,6 +46,10 @@ extern void x86_32_probe_apic(void); static inline void x86_32_probe_apic(void) { } #endif +extern u32 cpuid_to_apicid[]; + +#define CPU_ACPIID_INVALID U32_MAX + #ifdef CONFIG_X86_LOCAL_APIC extern int apic_verbosity; @@ -54,8 +58,6 @@ extern int local_apic_timer_c2_ok; extern bool apic_is_disabled; extern unsigned int lapic_timer_period; -extern u32 cpuid_to_apicid[]; - extern enum apic_intr_mode_id apic_intr_mode; enum apic_intr_mode_id { APIC_PIC, @@ -169,6 +171,14 @@ extern bool apic_needs_pit(void); extern void apic_send_IPI_allbutself(unsigned int vector); +extern void topology_register_apic(u32 apic_id, u32 acpi_id, bool present); +extern void topology_register_boot_apic(u32 apic_id); +extern int topology_hotplug_apic(u32 apic_id, u32 acpi_id); +extern void topology_hotunplug_apic(unsigned int cpu); +extern void topology_apply_cmdline_limits_early(void); +extern void topology_init_possible_cpus(void); +extern void topology_reset_possible_cpus_up(void); + #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 @@ -183,6 +193,8 @@ static inline void apic_intr_mode_init(void) { } static inline void lapic_assign_system_vectors(void) { } static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } static inline bool apic_needs_pit(void) { return true; } +static inline void topology_apply_cmdline_limits_early(void) { } +static inline void topology_init_possible_cpus(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC @@ -289,16 +301,11 @@ struct apic { /* Probe, setup and smpboot functions */ int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); - bool (*apic_id_registered)(void); - bool (*check_apicid_used)(physid_mask_t *map, u32 apicid); void (*init_apic_ldr)(void); - void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); u32 (*cpu_present_to_apicid)(int mps_cpu); - u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb); u32 (*get_apic_id)(u32 id); - u32 (*set_apic_id)(u32 apicid); /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip); @@ -527,7 +534,6 @@ extern int default_apic_id_valid(u32 apicid); extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); -extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap); extern u32 default_cpu_present_to_apicid(int mps_cpu); void apic_send_nmi_to_offline_cpu(unsigned int cpu); diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index b1a98fa38828e2589416fe4445ed001831009bb0..076bf8dee70264f63d2a4842bca4be9c5f587f7e 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #ifndef CONFIG_X86_CMPXCHG64 diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 0216f63a366b54db7239f5689e3fa77b54d7f7a8..fe1e7e3cc844a84e08908e44094d020d2fa2107a 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -33,7 +33,7 @@ * Returns: * 0 - (index < size) */ -static inline unsigned long array_index_mask_nospec(unsigned long index, +static __always_inline unsigned long array_index_mask_nospec(unsigned long index, unsigned long size) { unsigned long mask; diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index 8a1cd359248f993fad129b6e379ca476f87d743d..fb7388bbc212f9b1435b47206ae586cf84505846 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -11,9 +11,10 @@ enum cc_vendor { CC_VENDOR_INTEL, }; -extern u64 cc_mask; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM extern enum cc_vendor cc_vendor; +extern u64 cc_mask; + static inline void cc_set_mask(u64 mask) { RIP_REL_REF(cc_mask) = mask; diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index f8f9a9b7939587b2b8f6e00794e94ea69e6a338a..aa30fd8cad7f5285534c9e4b24e2846d1429f480 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -9,18 +9,10 @@ #include #include -#ifdef CONFIG_SMP - -extern void prefill_possible_map(void); - -#else /* CONFIG_SMP */ - -static inline void prefill_possible_map(void) {} - +#ifndef CONFIG_SMP #define cpu_physical_id(cpu) boot_cpu_physical_apicid #define cpu_acpi_id(cpu) 0 #define safe_smp_processor_id() 0 - #endif /* CONFIG_SMP */ #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index e35842f7aed0900290fc601787eb3a9f616097f2..0343caa016a9a6e0c8c8a40a2483fe6fdbc2135a 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -324,7 +324,9 @@ #define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ #define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ +#define X86_FEATURE_FRED (12*32+17) /* Flexible Return and Event Delivery */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ +#define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ #define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 9bee3e7bf97363e1d9b76438e94b902c8e1baf92..6b122a31da065a22ed660b04f41160d66c705c18 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -127,6 +127,42 @@ static inline unsigned int cpuid_edx(unsigned int op) return edx; } +static inline void __cpuid_read(unsigned int leaf, unsigned int subleaf, u32 *regs) +{ + regs[CPUID_EAX] = leaf; + regs[CPUID_ECX] = subleaf; + __cpuid(regs + CPUID_EAX, regs + CPUID_EBX, regs + CPUID_ECX, regs + CPUID_EDX); +} + +#define cpuid_subleaf(leaf, subleaf, regs) { \ + static_assert(sizeof(*(regs)) == 16); \ + __cpuid_read(leaf, subleaf, (u32 *)(regs)); \ +} + +#define cpuid_leaf(leaf, regs) { \ + static_assert(sizeof(*(regs)) == 16); \ + __cpuid_read(leaf, 0, (u32 *)(regs)); \ +} + +static inline void __cpuid_read_reg(unsigned int leaf, unsigned int subleaf, + enum cpuid_regs_idx regidx, u32 *reg) +{ + u32 regs[4]; + + __cpuid_read(leaf, subleaf, regs); + *reg = regs[regidx]; +} + +#define cpuid_subleaf_reg(leaf, subleaf, regidx, reg) { \ + static_assert(sizeof(*(reg)) == 4); \ + __cpuid_read_reg(leaf, subleaf, regidx, (u32 *)(reg)); \ +} + +#define cpuid_leaf_reg(leaf, regidx, reg) { \ + static_assert(sizeof(*(reg)) == 4); \ + __cpuid_read_reg(leaf, 0, regidx, (u32 *)(reg)); \ +} + static __always_inline bool cpuid_function_is_indexed(u32 function) { switch (function) { diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index dd4b67101bb7ed40013584f66cc80e805ea0ece1..bf5953883ec365377fec5979f6d5c34418ebba32 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -18,7 +18,7 @@ struct pcpu_hot { struct task_struct *current_task; int preempt_count; int cpu_number; -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING u64 call_depth; #endif unsigned long top_of_stack; @@ -37,8 +37,15 @@ static_assert(sizeof(struct pcpu_hot) == 64); DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); +/* const-qualified alias to pcpu_hot, aliased by linker. */ +DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override, + const_pcpu_hot); + static __always_inline struct task_struct *get_current(void) { + if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) + return this_cpu_read_const(const_pcpu_hot.current_task); + return this_cpu_read_stable(pcpu_hot.current_task); } diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 0cec92c430cc9d2fc24307c3e230a06e115c6fa8..fdbbbfec745aa58855641ae59e17a3a50fc2696a 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -5,7 +5,9 @@ #include #include #include + #include +#include DECLARE_PER_CPU(unsigned long, cpu_dr7); @@ -159,4 +161,26 @@ static inline unsigned long amd_get_dr_addr_mask(unsigned int dr) } #endif +static inline unsigned long get_debugctlmsr(void) +{ + unsigned long debugctlmsr = 0; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + + return debugctlmsr; +} + +static inline void update_debugctlmsr(unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); +} + #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 52c0150179907f8b506ed9f9230301bdae31d2bc..62dc9f59ea768c2022cd4673d5b0e74d2c3d4ce6 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -403,8 +403,6 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit1 = (limit >> 16) & 0xf; } -void alloc_intr_gate(unsigned int n, const void *addr); - static inline void init_idt_data(struct idt_data *data, unsigned int n, const void *addr) { diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 3332d2940020615229f2dd54863705af58a48c6d..da4054fbf533e9d5884066b5fcbbd766822e3a1e 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -44,32 +44,32 @@ # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) #endif -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define DISABLE_PTI 0 #else # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) #endif -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE # define DISABLE_RETPOLINE 0 #else # define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) #endif -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK # define DISABLE_RETHUNK 0 #else # define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) #endif -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY # define DISABLE_UNRET 0 #else # define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) #endif -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING # define DISABLE_CALL_DEPTH_TRACKING 0 #else # define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) @@ -117,6 +117,12 @@ #define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) #endif +#ifdef CONFIG_X86_FRED +# define DISABLE_FRED 0 +#else +# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) +#endif + #ifdef CONFIG_KVM_AMD_SEV #define DISABLE_SEV_SNP 0 #else @@ -139,7 +145,7 @@ #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) -#define DISABLED_MASK12 (DISABLE_LAM) +#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index c4555b269a1b247430f50dcf5934855823e1eba6..1dc600fa3ba536e904027df726d4d33360f0fe77 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -143,15 +143,6 @@ extern void efi_free_boot_services(void); void arch_efi_call_virt_setup(void); void arch_efi_call_virt_teardown(void); -/* kexec external ABI */ -struct efi_setup_data { - u64 fw_vendor; - u64 __unused; - u64 tables; - u64 smbios; - u64 reserved[8]; -}; - extern u64 efi_setup; #ifdef CONFIG_EFI @@ -418,8 +409,9 @@ extern int __init efi_memmap_split_count(efi_memory_desc_t *md, extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap, void *buf, struct efi_mem_range *mem); -#define arch_ima_efi_boot_mode \ - ({ extern struct boot_params boot_params; boot_params.secure_boot; }) +extern enum efi_secureboot_mode __x86_ima_efi_boot_mode(void); + +#define arch_ima_efi_boot_mode __x86_ima_efi_boot_mode() #ifdef CONFIG_EFI_RUNTIME_MAP int efi_get_runtime_map_size(void); diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1e16bd5ac781d8ad734110278d6a47517b685f87..1fb83d47711f97da6f24d14f0bb3b1d9cab55240 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -392,5 +392,4 @@ struct va_alignment { } ____cacheline_aligned; extern struct va_alignment va_align; -extern unsigned long align_vdso_addr(unsigned long); #endif /* _ASM_X86_ELF_H */ diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h index fe6312045042f8ae605e18d2b87b1f25c1dfc510..7acf0383be8022351c9dd1166b3b578a5626954c 100644 --- a/arch/x86/include/asm/extable_fixup_types.h +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -64,6 +64,8 @@ #define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4)) #define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8)) -#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ +#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ + +#define EX_TYPE_ERETU 21 #endif diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index ca6e5e5f16b2eca0e02222956951c628b556ec50..c485f1944c5f86a0ab0ecad38ce0e5bb65569bb8 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -37,10 +37,12 @@ extern void fpu_flush_thread(void); * The FPU context is only stored/restored for a user task and * PF_KTHREAD is used to distinguish between kernel and user threads. */ -static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) +static inline void switch_fpu_prepare(struct task_struct *old, int cpu) { if (cpu_feature_enabled(X86_FEATURE_FPU) && - !(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + !(old->flags & (PF_KTHREAD | PF_USER_WORKER))) { + struct fpu *old_fpu = &old->thread.fpu; + save_fpregs_to_fpstate(old_fpu); /* * The save operation preserved register state, so the @@ -60,10 +62,10 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) * Delay loading of the complete FPU state until the return to userland. * PKRU is handled separately. */ -static inline void switch_fpu_finish(void) +static inline void switch_fpu_finish(struct task_struct *new) { if (cpu_feature_enabled(X86_FEATURE_FPU)) - set_thread_flag(TIF_NEED_FPU_LOAD); + set_tsk_thread_flag(new, TIF_NEED_FPU_LOAD); } #endif /* _ASM_X86_FPU_SCHED_H */ diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h new file mode 100644 index 0000000000000000000000000000000000000000..e86c7ba32435f5a6956ebab09656c639a21ec53f --- /dev/null +++ b/arch/x86/include/asm/fred.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Macros for Flexible Return and Event Delivery (FRED) + */ + +#ifndef ASM_X86_FRED_H +#define ASM_X86_FRED_H + +#include + +#include +#include + +/* + * FRED event return instruction opcodes for ERET{S,U}; supported in + * binutils >= 2.41. + */ +#define ERETS _ASM_BYTES(0xf2,0x0f,0x01,0xca) +#define ERETU _ASM_BYTES(0xf3,0x0f,0x01,0xca) + +/* + * RSP is aligned to a 64-byte boundary before used to push a new stack frame + */ +#define FRED_STACK_FRAME_RSP_MASK _AT(unsigned long, (~0x3f)) + +/* + * Used for the return address for call emulation during code patching, + * and measured in 64-byte cache lines. + */ +#define FRED_CONFIG_REDZONE_AMOUNT 1 +#define FRED_CONFIG_REDZONE (_AT(unsigned long, FRED_CONFIG_REDZONE_AMOUNT) << 6) +#define FRED_CONFIG_INT_STKLVL(l) (_AT(unsigned long, l) << 9) +#define FRED_CONFIG_ENTRYPOINT(p) _AT(unsigned long, (p)) + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_X86_FRED +#include + +#include + +struct fred_info { + /* Event data: CR2, DR6, ... */ + unsigned long edata; + unsigned long resv; +}; + +/* Full format of the FRED stack frame */ +struct fred_frame { + struct pt_regs regs; + struct fred_info info; +}; + +static __always_inline struct fred_info *fred_info(struct pt_regs *regs) +{ + return &container_of(regs, struct fred_frame, regs)->info; +} + +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) +{ + return fred_info(regs)->edata; +} + +void asm_fred_entrypoint_user(void); +void asm_fred_entrypoint_kernel(void); +void asm_fred_entry_from_kvm(struct fred_ss); + +__visible void fred_entry_from_user(struct pt_regs *regs); +__visible void fred_entry_from_kernel(struct pt_regs *regs); +__visible void __fred_entry_from_kvm(struct pt_regs *regs); + +/* Can be called from noinstr code, thus __always_inline */ +static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) +{ + struct fred_ss ss = { + .ss =__KERNEL_DS, + .type = type, + .vector = vector, + .nmi = type == EVENT_TYPE_NMI, + .lm = 1, + }; + + asm_fred_entry_from_kvm(ss); +} + +void cpu_init_fred_exceptions(void); +void fred_complete_exception_setup(void); + +#else /* CONFIG_X86_FRED */ +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } +static inline void cpu_init_fred_exceptions(void) { } +static inline void fred_complete_exception_setup(void) { } +static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } +#endif /* CONFIG_X86_FRED */ +#endif /* !__ASSEMBLY__ */ + +#endif /* ASM_X86_FRED_H */ diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index 35cff5f2becf6bcbbdc34a890f29d3ba197e100d..9e7e8ca8e2997727d733cc5db3b64ac330e30bc8 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -6,7 +6,7 @@ #ifdef CONFIG_X86_64 -#include +#include /* * Read/write a task's FSBASE or GSBASE. This returns the value that diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b02c3cd3c0f6564ad374602995c08d9082942744..edebf1020e0497ff0d424fc7b35c450cbd34e164 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -16,8 +16,6 @@ #include -#define IRQ_MATRIX_BITS NR_VECTORS - #ifndef __ASSEMBLY__ #include diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index c7ef6ea2fa993c78864f460dace8ef86ad246b16..4212c00c9708d49253d8ebe3699578f14476408a 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -69,7 +69,7 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm); extern bool __ia32_enabled; -static inline bool ia32_enabled(void) +static __always_inline bool ia32_enabled(void) { return __ia32_enabled; } @@ -81,7 +81,7 @@ static inline void ia32_disable(void) #else /* !CONFIG_IA32_EMULATION */ -static inline bool ia32_enabled(void) +static __always_inline bool ia32_enabled(void) { return IS_ENABLED(CONFIG_X86_32); } diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 13639e57e1f8af4c24c0c656a9f0801516bf25f4..47d4c04d103df4eb824fef8297f0b77c39dee1c1 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -13,15 +13,18 @@ #include +typedef void (*idtentry_t)(struct pt_regs *regs); + /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware * @vector: Vector number (ignored for C) * @func: Function name of the entry point * - * Declares three functions: + * Declares four functions: * - The ASM entry point: asm_##func * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the FRED event dispatcher (maybe unused) * - The C handler called from the ASM entry point * * Note: This is the C variant of DECLARE_IDTENTRY(). As the name says it @@ -31,6 +34,7 @@ #define DECLARE_IDTENTRY(vector, func) \ asmlinkage void asm_##func(void); \ asmlinkage void xen_asm_##func(void); \ + void fred_##func(struct pt_regs *regs); \ __visible void func(struct pt_regs *regs) /** @@ -137,6 +141,17 @@ static __always_inline void __##func(struct pt_regs *regs, \ #define DEFINE_IDTENTRY_RAW(func) \ __visible noinstr void func(struct pt_regs *regs) +/** + * DEFINE_FREDENTRY_RAW - Emit code for raw FRED entry points + * @func: Function name of the entry point + * + * @func is called from the FRED event dispatcher with interrupts disabled. + * + * See @DEFINE_IDTENTRY_RAW for further details. + */ +#define DEFINE_FREDENTRY_RAW(func) \ +noinstr void fred_##func(struct pt_regs *regs) + /** * DECLARE_IDTENTRY_RAW_ERRORCODE - Declare functions for raw IDT entry points * Error code pushed by hardware @@ -233,17 +248,27 @@ static noinline void __##func(struct pt_regs *regs, u32 vector) #define DEFINE_IDTENTRY_SYSVEC(func) \ static void __##func(struct pt_regs *regs); \ \ +static __always_inline void instr_##func(struct pt_regs *regs) \ +{ \ + kvm_set_cpu_l1tf_flush_l1d(); \ + run_sysvec_on_irqstack_cond(__##func, regs); \ +} \ + \ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ - run_sysvec_on_irqstack_cond(__##func, regs); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static noinline void __##func(struct pt_regs *regs) /** @@ -260,19 +285,29 @@ static noinline void __##func(struct pt_regs *regs) #define DEFINE_IDTENTRY_SYSVEC_SIMPLE(func) \ static __always_inline void __##func(struct pt_regs *regs); \ \ -__visible noinstr void func(struct pt_regs *regs) \ +static __always_inline void instr_##func(struct pt_regs *regs) \ { \ - irqentry_state_t state = irqentry_enter(regs); \ - \ - instrumentation_begin(); \ __irq_enter_raw(); \ kvm_set_cpu_l1tf_flush_l1d(); \ __##func (regs); \ __irq_exit_raw(); \ +} \ + \ +__visible noinstr void func(struct pt_regs *regs) \ +{ \ + irqentry_state_t state = irqentry_enter(regs); \ + \ + instrumentation_begin(); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static __always_inline void __##func(struct pt_regs *regs) /** @@ -410,17 +445,35 @@ __visible noinstr void func(struct pt_regs *regs, \ /* C-Code mapping */ #define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW #define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW +#define DEFINE_FREDENTRY_NMI DEFINE_FREDENTRY_RAW #ifdef CONFIG_X86_64 #define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_MCE DEFINE_FREDENTRY_RAW #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_DEBUG DEFINE_FREDENTRY_RAW +#endif + +void idt_install_sysvec(unsigned int n, const void *function); + +#ifdef CONFIG_X86_FRED +void fred_install_sysvec(unsigned int vector, const idtentry_t function); +#else +static inline void fred_install_sysvec(unsigned int vector, const idtentry_t function) { } #endif +#define sysvec_install(vector, function) { \ + if (cpu_feature_enabled(X86_FEATURE_FRED)) \ + fred_install_sysvec(vector, function); \ + else \ + idt_install_sysvec(vector, asm_##function); \ +} + #else /* !__ASSEMBLY__ */ /* @@ -447,7 +500,7 @@ __visible noinstr void func(struct pt_regs *regs, \ /* System vector entries */ #define DECLARE_IDTENTRY_SYSVEC(vector, func) \ - idtentry_sysvec vector func + DECLARE_IDTENTRY(vector, func) #ifdef CONFIG_X86_64 # define DECLARE_IDTENTRY_MCE(vector, func) \ @@ -655,23 +708,36 @@ DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi); DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); +#else +# define fred_sysvec_reschedule_ipi NULL +# define fred_sysvec_reboot NULL +# define fred_sysvec_call_function_single NULL +# define fred_sysvec_call_function NULL #endif #ifdef CONFIG_X86_LOCAL_APIC # ifdef CONFIG_X86_MCE_THRESHOLD DECLARE_IDTENTRY_SYSVEC(THRESHOLD_APIC_VECTOR, sysvec_threshold); +# else +# define fred_sysvec_threshold NULL # endif # ifdef CONFIG_X86_MCE_AMD DECLARE_IDTENTRY_SYSVEC(DEFERRED_ERROR_VECTOR, sysvec_deferred_error); +# else +# define fred_sysvec_deferred_error NULL # endif # ifdef CONFIG_X86_THERMAL_VECTOR DECLARE_IDTENTRY_SYSVEC(THERMAL_APIC_VECTOR, sysvec_thermal); +# else +# define fred_sysvec_thermal NULL # endif # ifdef CONFIG_IRQ_WORK DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); +# else +# define fred_sysvec_irq_work NULL # endif #endif @@ -679,12 +745,16 @@ DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi); +#else +# define fred_sysvec_kvm_posted_intr_ipi NULL +# define fred_sysvec_kvm_posted_intr_wakeup_ipi NULL +# define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif #if IS_ENABLED(CONFIG_HYPERV) DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); -DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); +DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); #endif #if IS_ENABLED(CONFIG_ACRN_GUEST) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 3814a9263d64eaa7b7ceeb696582b5584699940b..294cd2a40818129a1032f24b7d3ba89a99258e23 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -379,7 +379,7 @@ static inline void iosubmit_cmds512(void __iomem *dst, const void *src, const u8 *end = from + count * 64; while (from < end) { - movdir64b(dst, from); + movdir64b_io(dst, from); from += 64; } } diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 51c782600e0260054ffab6121a0afbcede078de0..0d806513c4b32a103a6f36c2286bd9dc4deec091 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -140,7 +140,6 @@ extern void mask_ioapic_entries(void); extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); -extern void setup_ioapic_ids_from_mpc_nocheck(void); extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index c9f6a6c5de3cf6c72f53a62e25c0c548df85b28e..91ca9a9ee3a2b26a964254202c53ffb5ca544d3a 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -25,7 +25,6 @@ #include #include -#include struct kimage; diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h index 511b350691876dc1572183f60c282f22423494fc..f163176d6f7ffc580f0bc6f342ec741ebecaa9a5 100644 --- a/arch/x86/include/asm/kvmclock.h +++ b/arch/x86/include/asm/kvmclock.h @@ -4,8 +4,6 @@ #include -extern struct clocksource kvm_clock; - DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); static __always_inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 571fe4d2d2328c3b4899faf8085e0181403ed003..dc31b13b87a0d17020aaf2d3915da366b1b62d59 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -40,27 +40,27 @@ #ifdef __ASSEMBLY__ -#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk -#else /* CONFIG_RETPOLINE */ -#ifdef CONFIG_SLS +#else /* CONFIG_MITIGATION_RETPOLINE */ +#ifdef CONFIG_MITIGATION_SLS #define RET ret; int3 #else #define RET ret #endif -#endif /* CONFIG_RETPOLINE */ +#endif /* CONFIG_MITIGATION_RETPOLINE */ #else /* __ASSEMBLY__ */ -#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define ASM_RET "jmp __x86_return_thunk\n\t" -#else /* CONFIG_RETPOLINE */ -#ifdef CONFIG_SLS +#else /* CONFIG_MITIGATION_RETPOLINE */ +#ifdef CONFIG_MITIGATION_SLS #define ASM_RET "ret; int3\n\t" #else #define ASM_RET "ret\n\t" #endif -#endif /* CONFIG_RETPOLINE */ +#endif /* CONFIG_MITIGATION_RETPOLINE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 73dba8b9444308d603e5e5db6b76d412aea6b3b5..59aa966dc2127c61be2e3bbe443516a54e70c725 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -131,8 +131,20 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new) (typeof(l->a.counter) *) old, new); } -/* Always has a lock prefix */ -#define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) +/* + * Implement local_xchg using CMPXCHG instruction without the LOCK prefix. + * XCHG is expensive due to the implied LOCK prefix. The processor + * cannot prefetch cachelines if XCHG is used. + */ +static __always_inline long +local_xchg(local_t *l, long n) +{ + long c = local_read(l); + + do { } while (!local_try_cmpxchg(l, &c, n)); + + return c; +} /** * local_add_unless - add unless the number is already a given value diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 4b0f98a8d338d5c56371e4af161ae335cde7528a..c72c7ff78fcdc141e965910a89565a06a2a13027 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_MPSPEC_H #define _ASM_X86_MPSPEC_H +#include #include #include @@ -46,70 +47,31 @@ extern int smp_found_config; # define smp_found_config 0 #endif -static inline void get_smp_config(void) -{ - x86_init.mpparse.get_smp_config(0); -} - -static inline void early_get_smp_config(void) -{ - x86_init.mpparse.get_smp_config(1); -} - -static inline void find_smp_config(void) -{ - x86_init.mpparse.find_smp_config(); -} - #ifdef CONFIG_X86_MPPARSE extern void e820__memblock_alloc_reserved_mpc_new(void); extern int enable_update_mptable; -extern void default_find_smp_config(void); -extern void default_get_smp_config(unsigned int early); +extern void mpparse_find_mptable(void); +extern void mpparse_parse_early_smp_config(void); +extern void mpparse_parse_smp_config(void); #else static inline void e820__memblock_alloc_reserved_mpc_new(void) { } -#define enable_update_mptable 0 -#define default_find_smp_config x86_init_noop -#define default_get_smp_config x86_init_uint_noop +#define enable_update_mptable 0 +#define mpparse_find_mptable x86_init_noop +#define mpparse_parse_early_smp_config x86_init_noop +#define mpparse_parse_smp_config x86_init_noop #endif -int generic_processor_info(int apicid); +extern DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC); -#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) - -struct physid_mask { - unsigned long mask[PHYSID_ARRAY_SIZE]; -}; - -typedef struct physid_mask physid_mask_t; - -#define physid_set(physid, map) set_bit(physid, (map).mask) -#define physid_isset(physid, map) test_bit(physid, (map).mask) - -#define physids_or(dst, src1, src2) \ - bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) - -#define physids_clear(map) \ - bitmap_zero((map).mask, MAX_LOCAL_APIC) - -#define physids_empty(map) \ - bitmap_empty((map).mask, MAX_LOCAL_APIC) - -static inline void physids_promote(unsigned long physids, physid_mask_t *map) +static inline void reset_phys_cpu_present_map(u32 apicid) { - physids_clear(*map); - map->mask[0] = physids; + bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC); + set_bit(apicid, phys_cpu_present_map); } -static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) +static inline void copy_phys_cpu_present_map(unsigned long *dst) { - physids_clear(*map); - physid_set(physid, *map); + bitmap_copy(dst, phys_cpu_present_map, MAX_LOCAL_APIC); } -#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} } -#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} } - -extern physid_mask_t phys_cpu_present_map; - #endif /* _ASM_X86_MPSPEC_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f482bc6a5ae7cbe5878d64554995099ef3c31055..24c575cdd6b9fe240e563e63c191b7e1ab2e6909 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -36,8 +36,19 @@ #define EFER_FFXSR (1<<_EFER_FFXSR) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) -/* Intel MSRs. Some also available on other CPUs */ +/* FRED MSRs */ +#define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ +#define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ +#define MSR_IA32_FRED_RSP2 0x1ce /* Level 2 stack pointer */ +#define MSR_IA32_FRED_RSP3 0x1cf /* Level 3 stack pointer */ +#define MSR_IA32_FRED_STKLVLS 0x1d0 /* Exception stack levels */ +#define MSR_IA32_FRED_SSP0 MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */ +#define MSR_IA32_FRED_SSP1 0x1d1 /* Level 1 shadow stack pointer */ +#define MSR_IA32_FRED_SSP2 0x1d2 /* Level 2 shadow stack pointer */ +#define MSR_IA32_FRED_SSP3 0x1d3 /* Level 3 shadow stack pointer */ +#define MSR_IA32_FRED_CONFIG 0x1d4 /* Entrypoint and interrupt stack level */ +/* Intel MSRs. Some also available on other CPUs */ #define MSR_TEST_CTRL 0x00000033 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT 29 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT) @@ -594,36 +605,47 @@ #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 -#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 -#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) +#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) -#define MSR_AMD64_RMP_BASE 0xc0010132 -#define MSR_AMD64_RMP_END 0xc0010133 - -/* SNP feature bits enabled by the hypervisor */ -#define MSR_AMD64_SNP_VTOM BIT_ULL(3) -#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(4) -#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(5) -#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(6) -#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(7) -#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(8) -#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(9) -#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(10) -#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(11) -#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(12) -#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(14) -#define MSR_AMD64_SNP_VMSA_REG_PROTECTION BIT_ULL(16) -#define MSR_AMD64_SNP_SMT_PROTECTION BIT_ULL(17) - -/* SNP feature bits reserved for future use. */ -#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) -#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) -#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, 18) +#define MSR_AMD64_SNP_VTOM_BIT 3 +#define MSR_AMD64_SNP_VTOM BIT_ULL(MSR_AMD64_SNP_VTOM_BIT) +#define MSR_AMD64_SNP_REFLECT_VC_BIT 4 +#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(MSR_AMD64_SNP_REFLECT_VC_BIT) +#define MSR_AMD64_SNP_RESTRICTED_INJ_BIT 5 +#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(MSR_AMD64_SNP_RESTRICTED_INJ_BIT) +#define MSR_AMD64_SNP_ALT_INJ_BIT 6 +#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(MSR_AMD64_SNP_ALT_INJ_BIT) +#define MSR_AMD64_SNP_DEBUG_SWAP_BIT 7 +#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(MSR_AMD64_SNP_DEBUG_SWAP_BIT) +#define MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT 8 +#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT) +#define MSR_AMD64_SNP_BTB_ISOLATION_BIT 9 +#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(MSR_AMD64_SNP_BTB_ISOLATION_BIT) +#define MSR_AMD64_SNP_VMPL_SSS_BIT 10 +#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(MSR_AMD64_SNP_VMPL_SSS_BIT) +#define MSR_AMD64_SNP_SECURE_TSC_BIT 11 +#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(MSR_AMD64_SNP_SECURE_TSC_BIT) +#define MSR_AMD64_SNP_VMGEXIT_PARAM_BIT 12 +#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(MSR_AMD64_SNP_VMGEXIT_PARAM_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) +#define MSR_AMD64_SNP_IBS_VIRT_BIT 14 +#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(MSR_AMD64_SNP_IBS_VIRT_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) +#define MSR_AMD64_SNP_VMSA_REG_PROT_BIT 16 +#define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) +#define MSR_AMD64_SNP_SMT_PROT_BIT 17 +#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) +#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f +#define MSR_AMD64_RMP_BASE 0xc0010132 +#define MSR_AMD64_RMP_END 0xc0010133 + /* AMD Collaborative Processor Performance Control MSRs */ #define MSR_AMD_CPPC_CAP1 0xc00102b0 #define MSR_AMD_CPPC_ENABLE 0xc00102b1 diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 65ec1965cd2810323ab71a8d5cb79851845237c4..d642037f9ed5d81d5af89986e19bf8c33c74c6c8 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -12,11 +12,13 @@ #include #include +#include + struct msr_info { - u32 msr_no; - struct msr reg; - struct msr *msrs; - int err; + u32 msr_no; + struct msr reg; + struct msr __percpu *msrs; + int err; }; struct msr_regs_info { @@ -97,6 +99,19 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) : : "c" (msr), "a"(low), "d" (high) : "memory"); } +/* + * WRMSRNS behaves exactly like WRMSR with the only difference being + * that it is not a serializing instruction by default. + */ +static __always_inline void __wrmsrns(u32 msr, u32 low, u32 high) +{ + /* Instruction opcode for WRMSRNS; supported in binutils >= 2.40. */ + asm volatile("1: .byte 0x0f,0x01,0xc6\n" + "2:\n" + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) + : : "c" (msr), "a"(low), "d" (high)); +} + #define native_rdmsr(msr, val1, val2) \ do { \ u64 __val = __rdmsr((msr)); \ @@ -297,6 +312,11 @@ do { \ #endif /* !CONFIG_PARAVIRT_XXL */ +static __always_inline void wrmsrns(u32 msr, u64 val) +{ + __wrmsrns(msr, val, val >> 32); +} + /* * 64-bit version of wrmsr_safe(): */ @@ -305,8 +325,8 @@ static inline int wrmsrl_safe(u32 msr, u64 val) return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); } -struct msr *msrs_alloc(void); -void msrs_free(struct msr *msrs); +struct msr __percpu *msrs_alloc(void); +void msrs_free(struct msr __percpu *msrs); int msr_set_bit(u32 msr, u8 bit); int msr_clear_bit(u32 msr, u8 bit); @@ -315,8 +335,8 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q); -void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); -void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs); +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); @@ -345,14 +365,14 @@ static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) return 0; } static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no, - struct msr *msrs) + struct msr __percpu *msrs) { - rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); + rdmsr_on_cpu(0, msr_no, raw_cpu_ptr(&msrs->l), raw_cpu_ptr(&msrs->h)); } static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no, - struct msr *msrs) + struct msr __percpu *msrs) { - wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); + wrmsr_on_cpu(0, msr_no, raw_cpu_read(msrs->l), raw_cpu_read(msrs->h)); } static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 5c5f1e56c4048db1a725b450e6f700b484d150d2..41a0ebb699ec64284832878191404a9cfa8ffbc4 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -14,9 +14,6 @@ extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); -struct ctl_table; -extern int proc_nmi_enabled(struct ctl_table *, int , - void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; #endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 2aa52cab1e463af6f4105e2f887acf185dec9f31..d0b8bb762838ebfb0e22edca2815f4e509ea3afa 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -59,13 +59,13 @@ #ifdef CONFIG_CALL_THUNKS_DEBUG # define CALL_THUNKS_DEBUG_INC_CALLS \ - incq %gs:__x86_call_count; + incq PER_CPU_VAR(__x86_call_count); # define CALL_THUNKS_DEBUG_INC_RETS \ - incq %gs:__x86_ret_count; + incq PER_CPU_VAR(__x86_ret_count); # define CALL_THUNKS_DEBUG_INC_STUFFS \ - incq %gs:__x86_stuffs_count; + incq PER_CPU_VAR(__x86_stuffs_count); # define CALL_THUNKS_DEBUG_INC_CTXSW \ - incq %gs:__x86_ctxsw_count; + incq PER_CPU_VAR(__x86_ctxsw_count); #else # define CALL_THUNKS_DEBUG_INC_CALLS # define CALL_THUNKS_DEBUG_INC_RETS @@ -73,16 +73,13 @@ # define CALL_THUNKS_DEBUG_INC_CTXSW #endif -#if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) +#if defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) #include #define CREDIT_CALL_DEPTH \ movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); -#define ASM_CREDIT_CALL_DEPTH \ - movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); - #define RESET_CALL_DEPTH \ xor %eax, %eax; \ bts $63, %rax; \ @@ -95,20 +92,14 @@ CALL_THUNKS_DEBUG_INC_CALLS #define INCREMENT_CALL_DEPTH \ - sarq $5, %gs:pcpu_hot + X86_call_depth; \ - CALL_THUNKS_DEBUG_INC_CALLS - -#define ASM_INCREMENT_CALL_DEPTH \ sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ CALL_THUNKS_DEBUG_INC_CALLS #else #define CREDIT_CALL_DEPTH -#define ASM_CREDIT_CALL_DEPTH #define RESET_CALL_DEPTH -#define INCREMENT_CALL_DEPTH -#define ASM_INCREMENT_CALL_DEPTH #define RESET_CALL_DEPTH_FROM_CALL +#define INCREMENT_CALL_DEPTH #endif /* @@ -158,7 +149,7 @@ jnz 771b; \ /* barrier for jnz misprediction */ \ lfence; \ - ASM_CREDIT_CALL_DEPTH \ + CREDIT_CALL_DEPTH \ CALL_THUNKS_DEBUG_INC_CTXSW #else /* @@ -212,7 +203,7 @@ */ .macro VALIDATE_UNRET_END #if defined(CONFIG_NOINSTR_VALIDATION) && \ - (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) + (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) ANNOTATE_RETPOLINE_SAFE nop #endif @@ -241,7 +232,7 @@ * instruction irrespective of kCFI. */ .macro JMP_NOSPEC reg:req -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE __CS_PREFIX \reg jmp __x86_indirect_thunk_\reg #else @@ -251,7 +242,7 @@ .endm .macro CALL_NOSPEC reg:req -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE __CS_PREFIX \reg call __x86_indirect_thunk_\reg #else @@ -271,7 +262,7 @@ .Lskip_rsb_\@: .endm -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) +#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) #define CALL_UNTRAIN_RET "call entry_untrain_ret" #else #define CALL_UNTRAIN_RET "" @@ -289,7 +280,7 @@ * where we have a stack but before any RET instruction. */ .macro __UNTRAIN_RET ibpb_feature, call_depth_insns -#if defined(CONFIG_RETHUNK) || defined(CONFIG_CPU_IBPB_ENTRY) +#if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY) VALIDATE_UNRET_END ALTERNATIVE_3 "", \ CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ @@ -309,9 +300,9 @@ .macro CALL_DEPTH_ACCOUNT -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING ALTERNATIVE "", \ - __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH + __stringify(INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif .endm @@ -323,7 +314,7 @@ * Note: Only the memory operand variant of VERW clears the CPU buffers. */ .macro CLEAR_CPU_BUFFERS - ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF + ALTERNATIVE "", __stringify(verw mds_verw_sel), X86_FEATURE_CLEAR_CPU_BUF .endm #else /* __ASSEMBLY__ */ @@ -339,19 +330,19 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[]; extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK extern void __x86_return_thunk(void); #else static inline void __x86_return_thunk(void) {} #endif -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY extern void retbleed_return_thunk(void); #else static inline void retbleed_return_thunk(void) {} #endif -#ifdef CONFIG_CPU_SRSO +#ifdef CONFIG_MITIGATION_SRSO extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); #else @@ -368,7 +359,9 @@ extern void entry_ibpb(void); extern void (*x86_return_thunk)(void); -#ifdef CONFIG_CALL_DEPTH_TRACKING +extern void __warn_thunk(void); + +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING extern void call_depth_return_thunk(void); #define CALL_DEPTH_ACCOUNT \ @@ -382,14 +375,14 @@ DECLARE_PER_CPU(u64, __x86_ret_count); DECLARE_PER_CPU(u64, __x86_stuffs_count); DECLARE_PER_CPU(u64, __x86_ctxsw_count); #endif -#else /* !CONFIG_CALL_DEPTH_TRACKING */ +#else /* !CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ static inline void call_depth_return_thunk(void) {} #define CALL_DEPTH_ACCOUNT "" -#endif /* CONFIG_CALL_DEPTH_TRACKING */ +#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE #define GEN(reg) \ extern retpoline_thunk_t __x86_indirect_thunk_ ## reg; @@ -410,7 +403,7 @@ static inline void call_depth_return_thunk(void) {} /* * Inline asm uses the %V modifier which is only in newer GCC - * which is ensured when CONFIG_RETPOLINE is defined. + * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ # define CALL_NOSPEC \ ALTERNATIVE_2( \ diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index d18e5c332cb9f443b2279d0545779b67dcb7ec19..1b93ff80b43bcc229add1859fd5b14deb4e84b5d 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -66,10 +66,14 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, * virt_addr_valid(kaddr) returns true. */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) extern bool __virt_addr_valid(unsigned long kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) +static __always_inline void *pfn_to_kaddr(unsigned long pfn) +{ + return __va(pfn << PAGE_SHIFT); +} + static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits) { return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index b40c462b4af36cce4256923f06285e28b062ca51..b3ab80a03365cf1de1b5332e76494644e98eb938 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -10,7 +10,6 @@ #include #include #include -#include struct pci_sysdata { int domain; /* PCI domain */ @@ -124,16 +123,4 @@ cpumask_of_pcibus(const struct pci_bus *bus) } #endif -struct pci_setup_rom { - struct setup_data data; - uint16_t vendor; - uint16_t devid; - uint64_t pcilen; - unsigned long segment; - unsigned long bus; - unsigned long device; - unsigned long function; - uint8_t romdata[]; -}; - #endif /* _ASM_X86_PCI_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 5e01883eb51ee8e576e70db0577bfbe0c20c2e4f..44958ebaf626e20c970acaacaad012f93cba2671 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -4,17 +4,21 @@ #ifdef CONFIG_X86_64 #define __percpu_seg gs +#define __percpu_rel (%rip) #else #define __percpu_seg fs +#define __percpu_rel #endif #ifdef __ASSEMBLY__ #ifdef CONFIG_SMP -#define PER_CPU_VAR(var) %__percpu_seg:var -#else /* ! SMP */ -#define PER_CPU_VAR(var) var -#endif /* SMP */ +#define __percpu %__percpu_seg: +#else +#define __percpu +#endif + +#define PER_CPU_VAR(var) __percpu(var)__percpu_rel #ifdef CONFIG_X86_64_SMP #define INIT_PER_CPU_VAR(var) init_per_cpu__##var @@ -24,30 +28,84 @@ #else /* ...!ASSEMBLY */ +#include #include #include #ifdef CONFIG_SMP + +#ifdef CONFIG_CC_HAS_NAMED_AS + +#ifdef __CHECKER__ +#define __seg_gs __attribute__((address_space(__seg_gs))) +#define __seg_fs __attribute__((address_space(__seg_fs))) +#endif + +#ifdef CONFIG_X86_64 +#define __percpu_seg_override __seg_gs +#else +#define __percpu_seg_override __seg_fs +#endif + +#define __percpu_prefix "" + +#else /* CONFIG_CC_HAS_NAMED_AS */ + +#define __percpu_seg_override #define __percpu_prefix "%%"__stringify(__percpu_seg)":" + +#endif /* CONFIG_CC_HAS_NAMED_AS */ + +#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":" #define __my_cpu_offset this_cpu_read(this_cpu_off) +#ifdef CONFIG_USE_X86_SEG_SUPPORT +/* + * Efficient implementation for cases in which the compiler supports + * named address spaces. Allows the compiler to perform additional + * optimizations that can save more instructions. + */ +#define arch_raw_cpu_ptr(ptr) \ +({ \ + unsigned long tcp_ptr__; \ + tcp_ptr__ = __raw_cpu_read(, this_cpu_off); \ + \ + tcp_ptr__ += (unsigned long)(ptr); \ + (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ +}) +#else /* CONFIG_USE_X86_SEG_SUPPORT */ /* * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. */ -#define arch_raw_cpu_ptr(ptr) \ -({ \ - unsigned long tcp_ptr__; \ - asm ("add " __percpu_arg(1) ", %0" \ - : "=r" (tcp_ptr__) \ - : "m" (this_cpu_off), "0" (ptr)); \ - (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ +#define arch_raw_cpu_ptr(ptr) \ +({ \ + unsigned long tcp_ptr__; \ + asm ("mov " __percpu_arg(1) ", %0" \ + : "=r" (tcp_ptr__) \ + : "m" (__my_cpu_var(this_cpu_off))); \ + \ + tcp_ptr__ += (unsigned long)(ptr); \ + (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ }) -#else +#endif /* CONFIG_USE_X86_SEG_SUPPORT */ + +#define PER_CPU_VAR(var) %__percpu_seg:(var)__percpu_rel + +#else /* CONFIG_SMP */ +#define __percpu_seg_override #define __percpu_prefix "" -#endif +#define __force_percpu_prefix "" + +#define PER_CPU_VAR(var) (var)__percpu_rel +#endif /* CONFIG_SMP */ + +#define __my_cpu_type(var) typeof(var) __percpu_seg_override +#define __my_cpu_ptr(ptr) (__my_cpu_type(*ptr) *)(uintptr_t)(ptr) +#define __my_cpu_var(var) (*__my_cpu_ptr(&var)) #define __percpu_arg(x) __percpu_prefix "%" #x +#define __force_percpu_arg(x) __force_percpu_prefix "%" #x /* * Initialized pointers to per-cpu variables needed for the boot @@ -107,14 +165,14 @@ do { \ (void)pto_tmp__; \ } \ asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \ - : [var] "+m" (_var) \ + : [var] "+m" (__my_cpu_var(_var)) \ : [val] __pcpu_reg_imm_##size(pto_val__)); \ } while (0) #define percpu_unary_op(size, qual, op, _var) \ ({ \ asm qual (__pcpu_op1_##size(op, __percpu_arg([var])) \ - : [var] "+m" (_var)); \ + : [var] "+m" (__my_cpu_var(_var))); \ }) /* @@ -144,16 +202,16 @@ do { \ __pcpu_type_##size pfo_val__; \ asm qual (__pcpu_op2_##size(op, __percpu_arg([var]), "%[val]") \ : [val] __pcpu_reg_##size("=", pfo_val__) \ - : [var] "m" (_var)); \ + : [var] "m" (__my_cpu_var(_var))); \ (typeof(_var))(unsigned long) pfo_val__; \ }) #define percpu_stable_op(size, op, _var) \ ({ \ __pcpu_type_##size pfo_val__; \ - asm(__pcpu_op2_##size(op, __percpu_arg(P[var]), "%[val]") \ + asm(__pcpu_op2_##size(op, __force_percpu_arg(a[var]), "%[val]") \ : [val] __pcpu_reg_##size("=", pfo_val__) \ - : [var] "p" (&(_var))); \ + : [var] "i" (&(_var))); \ (typeof(_var))(unsigned long) pfo_val__; \ }) @@ -166,7 +224,7 @@ do { \ asm qual (__pcpu_op2_##size("xadd", "%[tmp]", \ __percpu_arg([var])) \ : [tmp] __pcpu_reg_##size("+", paro_tmp__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : : "memory"); \ (typeof(_var))(unsigned long) (paro_tmp__ + _val); \ }) @@ -187,7 +245,7 @@ do { \ __percpu_arg([var])) \ "\n\tjnz 1b" \ : [oval] "=&a" (pxo_old__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pxo_new__) \ : "memory"); \ (typeof(_var))(unsigned long) pxo_old__; \ @@ -204,7 +262,7 @@ do { \ asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ __percpu_arg([var])) \ : [oval] "+a" (pco_old__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pco_new__) \ : "memory"); \ (typeof(_var))(unsigned long) pco_old__; \ @@ -221,7 +279,7 @@ do { \ CC_SET(z) \ : CC_OUT(z) (success), \ [oval] "+a" (pco_old__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pco_new__) \ : "memory"); \ if (unlikely(!success)) \ @@ -244,7 +302,7 @@ do { \ \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - : [var] "+m" (_var), \ + : [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ @@ -276,7 +334,7 @@ do { \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ CC_SET(z) \ : CC_OUT(z) (success), \ - [var] "+m" (_var), \ + [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ @@ -313,7 +371,7 @@ do { \ \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - : [var] "+m" (_var), \ + : [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ @@ -345,7 +403,7 @@ do { \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ CC_SET(z) \ : CC_OUT(z) (success), \ - [var] "+m" (_var), \ + [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ @@ -366,9 +424,9 @@ do { \ * accessed while this_cpu_read_stable() allows the value to be cached. * this_cpu_read_stable() is more efficient and can be used if its value * is guaranteed to be valid across cpus. The current users include - * get_current() and get_thread_info() both of which are actually - * per-thread variables implemented as per-cpu variables and thus - * stable for the duration of the respective task. + * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are + * actually per-thread variables implemented as per-CPU variables and + * thus stable for the duration of the respective task. */ #define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp) #define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp) @@ -376,13 +434,72 @@ do { \ #define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp) #define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp) +#ifdef CONFIG_USE_X86_SEG_SUPPORT + +#define __raw_cpu_read(qual, pcp) \ +({ \ + *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \ +}) + +#define __raw_cpu_write(qual, pcp, val) \ +do { \ + *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)) = (val); \ +} while (0) + +#define raw_cpu_read_1(pcp) __raw_cpu_read(, pcp) +#define raw_cpu_read_2(pcp) __raw_cpu_read(, pcp) +#define raw_cpu_read_4(pcp) __raw_cpu_read(, pcp) +#define raw_cpu_write_1(pcp, val) __raw_cpu_write(, pcp, val) +#define raw_cpu_write_2(pcp, val) __raw_cpu_write(, pcp, val) +#define raw_cpu_write_4(pcp, val) __raw_cpu_write(, pcp, val) + +#define this_cpu_read_1(pcp) __raw_cpu_read(volatile, pcp) +#define this_cpu_read_2(pcp) __raw_cpu_read(volatile, pcp) +#define this_cpu_read_4(pcp) __raw_cpu_read(volatile, pcp) +#define this_cpu_write_1(pcp, val) __raw_cpu_write(volatile, pcp, val) +#define this_cpu_write_2(pcp, val) __raw_cpu_write(volatile, pcp, val) +#define this_cpu_write_4(pcp, val) __raw_cpu_write(volatile, pcp, val) + +#ifdef CONFIG_X86_64 +#define raw_cpu_read_8(pcp) __raw_cpu_read(, pcp) +#define raw_cpu_write_8(pcp, val) __raw_cpu_write(, pcp, val) + +#define this_cpu_read_8(pcp) __raw_cpu_read(volatile, pcp) +#define this_cpu_write_8(pcp, val) __raw_cpu_write(volatile, pcp, val) +#endif + +#define this_cpu_read_const(pcp) __raw_cpu_read(, pcp) +#else /* CONFIG_USE_X86_SEG_SUPPORT */ + #define raw_cpu_read_1(pcp) percpu_from_op(1, , "mov", pcp) #define raw_cpu_read_2(pcp) percpu_from_op(2, , "mov", pcp) #define raw_cpu_read_4(pcp) percpu_from_op(4, , "mov", pcp) - #define raw_cpu_write_1(pcp, val) percpu_to_op(1, , "mov", (pcp), val) #define raw_cpu_write_2(pcp, val) percpu_to_op(2, , "mov", (pcp), val) #define raw_cpu_write_4(pcp, val) percpu_to_op(4, , "mov", (pcp), val) + +#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp) +#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp) +#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp) +#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val) +#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val) +#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val) + +#ifdef CONFIG_X86_64 +#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp) +#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val) + +#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp) +#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val) +#endif + +/* + * The generic per-cpu infrastrucutre is not suitable for + * reading const-qualified variables. + */ +#define this_cpu_read_const(pcp) ({ BUILD_BUG(); (typeof(pcp))0; }) +#endif /* CONFIG_USE_X86_SEG_SUPPORT */ + #define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val) #define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val) #define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val) @@ -408,12 +525,6 @@ do { \ #define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val) #define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val) -#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp) -#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp) -#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp) -#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val) -#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val) -#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val) #define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val) #define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val) #define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val) @@ -452,8 +563,6 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp) -#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val) #define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val) #define raw_cpu_and_8(pcp, val) percpu_to_op(8, , "and", (pcp), val) #define raw_cpu_or_8(pcp, val) percpu_to_op(8, , "or", (pcp), val) @@ -462,8 +571,6 @@ do { \ #define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval) #define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval) -#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp) -#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val) #define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val) #define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val) #define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val) @@ -494,7 +601,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr, asm volatile("btl "__percpu_arg(2)",%1" CC_SET(c) : CC_OUT(c) (oldbit) - : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); + : "m" (*__my_cpu_ptr((unsigned long __percpu *)(addr))), "Ir" (nr)); return oldbit; } diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 94de1a05aebaac302a6e75aa68fe0bf39de7a629..d65e338b6a5fca1593733cb817423cc91b3296df 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -181,7 +181,7 @@ static inline u64 p4_clear_ht_bit(u64 config) static inline int p4_ht_active(void) { #ifdef CONFIG_SMP - return smp_num_siblings > 1; + return __max_threads_per_core > 1; #endif return 0; } @@ -189,7 +189,7 @@ static inline int p4_ht_active(void) static inline int p4_ht_thread(int cpu) { #ifdef CONFIG_SMP - if (smp_num_siblings == 2) + if (__max_threads_per_core == 2) return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); #endif return 0; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index c7ec5bb88334eab119ccf78002be2e7679291113..dcd836b59bebd329c3d265b98e48ef6eb4c9e6fc 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -34,7 +34,7 @@ static inline void paravirt_release_p4d(unsigned long pfn) {} */ extern gfp_t __userpte_alloc_gfp; -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Instead of one PGD, we acquire two PGDs. Being order-1, it is * both 8k in size and 8k-aligned. That lets us just flip bit 12 diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 9e7c0b719c3c11361b60f51cb79a553322afeb76..dabafba957ea6f2e1d5d6796da07427dab2f22ba 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -52,7 +52,7 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) static inline void native_set_pud(pud_t *pudp, pud_t pud) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd); #endif pxx_xchg64(pud, pudp, native_pud_val(pud)); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 9d077bca6a103ecfcbd02424483fd5971995bafd..df0f7d4a96f3284eb4ac9c0495b5a4ec8df27aa3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -909,7 +909,7 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, pmd_t *populate_extra_pmd(unsigned long vaddr); pte_t *populate_extra_pte(unsigned long vaddr); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd); /* @@ -923,12 +923,12 @@ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) return pgd; return __pti_set_user_pgtbl(pgdp, pgd); } -#else /* CONFIG_PAGE_TABLE_ISOLATION */ +#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ #endif /* __ASSEMBLY__ */ @@ -1131,7 +1131,7 @@ static inline int p4d_bad(p4d_t p4d) { unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; - if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (p4d_flags(p4d) & ~ignore_flags) != 0; @@ -1177,7 +1177,7 @@ static inline int pgd_bad(pgd_t pgd) if (!pgtable_l5_enabled()) return 0; - if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; @@ -1422,9 +1422,9 @@ static inline bool pgdp_maps_userspace(void *__ptr) #define pgd_leaf pgd_large static inline int pgd_large(pgd_t pgd) { return 0; } -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and * the user one is in the last 4k. To switch between them, you * just need to flip the 12th bit in their addresses. @@ -1469,7 +1469,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) { return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); @@ -1484,7 +1484,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!static_cpu_has(X86_FEATURE_PTI)) return; /* Clone the user space pgd as well */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 24af25b1551a56597c988019b90e545db52442ff..7e9db77231ac7f596182b2f259c2798fbaf20f30 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -143,7 +143,8 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { pgd_t pgd; - if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) { + if (pgtable_l5_enabled() || + !IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) { WRITE_ONCE(*p4dp, p4d); return; } diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index af77235fded63b64ec99844778796a294cf29d0a..919909d8cb77e3d630af36bf7df00f10fc3765f2 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -91,7 +91,7 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e, + return GEN_UNARY_RMWcc("decl", __my_cpu_var(pcpu_hot.preempt_count), e, __percpu_arg([var])); } diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index d8cccadc83a6fb3d512df090538ac6bd1df23318..e5f204b9b33dfaa92ed1b05faa6b604e50d5f2f3 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -51,7 +51,7 @@ #define CR3_NOFLUSH 0 #endif -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define X86_CR3_PTI_PCID_USER_BIT 11 #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 26620d7642a9fcf9d4a822140a1dd009399ee16a..811548f131f4e30418bedfdf98f1f302fc06723d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -20,7 +20,6 @@ struct vm86; #include #include #include -#include #include #include #include @@ -100,6 +99,9 @@ struct cpuinfo_topology { u32 logical_pkg_id; u32 logical_die_id; + // AMD Node ID and Nodes per Package info + u32 amd_node_id; + // Cache level topology IDs u32 llc_id; u32 l2c_id; @@ -119,8 +121,6 @@ struct cpuinfo_x86 { #endif __u8 x86_virt_bits; __u8 x86_phys_bits; - /* CPUID returned core id bits: */ - __u8 x86_coreid_bits; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ @@ -148,8 +148,6 @@ struct cpuinfo_x86 { unsigned long loops_per_jiffy; /* protected processor identification number */ u64 ppin; - /* cpuid returned max cores value: */ - u16 x86_max_cores; u16 x86_clflush_size; /* number of cores as seen by the OS: */ u16 booted_cores; @@ -186,13 +184,8 @@ extern struct cpuinfo_x86 new_cpu_data; extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; -#ifdef CONFIG_SMP DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); #define cpu_data(cpu) per_cpu(cpu_info, cpu) -#else -#define cpu_info boot_cpu_data -#define cpu_data(cpu) boot_cpu_data -#endif extern const struct seq_operations cpuinfo_op; @@ -533,6 +526,9 @@ static __always_inline unsigned long current_top_of_stack(void) * and around vm86 mode and sp0 on x86_64 is special because of the * entry trampoline. */ + if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) + return this_cpu_read_const(const_pcpu_hot.top_of_stack); + return this_cpu_read_stable(pcpu_hot.top_of_stack); } @@ -555,7 +551,7 @@ static inline void load_sp0(unsigned long sp0) unsigned long __get_wchan(struct task_struct *p); -extern void select_idle_routine(const struct cpuinfo_x86 *c); +extern void select_idle_routine(void); extern void amd_e400_c1e_apic_setup(void); extern unsigned long boot_option_idle_override; @@ -576,28 +572,6 @@ extern void cpu_init(void); extern void cpu_init_exception_handling(void); extern void cr4_init(void); -static inline unsigned long get_debugctlmsr(void) -{ - unsigned long debugctlmsr = 0; - -#ifndef CONFIG_X86_DEBUGCTLMSR - if (boot_cpu_data.x86 < 6) - return 0; -#endif - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); - - return debugctlmsr; -} - -static inline void update_debugctlmsr(unsigned long debugctlmsr) -{ -#ifndef CONFIG_X86_DEBUGCTLMSR - if (boot_cpu_data.x86 < 6) - return; -#endif - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); -} - extern void set_task_blockstep(struct task_struct *task, bool on); /* Boot loader type from the setup header: */ @@ -664,8 +638,10 @@ static __always_inline void prefetchw(const void *x) #else extern unsigned long __end_init_task[]; -#define INIT_THREAD { \ - .sp = (unsigned long)&__end_init_task - sizeof(struct pt_regs), \ +#define INIT_THREAD { \ + .sp = (unsigned long)&__end_init_task - \ + TOP_OF_KERNEL_STACK_PADDING - \ + sizeof(struct pt_regs), \ } extern unsigned long KSTK_ESP(struct task_struct *task); @@ -704,12 +680,10 @@ static inline u32 per_cpu_l2c_id(unsigned int cpu) } #ifdef CONFIG_CPU_SUP_AMD -extern u32 amd_get_nodes_per_socket(void); extern u32 amd_get_highest_perf(void); extern void amd_clear_divider(void); extern void amd_check_microcode(void); #else -static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } static inline void amd_clear_divider(void) { } static inline void amd_check_microcode(void) { } diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 65dee24206240641172ad509e6e06358e6fcf41f..043758a2e627037f544022197d41b8a762e6e973 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -23,11 +23,11 @@ extern int of_ioapic; extern u64 initial_dtb; extern void add_dtb(u64 data); void x86_of_pci_init(void); -void x86_dtb_init(void); +void x86_dtb_parse_smp_config(void); #else static inline void add_dtb(u64 data) { } static inline void x86_of_pci_init(void) { } -static inline void x86_dtb_init(void) { } +static inline void x86_dtb_parse_smp_config(void) { } #define of_ioapic 0 #endif diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index 07375b476c4fda6cfd89229826bf21e331ef5190..ab167c96b9ab474b33d778453db0bb550f42b0ac 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -3,7 +3,7 @@ #define _ASM_X86_PTI_H #ifndef __ASSEMBLY__ -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION extern void pti_init(void); extern void pti_check_boottime_disable(void); extern void pti_finalize(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index f4db78b09c8f0be1e0a904394d48f0a45b246cc8..5a83fbd9bc0b44f5bac9a4b3447e3b52fc5e1abc 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -56,18 +56,64 @@ struct pt_regs { #else /* __i386__ */ +struct fred_cs { + /* CS selector */ + u64 cs : 16, + /* Stack level at event time */ + sl : 2, + /* IBT in WAIT_FOR_ENDBRANCH state */ + wfe : 1, + : 45; +}; + +struct fred_ss { + /* SS selector */ + u64 ss : 16, + /* STI state */ + sti : 1, + /* Set if syscall, sysenter or INT n */ + swevent : 1, + /* Event is NMI type */ + nmi : 1, + : 13, + /* Event vector */ + vector : 8, + : 8, + /* Event type */ + type : 4, + : 4, + /* Event was incident to enclave execution */ + enclave : 1, + /* CPU was in long mode */ + lm : 1, + /* + * Nested exception during FRED delivery, not set + * for #DF. + */ + nested : 1, + : 1, + /* + * The length of the instruction causing the event. + * Only set for INTO, INT1, INT3, INT n, SYSCALL + * and SYSENTER. 0 otherwise. + */ + insnlen : 4; +}; + struct pt_regs { -/* - * C ABI says these regs are callee-preserved. They aren't saved on kernel entry - * unless syscall needs a complete, fully filled "struct pt_regs". - */ + /* + * C ABI says these regs are callee-preserved. They aren't saved on + * kernel entry unless syscall needs a complete, fully filled + * "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; -/* These regs are callee-clobbered. Always saved on kernel entry. */ + + /* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -77,18 +123,50 @@ struct pt_regs { unsigned long dx; unsigned long si; unsigned long di; -/* - * On syscall entry, this is syscall#. On CPU exception, this is error code. - * On hw interrupt, it's IRQ number: - */ + + /* + * orig_ax is used on entry for: + * - the syscall number (syscall, sysenter, int80) + * - error_code stored by the CPU on traps and exceptions + * - the interrupt number for device interrupts + * + * A FRED stack frame starts here: + * 1) It _always_ includes an error code; + * + * 2) The return frame for ERET[US] starts here, but + * the content of orig_ax is ignored. + */ unsigned long orig_ax; -/* Return frame for iretq */ + + /* The IRETQ return frame starts here */ unsigned long ip; - unsigned long cs; + + union { + /* CS selector */ + u16 cs; + /* The extended 64-bit data slot containing CS */ + u64 csx; + /* The FRED CS extension */ + struct fred_cs fred_cs; + }; + unsigned long flags; unsigned long sp; - unsigned long ss; -/* top of stack page */ + + union { + /* SS selector */ + u16 ss; + /* The extended 64-bit data slot containing SS */ + u64 ssx; + /* The FRED SS extension */ + struct fred_ss fred_ss; + }; + + /* + * Top of stack on IDT systems, while FRED systems have extra fields + * defined above for storing exception related information, e.g. CR2 or + * DR6. + */ }; #endif /* !__i386__ */ diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 255a78d9d90672afb053875184d89b05bab52a0b..12dbd2588ca7ccdaa1ea641327b5cbf866f50a68 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -7,6 +7,13 @@ #include #include +/* + * This value can never be a valid CLOSID, and is used when mapping a + * (closid, rmid) pair to an index and back. On x86 only the RMID is + * needed. The index is a software defined value. + */ +#define X86_RESCTRL_EMPTY_CLOSID ((u32)~0) + /** * struct resctrl_pqr_state - State cache for the PQR MSR * @cur_rmid: The cached Resource Monitoring ID @@ -31,10 +38,47 @@ struct resctrl_pqr_state { DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; + DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); +static inline bool resctrl_arch_alloc_capable(void) +{ + return rdt_alloc_capable; +} + +static inline void resctrl_arch_enable_alloc(void) +{ + static_branch_enable_cpuslocked(&rdt_alloc_enable_key); + static_branch_inc_cpuslocked(&rdt_enable_key); +} + +static inline void resctrl_arch_disable_alloc(void) +{ + static_branch_disable_cpuslocked(&rdt_alloc_enable_key); + static_branch_dec_cpuslocked(&rdt_enable_key); +} + +static inline bool resctrl_arch_mon_capable(void) +{ + return rdt_mon_capable; +} + +static inline void resctrl_arch_enable_mon(void) +{ + static_branch_enable_cpuslocked(&rdt_mon_enable_key); + static_branch_inc_cpuslocked(&rdt_enable_key); +} + +static inline void resctrl_arch_disable_mon(void) +{ + static_branch_disable_cpuslocked(&rdt_mon_enable_key); + static_branch_dec_cpuslocked(&rdt_enable_key); +} + /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * @@ -88,12 +132,58 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) return val * scale; } +static inline void resctrl_arch_set_closid_rmid(struct task_struct *tsk, + u32 closid, u32 rmid) +{ + WRITE_ONCE(tsk->closid, closid); + WRITE_ONCE(tsk->rmid, rmid); +} + +static inline bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) +{ + return READ_ONCE(tsk->closid) == closid; +} + +static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored, + u32 rmid) +{ + return READ_ONCE(tsk->rmid) == rmid; +} + static inline void resctrl_sched_in(struct task_struct *tsk) { if (static_branch_likely(&rdt_enable_key)) __resctrl_sched_in(tsk); } +static inline u32 resctrl_arch_system_num_rmid_idx(void) +{ + /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ + return boot_cpu_data.x86_cache_max_rmid + 1; +} + +static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) +{ + *rmid = idx; + *closid = X86_RESCTRL_EMPTY_CLOSID; +} + +static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid) +{ + return rmid; +} + +/* x86 can always read an rmid, nothing needs allocating */ +struct rdt_resource; +static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid) +{ + might_sleep(); + return NULL; +}; + +static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid, + void *ctx) { }; + void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index a5e89641bd2dac7e9fa5e1ab548369836640908a..9aee31862b4a8b8cbf2242db991a5cbeb3d41e21 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -47,6 +47,7 @@ int set_memory_uc(unsigned long addr, int numpages); int set_memory_wc(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_np(unsigned long addr, int numpages); +int set_memory_p(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); diff --git a/arch/x86/include/asm/setup_data.h b/arch/x86/include/asm/setup_data.h new file mode 100644 index 0000000000000000000000000000000000000000..77c51111a89394c7dffecb3086f341a4c0635a30 --- /dev/null +++ b/arch/x86/include/asm/setup_data.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SETUP_DATA_H +#define _ASM_X86_SETUP_DATA_H + +#include + +#ifndef __ASSEMBLY__ + +struct pci_setup_rom { + struct setup_data data; + uint16_t vendor; + uint16_t devid; + uint64_t pcilen; + unsigned long segment; + unsigned long bus; + unsigned long device; + unsigned long function; + uint8_t romdata[]; +}; + +/* kexec external ABI */ +struct efi_setup_data { + u64 fw_vendor; + u64 __unused; + u64 tables; + u64 smbios; + u64 reserved[8]; +}; + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_SETUP_DATA_H */ diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index cf671138feef489a5e0ce2e0b98f131da70cabe9..9477b4053bce2ccb9d6c1c0113fd6ee44bd34d50 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -13,7 +13,6 @@ #include #include -#include #include #define GHCB_PROTOCOL_MIN 1ULL @@ -22,6 +21,8 @@ #define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } +struct boot_params; + enum es_result { ES_OK, /* All good */ ES_UNSUPPORTED, /* Requested operation not supported */ @@ -228,6 +229,7 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); u64 sev_get_status(void); void kdump_sev_callback(void); +void sev_show_status(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -257,6 +259,7 @@ static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 snp_get_unsupported_features(u64 status) { return 0; } static inline u64 sev_get_status(void) { return 0; } static inline void kdump_sev_callback(void) { } +static inline void sev_show_status(void) { } #endif #ifdef CONFIG_KVM_AMD_SEV diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4fab2ed454f3ab86a636ec9d2ac1ed682d1de624..a35936b512fee639db9930fa64b543687edbf0b8 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -8,9 +8,6 @@ #include #include -extern int smp_num_siblings; -extern unsigned int num_processors; - DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); @@ -59,11 +56,6 @@ static inline void stop_other_cpus(void) smp_ops.stop_other_cpus(1); } -static inline void smp_prepare_boot_cpu(void) -{ - smp_ops.smp_prepare_boot_cpu(); -} - static inline void smp_prepare_cpus(unsigned int max_cpus) { smp_ops.smp_prepare_cpus(max_cpus); @@ -110,7 +102,6 @@ void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void smp_prepare_cpus_common(void); void native_smp_prepare_cpus(unsigned int max_cpus); -void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_kick_ap(unsigned int cpu, struct task_struct *tidle); @@ -174,8 +165,6 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) } #endif /* CONFIG_SMP */ -extern unsigned disabled_cpus; - #ifdef CONFIG_DEBUG_NMI_SELFTEST extern void nmi_selftest(void); #else diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index c648502e453579e3d0deeb9033f2832d9a2fd2a1..658b690b2ccb7d627c6418b32cba5c3a7d959832 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -96,4 +96,6 @@ static inline void speculative_store_bypass_ht_init(void) { } extern void speculation_ctrl_update(unsigned long tif); extern void speculation_ctrl_update_current(void); +extern bool itlb_multihit_kvm_mitigation; + #endif diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 48f8dd47cf6882ac9e3920d6e7105c0eff430528..2e9fc5c400cdc364a306f5d0f3c7dda305070627 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -2,11 +2,11 @@ #ifndef _ASM_X86_SPECIAL_INSNS_H #define _ASM_X86_SPECIAL_INSNS_H - #ifdef __KERNEL__ - #include #include + +#include #include #include @@ -224,10 +224,10 @@ static inline void serialize(void) } /* The dst parameter must be 64-bytes aligned */ -static inline void movdir64b(void __iomem *dst, const void *src) +static inline void movdir64b(void *dst, const void *src) { const struct { char _[64]; } *__src = src; - struct { char _[64]; } __iomem *__dst = dst; + struct { char _[64]; } *__dst = dst; /* * MOVDIR64B %(rdx), rax. @@ -245,6 +245,11 @@ static inline void movdir64b(void __iomem *dst, const void *src) : "m" (*__src), "a" (__dst), "d" (__src)); } +static inline void movdir64b_io(void __iomem *dst, const void *src) +{ + movdir64b((void __force *)dst, src); +} + /** * enqcmds - Enqueue a command in supervisor (CPL0) mode * @dst: destination, in MMIO space (must be 512-bit aligned) diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index 343b722ccaf21d0654b5d7dc19596a4ef548a8e5..125c407e2abe6da21a05f8a644ecce501ed1c910 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -46,7 +46,7 @@ #define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)") -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "jmp __x86_return_thunk") #else diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index f42dbf17f52b0ee12ec74f34d1fb551743be0e6c..c3bd0c0758c9a4366181b0e83c52c83142475d84 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -70,9 +70,13 @@ static inline void update_task_stack(struct task_struct *task) #ifdef CONFIG_X86_32 this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else - /* Xen PV enters the kernel on the thread stack. */ - if (cpu_feature_enabled(X86_FEATURE_XENPV)) + if (cpu_feature_enabled(X86_FEATURE_FRED)) { + /* WRMSRNS is a baseline feature for FRED. */ + wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE); + } else if (cpu_feature_enabled(X86_FEATURE_XENPV)) { + /* Xen PV enters the kernel on the thread stack. */ load_sp0(task_top_of_stack(task)); + } #endif } diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 0b70653a98c1573a475edf808601cb1c33f9a45e..345aafbc19648865f7262b64e43d1ae5176aae4c 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -15,6 +15,8 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len); +extern void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len); + /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index d63b02940747fad97cabfdb399b488d577d36529..12da7dfd5ef13b417eaae07d244edac620f5e8a2 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -31,7 +31,9 @@ * In vm86 mode, the hardware frame is much longer still, so add 16 * bytes to make room for the real-mode segments. * - * x86_64 has a fixed-length stack frame. + * x86-64 has a fixed-length stack frame, but it depends on whether + * or not FRED is enabled. Future versions of FRED might make this + * dynamic, but for now it is always 2 words longer. */ #ifdef CONFIG_X86_32 # ifdef CONFIG_VM86 @@ -39,8 +41,12 @@ # else # define TOP_OF_KERNEL_STACK_PADDING 8 # endif -#else -# define TOP_OF_KERNEL_STACK_PADDING 0 +#else /* x86-64 */ +# ifdef CONFIG_X86_FRED +# define TOP_OF_KERNEL_STACK_PADDING (2 * 8) +# else +# define TOP_OF_KERNEL_STACK_PADDING 0 +# endif #endif /* diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 5f87f6b9b09e74c1a6d01a38777bdc9856ac2f81..abe3a8f22cbd969898505b7550f9503f33bbae5e 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -102,6 +102,35 @@ static inline void setup_node_to_cpumask_map(void) { } #include +/* Topology information */ +enum x86_topology_domains { + TOPO_SMT_DOMAIN, + TOPO_CORE_DOMAIN, + TOPO_MODULE_DOMAIN, + TOPO_TILE_DOMAIN, + TOPO_DIE_DOMAIN, + TOPO_DIEGRP_DOMAIN, + TOPO_PKG_DOMAIN, + TOPO_MAX_DOMAIN, +}; + +struct x86_topology_system { + unsigned int dom_shifts[TOPO_MAX_DOMAIN]; + unsigned int dom_size[TOPO_MAX_DOMAIN]; +}; + +extern struct x86_topology_system x86_topo_system; + +static inline unsigned int topology_get_domain_size(enum x86_topology_domains dom) +{ + return x86_topo_system.dom_size[dom]; +} + +static inline unsigned int topology_get_domain_shift(enum x86_topology_domains dom) +{ + return dom == TOPO_SMT_DOMAIN ? 0 : x86_topo_system.dom_shifts[dom - 1]; +} + extern const struct cpumask *cpu_coregroup_mask(int cpu); extern const struct cpumask *cpu_clustergroup_mask(int cpu); @@ -112,7 +141,42 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu); #define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) #define topology_ppin(cpu) (cpu_data(cpu).ppin) -extern unsigned int __max_die_per_package; +#define topology_amd_node_id(cpu) (cpu_data(cpu).topo.amd_node_id) + +extern unsigned int __max_dies_per_package; +extern unsigned int __max_logical_packages; +extern unsigned int __max_threads_per_core; +extern unsigned int __num_threads_per_package; +extern unsigned int __num_cores_per_package; + +static inline unsigned int topology_max_packages(void) +{ + return __max_logical_packages; +} + +static inline unsigned int topology_max_dies_per_package(void) +{ + return __max_dies_per_package; +} + +static inline unsigned int topology_num_cores_per_package(void) +{ + return __num_cores_per_package; +} + +static inline unsigned int topology_num_threads_per_package(void) +{ + return __num_threads_per_package; +} + +#ifdef CONFIG_X86_LOCAL_APIC +int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level); +#else +static inline int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level) +{ + return 0; +} +#endif #ifdef CONFIG_SMP #define topology_cluster_id(cpu) (cpu_data(cpu).topo.l2c_id) @@ -121,12 +185,11 @@ extern unsigned int __max_die_per_package; #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) -extern unsigned int __max_logical_packages; -#define topology_max_packages() (__max_logical_packages) -static inline int topology_max_die_per_package(void) +static inline int topology_phys_to_logical_pkg(unsigned int pkg) { - return __max_die_per_package; + return topology_get_logical_id(pkg << x86_topo_system.dom_shifts[TOPO_PKG_DOMAIN], + TOPO_PKG_DOMAIN); } extern int __max_smt_threads; @@ -138,9 +201,12 @@ static inline int topology_max_smt_threads(void) #include -int topology_update_package_map(unsigned int apicid, unsigned int cpu); -int topology_update_die_map(unsigned int dieid, unsigned int cpu); -int topology_phys_to_logical_pkg(unsigned int pkg); +extern unsigned int __amd_nodes_per_pkg; + +static inline unsigned int topology_amd_nodes_per_pkg(void) +{ + return __amd_nodes_per_pkg; +} extern struct cpumask __cpu_primary_thread_mask; #define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask) @@ -153,16 +219,12 @@ static inline bool topology_is_primary_thread(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_primary_thread_mask); } + #else /* CONFIG_SMP */ -#define topology_max_packages() (1) -static inline int -topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } -static inline int -topology_update_die_map(unsigned int dieid, unsigned int cpu) { return 0; } static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } -static inline int topology_max_die_per_package(void) { return 1; } static inline int topology_max_smt_threads(void) { return 1; } static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } +static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; } #endif /* !CONFIG_SMP */ static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h index f5d2325aa0b749db0b6743adb85c94cb0f303624..8d1154cdf7875c923645e3b5f5bf111ce9ffd1f0 100644 --- a/arch/x86/include/asm/trapnr.h +++ b/arch/x86/include/asm/trapnr.h @@ -2,6 +2,18 @@ #ifndef _ASM_X86_TRAPNR_H #define _ASM_X86_TRAPNR_H +/* + * Event type codes used by FRED, Intel VT-x and AMD SVM + */ +#define EVENT_TYPE_EXTINT 0 // External interrupt +#define EVENT_TYPE_RESERVED 1 +#define EVENT_TYPE_NMI 2 // NMI +#define EVENT_TYPE_HWEXC 3 // Hardware originated traps, exceptions +#define EVENT_TYPE_SWINT 4 // INT n +#define EVENT_TYPE_PRIV_SWEXC 5 // INT1 +#define EVENT_TYPE_SWEXC 6 // INTO, INT3 +#define EVENT_TYPE_OTHER 7 // FRED SYSCALL/SYSENTER, VT-x MTF + /* Interrupts/Exceptions */ #define X86_TRAP_DE 0 /* Divide-by-zero */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 594fce0ca744398f0086f43582f66fe870b12fa9..405efb3e4996e7f1b8a65e2be4454c94eea07288 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -5,8 +5,9 @@ #ifndef _ASM_X86_TSC_H #define _ASM_X86_TSC_H -#include #include +#include +#include /* * Standard way to access the cycle counter. diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index f2c02e4469ccc3db6265b2e36a6bd336f5e822fc..04789f45ab2b2ffb063045cb00c50b4974dfde49 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -11,6 +11,7 @@ #include #include #include +#include #ifdef CONFIG_ADDRESS_MASKING /* @@ -18,14 +19,10 @@ */ static inline unsigned long __untagged_addr(unsigned long addr) { - /* - * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation - * in alternative instructions. The relocation gets wrong when gets - * copied to the target place. - */ asm (ALTERNATIVE("", - "and %%gs:tlbstate_untag_mask, %[addr]\n\t", X86_FEATURE_LAM) - : [addr] "+r" (addr) : "m" (tlbstate_untag_mask)); + "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM) + : [addr] "+r" (addr) + : [mask] "m" (__my_cpu_var(tlbstate_untag_mask))); return addr; } @@ -54,7 +51,7 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, * half and a user half. When cast to a signed type, user pointers * are positive and kernel pointers are negative. */ -#define valid_user_address(x) ((long)(x) >= 0) +#define valid_user_address(x) ((__force long)(x) >= 0) /* * User pointers can have tag bits on x86-64. This scheme tolerates @@ -87,8 +84,9 @@ static inline bool __access_ok(const void __user *ptr, unsigned long size) if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) { return valid_user_address(ptr); } else { - unsigned long sum = size + (unsigned long)ptr; - return valid_user_address(sum) && sum >= (unsigned long)ptr; + unsigned long sum = size + (__force unsigned long)ptr; + + return valid_user_address(sum) && sum >= (__force unsigned long)ptr; } } #define __access_ok __access_ok diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0e73616b82f3469f47877e0cc56ca6b79f034173..4dba173630084f88cabc20216b03ed9c86a1a072 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -17,6 +17,7 @@ #include #include +#include #include #define VMCS_CONTROL_BIT(x) BIT(VMX_FEATURE_##x & 0x1f) @@ -374,14 +375,14 @@ enum vmcs_field { #define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK -#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ -#define INTR_TYPE_RESERVED (1 << 8) /* reserved */ -#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ -#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */ -#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ -#define INTR_TYPE_PRIV_SW_EXCEPTION (5 << 8) /* ICE breakpoint - undocumented */ -#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */ -#define INTR_TYPE_OTHER_EVENT (7 << 8) /* other event */ +#define INTR_TYPE_EXT_INTR (EVENT_TYPE_EXTINT << 8) /* external interrupt */ +#define INTR_TYPE_RESERVED (EVENT_TYPE_RESERVED << 8) /* reserved */ +#define INTR_TYPE_NMI_INTR (EVENT_TYPE_NMI << 8) /* NMI */ +#define INTR_TYPE_HARD_EXCEPTION (EVENT_TYPE_HWEXC << 8) /* processor exception */ +#define INTR_TYPE_SOFT_INTR (EVENT_TYPE_SWINT << 8) /* software interrupt */ +#define INTR_TYPE_PRIV_SW_EXCEPTION (EVENT_TYPE_PRIV_SWEXC << 8) /* ICE breakpoint */ +#define INTR_TYPE_SOFT_EXCEPTION (EVENT_TYPE_SWEXC << 8) /* software exception */ +#define INTR_TYPE_OTHER_EVENT (EVENT_TYPE_OTHER << 8) /* other event */ /* GUEST_INTERRUPTIBILITY_INFO flags. */ #define GUEST_INTR_STATE_STI 0x00000001 diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index c878616a18b85750c92f764f1895cafb0698dd85..b89b40f250e6f55c52cbd520bdbaeaff4eb77cf2 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -2,8 +2,6 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H -#include - struct ghcb; struct mpc_bus; struct mpc_cpu; @@ -15,13 +13,15 @@ struct irq_domain; /** * struct x86_init_mpparse - platform specific mpparse ops * @setup_ioapic_ids: platform specific ioapic id override - * @find_smp_config: find the smp configuration - * @get_smp_config: get the smp configuration + * @find_mptable: Find MPTABLE early to reserve the memory region + * @early_parse_smp_cfg: Parse the SMP configuration data early before initmem_init() + * @parse_smp_cfg: Parse the SMP configuration data */ struct x86_init_mpparse { void (*setup_ioapic_ids)(void); - void (*find_smp_config)(void); - void (*get_smp_config)(unsigned int early); + void (*find_mptable)(void); + void (*early_parse_smp_cfg)(void); + void (*parse_smp_cfg)(void); }; /** diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index eeea058cf6028ec15cd7a0524b1eca7824775e4f..9b82eebd7add55e918d50a1e3a47e5300f29c222 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -2,21 +2,7 @@ #ifndef _ASM_X86_BOOTPARAM_H #define _ASM_X86_BOOTPARAM_H -/* setup_data/setup_indirect types */ -#define SETUP_NONE 0 -#define SETUP_E820_EXT 1 -#define SETUP_DTB 2 -#define SETUP_PCI 3 -#define SETUP_EFI 4 -#define SETUP_APPLE_PROPERTIES 5 -#define SETUP_JAILHOUSE 6 -#define SETUP_CC_BLOB 7 -#define SETUP_IMA 8 -#define SETUP_RNG_SEED 9 -#define SETUP_ENUM_MAX SETUP_RNG_SEED - -#define SETUP_INDIRECT (1<<31) -#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) +#include /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF @@ -49,22 +35,6 @@ #include #include