diff --git a/.clang-format b/.clang-format index f49620f506f17a95bda75dd4cbfd2a544ee0a8b4..f3923a1f98583bef70d7beeac5954da858079a3c 100644 --- a/.clang-format +++ b/.clang-format @@ -78,6 +78,8 @@ ForEachMacros: - 'ata_qc_for_each_with_internal' - 'ax25_for_each' - 'ax25_uid_for_each' + - '__bio_for_each_bvec' + - 'bio_for_each_bvec' - 'bio_for_each_integrity_vec' - '__bio_for_each_segment' - 'bio_for_each_segment' @@ -118,10 +120,12 @@ ForEachMacros: - 'drm_for_each_legacy_plane' - 'drm_for_each_plane' - 'drm_for_each_plane_mask' + - 'drm_for_each_privobj' - 'drm_mm_for_each_hole' - 'drm_mm_for_each_node' - 'drm_mm_for_each_node_in_range' - 'drm_mm_for_each_node_safe' + - 'flow_action_for_each' - 'for_each_active_drhd_unit' - 'for_each_active_iommu' - 'for_each_available_child_of_node' @@ -158,6 +162,9 @@ ForEachMacros: - 'for_each_dss_dev' - 'for_each_efi_memory_desc' - 'for_each_efi_memory_desc_in_map' + - 'for_each_element' + - 'for_each_element_extid' + - 'for_each_element_id' - 'for_each_endpoint_of_node' - 'for_each_evictable_lru' - 'for_each_fib6_node_rt_rcu' @@ -195,6 +202,7 @@ ForEachMacros: - 'for_each_net_rcu' - 'for_each_new_connector_in_state' - 'for_each_new_crtc_in_state' + - 'for_each_new_mst_mgr_in_state' - 'for_each_new_plane_in_state' - 'for_each_new_private_obj_in_state' - 'for_each_node' @@ -210,8 +218,10 @@ ForEachMacros: - 'for_each_of_pci_range' - 'for_each_old_connector_in_state' - 'for_each_old_crtc_in_state' + - 'for_each_old_mst_mgr_in_state' - 'for_each_oldnew_connector_in_state' - 'for_each_oldnew_crtc_in_state' + - 'for_each_oldnew_mst_mgr_in_state' - 'for_each_oldnew_plane_in_state' - 'for_each_oldnew_plane_in_state_reverse' - 'for_each_oldnew_private_obj_in_state' @@ -243,6 +253,9 @@ ForEachMacros: - 'for_each_sg_dma_page' - 'for_each_sg_page' - 'for_each_sibling_event' + - 'for_each_subelement' + - 'for_each_subelement_extid' + - 'for_each_subelement_id' - '__for_each_thread' - 'for_each_thread' - 'for_each_zone' @@ -252,6 +265,8 @@ ForEachMacros: - 'fwnode_for_each_child_node' - 'fwnode_graph_for_each_endpoint' - 'gadget_for_each_ep' + - 'genradix_for_each' + - 'genradix_for_each_from' - 'hash_for_each' - 'hash_for_each_possible' - 'hash_for_each_possible_rcu' @@ -293,7 +308,11 @@ ForEachMacros: - 'key_for_each' - 'key_for_each_safe' - 'klp_for_each_func' + - 'klp_for_each_func_safe' + - 'klp_for_each_func_static' - 'klp_for_each_object' + - 'klp_for_each_object_safe' + - 'klp_for_each_object_static' - 'kvm_for_each_memslot' - 'kvm_for_each_vcpu' - 'list_for_each' @@ -324,6 +343,8 @@ ForEachMacros: - 'media_device_for_each_intf' - 'media_device_for_each_link' - 'media_device_for_each_pad' + - 'mp_bvec_for_each_page' + - 'mp_bvec_for_each_segment' - 'nanddev_io_for_each_page' - 'netdev_for_each_lower_dev' - 'netdev_for_each_lower_private' @@ -375,6 +396,7 @@ ForEachMacros: - 'rht_for_each_rcu' - 'rht_for_each_rcu_continue' - '__rq_for_each_bio' + - 'rq_for_each_bvec' - 'rq_for_each_segment' - 'scsi_for_each_prot_sg' - 'scsi_for_each_sg' @@ -410,6 +432,8 @@ ForEachMacros: - 'v4l2_m2m_for_each_src_buf_safe' - 'virtio_device_for_each_vq' - 'xa_for_each' + - 'xa_for_each_marked' + - 'xa_for_each_start' - 'xas_for_each' - 'xas_for_each_conflict' - 'xas_for_each_marked' diff --git a/.mailmap b/.mailmap index b2cde8668dcc38f85c6f0f1bac9da5fa2e9f63b9..ae2bcad06f4b58eb3db18020df75cabeb59b6ed0 100644 --- a/.mailmap +++ b/.mailmap @@ -156,6 +156,8 @@ Morten Welinder Morten Welinder Mythri P K Nguyen Anh Quynh +Nicolas Pitre +Nicolas Pitre Paolo 'Blaisorblade' Giarrusso Patrick Mochel Paul Burton diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 9605dbd4b5b59ecc251913b1b327e6031cfb875f..4fb76c0e8d30a147a200de73e415d8df665e95c5 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -511,10 +511,30 @@ Description: Control Symetric Multi Threading (SMT) control: Read/write interface to control SMT. Possible values: - "on" SMT is enabled - "off" SMT is disabled - "forceoff" SMT is force disabled. Cannot be changed. - "notsupported" SMT is not supported by the CPU + "on" SMT is enabled + "off" SMT is disabled + "forceoff" SMT is force disabled. Cannot be changed. + "notsupported" SMT is not supported by the CPU + "notimplemented" SMT runtime toggling is not + implemented for the architecture If control status is "forceoff" or "notsupported" writes are rejected. + +What: /sys/devices/system/cpu/cpu#/power/energy_perf_bias +Date: March 2019 +Contact: linux-pm@vger.kernel.org +Description: Intel Energy and Performance Bias Hint (EPB) + + EPB for the given CPU in a sliding scale 0 - 15, where a value + of 0 corresponds to a hint preference for highest performance + and a value of 15 corresponds to the maximum energy savings. + + In order to change the EPB value for the CPU, write either + a number in the 0 - 15 sliding scale above, or one of the + strings: "performance", "balance-performance", "normal", + "balance-power", "power" (that represent values reflected by + their meaning), to this attribute. + + This attribute is present for all online CPUs supporting the + Intel EPB feature. diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 18f1798075633eff2235fa7f072054612dd5637e..c30c1957c7e6b866878d49d879f202ca3945813f 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -155,8 +155,7 @@ keeping lock contention under control at all tree levels regardless of the level of loading on the system.

RCU updaters wait for normal grace periods by registering -RCU callbacks, either directly via call_rcu() and -friends (namely call_rcu_bh() and call_rcu_sched()), +RCU callbacks, either directly via call_rcu() or indirectly via synchronize_rcu() and friends. RCU callbacks are represented by rcu_head structures, which are queued on rcu_data structures while they are diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html index 19e7a5fb6b739ec19ae337a12e3e4973f1cb70bc..57300db4b5ff607c30563bed1e8104c55f8e4b8e 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html @@ -56,6 +56,7 @@ sections. RCU-preempt Expedited Grace Periods

+CONFIG_PREEMPT=y kernels implement RCU-preempt. The overall flow of the handling of a given CPU by an RCU-preempt expedited grace period is shown in the following diagram: @@ -139,6 +140,7 @@ or offline, among other things. RCU-sched Expedited Grace Periods

+CONFIG_PREEMPT=n kernels implement RCU-sched. The overall flow of the handling of a given CPU by an RCU-sched expedited grace period is shown in the following diagram: @@ -146,7 +148,7 @@ expedited grace period is shown in the following diagram:

As with RCU-preempt, RCU-sched's -synchronize_sched_expedited() ignores offline and +synchronize_rcu_expedited() ignores offline and idle CPUs, again because they are in remotely detectable quiescent states. However, because the diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html index 8d21af02b1f0722f179efa56da0e2f3977d8e4d1..c64f8d26609fb64ae34dfdf551624984f38e4c0c 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html @@ -34,12 +34,11 @@ Similarly, any code that happens before the beginning of a given RCU grace period is guaranteed to see the effects of all accesses following the end of that grace period that are within RCU read-side critical sections. -

This guarantee is particularly pervasive for synchronize_sched(), -for which RCU-sched read-side critical sections include any region +

Note well that RCU-sched read-side critical sections include any region of code for which preemption is disabled. Given that each individual machine instruction can be thought of as an extremely small region of preemption-disabled code, one can think of -synchronize_sched() as smp_mb() on steroids. +synchronize_rcu() as smp_mb() on steroids.

RCU updaters use this guarantee by splitting their updates into two phases, one of which is executed before the grace period and diff --git a/Documentation/RCU/NMI-RCU.txt b/Documentation/RCU/NMI-RCU.txt index 687777f83b2371d4bd24e5d7abff88d7cf90a5d8..881353fd5bff1cbc1f3dead8009f5b86cb65832a 100644 --- a/Documentation/RCU/NMI-RCU.txt +++ b/Documentation/RCU/NMI-RCU.txt @@ -81,18 +81,19 @@ currently executing on some other CPU. We therefore cannot free up any data structures used by the old NMI handler until execution of it completes on all other CPUs. -One way to accomplish this is via synchronize_sched(), perhaps as +One way to accomplish this is via synchronize_rcu(), perhaps as follows: unset_nmi_callback(); - synchronize_sched(); + synchronize_rcu(); kfree(my_nmi_data); -This works because synchronize_sched() blocks until all CPUs complete -any preemption-disabled segments of code that they were executing. -Since NMI handlers disable preemption, synchronize_sched() is guaranteed +This works because (as of v4.20) synchronize_rcu() blocks until all +CPUs complete any preemption-disabled segments of code that they were +executing. +Since NMI handlers disable preemption, synchronize_rcu() is guaranteed not to return until all ongoing NMI handlers exit. It is therefore safe -to free up the handler's data as soon as synchronize_sched() returns. +to free up the handler's data as soon as synchronize_rcu() returns. Important note: for this to work, the architecture in question must invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively. diff --git a/Documentation/RCU/UP.txt b/Documentation/RCU/UP.txt index 90ec5341ee981a0a397710d2ffd1261eb2557896..53bde717017bb8cde0fbec21f0dd65db33db2f91 100644 --- a/Documentation/RCU/UP.txt +++ b/Documentation/RCU/UP.txt @@ -86,10 +86,8 @@ even on a UP system. So do not do it! Even on a UP system, the RCU infrastructure -must- respect grace periods, and -must- invoke callbacks from a known environment in which no locks are held. -It -is- safe for synchronize_sched() and synchronize_rcu_bh() to return -immediately on an UP system. It is also safe for synchronize_rcu() -to return immediately on UP systems, except when running preemptable -RCU. +Note that it -is- safe for synchronize_rcu() to return immediately on +UP systems, including !PREEMPT SMP builds running on UP systems. Quick Quiz #3: Why can't synchronize_rcu() return immediately on UP systems running preemptable RCU? diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 6f469864d9f59aa5a4d2456f01d55409d58d32c6..e98ff261a438bd4e0858a943fc469437599cc284 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -182,16 +182,13 @@ over a rather long period of time, but improvements are always welcome! when publicizing a pointer to a structure that can be traversed by an RCU read-side critical section. -5. If call_rcu(), or a related primitive such as call_rcu_bh(), - call_rcu_sched(), or call_srcu() is used, the callback function - will be called from softirq context. In particular, it cannot - block. +5. If call_rcu() or call_srcu() is used, the callback function will + be called from softirq context. In particular, it cannot block. -6. Since synchronize_rcu() can block, it cannot be called from - any sort of irq context. The same rule applies for - synchronize_rcu_bh(), synchronize_sched(), synchronize_srcu(), - synchronize_rcu_expedited(), synchronize_rcu_bh_expedited(), - synchronize_sched_expedite(), and synchronize_srcu_expedited(). +6. Since synchronize_rcu() can block, it cannot be called + from any sort of irq context. The same rule applies + for synchronize_srcu(), synchronize_rcu_expedited(), and + synchronize_srcu_expedited(). The expedited forms of these primitives have the same semantics as the non-expedited forms, but expediting is both expensive and @@ -212,20 +209,20 @@ over a rather long period of time, but improvements are always welcome! of the system, especially to real-time workloads running on the rest of the system. -7. If the updater uses call_rcu() or synchronize_rcu(), then the - corresponding readers must use rcu_read_lock() and - rcu_read_unlock(). If the updater uses call_rcu_bh() or - synchronize_rcu_bh(), then the corresponding readers must - use rcu_read_lock_bh() and rcu_read_unlock_bh(). If the - updater uses call_rcu_sched() or synchronize_sched(), then - the corresponding readers must disable preemption, possibly - by calling rcu_read_lock_sched() and rcu_read_unlock_sched(). - If the updater uses synchronize_srcu() or call_srcu(), then - the corresponding readers must use srcu_read_lock() and +7. As of v4.20, a given kernel implements only one RCU flavor, + which is RCU-sched for PREEMPT=n and RCU-preempt for PREEMPT=y. + If the updater uses call_rcu() or synchronize_rcu(), + then the corresponding readers my use rcu_read_lock() and + rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(), + or any pair of primitives that disables and re-enables preemption, + for example, rcu_read_lock_sched() and rcu_read_unlock_sched(). + If the updater uses synchronize_srcu() or call_srcu(), + then the corresponding readers must use srcu_read_lock() and srcu_read_unlock(), and with the same srcu_struct. The rules for the expedited primitives are the same as for their non-expedited counterparts. Mixing things up will result in confusion and - broken kernels. + broken kernels, and has even resulted in an exploitable security + issue. One exception to this rule: rcu_read_lock() and rcu_read_unlock() may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh() @@ -288,8 +285,7 @@ over a rather long period of time, but improvements are always welcome! d. Periodically invoke synchronize_rcu(), permitting a limited number of updates per grace period. - The same cautions apply to call_rcu_bh(), call_rcu_sched(), - call_srcu(), and kfree_rcu(). + The same cautions apply to call_srcu() and kfree_rcu(). Note that although these primitives do take action to avoid memory exhaustion when any given CPU has too many callbacks, a determined @@ -322,7 +318,7 @@ over a rather long period of time, but improvements are always welcome! 11. Any lock acquired by an RCU callback must be acquired elsewhere with softirq disabled, e.g., via spin_lock_irqsave(), - spin_lock_bh(), etc. Failing to disable irq on a given + spin_lock_bh(), etc. Failing to disable softirq on a given acquisition of that lock will result in deadlock as soon as the RCU softirq handler happens to run your RCU callback while interrupting that acquisition's critical section. @@ -335,13 +331,16 @@ over a rather long period of time, but improvements are always welcome! must use whatever locking or other synchronization is required to safely access and/or modify that data structure. - RCU callbacks are -usually- executed on the same CPU that executed - the corresponding call_rcu(), call_rcu_bh(), or call_rcu_sched(), - but are by -no- means guaranteed to be. For example, if a given - CPU goes offline while having an RCU callback pending, then that - RCU callback will execute on some surviving CPU. (If this was - not the case, a self-spawning RCU callback would prevent the - victim CPU from ever going offline.) + Do not assume that RCU callbacks will be executed on the same + CPU that executed the corresponding call_rcu() or call_srcu(). + For example, if a given CPU goes offline while having an RCU + callback pending, then that RCU callback will execute on some + surviving CPU. (If this was not the case, a self-spawning RCU + callback would prevent the victim CPU from ever going offline.) + Furthermore, CPUs designated by rcu_nocbs= might well -always- + have their RCU callbacks executed on some other CPUs, in fact, + for some real-time workloads, this is the whole point of using + the rcu_nocbs= kernel boot parameter. 13. Unlike other forms of RCU, it -is- permissible to block in an SRCU read-side critical section (demarked by srcu_read_lock() @@ -381,11 +380,11 @@ over a rather long period of time, but improvements are always welcome! SRCU's expedited primitive (synchronize_srcu_expedited()) never sends IPIs to other CPUs, so it is easier on - real-time workloads than is synchronize_rcu_expedited(), - synchronize_rcu_bh_expedited() or synchronize_sched_expedited(). + real-time workloads than is synchronize_rcu_expedited(). - Note that rcu_dereference() and rcu_assign_pointer() relate to - SRCU just as they do to other forms of RCU. + Note that rcu_assign_pointer() relates to SRCU just as it does to + other forms of RCU, but instead of rcu_dereference() you should + use srcu_dereference() in order to avoid lockdep splats. 14. The whole point of call_rcu(), synchronize_rcu(), and friends is to wait until all pre-existing readers have finished before @@ -405,6 +404,9 @@ over a rather long period of time, but improvements are always welcome! read-side critical sections. It is the responsibility of the RCU update-side primitives to deal with this. + For SRCU readers, you can use smp_mb__after_srcu_read_unlock() + immediately after an srcu_read_unlock() to get a full barrier. + 16. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the __rcu sparse checks to validate your RCU code. These can help find problems as follows: @@ -428,22 +430,19 @@ over a rather long period of time, but improvements are always welcome! These debugging aids can help you find problems that are otherwise extremely difficult to spot. -17. If you register a callback using call_rcu(), call_rcu_bh(), - call_rcu_sched(), or call_srcu(), and pass in a function defined - within a loadable module, then it in necessary to wait for - all pending callbacks to be invoked after the last invocation - and before unloading that module. Note that it is absolutely - -not- sufficient to wait for a grace period! The current (say) - synchronize_rcu() implementation waits only for all previous - callbacks registered on the CPU that synchronize_rcu() is running - on, but it is -not- guaranteed to wait for callbacks registered - on other CPUs. +17. If you register a callback using call_rcu() or call_srcu(), and + pass in a function defined within a loadable module, then it in + necessary to wait for all pending callbacks to be invoked after + the last invocation and before unloading that module. Note that + it is absolutely -not- sufficient to wait for a grace period! + The current (say) synchronize_rcu() implementation is -not- + guaranteed to wait for callbacks registered on other CPUs. + Or even on the current CPU if that CPU recently went offline + and came back online. You instead need to use one of the barrier functions: o call_rcu() -> rcu_barrier() - o call_rcu_bh() -> rcu_barrier() - o call_rcu_sched() -> rcu_barrier() o call_srcu() -> srcu_barrier() However, these barrier functions are absolutely -not- guaranteed diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt index 721b3e4265155354137e199bbaadd82b9e2e4221..c818cf65c5a9a0068d87f207cad1dccd86603b2c 100644 --- a/Documentation/RCU/rcu.txt +++ b/Documentation/RCU/rcu.txt @@ -52,10 +52,10 @@ o If I am running on a uniprocessor kernel, which can only do one o How can I see where RCU is currently used in the Linux kernel? Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu", - "rcu_read_lock_bh", "rcu_read_unlock_bh", "call_rcu_bh", - "srcu_read_lock", "srcu_read_unlock", "synchronize_rcu", - "synchronize_net", "synchronize_srcu", and the other RCU - primitives. Or grab one of the cscope databases from: + "rcu_read_lock_bh", "rcu_read_unlock_bh", "srcu_read_lock", + "srcu_read_unlock", "synchronize_rcu", "synchronize_net", + "synchronize_srcu", and the other RCU primitives. Or grab one + of the cscope databases from: http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt index ab96227bad42663b749df40b17156bf915102fd7..bf699e8cfc75ca18fe2ee94472ee46e6d2a6a9c8 100644 --- a/Documentation/RCU/rcu_dereference.txt +++ b/Documentation/RCU/rcu_dereference.txt @@ -351,3 +351,106 @@ garbage values. In short, rcu_dereference() is -not- optional when you are going to dereference the resulting pointer. + + +WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE? + +First, please avoid using rcu_dereference_raw() and also please avoid +using rcu_dereference_check() and rcu_dereference_protected() with a +second argument with a constant value of 1 (or true, for that matter). +With that caution out of the way, here is some guidance for which +member of the rcu_dereference() to use in various situations: + +1. If the access needs to be within an RCU read-side critical + section, use rcu_dereference(). With the new consolidated + RCU flavors, an RCU read-side critical section is entered + using rcu_read_lock(), anything that disables bottom halves, + anything that disables interrupts, or anything that disables + preemption. + +2. If the access might be within an RCU read-side critical section + on the one hand, or protected by (say) my_lock on the other, + use rcu_dereference_check(), for example: + + p1 = rcu_dereference_check(p->rcu_protected_pointer, + lockdep_is_held(&my_lock)); + + +3. If the access might be within an RCU read-side critical section + on the one hand, or protected by either my_lock or your_lock on + the other, again use rcu_dereference_check(), for example: + + p1 = rcu_dereference_check(p->rcu_protected_pointer, + lockdep_is_held(&my_lock) || + lockdep_is_held(&your_lock)); + +4. If the access is on the update side, so that it is always protected + by my_lock, use rcu_dereference_protected(): + + p1 = rcu_dereference_protected(p->rcu_protected_pointer, + lockdep_is_held(&my_lock)); + + This can be extended to handle multiple locks as in #3 above, + and both can be extended to check other conditions as well. + +5. If the protection is supplied by the caller, and is thus unknown + to this code, that is the rare case when rcu_dereference_raw() + is appropriate. In addition, rcu_dereference_raw() might be + appropriate when the lockdep expression would be excessively + complex, except that a better approach in that case might be to + take a long hard look at your synchronization design. Still, + there are data-locking cases where any one of a very large number + of locks or reference counters suffices to protect the pointer, + so rcu_dereference_raw() does have its place. + + However, its place is probably quite a bit smaller than one + might expect given the number of uses in the current kernel. + Ditto for its synonym, rcu_dereference_check( ... , 1), and + its close relative, rcu_dereference_protected(... , 1). + + +SPARSE CHECKING OF RCU-PROTECTED POINTERS + +The sparse static-analysis tool checks for direct access to RCU-protected +pointers, which can result in "interesting" bugs due to compiler +optimizations involving invented loads and perhaps also load tearing. +For example, suppose someone mistakenly does something like this: + + p = q->rcu_protected_pointer; + do_something_with(p->a); + do_something_else_with(p->b); + +If register pressure is high, the compiler might optimize "p" out +of existence, transforming the code to something like this: + + do_something_with(q->rcu_protected_pointer->a); + do_something_else_with(q->rcu_protected_pointer->b); + +This could fatally disappoint your code if q->rcu_protected_pointer +changed in the meantime. Nor is this a theoretical problem: Exactly +this sort of bug cost Paul E. McKenney (and several of his innocent +colleagues) a three-day weekend back in the early 1990s. + +Load tearing could of course result in dereferencing a mashup of a pair +of pointers, which also might fatally disappoint your code. + +These problems could have been avoided simply by making the code instead +read as follows: + + p = rcu_dereference(q->rcu_protected_pointer); + do_something_with(p->a); + do_something_else_with(p->b); + +Unfortunately, these sorts of bugs can be extremely hard to spot during +review. This is where the sparse tool comes into play, along with the +"__rcu" marker. If you mark a pointer declaration, whether in a structure +or as a formal parameter, with "__rcu", which tells sparse to complain if +this pointer is accessed directly. It will also cause sparse to complain +if a pointer not marked with "__rcu" is accessed using rcu_dereference() +and friends. For example, ->rcu_protected_pointer might be declared as +follows: + + struct foo __rcu *rcu_protected_pointer; + +Use of "__rcu" is opt-in. If you choose not to use it, then you should +ignore the sparse warnings. diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt index 5d7759071a3edbb3ef818e0d41a7081f39dc54ea..a2782df697328e3293769b429b5321c82e0e0b16 100644 --- a/Documentation/RCU/rcubarrier.txt +++ b/Documentation/RCU/rcubarrier.txt @@ -83,16 +83,15 @@ Pseudo-code using rcu_barrier() is as follows: 2. Execute rcu_barrier(). 3. Allow the module to be unloaded. -There are also rcu_barrier_bh(), rcu_barrier_sched(), and srcu_barrier() -functions for the other flavors of RCU, and you of course must match -the flavor of rcu_barrier() with that of call_rcu(). If your module -uses multiple flavors of call_rcu(), then it must also use multiple +There is also an srcu_barrier() function for SRCU, and you of course +must match the flavor of rcu_barrier() with that of call_rcu(). If your +module uses multiple flavors of call_rcu(), then it must also use multiple flavors of rcu_barrier() when unloading that module. For example, if -it uses call_rcu_bh(), call_srcu() on srcu_struct_1, and call_srcu() on +it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on srcu_struct_2(), then the following three lines of code will be required when unloading: - 1 rcu_barrier_bh(); + 1 rcu_barrier(); 2 srcu_barrier(&srcu_struct_1); 3 srcu_barrier(&srcu_struct_2); @@ -185,12 +184,12 @@ module invokes call_rcu() from timers, you will need to first cancel all the timers, and only then invoke rcu_barrier() to wait for any remaining RCU callbacks to complete. -Of course, if you module uses call_rcu_bh(), you will need to invoke -rcu_barrier_bh() before unloading. Similarly, if your module uses -call_rcu_sched(), you will need to invoke rcu_barrier_sched() before -unloading. If your module uses call_rcu(), call_rcu_bh(), -and- -call_rcu_sched(), then you will need to invoke each of rcu_barrier(), -rcu_barrier_bh(), and rcu_barrier_sched(). +Of course, if you module uses call_rcu(), you will need to invoke +rcu_barrier() before unloading. Similarly, if your module uses +call_srcu(), you will need to invoke srcu_barrier() before unloading, +and on the same srcu_struct structure. If your module uses call_rcu() +-and- call_srcu(), then you will need to invoke rcu_barrier() -and- +srcu_barrier(). Implementing rcu_barrier() @@ -223,8 +222,8 @@ shown below. Note that the final "1" in on_each_cpu()'s argument list ensures that all the calls to rcu_barrier_func() will have completed before on_each_cpu() returns. Line 9 then waits for the completion. -This code was rewritten in 2008 to support rcu_barrier_bh() and -rcu_barrier_sched() in addition to the original rcu_barrier(). +This code was rewritten in 2008 and several times thereafter, but this +still gives the general idea. The rcu_barrier_func() runs on each CPU, where it invokes call_rcu() to post an RCU callback, as follows: diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 1ace20815bb1c97290f12fa317ec65b97b3220b4..981651a8b65d206bb073f40c982d3fc5314c4d70 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -310,7 +310,7 @@ reader, updater, and reclaimer. rcu_assign_pointer() - +--------+ + +--------+ +---------------------->| reader |---------+ | +--------+ | | | | @@ -318,12 +318,12 @@ reader, updater, and reclaimer. | | | rcu_read_lock() | | | rcu_read_unlock() | rcu_dereference() | | - +---------+ | | - | updater |<---------------------+ | - +---------+ V + +---------+ | | + | updater |<----------------+ | + +---------+ V | +-----------+ +----------------------------------->| reclaimer | - +-----------+ + +-----------+ Defer: synchronize_rcu() & call_rcu() diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt index b8ca28b60215a48f1ee99cfae36f20e0b8d0e8da..7e71c9c1d8e9c7eee70ef957610b483a58739232 100644 --- a/Documentation/accounting/psi.txt +++ b/Documentation/accounting/psi.txt @@ -56,12 +56,12 @@ situation from a state where some tasks are stalled but the CPU is still doing productive work. As such, time spent in this subset of the stall state is tracked separately and exported in the "full" averages. -The ratios are tracked as recent trends over ten, sixty, and three -hundred second windows, which gives insight into short term events as -well as medium and long term trends. The total absolute stall time is -tracked and exported as well, to allow detection of latency spikes -which wouldn't necessarily make a dent in the time averages, or to -average trends over custom time frames. +The ratios (in %) are tracked as recent trends over ten, sixty, and +three hundred second windows, which gives insight into short term events +as well as medium and long term trends. The total absolute stall time +(in us) is tracked and exported as well, to allow detection of latency +spikes which wouldn't necessarily make a dent in the time averages, +or to average trends over custom time frames. Cgroup2 interface ================= diff --git a/Documentation/acpi/aml-debugger.txt b/Documentation/acpi/aml-debugger.txt deleted file mode 100644 index 75ebeb64ab29a8806776f41901859495a1c52d64..0000000000000000000000000000000000000000 --- a/Documentation/acpi/aml-debugger.txt +++ /dev/null @@ -1,66 +0,0 @@ -The AML Debugger - -Copyright (C) 2016, Intel Corporation -Author: Lv Zheng - - -This document describes the usage of the AML debugger embedded in the Linux -kernel. - -1. Build the debugger - - The following kernel configuration items are required to enable the AML - debugger interface from the Linux kernel: - - CONFIG_ACPI_DEBUGGER=y - CONFIG_ACPI_DEBUGGER_USER=m - - The userspace utilities can be built from the kernel source tree using - the following commands: - - $ cd tools - $ make acpi - - The resultant userspace tool binary is then located at: - - tools/power/acpi/acpidbg - - It can be installed to system directories by running "make install" (as a - sufficiently privileged user). - -2. Start the userspace debugger interface - - After booting the kernel with the debugger built-in, the debugger can be - started by using the following commands: - - # mount -t debugfs none /sys/kernel/debug - # modprobe acpi_dbg - # tools/power/acpi/acpidbg - - That spawns the interactive AML debugger environment where you can execute - debugger commands. - - The commands are documented in the "ACPICA Overview and Programmer Reference" - that can be downloaded from - - https://acpica.org/documentation - - The detailed debugger commands reference is located in Chapter 12 "ACPICA - Debugger Reference". The "help" command can be used for a quick reference. - -3. Stop the userspace debugger interface - - The interactive debugger interface can be closed by pressing Ctrl+C or using - the "quit" or "exit" commands. When finished, unload the module with: - - # rmmod acpi_dbg - - The module unloading may fail if there is an acpidbg instance running. - -4. Run the debugger in a script - - It may be useful to run the AML debugger in a test script. "acpidbg" supports - this in a special "batch" mode. For example, the following command outputs - the entire ACPI namespace: - - # acpidbg -b "namespace" diff --git a/Documentation/acpi/apei/output_format.txt b/Documentation/acpi/apei/output_format.txt deleted file mode 100644 index 0c49c197c47a4c744e76e121b80893ca2711e92d..0000000000000000000000000000000000000000 --- a/Documentation/acpi/apei/output_format.txt +++ /dev/null @@ -1,147 +0,0 @@ - APEI output format - ~~~~~~~~~~~~~~~~~~ - -APEI uses printk as hardware error reporting interface, the output -format is as follow. - - := -APEI generic hardware error status -severity: , -section: , severity: , -flags: -

-fru_id: -fru_text: -section_type:
-
- -* := recoverable | fatal | corrected | info - -
# := -[primary][, containment warning][, reset][, threshold exceeded]\ -[, resource not accessible][, latent error] - -
:= generic processor error | memory error | \ -PCIe error | unknown, - -
:= - | | \ - | - - := -[processor_type: , ] -[processor_isa: , ] -[error_type: -] -[operation: , ] -[flags: -] -[level: ] -[version_info: ] -[processor_id: ] -[target_address: ] -[requestor_id: ] -[responder_id: ] -[IP: ] - -* := IA32/X64 | IA64 - -* := IA32 | IA64 | X64 - -# := -[cache error][, TLB error][, bus error][, micro-architectural error] - -* := unknown or generic | data read | data write | \ -instruction execution - -# := -[restartable][, precise IP][, overflow][, corrected] - - := -[error_status: ] -[physical_address: ] -[physical_address_mask: ] -[node: ] -[card: ] -[module: ] -[bank: ] -[device: ] -[row: ] -[column: ] -[bit_position: ] -[requestor_id: ] -[responder_id: ] -[target_id: ] -[error_type: , ] - -* := -unknown | no error | single-bit ECC | multi-bit ECC | \ -single-symbol chipkill ECC | multi-symbol chipkill ECC | master abort | \ -target abort | parity error | watchdog timeout | invalid address | \ -mirror Broken | memory sparing | scrub corrected error | \ -scrub uncorrected error - - := -[port_type: , ] -[version: .] -[command: , status: ] -[device_id: ::. -slot: -secondary_bus: -vendor_id: , device_id: -class_code: ] -[serial number: , ] -[bridge: secondary_status: , control: ] -[aer_status: , aer_mask: - -[aer_uncor_severity: ] -aer_layer=, aer_agent= -aer_tlp_header: ] - -* := PCIe end point | legacy PCI end point | \ -unknown | unknown | root port | upstream switch port | \ -downstream switch port | PCIe to PCI/PCI-X bridge | \ -PCI/PCI-X to PCIe bridge | root complex integrated endpoint device | \ -root complex event collector - -if section severity is fatal or recoverable -# := -unknown | unknown | unknown | unknown | Data Link Protocol | \ -unknown | unknown | unknown | unknown | unknown | unknown | unknown | \ -Poisoned TLP | Flow Control Protocol | Completion Timeout | \ -Completer Abort | Unexpected Completion | Receiver Overflow | \ -Malformed TLP | ECRC | Unsupported Request -else -# := -Receiver Error | unknown | unknown | unknown | unknown | unknown | \ -Bad TLP | Bad DLLP | RELAY_NUM Rollover | unknown | unknown | unknown | \ -Replay Timer Timeout | Advisory Non-Fatal -fi - - := -Physical Layer | Data Link Layer | Transaction Layer - - := -Receiver ID | Requester ID | Completer ID | Transmitter ID - -Where, [] designate corresponding content is optional - -All description with * has the following format: - -field: , - -Where value of should be the position of "string" in description. Otherwise, will be "unknown". - -All description with # has the following format: - -field: - - -Where each string in corresponding to one set bit of -. The bit position is the position of "string" in description. - -For more detailed explanation of every field, please refer to UEFI -specification version 2.3 or later, section Appendix N: Common -Platform Error Record. diff --git a/Documentation/acpi/i2c-muxes.txt b/Documentation/acpi/i2c-muxes.txt deleted file mode 100644 index 9fcc4f0b885e15a1f46e76f6bd630c609af71be5..0000000000000000000000000000000000000000 --- a/Documentation/acpi/i2c-muxes.txt +++ /dev/null @@ -1,58 +0,0 @@ -ACPI I2C Muxes --------------- - -Describing an I2C device hierarchy that includes I2C muxes requires an ACPI -Device () scope per mux channel. - -Consider this topology: - -+------+ +------+ -| SMB1 |-->| MUX0 |--CH00--> i2c client A (0x50) -| | | 0x70 |--CH01--> i2c client B (0x50) -+------+ +------+ - -which corresponds to the following ASL: - -Device (SMB1) -{ - Name (_HID, ...) - Device (MUX0) - { - Name (_HID, ...) - Name (_CRS, ResourceTemplate () { - I2cSerialBus (0x70, ControllerInitiated, I2C_SPEED, - AddressingMode7Bit, "^SMB1", 0x00, - ResourceConsumer,,) - } - - Device (CH00) - { - Name (_ADR, 0) - - Device (CLIA) - { - Name (_HID, ...) - Name (_CRS, ResourceTemplate () { - I2cSerialBus (0x50, ControllerInitiated, I2C_SPEED, - AddressingMode7Bit, "^CH00", 0x00, - ResourceConsumer,,) - } - } - } - - Device (CH01) - { - Name (_ADR, 1) - - Device (CLIB) - { - Name (_HID, ...) - Name (_CRS, ResourceTemplate () { - I2cSerialBus (0x50, ControllerInitiated, I2C_SPEED, - AddressingMode7Bit, "^CH01", 0x00, - ResourceConsumer,,) - } - } - } - } -} diff --git a/Documentation/acpi/initrd_table_override.txt b/Documentation/acpi/initrd_table_override.txt deleted file mode 100644 index 30437a6db373a1cb449d15c1b98f2552786ebcc6..0000000000000000000000000000000000000000 --- a/Documentation/acpi/initrd_table_override.txt +++ /dev/null @@ -1,111 +0,0 @@ -Upgrading ACPI tables via initrd -================================ - -1) Introduction (What is this about) -2) What is this for -3) How does it work -4) References (Where to retrieve userspace tools) - -1) What is this about ---------------------- - -If the ACPI_TABLE_UPGRADE compile option is true, it is possible to -upgrade the ACPI execution environment that is defined by the ACPI tables -via upgrading the ACPI tables provided by the BIOS with an instrumented, -modified, more recent version one, or installing brand new ACPI tables. - -When building initrd with kernel in a single image, option -ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD should also be true for this -feature to work. - -For a full list of ACPI tables that can be upgraded/installed, take a look -at the char *table_sigs[MAX_ACPI_SIGNATURE]; definition in -drivers/acpi/tables.c. -All ACPI tables iasl (Intel's ACPI compiler and disassembler) knows should -be overridable, except: - - ACPI_SIG_RSDP (has a signature of 6 bytes) - - ACPI_SIG_FACS (does not have an ordinary ACPI table header) -Both could get implemented as well. - - -2) What is this for -------------------- - -Complain to your platform/BIOS vendor if you find a bug which is so severe -that a workaround is not accepted in the Linux kernel. And this facility -allows you to upgrade the buggy tables before your platform/BIOS vendor -releases an upgraded BIOS binary. - -This facility can be used by platform/BIOS vendors to provide a Linux -compatible environment without modifying the underlying platform firmware. - -This facility also provides a powerful feature to easily debug and test -ACPI BIOS table compatibility with the Linux kernel by modifying old -platform provided ACPI tables or inserting new ACPI tables. - -It can and should be enabled in any kernel because there is no functional -change with not instrumented initrds. - - -3) How does it work -------------------- - -# Extract the machine's ACPI tables: -cd /tmp -acpidump >acpidump -acpixtract -a acpidump -# Disassemble, modify and recompile them: -iasl -d *.dat -# For example add this statement into a _PRT (PCI Routing Table) function -# of the DSDT: -Store("HELLO WORLD", debug) -# And increase the OEM Revision. For example, before modification: -DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000000) -# After modification: -DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000001) -iasl -sa dsdt.dsl -# Add the raw ACPI tables to an uncompressed cpio archive. -# They must be put into a /kernel/firmware/acpi directory inside the cpio -# archive. Note that if the table put here matches a platform table -# (similar Table Signature, and similar OEMID, and similar OEM Table ID) -# with a more recent OEM Revision, the platform table will be upgraded by -# this table. If the table put here doesn't match a platform table -# (dissimilar Table Signature, or dissimilar OEMID, or dissimilar OEM Table -# ID), this table will be appended. -mkdir -p kernel/firmware/acpi -cp dsdt.aml kernel/firmware/acpi -# A maximum of "NR_ACPI_INITRD_TABLES (64)" tables are currently allowed -# (see osl.c): -iasl -sa facp.dsl -iasl -sa ssdt1.dsl -cp facp.aml kernel/firmware/acpi -cp ssdt1.aml kernel/firmware/acpi -# The uncompressed cpio archive must be the first. Other, typically -# compressed cpio archives, must be concatenated on top of the uncompressed -# one. Following command creates the uncompressed cpio archive and -# concatenates the original initrd on top: -find kernel | cpio -H newc --create > /boot/instrumented_initrd -cat /boot/initrd >>/boot/instrumented_initrd -# reboot with increased acpi debug level, e.g. boot params: -acpi.debug_level=0x2 acpi.debug_layer=0xFFFFFFFF -# and check your syslog: -[ 1.268089] ACPI: PCI Interrupt Routing Table [\_SB_.PCI0._PRT] -[ 1.272091] [ACPI Debug] String [0x0B] "HELLO WORLD" - -iasl is able to disassemble and recompile quite a lot different, -also static ACPI tables. - - -4) Where to retrieve userspace tools ------------------------------------- - -iasl and acpixtract are part of Intel's ACPICA project: -http://acpica.org/ -and should be packaged by distributions (for example in the acpica package -on SUSE). - -acpidump can be found in Len Browns pmtools: -ftp://kernel.org/pub/linux/kernel/people/lenb/acpi/utils/pmtools/acpidump -This tool is also part of the acpica package on SUSE. -Alternatively, used ACPI tables can be retrieved via sysfs in latest kernels: -/sys/firmware/acpi/tables diff --git a/Documentation/acpi/method-customizing.txt b/Documentation/acpi/method-customizing.txt deleted file mode 100644 index 7235da975f23b4e916353710b6480ec65716175f..0000000000000000000000000000000000000000 --- a/Documentation/acpi/method-customizing.txt +++ /dev/null @@ -1,73 +0,0 @@ -Linux ACPI Custom Control Method How To -======================================= - -Written by Zhang Rui - - -Linux supports customizing ACPI control methods at runtime. - -Users can use this to -1. override an existing method which may not work correctly, - or just for debugging purposes. -2. insert a completely new method in order to create a missing - method such as _OFF, _ON, _STA, _INI, etc. -For these cases, it is far simpler to dynamically install a single -control method rather than override the entire DSDT, because kernel -rebuild/reboot is not needed and test result can be got in minutes. - -Note: Only ACPI METHOD can be overridden, any other object types like - "Device", "OperationRegion", are not recognized. Methods - declared inside scope operators are also not supported. -Note: The same ACPI control method can be overridden for many times, - and it's always the latest one that used by Linux/kernel. -Note: To get the ACPI debug object output (Store (AAAA, Debug)), - please run "echo 1 > /sys/module/acpi/parameters/aml_debug_output". - -1. override an existing method - a) get the ACPI table via ACPI sysfs I/F. e.g. to get the DSDT, - just run "cat /sys/firmware/acpi/tables/DSDT > /tmp/dsdt.dat" - b) disassemble the table by running "iasl -d dsdt.dat". - c) rewrite the ASL code of the method and save it in a new file, - d) package the new file (psr.asl) to an ACPI table format. - Here is an example of a customized \_SB._AC._PSR method, - - DefinitionBlock ("", "SSDT", 1, "", "", 0x20080715) - { - Method (\_SB_.AC._PSR, 0, NotSerialized) - { - Store ("In AC _PSR", Debug) - Return (ACON) - } - } - Note that the full pathname of the method in ACPI namespace - should be used. - e) assemble the file to generate the AML code of the method. - e.g. "iasl -vw 6084 psr.asl" (psr.aml is generated as a result) - If parameter "-vw 6084" is not supported by your iASL compiler, - please try a newer version. - f) mount debugfs by "mount -t debugfs none /sys/kernel/debug" - g) override the old method via the debugfs by running - "cat /tmp/psr.aml > /sys/kernel/debug/acpi/custom_method" - -2. insert a new method - This is easier than overriding an existing method. - We just need to create the ASL code of the method we want to - insert and then follow the step c) ~ g) in section 1. - -3. undo your changes - The "undo" operation is not supported for a new inserted method - right now, i.e. we can not remove a method currently. - For an overridden method, in order to undo your changes, please - save a copy of the method original ASL code in step c) section 1, - and redo step c) ~ g) to override the method with the original one. - - -Note: We can use a kernel with multiple custom ACPI method running, - But each individual write to debugfs can implement a SINGLE - method override. i.e. if we want to insert/override multiple - ACPI methods, we need to redo step c) ~ g) for multiple times. - -Note: Be aware that root can mis-use this driver to modify arbitrary - memory and gain additional rights, if root's privileges got - restricted (for example if root is not allowed to load additional - modules after boot). diff --git a/Documentation/acpi/method-tracing.txt b/Documentation/acpi/method-tracing.txt deleted file mode 100644 index 0aba14c8f459353a9d7efbbfda3f9f32a1bb3bbb..0000000000000000000000000000000000000000 --- a/Documentation/acpi/method-tracing.txt +++ /dev/null @@ -1,192 +0,0 @@ -ACPICA Trace Facility - -Copyright (C) 2015, Intel Corporation -Author: Lv Zheng - - -Abstract: - -This document describes the functions and the interfaces of the method -tracing facility. - -1. Functionalities and usage examples: - - ACPICA provides method tracing capability. And two functions are - currently implemented using this capability. - - A. Log reducer - ACPICA subsystem provides debugging outputs when CONFIG_ACPI_DEBUG is - enabled. The debugging messages which are deployed via - ACPI_DEBUG_PRINT() macro can be reduced at 2 levels - per-component - level (known as debug layer, configured via - /sys/module/acpi/parameters/debug_layer) and per-type level (known as - debug level, configured via /sys/module/acpi/parameters/debug_level). - - But when the particular layer/level is applied to the control method - evaluations, the quantity of the debugging outputs may still be too - large to be put into the kernel log buffer. The idea thus is worked out - to only enable the particular debug layer/level (normally more detailed) - logs when the control method evaluation is started, and disable the - detailed logging when the control method evaluation is stopped. - - The following command examples illustrate the usage of the "log reducer" - functionality: - a. Filter out the debug layer/level matched logs when control methods - are being evaluated: - # cd /sys/module/acpi/parameters - # echo "0xXXXXXXXX" > trace_debug_layer - # echo "0xYYYYYYYY" > trace_debug_level - # echo "enable" > trace_state - b. Filter out the debug layer/level matched logs when the specified - control method is being evaluated: - # cd /sys/module/acpi/parameters - # echo "0xXXXXXXXX" > trace_debug_layer - # echo "0xYYYYYYYY" > trace_debug_level - # echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name - # echo "method" > /sys/module/acpi/parameters/trace_state - c. Filter out the debug layer/level matched logs when the specified - control method is being evaluated for the first time: - # cd /sys/module/acpi/parameters - # echo "0xXXXXXXXX" > trace_debug_layer - # echo "0xYYYYYYYY" > trace_debug_level - # echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name - # echo "method-once" > /sys/module/acpi/parameters/trace_state - Where: - 0xXXXXXXXX/0xYYYYYYYY: Refer to Documentation/acpi/debug.txt for - possible debug layer/level masking values. - \PPPP.AAAA.TTTT.HHHH: Full path of a control method that can be found - in the ACPI namespace. It needn't be an entry - of a control method evaluation. - - B. AML tracer - - There are special log entries added by the method tracing facility at - the "trace points" the AML interpreter starts/stops to execute a control - method, or an AML opcode. Note that the format of the log entries are - subject to change: - [ 0.186427] exdebug-0398 ex_trace_point : Method Begin [0xf58394d8:\_SB.PCI0.LPCB.ECOK] execution. - [ 0.186630] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905c88:If] execution. - [ 0.186820] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905cc0:LEqual] execution. - [ 0.187010] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905a20:-NamePath-] execution. - [ 0.187214] exdebug-0398 ex_trace_point : Opcode End [0xf5905a20:-NamePath-] execution. - [ 0.187407] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905f60:One] execution. - [ 0.187594] exdebug-0398 ex_trace_point : Opcode End [0xf5905f60:One] execution. - [ 0.187789] exdebug-0398 ex_trace_point : Opcode End [0xf5905cc0:LEqual] execution. - [ 0.187980] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905cc0:Return] execution. - [ 0.188146] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905f60:One] execution. - [ 0.188334] exdebug-0398 ex_trace_point : Opcode End [0xf5905f60:One] execution. - [ 0.188524] exdebug-0398 ex_trace_point : Opcode End [0xf5905cc0:Return] execution. - [ 0.188712] exdebug-0398 ex_trace_point : Opcode End [0xf5905c88:If] execution. - [ 0.188903] exdebug-0398 ex_trace_point : Method End [0xf58394d8:\_SB.PCI0.LPCB.ECOK] execution. - - Developers can utilize these special log entries to track the AML - interpretion, thus can aid issue debugging and performance tuning. Note - that, as the "AML tracer" logs are implemented via ACPI_DEBUG_PRINT() - macro, CONFIG_ACPI_DEBUG is also required to be enabled for enabling - "AML tracer" logs. - - The following command examples illustrate the usage of the "AML tracer" - functionality: - a. Filter out the method start/stop "AML tracer" logs when control - methods are being evaluated: - # cd /sys/module/acpi/parameters - # echo "0x80" > trace_debug_layer - # echo "0x10" > trace_debug_level - # echo "enable" > trace_state - b. Filter out the method start/stop "AML tracer" when the specified - control method is being evaluated: - # cd /sys/module/acpi/parameters - # echo "0x80" > trace_debug_layer - # echo "0x10" > trace_debug_level - # echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name - # echo "method" > trace_state - c. Filter out the method start/stop "AML tracer" logs when the specified - control method is being evaluated for the first time: - # cd /sys/module/acpi/parameters - # echo "0x80" > trace_debug_layer - # echo "0x10" > trace_debug_level - # echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name - # echo "method-once" > trace_state - d. Filter out the method/opcode start/stop "AML tracer" when the - specified control method is being evaluated: - # cd /sys/module/acpi/parameters - # echo "0x80" > trace_debug_layer - # echo "0x10" > trace_debug_level - # echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name - # echo "opcode" > trace_state - e. Filter out the method/opcode start/stop "AML tracer" when the - specified control method is being evaluated for the first time: - # cd /sys/module/acpi/parameters - # echo "0x80" > trace_debug_layer - # echo "0x10" > trace_debug_level - # echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name - # echo "opcode-opcode" > trace_state - - Note that all above method tracing facility related module parameters can - be used as the boot parameters, for example: - acpi.trace_debug_layer=0x80 acpi.trace_debug_level=0x10 \ - acpi.trace_method_name=\_SB.LID0._LID acpi.trace_state=opcode-once - -2. Interface descriptions: - - All method tracing functions can be configured via ACPI module - parameters that are accessible at /sys/module/acpi/parameters/: - - trace_method_name - The full path of the AML method that the user wants to trace. - Note that the full path shouldn't contain the trailing "_"s in its - name segments but may contain "\" to form an absolute path. - - trace_debug_layer - The temporary debug_layer used when the tracing feature is enabled. - Using ACPI_EXECUTER (0x80) by default, which is the debug_layer - used to match all "AML tracer" logs. - - trace_debug_level - The temporary debug_level used when the tracing feature is enabled. - Using ACPI_LV_TRACE_POINT (0x10) by default, which is the - debug_level used to match all "AML tracer" logs. - - trace_state - The status of the tracing feature. - Users can enable/disable this debug tracing feature by executing - the following command: - # echo string > /sys/module/acpi/parameters/trace_state - Where "string" should be one of the following: - "disable" - Disable the method tracing feature. - "enable" - Enable the method tracing feature. - ACPICA debugging messages matching - "trace_debug_layer/trace_debug_level" during any method - execution will be logged. - "method" - Enable the method tracing feature. - ACPICA debugging messages matching - "trace_debug_layer/trace_debug_level" during method execution - of "trace_method_name" will be logged. - "method-once" - Enable the method tracing feature. - ACPICA debugging messages matching - "trace_debug_layer/trace_debug_level" during method execution - of "trace_method_name" will be logged only once. - "opcode" - Enable the method tracing feature. - ACPICA debugging messages matching - "trace_debug_layer/trace_debug_level" during method/opcode - execution of "trace_method_name" will be logged. - "opcode-once" - Enable the method tracing feature. - ACPICA debugging messages matching - "trace_debug_layer/trace_debug_level" during method/opcode - execution of "trace_method_name" will be logged only once. - Note that, the difference between the "enable" and other feature - enabling options are: - 1. When "enable" is specified, since - "trace_debug_layer/trace_debug_level" shall apply to all control - method evaluations, after configuring "trace_state" to "enable", - "trace_method_name" will be reset to NULL. - 2. When "method/opcode" is specified, if - "trace_method_name" is NULL when "trace_state" is configured to - these options, the "trace_debug_layer/trace_debug_level" will - apply to all control method evaluations. diff --git a/Documentation/acpi/ssdt-overlays.txt b/Documentation/acpi/ssdt-overlays.txt deleted file mode 100644 index 5ae13f161ea2b98d924b5e688a8b75274778f0dd..0000000000000000000000000000000000000000 --- a/Documentation/acpi/ssdt-overlays.txt +++ /dev/null @@ -1,172 +0,0 @@ - -In order to support ACPI open-ended hardware configurations (e.g. development -boards) we need a way to augment the ACPI configuration provided by the firmware -image. A common example is connecting sensors on I2C / SPI buses on development -boards. - -Although this can be accomplished by creating a kernel platform driver or -recompiling the firmware image with updated ACPI tables, neither is practical: -the former proliferates board specific kernel code while the latter requires -access to firmware tools which are often not publicly available. - -Because ACPI supports external references in AML code a more practical -way to augment firmware ACPI configuration is by dynamically loading -user defined SSDT tables that contain the board specific information. - -For example, to enumerate a Bosch BMA222E accelerometer on the I2C bus of the -Minnowboard MAX development board exposed via the LSE connector [1], the -following ASL code can be used: - -DefinitionBlock ("minnowmax.aml", "SSDT", 1, "Vendor", "Accel", 0x00000003) -{ - External (\_SB.I2C6, DeviceObj) - - Scope (\_SB.I2C6) - { - Device (STAC) - { - Name (_ADR, Zero) - Name (_HID, "BMA222E") - - Method (_CRS, 0, Serialized) - { - Name (RBUF, ResourceTemplate () - { - I2cSerialBus (0x0018, ControllerInitiated, 0x00061A80, - AddressingMode7Bit, "\\_SB.I2C6", 0x00, - ResourceConsumer, ,) - GpioInt (Edge, ActiveHigh, Exclusive, PullDown, 0x0000, - "\\_SB.GPO2", 0x00, ResourceConsumer, , ) - { // Pin list - 0 - } - }) - Return (RBUF) - } - } - } -} - -which can then be compiled to AML binary format: - -$ iasl minnowmax.asl - -Intel ACPI Component Architecture -ASL Optimizing Compiler version 20140214-64 [Mar 29 2014] -Copyright (c) 2000 - 2014 Intel Corporation - -ASL Input: minnomax.asl - 30 lines, 614 bytes, 7 keywords -AML Output: minnowmax.aml - 165 bytes, 6 named objects, 1 executable opcodes - -[1] http://wiki.minnowboard.org/MinnowBoard_MAX#Low_Speed_Expansion_Connector_.28Top.29 - -The resulting AML code can then be loaded by the kernel using one of the methods -below. - -== Loading ACPI SSDTs from initrd == - -This option allows loading of user defined SSDTs from initrd and it is useful -when the system does not support EFI or when there is not enough EFI storage. - -It works in a similar way with initrd based ACPI tables override/upgrade: SSDT -aml code must be placed in the first, uncompressed, initrd under the -"kernel/firmware/acpi" path. Multiple files can be used and this will translate -in loading multiple tables. Only SSDT and OEM tables are allowed. See -initrd_table_override.txt for more details. - -Here is an example: - -# Add the raw ACPI tables to an uncompressed cpio archive. -# They must be put into a /kernel/firmware/acpi directory inside the -# cpio archive. -# The uncompressed cpio archive must be the first. -# Other, typically compressed cpio archives, must be -# concatenated on top of the uncompressed one. -mkdir -p kernel/firmware/acpi -cp ssdt.aml kernel/firmware/acpi - -# Create the uncompressed cpio archive and concatenate the original initrd -# on top: -find kernel | cpio -H newc --create > /boot/instrumented_initrd -cat /boot/initrd >>/boot/instrumented_initrd - -== Loading ACPI SSDTs from EFI variables == - -This is the preferred method, when EFI is supported on the platform, because it -allows a persistent, OS independent way of storing the user defined SSDTs. There -is also work underway to implement EFI support for loading user defined SSDTs -and using this method will make it easier to convert to the EFI loading -mechanism when that will arrive. - -In order to load SSDTs from an EFI variable the efivar_ssdt kernel command line -parameter can be used. The argument for the option is the variable name to -use. If there are multiple variables with the same name but with different -vendor GUIDs, all of them will be loaded. - -In order to store the AML code in an EFI variable the efivarfs filesystem can be -used. It is enabled and mounted by default in /sys/firmware/efi/efivars in all -recent distribution. - -Creating a new file in /sys/firmware/efi/efivars will automatically create a new -EFI variable. Updating a file in /sys/firmware/efi/efivars will update the EFI -variable. Please note that the file name needs to be specially formatted as -"Name-GUID" and that the first 4 bytes in the file (little-endian format) -represent the attributes of the EFI variable (see EFI_VARIABLE_MASK in -include/linux/efi.h). Writing to the file must also be done with one write -operation. - -For example, you can use the following bash script to create/update an EFI -variable with the content from a given file: - -#!/bin/sh -e - -while ! [ -z "$1" ]; do - case "$1" in - "-f") filename="$2"; shift;; - "-g") guid="$2"; shift;; - *) name="$1";; - esac - shift -done - -usage() -{ - echo "Syntax: ${0##*/} -f filename [ -g guid ] name" - exit 1 -} - -[ -n "$name" -a -f "$filename" ] || usage - -EFIVARFS="/sys/firmware/efi/efivars" - -[ -d "$EFIVARFS" ] || exit 2 - -if stat -tf $EFIVARFS | grep -q -v de5e81e4; then - mount -t efivarfs none $EFIVARFS -fi - -# try to pick up an existing GUID -[ -n "$guid" ] || guid=$(find "$EFIVARFS" -name "$name-*" | head -n1 | cut -f2- -d-) - -# use a randomly generated GUID -[ -n "$guid" ] || guid="$(cat /proc/sys/kernel/random/uuid)" - -# efivarfs expects all of the data in one write -tmp=$(mktemp) -/bin/echo -ne "\007\000\000\000" | cat - $filename > $tmp -dd if=$tmp of="$EFIVARFS/$name-$guid" bs=$(stat -c %s $tmp) -rm $tmp - -== Loading ACPI SSDTs from configfs == - -This option allows loading of user defined SSDTs from userspace via the configfs -interface. The CONFIG_ACPI_CONFIGFS option must be select and configfs must be -mounted. In the following examples, we assume that configfs has been mounted in -/config. - -New tables can be loading by creating new directories in /config/acpi/table/ and -writing the SSDT aml code in the aml attribute: - -cd /config/acpi/table -mkdir my_ssdt -cat ~/ssdt.aml > my_ssdt/aml diff --git a/Documentation/acpi/cppc_sysfs.txt b/Documentation/admin-guide/acpi/cppc_sysfs.rst similarity index 51% rename from Documentation/acpi/cppc_sysfs.txt rename to Documentation/admin-guide/acpi/cppc_sysfs.rst index f20fb445135d31953db86edccacca29c519fbf69..a4b99afbe331b3e71e75c7e22b70da346423d5e2 100644 --- a/Documentation/acpi/cppc_sysfs.txt +++ b/Documentation/admin-guide/acpi/cppc_sysfs.rst @@ -1,5 +1,11 @@ +.. SPDX-License-Identifier: GPL-2.0 - Collaborative Processor Performance Control (CPPC) +================================================== +Collaborative Processor Performance Control (CPPC) +================================================== + +CPPC +==== CPPC defined in the ACPI spec describes a mechanism for the OS to manage the performance of a logical processor on a contigious and abstract performance @@ -10,31 +16,28 @@ For more details on CPPC please refer to the ACPI specification at: http://uefi.org/specifications -Some of the CPPC registers are exposed via sysfs under: - -/sys/devices/system/cpu/cpuX/acpi_cppc/ - -for each cpu X +Some of the CPPC registers are exposed via sysfs under:: --------------------------------------------------------------------------------- + /sys/devices/system/cpu/cpuX/acpi_cppc/ -$ ls -lR /sys/devices/system/cpu/cpu0/acpi_cppc/ -/sys/devices/system/cpu/cpu0/acpi_cppc/: -total 0 --r--r--r-- 1 root root 65536 Mar 5 19:38 feedback_ctrs --r--r--r-- 1 root root 65536 Mar 5 19:38 highest_perf --r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_freq --r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_nonlinear_perf --r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_perf --r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_freq --r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_perf --r--r--r-- 1 root root 65536 Mar 5 19:38 reference_perf --r--r--r-- 1 root root 65536 Mar 5 19:38 wraparound_time +for each cpu X:: --------------------------------------------------------------------------------- + $ ls -lR /sys/devices/system/cpu/cpu0/acpi_cppc/ + /sys/devices/system/cpu/cpu0/acpi_cppc/: + total 0 + -r--r--r-- 1 root root 65536 Mar 5 19:38 feedback_ctrs + -r--r--r-- 1 root root 65536 Mar 5 19:38 highest_perf + -r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_freq + -r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_nonlinear_perf + -r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_perf + -r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_freq + -r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_perf + -r--r--r-- 1 root root 65536 Mar 5 19:38 reference_perf + -r--r--r-- 1 root root 65536 Mar 5 19:38 wraparound_time * highest_perf : Highest performance of this processor (abstract scale). -* nominal_perf : Highest sustained performance of this processor (abstract scale). +* nominal_perf : Highest sustained performance of this processor + (abstract scale). * lowest_nonlinear_perf : Lowest performance of this processor with nonlinear power savings (abstract scale). * lowest_perf : Lowest performance of this processor (abstract scale). @@ -48,22 +51,26 @@ total 0 * feedback_ctrs : Includes both Reference and delivered performance counter. Reference counter ticks up proportional to processor's reference performance. Delivered counter ticks up proportional to processor's delivered performance. -* wraparound_time: Minimum time for the feedback counters to wraparound (seconds). +* wraparound_time: Minimum time for the feedback counters to wraparound + (seconds). * reference_perf : Performance level at which reference performance counter accumulates (abstract scale). --------------------------------------------------------------------------------- - Computing Average Delivered Performance +Computing Average Delivered Performance +======================================= + +Below describes the steps to compute the average performance delivered by +taking two different snapshots of feedback counters at time T1 and T2. + + T1: Read feedback_ctrs as fbc_t1 + Wait or run some workload -Below describes the steps to compute the average performance delivered by taking -two different snapshots of feedback counters at time T1 and T2. + T2: Read feedback_ctrs as fbc_t2 -T1: Read feedback_ctrs as fbc_t1 - Wait or run some workload -T2: Read feedback_ctrs as fbc_t2 +:: -delivered_counter_delta = fbc_t2[del] - fbc_t1[del] -reference_counter_delta = fbc_t2[ref] - fbc_t1[ref] + delivered_counter_delta = fbc_t2[del] - fbc_t1[del] + reference_counter_delta = fbc_t2[ref] - fbc_t1[ref] -delivered_perf = (refernce_perf x delivered_counter_delta) / reference_counter_delta + delivered_perf = (refernce_perf x delivered_counter_delta) / reference_counter_delta diff --git a/Documentation/acpi/dsdt-override.txt b/Documentation/admin-guide/acpi/dsdt-override.rst similarity index 56% rename from Documentation/acpi/dsdt-override.txt rename to Documentation/admin-guide/acpi/dsdt-override.rst index 784841caa6e63824ff2503351fb12e682f01e3b3..50bd7f194bf440e40ff86f4b98670eff1c715c3c 100644 --- a/Documentation/acpi/dsdt-override.txt +++ b/Documentation/admin-guide/acpi/dsdt-override.rst @@ -1,6 +1,12 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Overriding DSDT +=============== + Linux supports a method of overriding the BIOS DSDT: -CONFIG_ACPI_CUSTOM_DSDT builds the image into the kernel. +CONFIG_ACPI_CUSTOM_DSDT - builds the image into the kernel. When to use this method is described in detail on the Linux/ACPI home page: diff --git a/Documentation/admin-guide/acpi/index.rst b/Documentation/admin-guide/acpi/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..4d13eeea1ecac3ca70d8a9214237e0dd2a05d331 --- /dev/null +++ b/Documentation/admin-guide/acpi/index.rst @@ -0,0 +1,14 @@ +============ +ACPI Support +============ + +Here we document in detail how to interact with various mechanisms in +the Linux ACPI support. + +.. toctree:: + :maxdepth: 1 + + initrd_table_override + dsdt-override + ssdt-overlays + cppc_sysfs diff --git a/Documentation/admin-guide/acpi/initrd_table_override.rst b/Documentation/admin-guide/acpi/initrd_table_override.rst new file mode 100644 index 0000000000000000000000000000000000000000..cbd768207631469dbe176e42d8348f170516c314 --- /dev/null +++ b/Documentation/admin-guide/acpi/initrd_table_override.rst @@ -0,0 +1,115 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================ +Upgrading ACPI tables via initrd +================================ + +What is this about +================== + +If the ACPI_TABLE_UPGRADE compile option is true, it is possible to +upgrade the ACPI execution environment that is defined by the ACPI tables +via upgrading the ACPI tables provided by the BIOS with an instrumented, +modified, more recent version one, or installing brand new ACPI tables. + +When building initrd with kernel in a single image, option +ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD should also be true for this +feature to work. + +For a full list of ACPI tables that can be upgraded/installed, take a look +at the char `*table_sigs[MAX_ACPI_SIGNATURE];` definition in +drivers/acpi/tables.c. + +All ACPI tables iasl (Intel's ACPI compiler and disassembler) knows should +be overridable, except: + + - ACPI_SIG_RSDP (has a signature of 6 bytes) + - ACPI_SIG_FACS (does not have an ordinary ACPI table header) + +Both could get implemented as well. + + +What is this for +================ + +Complain to your platform/BIOS vendor if you find a bug which is so severe +that a workaround is not accepted in the Linux kernel. And this facility +allows you to upgrade the buggy tables before your platform/BIOS vendor +releases an upgraded BIOS binary. + +This facility can be used by platform/BIOS vendors to provide a Linux +compatible environment without modifying the underlying platform firmware. + +This facility also provides a powerful feature to easily debug and test +ACPI BIOS table compatibility with the Linux kernel by modifying old +platform provided ACPI tables or inserting new ACPI tables. + +It can and should be enabled in any kernel because there is no functional +change with not instrumented initrds. + + +How does it work +================ +:: + + # Extract the machine's ACPI tables: + cd /tmp + acpidump >acpidump + acpixtract -a acpidump + # Disassemble, modify and recompile them: + iasl -d *.dat + # For example add this statement into a _PRT (PCI Routing Table) function + # of the DSDT: + Store("HELLO WORLD", debug) + # And increase the OEM Revision. For example, before modification: + DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000000) + # After modification: + DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000001) + iasl -sa dsdt.dsl + # Add the raw ACPI tables to an uncompressed cpio archive. + # They must be put into a /kernel/firmware/acpi directory inside the cpio + # archive. Note that if the table put here matches a platform table + # (similar Table Signature, and similar OEMID, and similar OEM Table ID) + # with a more recent OEM Revision, the platform table will be upgraded by + # this table. If the table put here doesn't match a platform table + # (dissimilar Table Signature, or dissimilar OEMID, or dissimilar OEM Table + # ID), this table will be appended. + mkdir -p kernel/firmware/acpi + cp dsdt.aml kernel/firmware/acpi + # A maximum of "NR_ACPI_INITRD_TABLES (64)" tables are currently allowed + # (see osl.c): + iasl -sa facp.dsl + iasl -sa ssdt1.dsl + cp facp.aml kernel/firmware/acpi + cp ssdt1.aml kernel/firmware/acpi + # The uncompressed cpio archive must be the first. Other, typically + # compressed cpio archives, must be concatenated on top of the uncompressed + # one. Following command creates the uncompressed cpio archive and + # concatenates the original initrd on top: + find kernel | cpio -H newc --create > /boot/instrumented_initrd + cat /boot/initrd >>/boot/instrumented_initrd + # reboot with increased acpi debug level, e.g. boot params: + acpi.debug_level=0x2 acpi.debug_layer=0xFFFFFFFF + # and check your syslog: + [ 1.268089] ACPI: PCI Interrupt Routing Table [\_SB_.PCI0._PRT] + [ 1.272091] [ACPI Debug] String [0x0B] "HELLO WORLD" + +iasl is able to disassemble and recompile quite a lot different, +also static ACPI tables. + + +Where to retrieve userspace tools +================================= + +iasl and acpixtract are part of Intel's ACPICA project: +http://acpica.org/ + +and should be packaged by distributions (for example in the acpica package +on SUSE). + +acpidump can be found in Len Browns pmtools: +ftp://kernel.org/pub/linux/kernel/people/lenb/acpi/utils/pmtools/acpidump + +This tool is also part of the acpica package on SUSE. +Alternatively, used ACPI tables can be retrieved via sysfs in latest kernels: +/sys/firmware/acpi/tables diff --git a/Documentation/admin-guide/acpi/ssdt-overlays.rst b/Documentation/admin-guide/acpi/ssdt-overlays.rst new file mode 100644 index 0000000000000000000000000000000000000000..da37455f96c9b2490e43d38b0982f821cce91443 --- /dev/null +++ b/Documentation/admin-guide/acpi/ssdt-overlays.rst @@ -0,0 +1,180 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +SSDT Overlays +============= + +In order to support ACPI open-ended hardware configurations (e.g. development +boards) we need a way to augment the ACPI configuration provided by the firmware +image. A common example is connecting sensors on I2C / SPI buses on development +boards. + +Although this can be accomplished by creating a kernel platform driver or +recompiling the firmware image with updated ACPI tables, neither is practical: +the former proliferates board specific kernel code while the latter requires +access to firmware tools which are often not publicly available. + +Because ACPI supports external references in AML code a more practical +way to augment firmware ACPI configuration is by dynamically loading +user defined SSDT tables that contain the board specific information. + +For example, to enumerate a Bosch BMA222E accelerometer on the I2C bus of the +Minnowboard MAX development board exposed via the LSE connector [1], the +following ASL code can be used:: + + DefinitionBlock ("minnowmax.aml", "SSDT", 1, "Vendor", "Accel", 0x00000003) + { + External (\_SB.I2C6, DeviceObj) + + Scope (\_SB.I2C6) + { + Device (STAC) + { + Name (_ADR, Zero) + Name (_HID, "BMA222E") + + Method (_CRS, 0, Serialized) + { + Name (RBUF, ResourceTemplate () + { + I2cSerialBus (0x0018, ControllerInitiated, 0x00061A80, + AddressingMode7Bit, "\\_SB.I2C6", 0x00, + ResourceConsumer, ,) + GpioInt (Edge, ActiveHigh, Exclusive, PullDown, 0x0000, + "\\_SB.GPO2", 0x00, ResourceConsumer, , ) + { // Pin list + 0 + } + }) + Return (RBUF) + } + } + } + } + +which can then be compiled to AML binary format:: + + $ iasl minnowmax.asl + + Intel ACPI Component Architecture + ASL Optimizing Compiler version 20140214-64 [Mar 29 2014] + Copyright (c) 2000 - 2014 Intel Corporation + + ASL Input: minnomax.asl - 30 lines, 614 bytes, 7 keywords + AML Output: minnowmax.aml - 165 bytes, 6 named objects, 1 executable opcodes + +[1] http://wiki.minnowboard.org/MinnowBoard_MAX#Low_Speed_Expansion_Connector_.28Top.29 + +The resulting AML code can then be loaded by the kernel using one of the methods +below. + +Loading ACPI SSDTs from initrd +============================== + +This option allows loading of user defined SSDTs from initrd and it is useful +when the system does not support EFI or when there is not enough EFI storage. + +It works in a similar way with initrd based ACPI tables override/upgrade: SSDT +aml code must be placed in the first, uncompressed, initrd under the +"kernel/firmware/acpi" path. Multiple files can be used and this will translate +in loading multiple tables. Only SSDT and OEM tables are allowed. See +initrd_table_override.txt for more details. + +Here is an example:: + + # Add the raw ACPI tables to an uncompressed cpio archive. + # They must be put into a /kernel/firmware/acpi directory inside the + # cpio archive. + # The uncompressed cpio archive must be the first. + # Other, typically compressed cpio archives, must be + # concatenated on top of the uncompressed one. + mkdir -p kernel/firmware/acpi + cp ssdt.aml kernel/firmware/acpi + + # Create the uncompressed cpio archive and concatenate the original initrd + # on top: + find kernel | cpio -H newc --create > /boot/instrumented_initrd + cat /boot/initrd >>/boot/instrumented_initrd + +Loading ACPI SSDTs from EFI variables +===================================== + +This is the preferred method, when EFI is supported on the platform, because it +allows a persistent, OS independent way of storing the user defined SSDTs. There +is also work underway to implement EFI support for loading user defined SSDTs +and using this method will make it easier to convert to the EFI loading +mechanism when that will arrive. + +In order to load SSDTs from an EFI variable the efivar_ssdt kernel command line +parameter can be used. The argument for the option is the variable name to +use. If there are multiple variables with the same name but with different +vendor GUIDs, all of them will be loaded. + +In order to store the AML code in an EFI variable the efivarfs filesystem can be +used. It is enabled and mounted by default in /sys/firmware/efi/efivars in all +recent distribution. + +Creating a new file in /sys/firmware/efi/efivars will automatically create a new +EFI variable. Updating a file in /sys/firmware/efi/efivars will update the EFI +variable. Please note that the file name needs to be specially formatted as +"Name-GUID" and that the first 4 bytes in the file (little-endian format) +represent the attributes of the EFI variable (see EFI_VARIABLE_MASK in +include/linux/efi.h). Writing to the file must also be done with one write +operation. + +For example, you can use the following bash script to create/update an EFI +variable with the content from a given file:: + + #!/bin/sh -e + + while ! [ -z "$1" ]; do + case "$1" in + "-f") filename="$2"; shift;; + "-g") guid="$2"; shift;; + *) name="$1";; + esac + shift + done + + usage() + { + echo "Syntax: ${0##*/} -f filename [ -g guid ] name" + exit 1 + } + + [ -n "$name" -a -f "$filename" ] || usage + + EFIVARFS="/sys/firmware/efi/efivars" + + [ -d "$EFIVARFS" ] || exit 2 + + if stat -tf $EFIVARFS | grep -q -v de5e81e4; then + mount -t efivarfs none $EFIVARFS + fi + + # try to pick up an existing GUID + [ -n "$guid" ] || guid=$(find "$EFIVARFS" -name "$name-*" | head -n1 | cut -f2- -d-) + + # use a randomly generated GUID + [ -n "$guid" ] || guid="$(cat /proc/sys/kernel/random/uuid)" + + # efivarfs expects all of the data in one write + tmp=$(mktemp) + /bin/echo -ne "\007\000\000\000" | cat - $filename > $tmp + dd if=$tmp of="$EFIVARFS/$name-$guid" bs=$(stat -c %s $tmp) + rm $tmp + +Loading ACPI SSDTs from configfs +================================ + +This option allows loading of user defined SSDTs from userspace via the configfs +interface. The CONFIG_ACPI_CONFIGFS option must be select and configfs must be +mounted. In the following examples, we assume that configfs has been mounted in +/config. + +New tables can be loading by creating new directories in /config/acpi/table/ and +writing the SSDT aml code in the aml attribute:: + + cd /config/acpi/table + mkdir my_ssdt + cat ~/ssdt.aml > my_ssdt/aml diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 0a491676685e1e2e8e20b371fde657fef04ef73d..5b8286fdd91ba33804a91a830c5d102233b997c2 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -77,6 +77,7 @@ configure specific aspects of kernel behavior to your liking. LSM/index mm/index perf-security + acpi/index .. only:: subproject and html diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index b8d0bc07ed0a62aa8e041596eac020dc86537597..0124980dca2db50b8076f0474400e8344ef40205 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -88,6 +88,7 @@ parameter is applicable:: APIC APIC support is enabled. APM Advanced Power Management support is enabled. ARM ARM architecture is enabled. + ARM64 ARM64 architecture is enabled. AX25 Appropriate AX.25 support is enabled. CLK Common clock infrastructure is enabled. CMA Contiguous Memory Area support is enabled. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b8ee90bb64470d0d6d6ccadccf8b8fbbf86509d..fd03e2b629bbcfda55847e0cddd973bdd7b3a708 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -704,8 +704,11 @@ upon panic. This parameter reserves the physical memory region [offset, offset + size] for that kernel image. If '@offset' is omitted, then a suitable offset - is selected automatically. Check - Documentation/kdump/kdump.txt for further details. + is selected automatically. + [KNL, x86_64] select a region under 4G first, and + fall back to reserve region above 4G when '@offset' + hasn't been specified. + See Documentation/kdump/kdump.txt for further details. crashkernel=range1:size1[,range2:size2,...][@offset] [KNL] Same as above, but depends on the memory @@ -2544,6 +2547,40 @@ in the "bleeding edge" mini2440 support kernel at http://repo.or.cz/w/linux-2.6/mini2440.git + mitigations= + [X86,PPC,S390,ARM64] Control optional mitigations for + CPU vulnerabilities. This is a set of curated, + arch-independent options, each of which is an + aggregation of existing arch-specific options. + + off + Disable all optional CPU mitigations. This + improves system performance, but it may also + expose users to several CPU vulnerabilities. + Equivalent to: nopti [X86,PPC] + kpti=0 [ARM64] + nospectre_v1 [PPC] + nobp=0 [S390] + nospectre_v2 [X86,PPC,S390,ARM64] + spectre_v2_user=off [X86] + spec_store_bypass_disable=off [X86,PPC] + ssbd=force-off [ARM64] + l1tf=off [X86] + + auto (default) + Mitigate all CPU vulnerabilities, but leave SMT + enabled, even if it's vulnerable. This is for + users who don't want to be surprised by SMT + getting disabled across kernel upgrades, or who + have other ways of avoiding SMT-based attacks. + Equivalent to: (default behavior) + + auto,nosmt + Mitigate all CPU vulnerabilities, disabling SMT + if needed. This is for users who always want to + be fully mitigated, even if it means losing SMT. + Equivalent to: l1tf=flush,nosmt [X86] + mminit_loglevel= [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this parameter allows control of the logging verbosity for @@ -2873,10 +2910,10 @@ check bypass). With this option data leaks are possible in the system. - nospectre_v2 [X86,PPC_FSL_BOOK3E] Disable all mitigations for the Spectre variant 2 - (indirect branch prediction) vulnerability. System may - allow data leaks with this option, which is equivalent - to spectre_v2=off. + nospectre_v2 [X86,PPC_FSL_BOOK3E,ARM64] Disable all mitigations for + the Spectre variant 2 (indirect branch prediction) + vulnerability. System may allow data leaks with this + option. nospec_store_bypass_disable [HW] Disable all mitigations for the Speculative Store Bypass vulnerability @@ -3394,6 +3431,8 @@ bridges without forcing it upstream. Note: this removes isolation between devices and may put more devices in an IOMMU group. + force_floating [S390] Force usage of floating interrupts. + nomio [S390] Do not use MIO instructions. pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. @@ -3623,7 +3662,9 @@ see CONFIG_RAS_CEC help text. rcu_nocbs= [KNL] - The argument is a cpu list, as described above. + The argument is a cpu list, as described above, + except that the string "all" can be used to + specify every CPU on the system. In kernels built with CONFIG_RCU_NOCB_CPU=y, set the specified list of CPUs to be no-callback CPUs. @@ -4703,6 +4744,10 @@ [x86] unstable: mark the TSC clocksource as unstable, this marks the TSC unconditionally unstable at bootup and avoids any further wobbles once the TSC watchdog notices. + [x86] nowatchdog: disable clocksource watchdog. Used + in situations with strict latency requirements (where + interruptions from clocksource watchdog are not + acceptable). turbografx.map[2|3]= [HW,JOY] TurboGraFX parallel port interface diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst index 7eca9026a9ed2c3ed2a35b7e2184660e8caa9fdf..0c74a778496480ac9899b7bce415798cc274c362 100644 --- a/Documentation/admin-guide/pm/cpufreq.rst +++ b/Documentation/admin-guide/pm/cpufreq.rst @@ -1,3 +1,6 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + .. |struct cpufreq_policy| replace:: :c:type:`struct cpufreq_policy ` .. |intel_pstate| replace:: :doc:`intel_pstate ` @@ -5,9 +8,10 @@ CPU Performance Scaling ======================= -:: +:Copyright: |copy| 2017 Intel Corporation + +:Author: Rafael J. Wysocki - Copyright (c) 2017 Intel Corp., Rafael J. Wysocki The Concept of CPU Performance Scaling ====================================== @@ -396,8 +400,8 @@ RT or deadline scheduling classes, the governor will increase the frequency to the allowed maximum (that is, the ``scaling_max_freq`` policy limit). In turn, if it is invoked by the CFS scheduling class, the governor will use the Per-Entity Load Tracking (PELT) metric for the root control group of the -given CPU as the CPU utilization estimate (see the `Per-entity load tracking`_ -LWN.net article for a description of the PELT mechanism). Then, the new +given CPU as the CPU utilization estimate (see the *Per-entity load tracking* +LWN.net article [1]_ for a description of the PELT mechanism). Then, the new CPU frequency to apply is computed in accordance with the formula f = 1.25 * ``f_0`` * ``util`` / ``max`` @@ -698,4 +702,8 @@ hardware feature (e.g. all Intel ones), even if the :c:macro:`CONFIG_X86_ACPI_CPUFREQ_CPB` configuration option is set. -.. _Per-entity load tracking: https://lwn.net/Articles/531853/ +References +========== + +.. [1] Jonathan Corbet, *Per-entity load tracking*, + https://lwn.net/Articles/531853/ diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst index 9c58b35a81cbcf38eab2db94a3f3e01f60601e6f..e70b365dbc6030e758c20f6aa6165498393dd42d 100644 --- a/Documentation/admin-guide/pm/cpuidle.rst +++ b/Documentation/admin-guide/pm/cpuidle.rst @@ -1,3 +1,6 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + .. |struct cpuidle_state| replace:: :c:type:`struct cpuidle_state ` .. |cpufreq| replace:: :doc:`CPU Performance Scaling ` @@ -5,9 +8,10 @@ CPU Idle Time Management ======================== -:: +:Copyright: |copy| 2018 Intel Corporation + +:Author: Rafael J. Wysocki - Copyright (c) 2018 Intel Corp., Rafael J. Wysocki Concepts ======== diff --git a/Documentation/admin-guide/pm/index.rst b/Documentation/admin-guide/pm/index.rst index 49237ac734428b63951837ad40fd110f3b346dfb..39f8f9f81e7a3a3e5537e9927845d627f0f44b36 100644 --- a/Documentation/admin-guide/pm/index.rst +++ b/Documentation/admin-guide/pm/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ================ Power Management ================ diff --git a/Documentation/admin-guide/pm/intel_epb.rst b/Documentation/admin-guide/pm/intel_epb.rst new file mode 100644 index 0000000000000000000000000000000000000000..005121167af7aaf761d4c8183dd8cd7858d1097f --- /dev/null +++ b/Documentation/admin-guide/pm/intel_epb.rst @@ -0,0 +1,41 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +====================================== +Intel Performance and Energy Bias Hint +====================================== + +:Copyright: |copy| 2019 Intel Corporation + +:Author: Rafael J. Wysocki + + +.. kernel-doc:: arch/x86/kernel/cpu/intel_epb.c + :doc: overview + +Intel Performance and Energy Bias Attribute in ``sysfs`` +======================================================== + +The Intel Performance and Energy Bias Hint (EPB) value for a given (logical) CPU +can be checked or updated through a ``sysfs`` attribute (file) under +:file:`/sys/devices/system/cpu/cpu/power/`, where the CPU number ```` +is allocated at the system initialization time: + +``energy_perf_bias`` + Shows the current EPB value for the CPU in a sliding scale 0 - 15, where + a value of 0 corresponds to a hint preference for highest performance + and a value of 15 corresponds to the maximum energy savings. + + In order to update the EPB value for the CPU, this attribute can be + written to, either with a number in the 0 - 15 sliding scale above, or + with one of the strings: "performance", "balance-performance", "normal", + "balance-power", "power" that represent values reflected by their + meaning. + + This attribute is present for all online CPUs supporting the EPB + feature. + +Note that while the EPB interface to the processor is defined at the logical CPU +level, the physical register backing it may be shared by multiple CPUs (for +example, SMT siblings or cores in one package). For this reason, updating the +EPB value for one CPU may cause the EPB values for other CPUs to change. diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index ec0f7c111f65b14c4328d7a9e0cd3314a8fba36f..67e414e34f37997e8bc620e904797cb3f3aa13c4 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -1,10 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + =============================================== ``intel_pstate`` CPU Performance Scaling Driver =============================================== -:: +:Copyright: |copy| 2017 Intel Corporation - Copyright (c) 2017 Intel Corp., Rafael J. Wysocki +:Author: Rafael J. Wysocki General Information @@ -20,11 +23,10 @@ you have not done that yet.] For the processors supported by ``intel_pstate``, the P-state concept is broader than just an operating frequency or an operating performance point (see the -`LinuxCon Europe 2015 presentation by Kristen Accardi `_ for more +LinuxCon Europe 2015 presentation by Kristen Accardi [1]_ for more information about that). For this reason, the representation of P-states used by ``intel_pstate`` internally follows the hardware specification (for details -refer to `Intel® 64 and IA-32 Architectures Software Developer’s Manual -Volume 3: System Programming Guide `_). However, the ``CPUFreq`` core +refer to Intel Software Developer’s Manual [2]_). However, the ``CPUFreq`` core uses frequencies for identifying operating performance points of CPUs and frequencies are involved in the user space interface exposed by it, so ``intel_pstate`` maps its internal representation of P-states to frequencies too @@ -561,9 +563,9 @@ or to pin every task potentially sensitive to them to a specific CPU.] On the majority of systems supported by ``intel_pstate``, the ACPI tables provided by the platform firmware contain ``_PSS`` objects returning information -that can be used for CPU performance scaling (refer to the `ACPI specification`_ -for details on the ``_PSS`` objects and the format of the information returned -by them). +that can be used for CPU performance scaling (refer to the ACPI specification +[3]_ for details on the ``_PSS`` objects and the format of the information +returned by them). The information returned by the ACPI ``_PSS`` objects is used by the ``acpi-cpufreq`` scaling driver. On systems supported by ``intel_pstate`` @@ -728,6 +730,14 @@ P-state is called, the ``ftrace`` filter can be set to to -0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func -.. _LCEU2015: http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf -.. _SDM: http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html -.. _ACPI specification: http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf +References +========== + +.. [1] Kristen Accardi, *Balancing Power and Performance in the Linux Kernel*, + http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf + +.. [2] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 3: System Programming Guide*, + http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html + +.. [3] *Advanced Configuration and Power Interface Specification*, + https://uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf diff --git a/Documentation/admin-guide/pm/sleep-states.rst b/Documentation/admin-guide/pm/sleep-states.rst index dbf5acd49f350de349bf228e9b7c824e2cfd04b1..cd3a28cb81f424d0962046ebb69292c214041150 100644 --- a/Documentation/admin-guide/pm/sleep-states.rst +++ b/Documentation/admin-guide/pm/sleep-states.rst @@ -1,10 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + =================== System Sleep States =================== -:: +:Copyright: |copy| 2017 Intel Corporation + +:Author: Rafael J. Wysocki - Copyright (c) 2017 Intel Corp., Rafael J. Wysocki Sleep states are global low-power states of the entire system in which user space code cannot be executed and the overall system activity is significantly diff --git a/Documentation/admin-guide/pm/strategies.rst b/Documentation/admin-guide/pm/strategies.rst index afe4d3f831fe06fb7e04989cf317f1d5caaaebd8..dd0362e32fa55e90cd13784e1402310717416b39 100644 --- a/Documentation/admin-guide/pm/strategies.rst +++ b/Documentation/admin-guide/pm/strategies.rst @@ -1,10 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + =========================== Power Management Strategies =========================== -:: +:Copyright: |copy| 2017 Intel Corporation + +:Author: Rafael J. Wysocki - Copyright (c) 2017 Intel Corp., Rafael J. Wysocki The Linux kernel supports two major high-level power management strategies. diff --git a/Documentation/admin-guide/pm/system-wide.rst b/Documentation/admin-guide/pm/system-wide.rst index 0c81e4c5de398db435e85d486cd821bebea65a33..2b1f987b34f007a3d2cffd56a4b64fee3f2aa4c4 100644 --- a/Documentation/admin-guide/pm/system-wide.rst +++ b/Documentation/admin-guide/pm/system-wide.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ============================ System-Wide Power Management ============================ diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst index b6cef9b5e961e6bdec653707e1457e0ddb87e013..fc298eb1234b07ba70e35ae26d1dd750834419d7 100644 --- a/Documentation/admin-guide/pm/working-state.rst +++ b/Documentation/admin-guide/pm/working-state.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ============================== Working-State Power Management ============================== @@ -8,3 +10,4 @@ Working-State Power Management cpuidle cpufreq intel_pstate + intel_epb diff --git a/Documentation/arm64/cpu-feature-registers.txt b/Documentation/arm64/cpu-feature-registers.txt index d4b4dd1fe786a16f9cd97d41c13dbf8425bb3321..684a0da3937863980bd84e92f025b6ee0141674a 100644 --- a/Documentation/arm64/cpu-feature-registers.txt +++ b/Documentation/arm64/cpu-feature-registers.txt @@ -209,6 +209,22 @@ infrastructure: | AT | [35-32] | y | x--------------------------------------------------x + 6) ID_AA64ZFR0_EL1 - SVE feature ID register 0 + + x--------------------------------------------------x + | Name | bits | visible | + |--------------------------------------------------| + | SM4 | [43-40] | y | + |--------------------------------------------------| + | SHA3 | [35-32] | y | + |--------------------------------------------------| + | BitPerm | [19-16] | y | + |--------------------------------------------------| + | AES | [7-4] | y | + |--------------------------------------------------| + | SVEVer | [3-0] | y | + x--------------------------------------------------x + Appendix I: Example --------------------------- diff --git a/Documentation/arm64/elf_hwcaps.txt b/Documentation/arm64/elf_hwcaps.txt index 13d6691b37bee5f6517d7234df4b59c74dcb8837..b73a2519ecf231d74ab36f68a9fa2cfdee8af6ac 100644 --- a/Documentation/arm64/elf_hwcaps.txt +++ b/Documentation/arm64/elf_hwcaps.txt @@ -13,9 +13,9 @@ architected discovery mechanism available to userspace code at EL0. The kernel exposes the presence of these features to userspace through a set of flags called hwcaps, exposed in the auxilliary vector. -Userspace software can test for features by acquiring the AT_HWCAP entry -of the auxilliary vector, and testing whether the relevant flags are -set, e.g. +Userspace software can test for features by acquiring the AT_HWCAP or +AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant +flags are set, e.g. bool floating_point_is_present(void) { @@ -135,6 +135,10 @@ HWCAP_DCPOP Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0001. +HWCAP2_DCPODP + + Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010. + HWCAP_SHA3 Functionality implied by ID_AA64ISAR0_EL1.SHA3 == 0b0001. @@ -159,6 +163,30 @@ HWCAP_SVE Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001. +HWCAP2_SVE2 + + Functionality implied by ID_AA64ZFR0_EL1.SVEVer == 0b0001. + +HWCAP2_SVEAES + + Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0001. + +HWCAP2_SVEPMULL + + Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0010. + +HWCAP2_SVEBITPERM + + Functionality implied by ID_AA64ZFR0_EL1.BitPerm == 0b0001. + +HWCAP2_SVESHA3 + + Functionality implied by ID_AA64ZFR0_EL1.SHA3 == 0b0001. + +HWCAP2_SVESM4 + + Functionality implied by ID_AA64ZFR0_EL1.SM4 == 0b0001. + HWCAP_ASIMDFHM Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001. @@ -194,3 +222,10 @@ HWCAP_PACG Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or ID_AA64ISAR1_EL1.GPI == 0b0001, as described by Documentation/arm64/pointer-authentication.txt. + + +4. Unused AT_HWCAP bits +----------------------- + +For interoperation with userspace, the kernel guarantees that bits 62 +and 63 of AT_HWCAP will always be returned as 0. diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index d1e2bb801e1bdbec43f1cc4fded1f54ff3cd40d4..68d9b74fd751225998b9c5058a1ef2889e71c24f 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -61,6 +61,7 @@ stable kernels. | ARM | Cortex-A76 | #1188873 | ARM64_ERRATUM_1188873 | | ARM | Cortex-A76 | #1165522 | ARM64_ERRATUM_1165522 | | ARM | Cortex-A76 | #1286807 | ARM64_ERRATUM_1286807 | +| ARM | Neoverse-N1 | #1188873 | ARM64_ERRATUM_1188873 | | ARM | MMU-500 | #841119,#826419 | N/A | | | | | | | Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 | @@ -77,6 +78,7 @@ stable kernels. | Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 | | Hisilicon | Hip0{6,7} | #161010701 | N/A | | Hisilicon | Hip07 | #161600802 | HISILICON_ERRATUM_161600802 | +| Hisilicon | Hip08 SMMU PMCG | #162001800 | N/A | | | | | | | Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | | Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 | diff --git a/Documentation/arm64/sve.txt b/Documentation/arm64/sve.txt index 7169a0ec41d86911ad4a9c7fc34488841dc7c1e6..9940e924a47ed4cf378d083c76b4820e1cafd68d 100644 --- a/Documentation/arm64/sve.txt +++ b/Documentation/arm64/sve.txt @@ -34,6 +34,23 @@ model features for SVE is included in Appendix A. following sections: software that needs to verify that those interfaces are present must check for HWCAP_SVE instead. +* On hardware that supports the SVE2 extensions, HWCAP2_SVE2 will also + be reported in the AT_HWCAP2 aux vector entry. In addition to this, + optional extensions to SVE2 may be reported by the presence of: + + HWCAP2_SVE2 + HWCAP2_SVEAES + HWCAP2_SVEPMULL + HWCAP2_SVEBITPERM + HWCAP2_SVESHA3 + HWCAP2_SVESM4 + + This list may be extended over time as the SVE architecture evolves. + + These extensions are also reported via the CPU ID register ID_AA64ZFR0_EL1, + which userspace can read using an MRS instruction. See elf_hwcaps.txt and + cpu-feature-registers.txt for details. + * Debuggers should restrict themselves to interacting with the target via the NT_ARM_SVE regset. The recommended way of detecting support for this regset is to connect to a target process first and then attempt a diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt index 913396ac582431cb3acbdc96a1bd7f4293661d0b..dca3fb0554db4928fa186e0826448d37e543a550 100644 --- a/Documentation/atomic_t.txt +++ b/Documentation/atomic_t.txt @@ -56,6 +56,23 @@ Barriers: smp_mb__{before,after}_atomic() +TYPES (signed vs unsigned) +----- + +While atomic_t, atomic_long_t and atomic64_t use int, long and s64 +respectively (for hysterical raisins), the kernel uses -fno-strict-overflow +(which implies -fwrapv) and defines signed overflow to behave like +2s-complement. + +Therefore, an explicitly unsigned variant of the atomic ops is strictly +unnecessary and we can simply cast, there is no UB. + +There was a bug in UBSAN prior to GCC-8 that would generate UB warnings for +signed types. + +With this we also conform to the C/C++ _Atomic behaviour and things like +P1236R1. + SEMANTICS --------- diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 9a60a5d60e380ab2204334086a58ebef1f3b11e5..7313d354f20e6402e23161f24ddab227afb7c0a1 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -148,16 +148,16 @@ The ``btf_type.size * 8`` must be equal to or greater than ``BTF_INT_BITS()`` for the type. The maximum value of ``BTF_INT_BITS()`` is 128. The ``BTF_INT_OFFSET()`` specifies the starting bit offset to calculate values -for this int. For example, a bitfield struct member has: * btf member bit -offset 100 from the start of the structure, * btf member pointing to an int -type, * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4`` +for this int. For example, a bitfield struct member has: + * btf member bit offset 100 from the start of the structure, + * btf member pointing to an int type, + * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4`` Then in the struct memory layout, this member will occupy ``4`` bits starting from bits ``100 + 2 = 102``. Alternatively, the bitfield struct member can be the following to access the same bits as the above: - * btf member bit offset 102, * btf member pointing to an int type, * the int type has ``BTF_INT_OFFSET() = 0`` and ``BTF_INT_BITS() = 4`` diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index 6eb9d3f090cdf5d9a82afa3bd46cec5554ca116a..93cb65d52720a0ef72b2ba527ef3135b579c113e 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -101,16 +101,6 @@ changes occur: translations for software managed TLB configurations. The sparc64 port currently does this. -6) ``void tlb_migrate_finish(struct mm_struct *mm)`` - - This interface is called at the end of an explicit - process migration. This interface provides a hook - to allow a platform to update TLB or context-specific - information for the address space. - - The ia64 sn2 platform is one example of a platform - that uses this interface. - Next, we have the cache flushing interfaces. In general, when Linux is changing an existing virtual-->physical mapping to a new value, the sequence will be in one of the following forms:: diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt index c6e7e9196a8b41cdd983ac1452c0552c88a9c981..cb61277e230828a28fa6ed0bfb1d0d13a4fde6c7 100644 --- a/Documentation/cputopology.txt +++ b/Documentation/cputopology.txt @@ -3,79 +3,79 @@ How CPU topology info is exported via sysfs =========================================== Export CPU topology info via sysfs. Items (attributes) are similar -to /proc/cpuinfo output of some architectures: +to /proc/cpuinfo output of some architectures. They reside in +/sys/devices/system/cpu/cpuX/topology/: -1) /sys/devices/system/cpu/cpuX/topology/physical_package_id: +physical_package_id: physical package id of cpuX. Typically corresponds to a physical socket number, but the actual value is architecture and platform dependent. -2) /sys/devices/system/cpu/cpuX/topology/core_id: +core_id: the CPU core ID of cpuX. Typically it is the hardware platform's identifier (rather than the kernel's). The actual value is architecture and platform dependent. -3) /sys/devices/system/cpu/cpuX/topology/book_id: +book_id: the book ID of cpuX. Typically it is the hardware platform's identifier (rather than the kernel's). The actual value is architecture and platform dependent. -4) /sys/devices/system/cpu/cpuX/topology/drawer_id: +drawer_id: the drawer ID of cpuX. Typically it is the hardware platform's identifier (rather than the kernel's). The actual value is architecture and platform dependent. -5) /sys/devices/system/cpu/cpuX/topology/thread_siblings: +thread_siblings: internal kernel map of cpuX's hardware threads within the same core as cpuX. -6) /sys/devices/system/cpu/cpuX/topology/thread_siblings_list: +thread_siblings_list: human-readable list of cpuX's hardware threads within the same core as cpuX. -7) /sys/devices/system/cpu/cpuX/topology/core_siblings: +core_siblings: internal kernel map of cpuX's hardware threads within the same physical_package_id. -8) /sys/devices/system/cpu/cpuX/topology/core_siblings_list: +core_siblings_list: human-readable list of cpuX's hardware threads within the same physical_package_id. -9) /sys/devices/system/cpu/cpuX/topology/book_siblings: +book_siblings: internal kernel map of cpuX's hardware threads within the same book_id. -10) /sys/devices/system/cpu/cpuX/topology/book_siblings_list: +book_siblings_list: human-readable list of cpuX's hardware threads within the same book_id. -11) /sys/devices/system/cpu/cpuX/topology/drawer_siblings: +drawer_siblings: internal kernel map of cpuX's hardware threads within the same drawer_id. -12) /sys/devices/system/cpu/cpuX/topology/drawer_siblings_list: +drawer_siblings_list: human-readable list of cpuX's hardware threads within the same drawer_id. -To implement it in an architecture-neutral way, a new source file, -drivers/base/topology.c, is to export the 6 to 12 attributes. The book -and drawer related sysfs files will only be created if CONFIG_SCHED_BOOK -and CONFIG_SCHED_DRAWER are selected. +Architecture-neutral, drivers/base/topology.c, exports these attributes. +However, the book and drawer related sysfs files will only be created if +CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively. -CONFIG_SCHED_BOOK and CONFIG_DRAWER are currently only used on s390, where -they reflect the cpu and cache hierarchy. +CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390, +where they reflect the cpu and cache hierarchy. For an architecture to support this feature, it must define some of these macros in include/asm-XXX/topology.h:: @@ -98,10 +98,10 @@ To be consistent on all architectures, include/linux/topology.h provides default definitions for any of the above macros that are not defined by include/asm-XXX/topology.h: -1) physical_package_id: -1 -2) core_id: 0 -3) sibling_cpumask: just the given CPU -4) core_cpumask: just the given CPU +1) topology_physical_package_id: -1 +2) topology_core_id: 0 +3) topology_sibling_cpumask: just the given CPU +4) topology_core_cpumask: just the given CPU For architectures that don't support books (CONFIG_SCHED_BOOK) there are no default definitions for topology_book_id() and topology_book_cpumask(). diff --git a/Documentation/devicetree/bindings/arm/cpus.yaml b/Documentation/devicetree/bindings/arm/cpus.yaml index 365dcf384d73922a22bc70a32e98444122bc4fa7..82dd7582e945461efbdff77c8222bdc1e0e162b9 100644 --- a/Documentation/devicetree/bindings/arm/cpus.yaml +++ b/Documentation/devicetree/bindings/arm/cpus.yaml @@ -228,7 +228,7 @@ patternProperties: - renesas,r9a06g032-smp - rockchip,rk3036-smp - rockchip,rk3066-smp - - socionext,milbeaut-m10v-smp + - socionext,milbeaut-m10v-smp - ste,dbx500-smp cpu-release-addr: diff --git a/Documentation/devicetree/bindings/edac/socfpga-eccmgr.txt b/Documentation/devicetree/bindings/edac/socfpga-eccmgr.txt index 5626560a6cfdff805ebd1460a14ec2127747de4a..8f52206cfd2a1bcaa7f83b1bb127264b709bad3a 100644 --- a/Documentation/devicetree/bindings/edac/socfpga-eccmgr.txt +++ b/Documentation/devicetree/bindings/edac/socfpga-eccmgr.txt @@ -232,37 +232,152 @@ Example: }; }; -Stratix10 SoCFPGA ECC Manager +Stratix10 SoCFPGA ECC Manager (ARM64) The Stratix10 SoC ECC Manager handles the IRQs for each peripheral -in a shared register similar to the Arria10. However, ECC requires -access to registers that can only be read from Secure Monitor with -SMC calls. Therefore the device tree is slightly different. +in a shared register similar to the Arria10. However, Stratix10 ECC +requires access to registers that can only be read from Secure Monitor +with SMC calls. Therefore the device tree is slightly different. Note +that only 1 interrupt is sent in Stratix10 because the double bit errors +are treated as SErrors in ARM64 instead of IRQs in ARM32. Required Properties: - compatible : Should be "altr,socfpga-s10-ecc-manager" -- interrupts : Should be single bit error interrupt, then double bit error - interrupt. +- altr,sysgr-syscon : phandle to Stratix10 System Manager Block + containing the ECC manager registers. +- interrupts : Should be single bit error interrupt. - interrupt-controller : boolean indicator that ECC Manager is an interrupt controller - #interrupt-cells : must be set to 2. +- #address-cells: must be 1 +- #size-cells: must be 1 +- ranges : standard definition, should translate from local addresses Subcomponents: SDRAM ECC Required Properties: - compatible : Should be "altr,sdram-edac-s10" -- interrupts : Should be single bit error interrupt, then double bit error - interrupt, in this order. +- interrupts : Should be single bit error interrupt. + +On-Chip RAM ECC +Required Properties: +- compatible : Should be "altr,socfpga-s10-ocram-ecc" +- reg : Address and size for ECC block registers. +- altr,ecc-parent : phandle to parent OCRAM node. +- interrupts : Should be single bit error interrupt. + +Ethernet FIFO ECC +Required Properties: +- compatible : Should be "altr,socfpga-s10-eth-mac-ecc" +- reg : Address and size for ECC block registers. +- altr,ecc-parent : phandle to parent Ethernet node. +- interrupts : Should be single bit error interrupt. + +NAND FIFO ECC +Required Properties: +- compatible : Should be "altr,socfpga-s10-nand-ecc" +- reg : Address and size for ECC block registers. +- altr,ecc-parent : phandle to parent NAND node. +- interrupts : Should be single bit error interrupt. + +DMA FIFO ECC +Required Properties: +- compatible : Should be "altr,socfpga-s10-dma-ecc" +- reg : Address and size for ECC block registers. +- altr,ecc-parent : phandle to parent DMA node. +- interrupts : Should be single bit error interrupt. + +USB FIFO ECC +Required Properties: +- compatible : Should be "altr,socfpga-s10-usb-ecc" +- reg : Address and size for ECC block registers. +- altr,ecc-parent : phandle to parent USB node. +- interrupts : Should be single bit error interrupt. + +SDMMC FIFO ECC +Required Properties: +- compatible : Should be "altr,socfpga-s10-sdmmc-ecc" +- reg : Address and size for ECC block registers. +- altr,ecc-parent : phandle to parent SD/MMC node. +- interrupts : Should be single bit error interrupt for port A + and then single bit error interrupt for port B. Example: eccmgr { compatible = "altr,socfpga-s10-ecc-manager"; - interrupts = <0 15 4>, <0 95 4>; + altr,sysmgr-syscon = <&sysmgr>; + #address-cells = <1>; + #size-cells = <1>; + interrupts = <0 15 4>; interrupt-controller; #interrupt-cells = <2>; + ranges; sdramedac { compatible = "altr,sdram-edac-s10"; - interrupts = <16 4>, <48 4>; + interrupts = <16 IRQ_TYPE_LEVEL_HIGH>; + }; + + ocram-ecc@ff8cc000 { + compatible = "altr,socfpga-s10-ocram-ecc"; + reg = ; + altr,ecc-parent = <&ocram>; + interrupts = <1 IRQ_TYPE_LEVEL_HIGH>; + }; + + emac0-rx-ecc@ff8c0000 { + compatible = "altr,socfpga-s10-eth-mac-ecc"; + reg = <0xff8c0000 0x100>; + altr,ecc-parent = <&gmac0>; + interrupts = <4 IRQ_TYPE_LEVEL_HIGH>; + }; + + emac0-tx-ecc@ff8c0400 { + compatible = "altr,socfpga-s10-eth-mac-ecc"; + reg = <0xff8c0400 0x100>; + altr,ecc-parent = <&gmac0>; + interrupts = <5 IRQ_TYPE_LEVEL_HIGH>' + }; + + nand-buf-ecc@ff8c8000 { + compatible = "altr,socfpga-s10-nand-ecc"; + reg = <0xff8c8000 0x100>; + altr,ecc-parent = <&nand>; + interrupts = <11 IRQ_TYPE_LEVEL_HIGH>; + }; + + nand-rd-ecc@ff8c8400 { + compatible = "altr,socfpga-s10-nand-ecc"; + reg = <0xff8c8400 0x100>; + altr,ecc-parent = <&nand>; + interrupts = <13 IRQ_TYPE_LEVEL_HIGH>; + }; + + nand-wr-ecc@ff8c8800 { + compatible = "altr,socfpga-s10-nand-ecc"; + reg = <0xff8c8800 0x100>; + altr,ecc-parent = <&nand>; + interrupts = <12 IRQ_TYPE_LEVEL_HIGH>; + }; + + dma-ecc@ff8c9000 { + compatible = "altr,socfpga-s10-dma-ecc"; + reg = <0xff8c9000 0x100>; + altr,ecc-parent = <&pdma>; + interrupts = <10 IRQ_TYPE_LEVEL_HIGH>; + + usb0-ecc@ff8c4000 { + compatible = "altr,socfpga-s10-usb-ecc"; + reg = <0xff8c4000 0x100>; + altr,ecc-parent = <&usb0>; + interrupts = <2 IRQ_TYPE_LEVEL_HIGH>; + }; + + sdmmc-ecc@ff8c8c00 { + compatible = "altr,socfpga-s10-sdmmc-ecc"; + reg = <0xff8c8c00 0x100>; + altr,ecc-parent = <&mmc>; + interrupts = <14 IRQ_TYPE_LEVEL_HIGH>, + <15 IRQ_TYPE_LEVEL_HIGH>; }; }; diff --git a/Documentation/devicetree/bindings/hwmon/adc128d818.txt b/Documentation/devicetree/bindings/hwmon/adc128d818.txt index 08bab0e94d25a21b8ed4d87738641e5ffe74bfa2..d0ae46d7bac370d9db369d9e471ef632be2482d9 100644 --- a/Documentation/devicetree/bindings/hwmon/adc128d818.txt +++ b/Documentation/devicetree/bindings/hwmon/adc128d818.txt @@ -26,7 +26,7 @@ Required node properties: Optional node properties: - - ti,mode: Operation mode (see above). + - ti,mode: Operation mode (u8) (see above). Example (operation mode 2): @@ -34,5 +34,5 @@ Example (operation mode 2): adc128d818@1d { compatible = "ti,adc128d818"; reg = <0x1d>; - ti,mode = <2>; + ti,mode = /bits/ 8 <2>; }; diff --git a/Documentation/devicetree/bindings/net/davinci_emac.txt b/Documentation/devicetree/bindings/net/davinci_emac.txt index 24c5cdaba8d279a4b132fbd2f964ae1460b3fd0f..ca83dcc84fb8ee5cfd876cf0bb3d8af5fd85ba6b 100644 --- a/Documentation/devicetree/bindings/net/davinci_emac.txt +++ b/Documentation/devicetree/bindings/net/davinci_emac.txt @@ -20,6 +20,8 @@ Required properties: Optional properties: - phy-handle: See ethernet.txt file in the same directory. If absent, davinci_emac driver defaults to 100/FULL. +- nvmem-cells: phandle, reference to an nvmem node for the MAC address +- nvmem-cell-names: string, should be "mac-address" if nvmem is to be used - ti,davinci-rmii-en: 1 byte, 1 means use RMII - ti,davinci-no-bd-ram: boolean, does EMAC have BD RAM? diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt index cfc376bc977aa0a25e64d4e1ef617a1a326fe634..a6862158058461f5af428498ea14c98aed1f7775 100644 --- a/Documentation/devicetree/bindings/net/ethernet.txt +++ b/Documentation/devicetree/bindings/net/ethernet.txt @@ -10,15 +10,14 @@ Documentation/devicetree/bindings/phy/phy-bindings.txt. the boot program; should be used in cases where the MAC address assigned to the device by the boot program is different from the "local-mac-address" property; -- nvmem-cells: phandle, reference to an nvmem node for the MAC address; -- nvmem-cell-names: string, should be "mac-address" if nvmem is to be used; - max-speed: number, specifies maximum speed in Mbit/s supported by the device; - max-frame-size: number, maximum transfer unit (IEEE defined MTU), rather than the maximum frame size (there's contradiction in the Devicetree Specification). - phy-mode: string, operation mode of the PHY interface. This is now a de-facto standard property; supported values are: - * "internal" + * "internal" (Internal means there is not a standard bus between the MAC and + the PHY, something proprietary is being used to embed the PHY in the MAC.) * "mii" * "gmii" * "sgmii" diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt index 174f292d8a3e8c14cf7d5d1105380b5f3d358544..8b80515729d7145cc05c9293857212ba914e0607 100644 --- a/Documentation/devicetree/bindings/net/macb.txt +++ b/Documentation/devicetree/bindings/net/macb.txt @@ -26,6 +26,10 @@ Required properties: Optional elements: 'tsu_clk' - clocks: Phandles to input clocks. +Optional properties: +- nvmem-cells: phandle, reference to an nvmem node for the MAC address +- nvmem-cell-names: string, should be "mac-address" if nvmem is to be used + Optional properties for PHY child node: - reset-gpios : Should specify the gpio for phy reset - magic-packet : If present, indicates that the hardware supports waking diff --git a/Documentation/driver-api/acpi/index.rst b/Documentation/driver-api/acpi/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..ace0008e54c2ff96c7ae6dc2a1e464c8f0996374 --- /dev/null +++ b/Documentation/driver-api/acpi/index.rst @@ -0,0 +1,9 @@ +============ +ACPI Support +============ + +.. toctree:: + :maxdepth: 2 + + linuxized-acpica + scan_handlers diff --git a/Documentation/acpi/linuxized-acpica.txt b/Documentation/driver-api/acpi/linuxized-acpica.rst similarity index 80% rename from Documentation/acpi/linuxized-acpica.txt rename to Documentation/driver-api/acpi/linuxized-acpica.rst index 3ad7b0dfb083377d043573de0de94206881853f4..0ca8f15385190b8d07931cf9e987e80afcc8ed2d 100644 --- a/Documentation/acpi/linuxized-acpica.txt +++ b/Documentation/driver-api/acpi/linuxized-acpica.rst @@ -1,31 +1,37 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +============================================================ Linuxized ACPICA - Introduction to ACPICA Release Automation +============================================================ -Copyright (C) 2013-2016, Intel Corporation -Author: Lv Zheng +:Copyright: |copy| 2013-2016, Intel Corporation +:Author: Lv Zheng -Abstract: +Abstract +======== This document describes the ACPICA project and the relationship between ACPICA and Linux. It also describes how ACPICA code in drivers/acpi/acpica, include/acpi and tools/power/acpi is automatically updated to follow the upstream. +ACPICA Project +============== -1. ACPICA Project - - The ACPI Component Architecture (ACPICA) project provides an operating - system (OS)-independent reference implementation of the Advanced - Configuration and Power Interface Specification (ACPI). It has been - adapted by various host OSes. By directly integrating ACPICA, Linux can - also benefit from the application experiences of ACPICA from other host - OSes. +The ACPI Component Architecture (ACPICA) project provides an operating +system (OS)-independent reference implementation of the Advanced +Configuration and Power Interface Specification (ACPI). It has been +adapted by various host OSes. By directly integrating ACPICA, Linux can +also benefit from the application experiences of ACPICA from other host +OSes. - The homepage of ACPICA project is: www.acpica.org, it is maintained and - supported by Intel Corporation. +The homepage of ACPICA project is: www.acpica.org, it is maintained and +supported by Intel Corporation. - The following figure depicts the Linux ACPI subsystem where the ACPICA - adaptation is included: +The following figure depicts the Linux ACPI subsystem where the ACPICA +adaptation is included:: +---------------------------------------------------------+ | | @@ -71,21 +77,27 @@ upstream. Figure 1. Linux ACPI Software Components - NOTE: +.. note:: A. OS Service Layer - Provided by Linux to offer OS dependent implementation of the predefined ACPICA interfaces (acpi_os_*). + :: + include/acpi/acpiosxf.h drivers/acpi/osl.c include/acpi/platform include/asm/acenv.h B. ACPICA Functionality - Released from ACPICA code base to offer OS independent implementation of the ACPICA interfaces (acpi_*). + :: + drivers/acpi/acpica include/acpi/ac*.h tools/power/acpi C. Linux/ACPI Functionality - Providing Linux specific ACPI functionality to the other Linux kernel subsystems and user space programs. + :: + drivers/acpi include/linux/acpi.h include/linux/acpi*.h @@ -95,24 +107,27 @@ upstream. ACPI subsystem to offer architecture specific implementation of the ACPI interfaces. They are Linux specific components and are out of the scope of this document. + :: + include/asm/acpi.h include/asm/acpi*.h arch/*/acpi -2. ACPICA Release +ACPICA Release +============== - The ACPICA project maintains its code base at the following repository URL: - https://github.com/acpica/acpica.git. As a rule, a release is made every - month. +The ACPICA project maintains its code base at the following repository URL: +https://github.com/acpica/acpica.git. As a rule, a release is made every +month. - As the coding style adopted by the ACPICA project is not acceptable by - Linux, there is a release process to convert the ACPICA git commits into - Linux patches. The patches generated by this process are referred to as - "linuxized ACPICA patches". The release process is carried out on a local - copy the ACPICA git repository. Each commit in the monthly release is - converted into a linuxized ACPICA patch. Together, they form the monthly - ACPICA release patchset for the Linux ACPI community. This process is - illustrated in the following figure: +As the coding style adopted by the ACPICA project is not acceptable by +Linux, there is a release process to convert the ACPICA git commits into +Linux patches. The patches generated by this process are referred to as +"linuxized ACPICA patches". The release process is carried out on a local +copy the ACPICA git repository. Each commit in the monthly release is +converted into a linuxized ACPICA patch. Together, they form the monthly +ACPICA release patchset for the Linux ACPI community. This process is +illustrated in the following figure:: +-----------------------------+ | acpica / master (-) commits | @@ -153,7 +168,7 @@ upstream. Figure 2. ACPICA -> Linux Upstream Process - NOTE: +.. note:: A. Linuxize Utilities - Provided by the ACPICA repository, including a utility located in source/tools/acpisrc folder and a number of scripts located in generate/linux folder. @@ -170,19 +185,20 @@ upstream. following kernel configuration options: CONFIG_ACPI/CONFIG_ACPI_DEBUG/CONFIG_ACPI_DEBUGGER -3. ACPICA Divergences +ACPICA Divergences +================== - Ideally, all of the ACPICA commits should be converted into Linux patches - automatically without manual modifications, the "linux / master" tree should - contain the ACPICA code that exactly corresponds to the ACPICA code - contained in "new linuxized acpica" tree and it should be possible to run - the release process fully automatically. +Ideally, all of the ACPICA commits should be converted into Linux patches +automatically without manual modifications, the "linux / master" tree should +contain the ACPICA code that exactly corresponds to the ACPICA code +contained in "new linuxized acpica" tree and it should be possible to run +the release process fully automatically. - As a matter of fact, however, there are source code differences between - the ACPICA code in Linux and the upstream ACPICA code, referred to as - "ACPICA Divergences". +As a matter of fact, however, there are source code differences between +the ACPICA code in Linux and the upstream ACPICA code, referred to as +"ACPICA Divergences". - The various sources of ACPICA divergences include: +The various sources of ACPICA divergences include: 1. Legacy divergences - Before the current ACPICA release process was established, there already had been divergences between Linux and ACPICA. Over the past several years those divergences have been greatly @@ -213,11 +229,12 @@ upstream. rebased on the ACPICA side in order to offer better solutions, new ACPICA divergences are generated. -4. ACPICA Development +ACPICA Development +================== - This paragraph guides Linux developers to use the ACPICA upstream release - utilities to obtain Linux patches corresponding to upstream ACPICA commits - before they become available from the ACPICA release process. +This paragraph guides Linux developers to use the ACPICA upstream release +utilities to obtain Linux patches corresponding to upstream ACPICA commits +before they become available from the ACPICA release process. 1. Cherry-pick an ACPICA commit @@ -225,7 +242,7 @@ upstream. you want to cherry pick must be committed into the local repository. Then the gen-patch.sh command can help to cherry-pick an ACPICA commit - from the ACPICA local repository: + from the ACPICA local repository:: $ git clone https://github.com/acpica/acpica $ cd acpica @@ -240,7 +257,7 @@ upstream. changes that haven't been applied to Linux yet. You can generate the ACPICA release series yourself and rebase your code on - top of the generated ACPICA release patches: + top of the generated ACPICA release patches:: $ git clone https://github.com/acpica/acpica $ cd acpica @@ -254,7 +271,7 @@ upstream. 3. Inspect the current divergences If you have local copies of both Linux and upstream ACPICA, you can generate - a diff file indicating the state of the current divergences: + a diff file indicating the state of the current divergences:: # git clone https://github.com/acpica/acpica # git clone http://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git diff --git a/Documentation/acpi/scan_handlers.txt b/Documentation/driver-api/acpi/scan_handlers.rst similarity index 90% rename from Documentation/acpi/scan_handlers.txt rename to Documentation/driver-api/acpi/scan_handlers.rst index 3246ccf159925026e99451a581246f38eff8fff4..7a197b3a33fcb2476eb3d6d58a59717c4d6331cc 100644 --- a/Documentation/acpi/scan_handlers.txt +++ b/Documentation/driver-api/acpi/scan_handlers.rst @@ -1,7 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +================== ACPI Scan Handlers +================== + +:Copyright: |copy| 2012, Intel Corporation -Copyright (C) 2012, Intel Corporation -Author: Rafael J. Wysocki +:Author: Rafael J. Wysocki During system initialization and ACPI-based device hot-add, the ACPI namespace is scanned in search of device objects that generally represent various pieces @@ -30,14 +36,14 @@ to configure that link so that the kernel can use it. Those additional configuration tasks usually depend on the type of the hardware component represented by the given device node which can be determined on the basis of the device node's hardware ID (HID). They are performed by objects -called ACPI scan handlers represented by the following structure: +called ACPI scan handlers represented by the following structure:: -struct acpi_scan_handler { - const struct acpi_device_id *ids; - struct list_head list_node; - int (*attach)(struct acpi_device *dev, const struct acpi_device_id *id); - void (*detach)(struct acpi_device *dev); -}; + struct acpi_scan_handler { + const struct acpi_device_id *ids; + struct list_head list_node; + int (*attach)(struct acpi_device *dev, const struct acpi_device_id *id); + void (*detach)(struct acpi_device *dev); + }; where ids is the list of IDs of device nodes the given handler is supposed to take care of, list_node is the hook to the global list of ACPI scan handlers diff --git a/Documentation/driver-api/device-io.rst b/Documentation/driver-api/device-io.rst index b00b239030788ff68e0ecca8393a0d012c33ff2c..0e389378f71d525ada7091601af49084dfe8fef0 100644 --- a/Documentation/driver-api/device-io.rst +++ b/Documentation/driver-api/device-io.rst @@ -103,51 +103,6 @@ continuing execution:: ha->flags.ints_enabled = 0; } -In addition to write posting, on some large multiprocessing systems -(e.g. SGI Challenge, Origin and Altix machines) posted writes won't be -strongly ordered coming from different CPUs. Thus it's important to -properly protect parts of your driver that do memory-mapped writes with -locks and use the :c:func:`mmiowb()` to make sure they arrive in the -order intended. Issuing a regular readX() will also ensure write ordering, -but should only be used when the -driver has to be sure that the write has actually arrived at the device -(not that it's simply ordered with respect to other writes), since a -full readX() is a relatively expensive operation. - -Generally, one should use :c:func:`mmiowb()` prior to releasing a spinlock -that protects regions using :c:func:`writeb()` or similar functions that -aren't surrounded by readb() calls, which will ensure ordering -and flushing. The following pseudocode illustrates what might occur if -write ordering isn't guaranteed via :c:func:`mmiowb()` or one of the -readX() functions:: - - CPU A: spin_lock_irqsave(&dev_lock, flags) - CPU A: ... - CPU A: writel(newval, ring_ptr); - CPU A: spin_unlock_irqrestore(&dev_lock, flags) - ... - CPU B: spin_lock_irqsave(&dev_lock, flags) - CPU B: writel(newval2, ring_ptr); - CPU B: ... - CPU B: spin_unlock_irqrestore(&dev_lock, flags) - -In the case above, newval2 could be written to ring_ptr before newval. -Fixing it is easy though:: - - CPU A: spin_lock_irqsave(&dev_lock, flags) - CPU A: ... - CPU A: writel(newval, ring_ptr); - CPU A: mmiowb(); /* ensure no other writes beat us to the device */ - CPU A: spin_unlock_irqrestore(&dev_lock, flags) - ... - CPU B: spin_lock_irqsave(&dev_lock, flags) - CPU B: writel(newval2, ring_ptr); - CPU B: ... - CPU B: mmiowb(); - CPU B: spin_unlock_irqrestore(&dev_lock, flags) - -See tg3.c for a real world example of how to use :c:func:`mmiowb()` - PCI ordering rules also guarantee that PIO read responses arrive after any outstanding DMA writes from that bus, since for some devices the result of a readb() call may signal to the driver that a DMA transaction is diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index c0b600ed99613e42494d377406d969e6d39ee426..aa87075c78460a1c016d2621be87cb6426056e39 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -56,6 +56,7 @@ available subsections can be seen below. slimbus soundwire/index fpga/index + acpi/index .. only:: subproject and html diff --git a/Documentation/driver-api/pci/p2pdma.rst b/Documentation/driver-api/pci/p2pdma.rst index 6d85b5a2598db0fc1f957fe77fe6ebff42030560..44deb52beeb4766ae736c9a2645ce2be09857cc1 100644 --- a/Documentation/driver-api/pci/p2pdma.rst +++ b/Documentation/driver-api/pci/p2pdma.rst @@ -132,10 +132,6 @@ precludes passing these pages to userspace. P2P memory is also technically IO memory but should never have any side effects behind it. Thus, the order of loads and stores should not be important and ioreadX(), iowriteX() and friends should not be necessary. -However, as the memory is not cache coherent, if access ever needs to -be protected by a spinlock then :c:func:`mmiowb()` must be used before -unlocking the lock. (See ACQUIRES VS I/O ACCESSES in -Documentation/memory-barriers.txt) P2P DMA Support Library diff --git a/Documentation/driver-api/pm/cpuidle.rst b/Documentation/driver-api/pm/cpuidle.rst index 5842ab621a58c9c61fbfb13fb6b933a72bc889e6..006cf6db40c6d32c8a13b45930b1cdbc1d8f4c3e 100644 --- a/Documentation/driver-api/pm/cpuidle.rst +++ b/Documentation/driver-api/pm/cpuidle.rst @@ -1,3 +1,6 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + .. |struct cpuidle_governor| replace:: :c:type:`struct cpuidle_governor ` .. |struct cpuidle_device| replace:: :c:type:`struct cpuidle_device ` .. |struct cpuidle_driver| replace:: :c:type:`struct cpuidle_driver ` @@ -7,9 +10,9 @@ CPU Idle Time Management ======================== -:: +:Copyright: |copy| 2019 Intel Corporation - Copyright (c) 2019 Intel Corp., Rafael J. Wysocki +:Author: Rafael J. Wysocki CPU Idle Time Management Subsystem diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst index 090c151aa86bd4f7728c7e72b2368c3c4cb0aeb6..30835683616a68b008eaa49968329d60fe368fe0 100644 --- a/Documentation/driver-api/pm/devices.rst +++ b/Documentation/driver-api/pm/devices.rst @@ -1,3 +1,6 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + .. |struct dev_pm_ops| replace:: :c:type:`struct dev_pm_ops ` .. |struct dev_pm_domain| replace:: :c:type:`struct dev_pm_domain ` .. |struct bus_type| replace:: :c:type:`struct bus_type ` @@ -12,11 +15,12 @@ Device Power Management Basics ============================== -:: +:Copyright: |copy| 2010-2011 Rafael J. Wysocki , Novell Inc. +:Copyright: |copy| 2010 Alan Stern +:Copyright: |copy| 2016 Intel Corporation + +:Author: Rafael J. Wysocki - Copyright (c) 2010-2011 Rafael J. Wysocki , Novell Inc. - Copyright (c) 2010 Alan Stern - Copyright (c) 2016 Intel Corp., Rafael J. Wysocki Most of the code in Linux is device drivers, so most of the Linux power management (PM) code is also driver-specific. Most drivers will do very diff --git a/Documentation/driver-api/pm/index.rst b/Documentation/driver-api/pm/index.rst index 56975c6bc78952d2e250a55542954a965df36413..c2a9ef8d115ceb91a646c6b9e4f41966b22116ce 100644 --- a/Documentation/driver-api/pm/index.rst +++ b/Documentation/driver-api/pm/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + =============================== CPU and Device Power Management =============================== diff --git a/Documentation/driver-api/pm/notifiers.rst b/Documentation/driver-api/pm/notifiers.rst index 62f860026992dc523f9e6e0c4a85fe85abc6b2d5..186435c43b77e59fb1d9c0676682885833e67cb8 100644 --- a/Documentation/driver-api/pm/notifiers.rst +++ b/Documentation/driver-api/pm/notifiers.rst @@ -1,10 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + ============================= Suspend/Hibernation Notifiers ============================= -:: +:Copyright: |copy| 2016 Intel Corporation + +:Author: Rafael J. Wysocki - Copyright (c) 2016 Intel Corp., Rafael J. Wysocki There are some operations that subsystems or drivers may want to carry out before hibernation/suspend or after restore/resume, but they require the system diff --git a/Documentation/driver-api/pm/types.rst b/Documentation/driver-api/pm/types.rst index 3ebdecc5410433bfffc1d17614e69600fd47e7f6..73a231caf764386d73c42073894e010b745a927b 100644 --- a/Documentation/driver-api/pm/types.rst +++ b/Documentation/driver-api/pm/types.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ================================== Device Power Management Data Types ================================== diff --git a/Documentation/driver-api/usb/power-management.rst b/Documentation/driver-api/usb/power-management.rst index 79beb807996b7a3a17e08b5f1d6e31d4176d3fc2..4a74cf6f2797274b96510685a44f4066fac558d3 100644 --- a/Documentation/driver-api/usb/power-management.rst +++ b/Documentation/driver-api/usb/power-management.rst @@ -370,11 +370,15 @@ autosuspend the interface's device. When the usage counter is = 0 then the interface is considered to be idle, and the kernel may autosuspend the device. -Drivers need not be concerned about balancing changes to the usage -counter; the USB core will undo any remaining "get"s when a driver -is unbound from its interface. As a corollary, drivers must not call -any of the ``usb_autopm_*`` functions after their ``disconnect`` -routine has returned. +Drivers must be careful to balance their overall changes to the usage +counter. Unbalanced "get"s will remain in effect when a driver is +unbound from its interface, preventing the device from going into +runtime suspend should the interface be bound to a driver again. On +the other hand, drivers are allowed to achieve this balance by calling +the ``usb_autopm_*`` functions even after their ``disconnect`` routine +has returned -- say from within a work-queue routine -- provided they +retain an active reference to the interface (via ``usb_get_intf`` and +``usb_put_intf``). Drivers using the async routines are responsible for their own synchronization and mutual exclusion. diff --git a/Documentation/features/time/modern-timekeeping/arch-support.txt b/Documentation/features/time/modern-timekeeping/arch-support.txt index 2855dfe2464d4a3408c60976ef0bdc9e79b4e050..1d46da165b75e127a0842e64b9febd7b2b82c31e 100644 --- a/Documentation/features/time/modern-timekeeping/arch-support.txt +++ b/Documentation/features/time/modern-timekeeping/arch-support.txt @@ -15,7 +15,7 @@ | h8300: | ok | | hexagon: | ok | | ia64: | ok | - | m68k: | TODO | + | m68k: | ok | | microblaze: | ok | | mips: | ok | | nds32: | ok | diff --git a/Documentation/acpi/DSD-properties-rules.txt b/Documentation/firmware-guide/acpi/DSD-properties-rules.rst similarity index 88% rename from Documentation/acpi/DSD-properties-rules.txt rename to Documentation/firmware-guide/acpi/DSD-properties-rules.rst index 3e4862bdad98cc792831bf3419a6b19910d59e26..4306f29b6103bd16f343274c8063079d398a86d8 100644 --- a/Documentation/acpi/DSD-properties-rules.txt +++ b/Documentation/firmware-guide/acpi/DSD-properties-rules.rst @@ -1,8 +1,11 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== _DSD Device Properties Usage Rules ----------------------------------- +================================== Properties, Property Sets and Property Subsets ----------------------------------------------- +============================================== The _DSD (Device Specific Data) configuration object, introduced in ACPI 5.1, allows any type of device configuration data to be provided via the ACPI @@ -18,7 +21,7 @@ specific type) associated with it. In the ACPI _DSD context it is an element of the sub-package following the generic Device Properties UUID in the _DSD return package as specified in the -Device Properties UUID definition document [1]. +Device Properties UUID definition document [1]_. It also may be regarded as the definition of a key and the associated data type that can be returned by _DSD in the Device Properties UUID sub-package for a @@ -33,14 +36,14 @@ Property subsets are nested collections of properties. Each of them is associated with an additional key (name) allowing the subset to be referred to as a whole (and to be treated as a separate entity). The canonical representation of property subsets is via the mechanism specified in the -Hierarchical Properties Extension UUID definition document [2]. +Hierarchical Properties Extension UUID definition document [2]_. Property sets may be hierarchical. That is, a property set may contain multiple property subsets that each may contain property subsets of its own and so on. General Validity Rule for Property Sets ---------------------------------------- +======================================= Valid property sets must follow the guidance given by the Device Properties UUID definition document [1]. @@ -73,7 +76,7 @@ suitable for the ACPI environment and consequently they cannot belong to a valid property set. Property Sets and Device Tree Bindings --------------------------------------- +====================================== It often is useful to make _DSD return property sets that follow Device Tree bindings. @@ -91,7 +94,7 @@ expected to automatically work in the ACPI environment regardless of their contents. References ----------- +========== -[1] http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf -[2] http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf +.. [1] http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf +.. [2] http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf diff --git a/Documentation/acpi/acpi-lid.txt b/Documentation/firmware-guide/acpi/acpi-lid.rst similarity index 86% rename from Documentation/acpi/acpi-lid.txt rename to Documentation/firmware-guide/acpi/acpi-lid.rst index effe7af3a5af95d25f86efadaa7dd2b127a0dea9..874ce0ed340d8cfec5c772809747a0cb4d7c4dac 100644 --- a/Documentation/acpi/acpi-lid.txt +++ b/Documentation/firmware-guide/acpi/acpi-lid.rst @@ -1,13 +1,18 @@ -Special Usage Model of the ACPI Control Method Lid Device +.. SPDX-License-Identifier: GPL-2.0 +.. include:: -Copyright (C) 2016, Intel Corporation -Author: Lv Zheng +========================================================= +Special Usage Model of the ACPI Control Method Lid Device +========================================================= +:Copyright: |copy| 2016, Intel Corporation -Abstract: +:Author: Lv Zheng -Platforms containing lids convey lid state (open/close) to OSPMs using a -control method lid device. To implement this, the AML tables issue +Abstract +======== +Platforms containing lids convey lid state (open/close) to OSPMs +using a control method lid device. To implement this, the AML tables issue Notify(lid_device, 0x80) to notify the OSPMs whenever the lid state has changed. The _LID control method for the lid device must be implemented to report the "current" state of the lid as either "opened" or "closed". @@ -19,7 +24,8 @@ taken into account. This document describes the restrictions and the expections of the Linux ACPI lid device driver. -1. Restrictions of the returning value of the _LID control method +Restrictions of the returning value of the _LID control method +============================================================== The _LID control method is described to return the "current" lid state. However the word of "current" has ambiguity, some buggy AML tables return @@ -30,7 +36,8 @@ initial returning value. When the AML tables implement this control method with cached value, the initial returning value is likely not reliable. There are platforms always retun "closed" as initial lid state. -2. Restrictions of the lid state change notifications +Restrictions of the lid state change notifications +================================================== There are buggy AML tables never notifying when the lid device state is changed to "opened". Thus the "opened" notification is not guaranteed. But @@ -39,18 +46,22 @@ state is changed to "closed". The "closed" notification is normally used to trigger some system power saving operations on Windows. Since it is fully tested, it is reliable from all AML tables. -3. Expections for the userspace users of the ACPI lid device driver +Expections for the userspace users of the ACPI lid device driver +================================================================ The ACPI button driver exports the lid state to the userspace via the -following file: +following file:: + /proc/acpi/button/lid/LID0/state + This file actually calls the _LID control method described above. And given the previous explanation, it is not reliable enough on some platforms. So it is advised for the userspace program to not to solely rely on this file to determine the actual lid state. The ACPI button driver emits the following input event to the userspace: - SW_LID + * SW_LID + The ACPI lid device driver is implemented to try to deliver the platform triggered events to the userspace. However, given the fact that the buggy firmware cannot make sure "opened"/"closed" events are paired, the ACPI @@ -59,20 +70,25 @@ button driver uses the following 3 modes in order not to trigger issues. If the userspace hasn't been prepared to ignore the unreliable "opened" events and the unreliable initial state notification, Linux users can use the following kernel parameters to handle the possible issues: + A. button.lid_init_state=method: When this option is specified, the ACPI button driver reports the initial lid state using the returning value of the _LID control method and whether the "opened"/"closed" events are paired fully relies on the firmware implementation. + This option can be used to fix some platforms where the returning value of the _LID control method is reliable but the initial lid state notification is missing. + This option is the default behavior during the period the userspace isn't ready to handle the buggy AML tables. + B. button.lid_init_state=open: When this option is specified, the ACPI button driver always reports the initial lid state as "opened" and whether the "opened"/"closed" events are paired fully relies on the firmware implementation. + This may fix some platforms where the returning value of the _LID control method is not reliable and the initial lid state notification is missing. @@ -80,6 +96,7 @@ B. button.lid_init_state=open: If the userspace has been prepared to ignore the unreliable "opened" events and the unreliable initial state notification, Linux users should always use the following kernel parameter: + C. button.lid_init_state=ignore: When this option is specified, the ACPI button driver never reports the initial lid state and there is a compensation mechanism implemented to @@ -89,6 +106,7 @@ C. button.lid_init_state=ignore: notifications can be delivered to the userspace when the lid is actually opens given that some AML tables do not send "opened" notifications reliably. + In this mode, if everything is correctly implemented by the platform firmware, the old userspace programs should still work. Otherwise, the new userspace programs are required to work with the ACPI button driver. diff --git a/Documentation/firmware-guide/acpi/aml-debugger.rst b/Documentation/firmware-guide/acpi/aml-debugger.rst new file mode 100644 index 0000000000000000000000000000000000000000..a889d43bc6c5e3e5c50b1a2ab84a46f154352a4f --- /dev/null +++ b/Documentation/firmware-guide/acpi/aml-debugger.rst @@ -0,0 +1,75 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +================ +The AML Debugger +================ + +:Copyright: |copy| 2016, Intel Corporation +:Author: Lv Zheng + + +This document describes the usage of the AML debugger embedded in the Linux +kernel. + +1. Build the debugger +===================== + +The following kernel configuration items are required to enable the AML +debugger interface from the Linux kernel:: + + CONFIG_ACPI_DEBUGGER=y + CONFIG_ACPI_DEBUGGER_USER=m + +The userspace utilities can be built from the kernel source tree using +the following commands:: + + $ cd tools + $ make acpi + +The resultant userspace tool binary is then located at:: + + tools/power/acpi/acpidbg + +It can be installed to system directories by running "make install" (as a +sufficiently privileged user). + +2. Start the userspace debugger interface +========================================= + +After booting the kernel with the debugger built-in, the debugger can be +started by using the following commands:: + + # mount -t debugfs none /sys/kernel/debug + # modprobe acpi_dbg + # tools/power/acpi/acpidbg + +That spawns the interactive AML debugger environment where you can execute +debugger commands. + +The commands are documented in the "ACPICA Overview and Programmer Reference" +that can be downloaded from + +https://acpica.org/documentation + +The detailed debugger commands reference is located in Chapter 12 "ACPICA +Debugger Reference". The "help" command can be used for a quick reference. + +3. Stop the userspace debugger interface +======================================== + +The interactive debugger interface can be closed by pressing Ctrl+C or using +the "quit" or "exit" commands. When finished, unload the module with:: + + # rmmod acpi_dbg + +The module unloading may fail if there is an acpidbg instance running. + +4. Run the debugger in a script +=============================== + +It may be useful to run the AML debugger in a test script. "acpidbg" supports +this in a special "batch" mode. For example, the following command outputs +the entire ACPI namespace:: + + # acpidbg -b "namespace" diff --git a/Documentation/acpi/apei/einj.txt b/Documentation/firmware-guide/acpi/apei/einj.rst similarity index 67% rename from Documentation/acpi/apei/einj.txt rename to Documentation/firmware-guide/acpi/apei/einj.rst index e550c8b98139974343de5e4890c94f642f0a53d3..e588bccf5158370fb03dfe4db06b20ee93114108 100644 --- a/Documentation/acpi/apei/einj.txt +++ b/Documentation/firmware-guide/acpi/apei/einj.rst @@ -1,13 +1,16 @@ - APEI Error INJection - ~~~~~~~~~~~~~~~~~~~~ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +APEI Error INJection +==================== EINJ provides a hardware error injection mechanism. It is very useful for debugging and testing APEI and RAS features in general. You need to check whether your BIOS supports EINJ first. For that, look -for early boot messages similar to this one: +for early boot messages similar to this one:: -ACPI: EINJ 0x000000007370A000 000150 (v01 INTEL 00000001 INTL 00000001) + ACPI: EINJ 0x000000007370A000 000150 (v01 INTEL 00000001 INTL 00000001) which shows that the BIOS is exposing an EINJ table - it is the mechanism through which the injection is done. @@ -23,11 +26,11 @@ order to see the APEI,EINJ,... functionality supported and exposed by the BIOS menu. To use EINJ, make sure the following are options enabled in your kernel -configuration: +configuration:: -CONFIG_DEBUG_FS -CONFIG_ACPI_APEI -CONFIG_ACPI_APEI_EINJ + CONFIG_DEBUG_FS + CONFIG_ACPI_APEI + CONFIG_ACPI_APEI_EINJ The EINJ user interface is in /apei/einj. @@ -37,20 +40,22 @@ The following files belong to it: This file shows which error types are supported: + ================ =================================== Error Type Value Error Description - ================ ================= - 0x00000001 Processor Correctable - 0x00000002 Processor Uncorrectable non-fatal - 0x00000004 Processor Uncorrectable fatal - 0x00000008 Memory Correctable - 0x00000010 Memory Uncorrectable non-fatal - 0x00000020 Memory Uncorrectable fatal - 0x00000040 PCI Express Correctable - 0x00000080 PCI Express Uncorrectable fatal - 0x00000100 PCI Express Uncorrectable non-fatal - 0x00000200 Platform Correctable - 0x00000400 Platform Uncorrectable non-fatal - 0x00000800 Platform Uncorrectable fatal + ================ =================================== + 0x00000001 Processor Correctable + 0x00000002 Processor Uncorrectable non-fatal + 0x00000004 Processor Uncorrectable fatal + 0x00000008 Memory Correctable + 0x00000010 Memory Uncorrectable non-fatal + 0x00000020 Memory Uncorrectable fatal + 0x00000040 PCI Express Correctable + 0x00000080 PCI Express Uncorrectable fatal + 0x00000100 PCI Express Uncorrectable non-fatal + 0x00000200 Platform Correctable + 0x00000400 Platform Uncorrectable non-fatal + 0x00000800 Platform Uncorrectable fatal + ================ =================================== The format of the file contents are as above, except present are only the available error types. @@ -73,9 +78,12 @@ The following files belong to it: injection. Value is a bitmask as specified in ACPI5.0 spec for the SET_ERROR_TYPE_WITH_ADDRESS data structure: - Bit 0 - Processor APIC field valid (see param3 below). - Bit 1 - Memory address and mask valid (param1 and param2). - Bit 2 - PCIe (seg,bus,dev,fn) valid (see param4 below). + Bit 0 + Processor APIC field valid (see param3 below). + Bit 1 + Memory address and mask valid (param1 and param2). + Bit 2 + PCIe (seg,bus,dev,fn) valid (see param4 below). If set to zero, legacy behavior is mimicked where the type of injection specifies just one bit set, and param1 is multiplexed. @@ -121,7 +129,7 @@ BIOS versions based on the ACPI 5.0 specification have more control over the target of the injection. For processor-related errors (type 0x1, 0x2 and 0x4), you can set flags to 0x3 (param3 for bit 0, and param1 and param2 for bit 1) so that you have more information added to the error -signature being injected. The actual data passed is this: +signature being injected. The actual data passed is this:: memory_address = param1; memory_address_range = param2; @@ -131,7 +139,7 @@ signature being injected. The actual data passed is this: For memory errors (type 0x8, 0x10 and 0x20) the address is set using param1 with a mask in param2 (0x0 is equivalent to all ones). For PCI express errors (type 0x40, 0x80 and 0x100) the segment, bus, device and -function are specified using param1: +function are specified using param1:: 31 24 23 16 15 11 10 8 7 0 +-------------------------------------------------+ @@ -152,26 +160,26 @@ documentation for details (and expect changes to this API if vendors creativity in using this feature expands beyond our expectations). -An error injection example: +An error injection example:: -# cd /sys/kernel/debug/apei/einj -# cat available_error_type # See which errors can be injected -0x00000002 Processor Uncorrectable non-fatal -0x00000008 Memory Correctable -0x00000010 Memory Uncorrectable non-fatal -# echo 0x12345000 > param1 # Set memory address for injection -# echo $((-1 << 12)) > param2 # Mask 0xfffffffffffff000 - anywhere in this page -# echo 0x8 > error_type # Choose correctable memory error -# echo 1 > error_inject # Inject now + # cd /sys/kernel/debug/apei/einj + # cat available_error_type # See which errors can be injected + 0x00000002 Processor Uncorrectable non-fatal + 0x00000008 Memory Correctable + 0x00000010 Memory Uncorrectable non-fatal + # echo 0x12345000 > param1 # Set memory address for injection + # echo $((-1 << 12)) > param2 # Mask 0xfffffffffffff000 - anywhere in this page + # echo 0x8 > error_type # Choose correctable memory error + # echo 1 > error_inject # Inject now -You should see something like this in dmesg: +You should see something like this in dmesg:: -[22715.830801] EDAC sbridge MC3: HANDLING MCE MEMORY ERROR -[22715.834759] EDAC sbridge MC3: CPU 0: Machine Check Event: 0 Bank 7: 8c00004000010090 -[22715.834759] EDAC sbridge MC3: TSC 0 -[22715.834759] EDAC sbridge MC3: ADDR 12345000 EDAC sbridge MC3: MISC 144780c86 -[22715.834759] EDAC sbridge MC3: PROCESSOR 0:306e7 TIME 1422553404 SOCKET 0 APIC 0 -[22716.616173] EDAC MC3: 1 CE memory read error on CPU_SrcID#0_Channel#0_DIMM#0 (channel:0 slot:0 page:0x12345 offset:0x0 grain:32 syndrome:0x0 - area:DRAM err_code:0001:0090 socket:0 channel_mask:1 rank:0) + [22715.830801] EDAC sbridge MC3: HANDLING MCE MEMORY ERROR + [22715.834759] EDAC sbridge MC3: CPU 0: Machine Check Event: 0 Bank 7: 8c00004000010090 + [22715.834759] EDAC sbridge MC3: TSC 0 + [22715.834759] EDAC sbridge MC3: ADDR 12345000 EDAC sbridge MC3: MISC 144780c86 + [22715.834759] EDAC sbridge MC3: PROCESSOR 0:306e7 TIME 1422553404 SOCKET 0 APIC 0 + [22716.616173] EDAC MC3: 1 CE memory read error on CPU_SrcID#0_Channel#0_DIMM#0 (channel:0 slot:0 page:0x12345 offset:0x0 grain:32 syndrome:0x0 - area:DRAM err_code:0001:0090 socket:0 channel_mask:1 rank:0) For more information about EINJ, please refer to ACPI specification version 4.0, section 17.5 and ACPI 5.0, section 18.6. diff --git a/Documentation/firmware-guide/acpi/apei/output_format.rst b/Documentation/firmware-guide/acpi/apei/output_format.rst new file mode 100644 index 0000000000000000000000000000000000000000..c2e7ebddb5298ad11209446b4fd268db1af74304 --- /dev/null +++ b/Documentation/firmware-guide/acpi/apei/output_format.rst @@ -0,0 +1,150 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================== +APEI output format +================== + +APEI uses printk as hardware error reporting interface, the output +format is as follow:: + + := + APEI generic hardware error status + severity: , + section: , severity: , + flags: +
+ fru_id: + fru_text: + section_type:
+
+ + * := recoverable | fatal | corrected | info + +
# := + [primary][, containment warning][, reset][, threshold exceeded]\ + [, resource not accessible][, latent error] + +
:= generic processor error | memory error | \ + PCIe error | unknown, + +
:= + | | \ + | + + := + [processor_type: , ] + [processor_isa: , ] + [error_type: + ] + [operation: , ] + [flags: + ] + [level: ] + [version_info: ] + [processor_id: ] + [target_address: ] + [requestor_id: ] + [responder_id: ] + [IP: ] + + * := IA32/X64 | IA64 + + * := IA32 | IA64 | X64 + + # := + [cache error][, TLB error][, bus error][, micro-architectural error] + + * := unknown or generic | data read | data write | \ + instruction execution + + # := + [restartable][, precise IP][, overflow][, corrected] + + := + [error_status: ] + [physical_address: ] + [physical_address_mask: ] + [node: ] + [card: ] + [module: ] + [bank: ] + [device: ] + [row: ] + [column: ] + [bit_position: ] + [requestor_id: ] + [responder_id: ] + [target_id: ] + [error_type: , ] + + * := + unknown | no error | single-bit ECC | multi-bit ECC | \ + single-symbol chipkill ECC | multi-symbol chipkill ECC | master abort | \ + target abort | parity error | watchdog timeout | invalid address | \ + mirror Broken | memory sparing | scrub corrected error | \ + scrub uncorrected error + + := + [port_type: , ] + [version: .] + [command: , status: ] + [device_id: ::. + slot: + secondary_bus: + vendor_id: , device_id: + class_code: ] + [serial number: , ] + [bridge: secondary_status: , control: ] + [aer_status: , aer_mask: + + [aer_uncor_severity: ] + aer_layer=, aer_agent= + aer_tlp_header: ] + + * := PCIe end point | legacy PCI end point | \ + unknown | unknown | root port | upstream switch port | \ + downstream switch port | PCIe to PCI/PCI-X bridge | \ + PCI/PCI-X to PCIe bridge | root complex integrated endpoint device | \ + root complex event collector + + if section severity is fatal or recoverable + # := + unknown | unknown | unknown | unknown | Data Link Protocol | \ + unknown | unknown | unknown | unknown | unknown | unknown | unknown | \ + Poisoned TLP | Flow Control Protocol | Completion Timeout | \ + Completer Abort | Unexpected Completion | Receiver Overflow | \ + Malformed TLP | ECRC | Unsupported Request + else + # := + Receiver Error | unknown | unknown | unknown | unknown | unknown | \ + Bad TLP | Bad DLLP | RELAY_NUM Rollover | unknown | unknown | unknown | \ + Replay Timer Timeout | Advisory Non-Fatal + fi + + := + Physical Layer | Data Link Layer | Transaction Layer + + := + Receiver ID | Requester ID | Completer ID | Transmitter ID + +Where, [] designate corresponding content is optional + +All description with * has the following format:: + + field: , + +Where value of should be the position of "string" in description. Otherwise, will be "unknown". + +All description with # has the following format:: + + field: + + +Where each string in corresponding to one set bit of +. The bit position is the position of "string" in description. + +For more detailed explanation of every field, please refer to UEFI +specification version 2.3 or later, section Appendix N: Common +Platform Error Record. diff --git a/Documentation/acpi/debug.txt b/Documentation/firmware-guide/acpi/debug.rst similarity index 91% rename from Documentation/acpi/debug.txt rename to Documentation/firmware-guide/acpi/debug.rst index 65bf47c46b6dac88d00188ea13e23053c75faac8..1a152dd1d7654ae8139caf69875291e06bc820f0 100644 --- a/Documentation/acpi/debug.txt +++ b/Documentation/firmware-guide/acpi/debug.rst @@ -1,18 +1,21 @@ - ACPI Debug Output +.. SPDX-License-Identifier: GPL-2.0 +================= +ACPI Debug Output +================= The ACPI CA, the Linux ACPI core, and some ACPI drivers can generate debug output. This document describes how to use this facility. Compile-time configuration --------------------------- +========================== ACPI debug output is globally enabled by CONFIG_ACPI_DEBUG. If this config option is turned off, the debug messages are not even built into the kernel. Boot- and run-time configuration --------------------------------- +================================ When CONFIG_ACPI_DEBUG=y, you can select the component and level of messages you're interested in. At boot-time, use the acpi.debug_layer and @@ -21,7 +24,7 @@ debug_layer and debug_level files in /sys/module/acpi/parameters/ to control the debug messages. debug_layer (component) ------------------------ +======================= The "debug_layer" is a mask that selects components of interest, e.g., a specific driver or part of the ACPI interpreter. To build the debug_layer @@ -33,7 +36,7 @@ to /sys/module/acpi/parameters/debug_layer. The possible components are defined in include/acpi/acoutput.h and include/acpi/acpi_drivers.h. Reading /sys/module/acpi/parameters/debug_layer -shows the supported mask values, currently these: +shows the supported mask values, currently these:: ACPI_UTILITIES 0x00000001 ACPI_HARDWARE 0x00000002 @@ -65,7 +68,7 @@ shows the supported mask values, currently these: ACPI_PROCESSOR_COMPONENT 0x20000000 debug_level ------------ +=========== The "debug_level" is a mask that selects different types of messages, e.g., those related to initialization, method execution, informational messages, etc. @@ -81,7 +84,7 @@ to /sys/module/acpi/parameters/debug_level. The possible levels are defined in include/acpi/acoutput.h. Reading /sys/module/acpi/parameters/debug_level shows the supported mask values, -currently these: +currently these:: ACPI_LV_INIT 0x00000001 ACPI_LV_DEBUG_OBJECT 0x00000002 @@ -113,9 +116,9 @@ currently these: ACPI_LV_EVENTS 0x80000000 Examples --------- +======== -For example, drivers/acpi/bus.c contains this: +For example, drivers/acpi/bus.c contains this:: #define _COMPONENT ACPI_BUS_COMPONENT ... @@ -127,22 +130,22 @@ statement uses ACPI_DB_INFO, which is macro based on the ACPI_LV_INFO definition.) Enable all AML "Debug" output (stores to the Debug object while interpreting -AML) during boot: +AML) during boot:: acpi.debug_layer=0xffffffff acpi.debug_level=0x2 -Enable PCI and PCI interrupt routing debug messages: +Enable PCI and PCI interrupt routing debug messages:: acpi.debug_layer=0x400000 acpi.debug_level=0x4 -Enable all ACPI hardware-related messages: +Enable all ACPI hardware-related messages:: acpi.debug_layer=0x2 acpi.debug_level=0xffffffff -Enable all ACPI_DB_INFO messages after boot: +Enable all ACPI_DB_INFO messages after boot:: # echo 0x4 > /sys/module/acpi/parameters/debug_level -Show all valid component values: +Show all valid component values:: # cat /sys/module/acpi/parameters/debug_layer diff --git a/Documentation/acpi/dsd/data-node-references.txt b/Documentation/firmware-guide/acpi/dsd/data-node-references.rst similarity index 71% rename from Documentation/acpi/dsd/data-node-references.txt rename to Documentation/firmware-guide/acpi/dsd/data-node-references.rst index c3871565c8cfbf90d6118a6e76e2d2b689a45845..1351984e767c8c69dda03cdd968f61c0d0452ff8 100644 --- a/Documentation/acpi/dsd/data-node-references.txt +++ b/Documentation/firmware-guide/acpi/dsd/data-node-references.rst @@ -1,9 +1,12 @@ -Copyright (C) 2018 Intel Corporation -Author: Sakari Ailus - +.. SPDX-License-Identifier: GPL-2.0 +.. include:: +=================================== Referencing hierarchical data nodes ------------------------------------ +=================================== + +:Copyright: |copy| 2018 Intel Corporation +:Author: Sakari Ailus ACPI in general allows referring to device objects in the tree only. Hierarchical data extension nodes may not be referred to directly, hence this @@ -28,13 +31,14 @@ extension key. Example -------- +======= - In the ASL snippet below, the "reference" _DSD property [2] contains a - device object reference to DEV0 and under that device object, a - hierarchical data extension key "node@1" referring to the NOD1 object - and lastly, a hierarchical data extension key "anothernode" referring to - the ANOD object which is also the final target node of the reference. +In the ASL snippet below, the "reference" _DSD property [2] contains a +device object reference to DEV0 and under that device object, a +hierarchical data extension key "node@1" referring to the NOD1 object +and lastly, a hierarchical data extension key "anothernode" referring to +the ANOD object which is also the final target node of the reference. +:: Device (DEV0) { @@ -75,15 +79,15 @@ Example }) } -Please also see a graph example in graph.txt . +Please also see a graph example in :doc:`graph`. References ----------- +========== [1] Hierarchical Data Extension UUID For _DSD. - , - referenced 2018-07-17. +, +referenced 2018-07-17. [2] Device Properties UUID For _DSD. - , - referenced 2016-10-04. +, +referenced 2016-10-04. diff --git a/Documentation/acpi/dsd/graph.txt b/Documentation/firmware-guide/acpi/dsd/graph.rst similarity index 56% rename from Documentation/acpi/dsd/graph.txt rename to Documentation/firmware-guide/acpi/dsd/graph.rst index b9ce910781dcd09b9f308dac6218a0b3b27da7ed..e0baed35b037ed06ec5f78e1a7b2e1d5b09e53c1 100644 --- a/Documentation/acpi/dsd/graph.txt +++ b/Documentation/firmware-guide/acpi/dsd/graph.rst @@ -1,8 +1,11 @@ -Graphs +.. SPDX-License-Identifier: GPL-2.0 +====== +Graphs +====== _DSD ----- +==== _DSD (Device Specific Data) [7] is a predefined ACPI device configuration object that can be used to convey information on @@ -30,7 +33,7 @@ hierarchical data extension array on each depth. Ports and endpoints -------------------- +=================== The port and endpoint concepts are very similar to those in Devicetree [3]. A port represents an interface in a device, and an endpoint @@ -38,9 +41,9 @@ represents a connection to that interface. All port nodes are located under the device's "_DSD" node in the hierarchical data extension tree. The data extension related to each port node must begin -with "port" and must be followed by the "@" character and the number of the port -as its key. The target object it refers to should be called "PRTX", where "X" is -the number of the port. An example of such a package would be: +with "port" and must be followed by the "@" character and the number of the +port as its key. The target object it refers to should be called "PRTX", where +"X" is the number of the port. An example of such a package would be:: Package() { "port@4", PRT4 } @@ -49,7 +52,7 @@ data extension key of the endpoint nodes must begin with "endpoint" and must be followed by the "@" character and the number of the endpoint. The object it refers to should be called "EPXY", where "X" is the number of the port and "Y" is the number of the endpoint. An example of such a -package would be: +package would be:: Package() { "endpoint@0", EP40 } @@ -62,85 +65,85 @@ of that port shall be zero. Similarly, if a port may only have a single endpoint, the number of that endpoint shall be zero. The endpoint reference uses property extension with "remote-endpoint" property -name followed by a reference in the same package. Such references consist of the +name followed by a reference in the same package. Such references consist of the remote device reference, the first package entry of the port data extension reference under the device and finally the first package entry of the endpoint -data extension reference under the port. Individual references thus appear as: +data extension reference under the port. Individual references thus appear as:: Package() { device, "port@X", "endpoint@Y" } -In the above example, "X" is the number of the port and "Y" is the number of the -endpoint. +In the above example, "X" is the number of the port and "Y" is the number of +the endpoint. The references to endpoints must be always done both ways, to the remote endpoint and back from the referred remote endpoint node. -A simple example of this is show below: +A simple example of this is show below:: Scope (\_SB.PCI0.I2C2) { - Device (CAM0) - { - Name (_DSD, Package () { - ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), - Package () { - Package () { "compatible", Package () { "nokia,smia" } }, - }, - ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), - Package () { - Package () { "port@0", PRT0 }, - } - }) - Name (PRT0, Package() { - ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), - Package () { - Package () { "reg", 0 }, - }, - ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), - Package () { - Package () { "endpoint@0", EP00 }, - } - }) - Name (EP00, Package() { - ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), - Package () { - Package () { "reg", 0 }, - Package () { "remote-endpoint", Package() { \_SB.PCI0.ISP, "port@4", "endpoint@0" } }, - } - }) - } + Device (CAM0) + { + Name (_DSD, Package () { + ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), + Package () { + Package () { "compatible", Package () { "nokia,smia" } }, + }, + ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), + Package () { + Package () { "port@0", PRT0 }, + } + }) + Name (PRT0, Package() { + ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), + Package () { + Package () { "reg", 0 }, + }, + ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), + Package () { + Package () { "endpoint@0", EP00 }, + } + }) + Name (EP00, Package() { + ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), + Package () { + Package () { "reg", 0 }, + Package () { "remote-endpoint", Package() { \_SB.PCI0.ISP, "port@4", "endpoint@0" } }, + } + }) + } } Scope (\_SB.PCI0) { - Device (ISP) - { - Name (_DSD, Package () { - ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), - Package () { - Package () { "port@4", PRT4 }, - } - }) - - Name (PRT4, Package() { - ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), - Package () { - Package () { "reg", 4 }, /* CSI-2 port number */ - }, - ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), - Package () { - Package () { "endpoint@0", EP40 }, - } - }) - - Name (EP40, Package() { - ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), - Package () { - Package () { "reg", 0 }, - Package () { "remote-endpoint", Package () { \_SB.PCI0.I2C2.CAM0, "port@0", "endpoint@0" } }, - } - }) - } + Device (ISP) + { + Name (_DSD, Package () { + ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), + Package () { + Package () { "port@4", PRT4 }, + } + }) + + Name (PRT4, Package() { + ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), + Package () { + Package () { "reg", 4 }, /* CSI-2 port number */ + }, + ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), + Package () { + Package () { "endpoint@0", EP40 }, + } + }) + + Name (EP40, Package() { + ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), + Package () { + Package () { "reg", 0 }, + Package () { "remote-endpoint", Package () { \_SB.PCI0.I2C2.CAM0, "port@0", "endpoint@0" } }, + } + }) + } } Here, the port 0 of the "CAM0" device is connected to the port 4 of @@ -148,27 +151,27 @@ the "ISP" device and vice versa. References ----------- +========== [1] _DSD (Device Specific Data) Implementation Guide. - , + http://www.uefi.org/sites/default/files/resources/_DSD-implementation-guide-toplevel-1_1.htm, referenced 2016-10-03. -[2] Devicetree. , referenced 2016-10-03. +[2] Devicetree. http://www.devicetree.org, referenced 2016-10-03. [3] Documentation/devicetree/bindings/graph.txt [4] Device Properties UUID For _DSD. - , + http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf, referenced 2016-10-04. [5] Hierarchical Data Extension UUID For _DSD. - , + http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf, referenced 2016-10-04. [6] Advanced Configuration and Power Interface Specification. - , + http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf, referenced 2016-10-04. [7] _DSD Device Properties Usage Rules. - Documentation/acpi/DSD-properties-rules.txt + :doc:`../DSD-properties-rules` diff --git a/Documentation/acpi/enumeration.txt b/Documentation/firmware-guide/acpi/enumeration.rst similarity index 84% rename from Documentation/acpi/enumeration.txt rename to Documentation/firmware-guide/acpi/enumeration.rst index 7bcf9c3d9fbe27520f2167e77cba45c294cd9e89..6b32b7be8c85fc270841b12da316d47a15179733 100644 --- a/Documentation/acpi/enumeration.txt +++ b/Documentation/firmware-guide/acpi/enumeration.rst @@ -1,5 +1,9 @@ -ACPI based device enumeration -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. SPDX-License-Identifier: GPL-2.0 + +============================= +ACPI Based Device Enumeration +============================= + ACPI 5 introduced a set of new resources (UartTSerialBus, I2cSerialBus, SpiSerialBus, GpioIo and GpioInt) which can be used in enumerating slave devices behind serial bus controllers. @@ -11,12 +15,12 @@ that are accessed through memory-mapped registers. In order to support this and re-use the existing drivers as much as possible we decided to do following: - o Devices that have no bus connector resource are represented as - platform devices. + - Devices that have no bus connector resource are represented as + platform devices. - o Devices behind real busses where there is a connector resource - are represented as struct spi_device or struct i2c_device - (standard UARTs are not busses so there is no struct uart_device). + - Devices behind real busses where there is a connector resource + are represented as struct spi_device or struct i2c_device + (standard UARTs are not busses so there is no struct uart_device). As both ACPI and Device Tree represent a tree of devices (and their resources) this implementation follows the Device Tree way as much as @@ -31,7 +35,8 @@ enumerated from ACPI namespace. This handle can be used to extract other device-specific configuration. There is an example of this below. Platform bus support -~~~~~~~~~~~~~~~~~~~~ +==================== + Since we are using platform devices to represent devices that are not connected to any physical bus we only need to implement a platform driver for the device and add supported ACPI IDs. If this same IP-block is used on @@ -39,7 +44,7 @@ some other non-ACPI platform, the driver might work out of the box or needs some minor changes. Adding ACPI support for an existing driver should be pretty -straightforward. Here is the simplest example: +straightforward. Here is the simplest example:: #ifdef CONFIG_ACPI static const struct acpi_device_id mydrv_acpi_match[] = { @@ -61,12 +66,13 @@ configuring GPIOs it can get its ACPI handle and extract this information from ACPI tables. DMA support -~~~~~~~~~~~ +=========== + DMA controllers enumerated via ACPI should be registered in the system to provide generic access to their resources. For example, a driver that would like to be accessible to slave devices via generic API call dma_request_slave_channel() must register itself at the end of the probe -function like this: +function like this:: err = devm_acpi_dma_controller_register(dev, xlate_func, dw); /* Handle the error if it's not a case of !CONFIG_ACPI */ @@ -74,7 +80,7 @@ function like this: and implement custom xlate function if needed (usually acpi_dma_simple_xlate() is enough) which converts the FixedDMA resource provided by struct acpi_dma_spec into the corresponding DMA channel. A piece of code for that case -could look like: +could look like:: #ifdef CONFIG_ACPI struct filter_args { @@ -114,7 +120,7 @@ provided by struct acpi_dma. Clients must call dma_request_slave_channel() with the string parameter that corresponds to a specific FixedDMA resource. By default "tx" means the first entry of the FixedDMA resource array, "rx" means the second entry. The table -below shows a layout: +below shows a layout:: Device (I2C0) { @@ -138,12 +144,13 @@ acpi_dma_request_slave_chan_by_index() directly and therefore choose the specific FixedDMA resource by its index. SPI serial bus support -~~~~~~~~~~~~~~~~~~~~~~ +====================== + Slave devices behind SPI bus have SpiSerialBus resource attached to them. This is extracted automatically by the SPI core and the slave devices are enumerated once spi_register_master() is called by the bus driver. -Here is what the ACPI namespace for a SPI slave might look like: +Here is what the ACPI namespace for a SPI slave might look like:: Device (EEP0) { @@ -163,7 +170,7 @@ Here is what the ACPI namespace for a SPI slave might look like: The SPI device drivers only need to add ACPI IDs in a similar way than with the platform device drivers. Below is an example where we add ACPI support -to at25 SPI eeprom driver (this is meant for the above ACPI snippet): +to at25 SPI eeprom driver (this is meant for the above ACPI snippet):: #ifdef CONFIG_ACPI static const struct acpi_device_id at25_acpi_match[] = { @@ -182,7 +189,7 @@ to at25 SPI eeprom driver (this is meant for the above ACPI snippet): Note that this driver actually needs more information like page size of the eeprom etc. but at the time writing this there is no standard way of -passing those. One idea is to return this in _DSM method like: +passing those. One idea is to return this in _DSM method like:: Device (EEP0) { @@ -202,7 +209,7 @@ passing those. One idea is to return this in _DSM method like: } Then the at25 SPI driver can get this configuration by calling _DSM on its -ACPI handle like: +ACPI handle like:: struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL }; struct acpi_object_list input; @@ -220,14 +227,15 @@ ACPI handle like: kfree(output.pointer); I2C serial bus support -~~~~~~~~~~~~~~~~~~~~~~ +====================== + The slaves behind I2C bus controller only need to add the ACPI IDs like with the platform and SPI drivers. The I2C core automatically enumerates any slave devices behind the controller device once the adapter is registered. Below is an example of how to add ACPI support to the existing mpu3050 -input driver: +input driver:: #ifdef CONFIG_ACPI static const struct acpi_device_id mpu3050_acpi_match[] = { @@ -251,56 +259,57 @@ input driver: }; GPIO support -~~~~~~~~~~~~ +============ + ACPI 5 introduced two new resources to describe GPIO connections: GpioIo and GpioInt. These resources can be used to pass GPIO numbers used by the device to the driver. ACPI 5.1 extended this with _DSD (Device Specific Data) which made it possible to name the GPIOs among other things. -For example: +For example:: -Device (DEV) -{ - Method (_CRS, 0, NotSerialized) + Device (DEV) { - Name (SBUF, ResourceTemplate() + Method (_CRS, 0, NotSerialized) { - ... - // Used to power on/off the device - GpioIo (Exclusive, PullDefault, 0x0000, 0x0000, - IoRestrictionOutputOnly, "\\_SB.PCI0.GPI0", - 0x00, ResourceConsumer,,) + Name (SBUF, ResourceTemplate() { - // Pin List - 0x0055 - } + ... + // Used to power on/off the device + GpioIo (Exclusive, PullDefault, 0x0000, 0x0000, + IoRestrictionOutputOnly, "\\_SB.PCI0.GPI0", + 0x00, ResourceConsumer,,) + { + // Pin List + 0x0055 + } + + // Interrupt for the device + GpioInt (Edge, ActiveHigh, ExclusiveAndWake, PullNone, + 0x0000, "\\_SB.PCI0.GPI0", 0x00, ResourceConsumer,,) + { + // Pin list + 0x0058 + } + + ... - // Interrupt for the device - GpioInt (Edge, ActiveHigh, ExclusiveAndWake, PullNone, - 0x0000, "\\_SB.PCI0.GPI0", 0x00, ResourceConsumer,,) - { - // Pin list - 0x0058 } - ... - + Return (SBUF) } - Return (SBUF) - } - - // ACPI 5.1 _DSD used for naming the GPIOs - Name (_DSD, Package () - { - ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), - Package () + // ACPI 5.1 _DSD used for naming the GPIOs + Name (_DSD, Package () { - Package () {"power-gpios", Package() {^DEV, 0, 0, 0 }}, - Package () {"irq-gpios", Package() {^DEV, 1, 0, 0 }}, - } - }) - ... + ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), + Package () + { + Package () {"power-gpios", Package() {^DEV, 0, 0, 0 }}, + Package () {"irq-gpios", Package() {^DEV, 1, 0, 0 }}, + } + }) + ... These GPIO numbers are controller relative and path "\\_SB.PCI0.GPI0" specifies the path to the controller. In order to use these GPIOs in Linux @@ -310,7 +319,7 @@ There is a standard GPIO API for that and is documented in Documentation/gpio/. In the above example we can get the corresponding two GPIO descriptors with -a code like this: +a code like this:: #include ... @@ -334,21 +343,22 @@ See Documentation/acpi/gpio-properties.txt for more information about the _DSD binding related to GPIOs. MFD devices -~~~~~~~~~~~ +=========== + The MFD devices register their children as platform devices. For the child devices there needs to be an ACPI handle that they can use to reference parts of the ACPI namespace that relate to them. In the Linux MFD subsystem we provide two ways: - o The children share the parent ACPI handle. - o The MFD cell can specify the ACPI id of the device. + - The children share the parent ACPI handle. + - The MFD cell can specify the ACPI id of the device. For the first case, the MFD drivers do not need to do anything. The resulting child platform device will have its ACPI_COMPANION() set to point to the parent device. If the ACPI namespace has a device that we can match using an ACPI id or ACPI -adr, the cell should be set like: +adr, the cell should be set like:: static struct mfd_cell_acpi_match my_subdevice_cell_acpi_match = { .pnpid = "XYZ0001", @@ -366,7 +376,8 @@ the MFD device and if found, that ACPI companion device is bound to the resulting child platform device. Device Tree namespace link device ID -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +==================================== + The Device Tree protocol uses device identification based on the "compatible" property whose value is a string or an array of strings recognized as device identifiers by drivers and the driver core. The set of all those strings may be @@ -410,6 +421,32 @@ Specifically, the device IDs returned by _HID and preceding PRP0001 in the _CID return package will be checked first. Also in that case the bus type the device will be enumerated to depends on the device ID returned by _HID. +For example, the following ACPI sample might be used to enumerate an lm75-type +I2C temperature sensor and match it to the driver using the Device Tree +namespace link: + + Device (TMP0) + { + Name (_HID, "PRP0001") + Name (_DSD, Package() { + ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), + Package () { + Package (2) { "compatible", "ti,tmp75" }, + } + }) + Method (_CRS, 0, Serialized) + { + Name (SBUF, ResourceTemplate () + { + I2cSerialBusV2 (0x48, ControllerInitiated, + 400000, AddressingMode7Bit, + "\\_SB.PCI0.I2C1", 0x00, + ResourceConsumer, , Exclusive,) + }) + Return (SBUF) + } + } + It is valid to define device objects with a _HID returning PRP0001 and without the "compatible" property in the _DSD or a _CID as long as one of their ancestors provides a _DSD with a valid "compatible" property. Such device @@ -423,4 +460,4 @@ the _DSD of the device object itself or the _DSD of its ancestor in the Otherwise, the _DSD itself is regarded as invalid and therefore the "compatible" property returned by it is meaningless. -Refer to DSD-properties-rules.txt for more information. +Refer to :doc:`DSD-properties-rules` for more information. diff --git a/Documentation/acpi/gpio-properties.txt b/Documentation/firmware-guide/acpi/gpio-properties.rst similarity index 81% rename from Documentation/acpi/gpio-properties.txt rename to Documentation/firmware-guide/acpi/gpio-properties.rst index 88c65cb5bf0a70ae66f4838dcd9be6aad9370127..bb6d74f23ee0827568456621599aabc4c898c713 100644 --- a/Documentation/acpi/gpio-properties.txt +++ b/Documentation/firmware-guide/acpi/gpio-properties.rst @@ -1,5 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================================== _DSD Device Properties Related to GPIO --------------------------------------- +====================================== With the release of ACPI 5.1, the _DSD configuration object finally allows names to be given to GPIOs (and other things as well) returned @@ -8,7 +11,7 @@ the corresponding GPIO, which is pretty error prone (it depends on the _CRS output ordering, for example). With _DSD we can now query GPIOs using a name instead of an integer -index, like the ASL example below shows: +index, like the ASL example below shows:: // Bluetooth device with reset and shutdown GPIOs Device (BTH) @@ -34,15 +37,19 @@ index, like the ASL example below shows: }) } -The format of the supported GPIO property is: +The format of the supported GPIO property is:: Package () { "name", Package () { ref, index, pin, active_low }} - ref - The device that has _CRS containing GpioIo()/GpioInt() resources, - typically this is the device itself (BTH in our case). - index - Index of the GpioIo()/GpioInt() resource in _CRS starting from zero. - pin - Pin in the GpioIo()/GpioInt() resource. Typically this is zero. - active_low - If 1 the GPIO is marked as active_low. +ref + The device that has _CRS containing GpioIo()/GpioInt() resources, + typically this is the device itself (BTH in our case). +index + Index of the GpioIo()/GpioInt() resource in _CRS starting from zero. +pin + Pin in the GpioIo()/GpioInt() resource. Typically this is zero. +active_low + If 1 the GPIO is marked as active_low. Since ACPI GpioIo() resource does not have a field saying whether it is active low or high, the "active_low" argument can be used here. Setting @@ -55,7 +62,7 @@ It is possible to leave holes in the array of GPIOs. This is useful in cases like with SPI host controllers where some chip selects may be implemented as GPIOs and some as native signals. For example a SPI host controller can have chip selects 0 and 2 implemented as GPIOs and 1 as -native: +native:: Package () { "cs-gpios", @@ -67,7 +74,7 @@ native: } Other supported properties --------------------------- +========================== Following Device Tree compatible device properties are also supported by _DSD device properties for GPIO controllers: @@ -78,7 +85,7 @@ _DSD device properties for GPIO controllers: - input - line-name -Example: +Example:: Name (_DSD, Package () { // _DSD Hierarchical Properties Extension UUID @@ -100,7 +107,7 @@ Example: - gpio-line-names -Example: +Example:: Package () { "gpio-line-names", @@ -114,7 +121,7 @@ See Documentation/devicetree/bindings/gpio/gpio.txt for more information about these properties. ACPI GPIO Mappings Provided by Drivers --------------------------------------- +====================================== There are systems in which the ACPI tables do not contain _DSD but provide _CRS with GpioIo()/GpioInt() resources and device drivers still need to work with @@ -139,16 +146,16 @@ line in that resource starting from zero, and the active-low flag for that line, respectively, in analogy with the _DSD GPIO property format specified above. For the example Bluetooth device discussed previously the data structures in -question would look like this: +question would look like this:: -static const struct acpi_gpio_params reset_gpio = { 1, 1, false }; -static const struct acpi_gpio_params shutdown_gpio = { 0, 0, false }; + static const struct acpi_gpio_params reset_gpio = { 1, 1, false }; + static const struct acpi_gpio_params shutdown_gpio = { 0, 0, false }; -static const struct acpi_gpio_mapping bluetooth_acpi_gpios[] = { - { "reset-gpios", &reset_gpio, 1 }, - { "shutdown-gpios", &shutdown_gpio, 1 }, - { }, -}; + static const struct acpi_gpio_mapping bluetooth_acpi_gpios[] = { + { "reset-gpios", &reset_gpio, 1 }, + { "shutdown-gpios", &shutdown_gpio, 1 }, + { }, + }; Next, the mapping table needs to be passed as the second argument to acpi_dev_add_driver_gpios() that will register it with the ACPI device object @@ -158,12 +165,12 @@ calling acpi_dev_remove_driver_gpios() on the ACPI device object where that table was previously registered. Using the _CRS fallback ------------------------ +======================= If a device does not have _DSD or the driver does not create ACPI GPIO mapping, the Linux GPIO framework refuses to return any GPIOs. This is because the driver does not know what it actually gets. For example if we -have a device like below: +have a device like below:: Device (BTH) { @@ -177,7 +184,7 @@ have a device like below: }) } -The driver might expect to get the right GPIO when it does: +The driver might expect to get the right GPIO when it does:: desc = gpiod_get(dev, "reset", GPIOD_OUT_LOW); @@ -193,22 +200,25 @@ the ACPI GPIO mapping tables are hardly linked to ACPI ID and certain objects, as listed in the above chapter, of the device in question. Getting GPIO descriptor ------------------------ +======================= + +There are two main approaches to get GPIO resource from ACPI:: -There are two main approaches to get GPIO resource from ACPI: - desc = gpiod_get(dev, connection_id, flags); - desc = gpiod_get_index(dev, connection_id, index, flags); + desc = gpiod_get(dev, connection_id, flags); + desc = gpiod_get_index(dev, connection_id, index, flags); We may consider two different cases here, i.e. when connection ID is provided and otherwise. -Case 1: - desc = gpiod_get(dev, "non-null-connection-id", flags); - desc = gpiod_get_index(dev, "non-null-connection-id", index, flags); +Case 1:: + + desc = gpiod_get(dev, "non-null-connection-id", flags); + desc = gpiod_get_index(dev, "non-null-connection-id", index, flags); + +Case 2:: -Case 2: - desc = gpiod_get(dev, NULL, flags); - desc = gpiod_get_index(dev, NULL, index, flags); + desc = gpiod_get(dev, NULL, flags); + desc = gpiod_get_index(dev, NULL, index, flags); Case 1 assumes that corresponding ACPI device description must have defined device properties and will prevent to getting any GPIO resources diff --git a/Documentation/firmware-guide/acpi/i2c-muxes.rst b/Documentation/firmware-guide/acpi/i2c-muxes.rst new file mode 100644 index 0000000000000000000000000000000000000000..3a8997ccd7c4b63b6c9b3c1d34702fca1bfc27a6 --- /dev/null +++ b/Documentation/firmware-guide/acpi/i2c-muxes.rst @@ -0,0 +1,61 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +ACPI I2C Muxes +============== + +Describing an I2C device hierarchy that includes I2C muxes requires an ACPI +Device () scope per mux channel. + +Consider this topology:: + + +------+ +------+ + | SMB1 |-->| MUX0 |--CH00--> i2c client A (0x50) + | | | 0x70 |--CH01--> i2c client B (0x50) + +------+ +------+ + +which corresponds to the following ASL:: + + Device (SMB1) + { + Name (_HID, ...) + Device (MUX0) + { + Name (_HID, ...) + Name (_CRS, ResourceTemplate () { + I2cSerialBus (0x70, ControllerInitiated, I2C_SPEED, + AddressingMode7Bit, "^SMB1", 0x00, + ResourceConsumer,,) + } + + Device (CH00) + { + Name (_ADR, 0) + + Device (CLIA) + { + Name (_HID, ...) + Name (_CRS, ResourceTemplate () { + I2cSerialBus (0x50, ControllerInitiated, I2C_SPEED, + AddressingMode7Bit, "^CH00", 0x00, + ResourceConsumer,,) + } + } + } + + Device (CH01) + { + Name (_ADR, 1) + + Device (CLIB) + { + Name (_HID, ...) + Name (_CRS, ResourceTemplate () { + I2cSerialBus (0x50, ControllerInitiated, I2C_SPEED, + AddressingMode7Bit, "^CH01", 0x00, + ResourceConsumer,,) + } + } + } + } + } diff --git a/Documentation/firmware-guide/acpi/index.rst b/Documentation/firmware-guide/acpi/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..ae609eec4679c0dc19387677a71c8fb36cf65504 --- /dev/null +++ b/Documentation/firmware-guide/acpi/index.rst @@ -0,0 +1,26 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============ +ACPI Support +============ + +.. toctree:: + :maxdepth: 1 + + namespace + dsd/graph + dsd/data-node-references + enumeration + osi + method-customizing + method-tracing + DSD-properties-rules + debug + aml-debugger + apei/output_format + apei/einj + gpio-properties + i2c-muxes + acpi-lid + lpit + video_extension diff --git a/Documentation/acpi/lpit.txt b/Documentation/firmware-guide/acpi/lpit.rst similarity index 68% rename from Documentation/acpi/lpit.txt rename to Documentation/firmware-guide/acpi/lpit.rst index b426398d2e97109704db0e715f6f1f1572580cfe..aca928fab027a5cc5104dad1c19ae23c57853883 100644 --- a/Documentation/acpi/lpit.txt +++ b/Documentation/firmware-guide/acpi/lpit.rst @@ -1,3 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +Low Power Idle Table (LPIT) +=========================== + To enumerate platform Low Power Idle states, Intel platforms are using “Low Power Idle Table” (LPIT). More details about this table can be downloaded from: @@ -8,13 +14,15 @@ Residencies for each low power state can be read via FFH On platforms supporting S0ix sleep states, there can be two types of residencies: -- CPU PKG C10 (Read via FFH interface) -- Platform Controller Hub (PCH) SLP_S0 (Read via memory mapped interface) + + - CPU PKG C10 (Read via FFH interface) + - Platform Controller Hub (PCH) SLP_S0 (Read via memory mapped interface) The following attributes are added dynamically to the cpuidle -sysfs attribute group: - /sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us - /sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us +sysfs attribute group:: + + /sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us + /sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us The "low_power_idle_cpu_residency_us" attribute shows time spent by the CPU package in PKG C10 diff --git a/Documentation/firmware-guide/acpi/method-customizing.rst b/Documentation/firmware-guide/acpi/method-customizing.rst new file mode 100644 index 0000000000000000000000000000000000000000..de3ebcaed4cf0a27cb318919700fe5c1257bac13 --- /dev/null +++ b/Documentation/firmware-guide/acpi/method-customizing.rst @@ -0,0 +1,89 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================= +Linux ACPI Custom Control Method How To +======================================= + +:Author: Zhang Rui + + +Linux supports customizing ACPI control methods at runtime. + +Users can use this to: + +1. override an existing method which may not work correctly, + or just for debugging purposes. +2. insert a completely new method in order to create a missing + method such as _OFF, _ON, _STA, _INI, etc. + +For these cases, it is far simpler to dynamically install a single +control method rather than override the entire DSDT, because kernel +rebuild/reboot is not needed and test result can be got in minutes. + +.. note:: + + - Only ACPI METHOD can be overridden, any other object types like + "Device", "OperationRegion", are not recognized. Methods + declared inside scope operators are also not supported. + + - The same ACPI control method can be overridden for many times, + and it's always the latest one that used by Linux/kernel. + + - To get the ACPI debug object output (Store (AAAA, Debug)), + please run:: + + echo 1 > /sys/module/acpi/parameters/aml_debug_output + + +1. override an existing method +============================== +a) get the ACPI table via ACPI sysfs I/F. e.g. to get the DSDT, + just run "cat /sys/firmware/acpi/tables/DSDT > /tmp/dsdt.dat" +b) disassemble the table by running "iasl -d dsdt.dat". +c) rewrite the ASL code of the method and save it in a new file, +d) package the new file (psr.asl) to an ACPI table format. + Here is an example of a customized \_SB._AC._PSR method:: + + DefinitionBlock ("", "SSDT", 1, "", "", 0x20080715) + { + Method (\_SB_.AC._PSR, 0, NotSerialized) + { + Store ("In AC _PSR", Debug) + Return (ACON) + } + } + + Note that the full pathname of the method in ACPI namespace + should be used. +e) assemble the file to generate the AML code of the method. + e.g. "iasl -vw 6084 psr.asl" (psr.aml is generated as a result) + If parameter "-vw 6084" is not supported by your iASL compiler, + please try a newer version. +f) mount debugfs by "mount -t debugfs none /sys/kernel/debug" +g) override the old method via the debugfs by running + "cat /tmp/psr.aml > /sys/kernel/debug/acpi/custom_method" + +2. insert a new method +====================== +This is easier than overriding an existing method. +We just need to create the ASL code of the method we want to +insert and then follow the step c) ~ g) in section 1. + +3. undo your changes +==================== +The "undo" operation is not supported for a new inserted method +right now, i.e. we can not remove a method currently. +For an overridden method, in order to undo your changes, please +save a copy of the method original ASL code in step c) section 1, +and redo step c) ~ g) to override the method with the original one. + + +.. note:: We can use a kernel with multiple custom ACPI method running, + But each individual write to debugfs can implement a SINGLE + method override. i.e. if we want to insert/override multiple + ACPI methods, we need to redo step c) ~ g) for multiple times. + +.. note:: Be aware that root can mis-use this driver to modify arbitrary + memory and gain additional rights, if root's privileges got + restricted (for example if root is not allowed to load additional + modules after boot). diff --git a/Documentation/firmware-guide/acpi/method-tracing.rst b/Documentation/firmware-guide/acpi/method-tracing.rst new file mode 100644 index 0000000000000000000000000000000000000000..d0b077b73f5f21f5707d41f18e2b33fd56684686 --- /dev/null +++ b/Documentation/firmware-guide/acpi/method-tracing.rst @@ -0,0 +1,238 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +===================== +ACPICA Trace Facility +===================== + +:Copyright: |copy| 2015, Intel Corporation +:Author: Lv Zheng + + +Abstract +======== +This document describes the functions and the interfaces of the +method tracing facility. + +Functionalities and usage examples +================================== + +ACPICA provides method tracing capability. And two functions are +currently implemented using this capability. + +Log reducer +----------- + +ACPICA subsystem provides debugging outputs when CONFIG_ACPI_DEBUG is +enabled. The debugging messages which are deployed via +ACPI_DEBUG_PRINT() macro can be reduced at 2 levels - per-component +level (known as debug layer, configured via +/sys/module/acpi/parameters/debug_layer) and per-type level (known as +debug level, configured via /sys/module/acpi/parameters/debug_level). + +But when the particular layer/level is applied to the control method +evaluations, the quantity of the debugging outputs may still be too +large to be put into the kernel log buffer. The idea thus is worked out +to only enable the particular debug layer/level (normally more detailed) +logs when the control method evaluation is started, and disable the +detailed logging when the control method evaluation is stopped. + +The following command examples illustrate the usage of the "log reducer" +functionality: + +a. Filter out the debug layer/level matched logs when control methods + are being evaluated:: + + # cd /sys/module/acpi/parameters + # echo "0xXXXXXXXX" > trace_debug_layer + # echo "0xYYYYYYYY" > trace_debug_level + # echo "enable" > trace_state + +b. Filter out the debug layer/level matched logs when the specified + control method is being evaluated:: + + # cd /sys/module/acpi/parameters + # echo "0xXXXXXXXX" > trace_debug_layer + # echo "0xYYYYYYYY" > trace_debug_level + # echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name + # echo "method" > /sys/module/acpi/parameters/trace_state + +c. Filter out the debug layer/level matched logs when the specified + control method is being evaluated for the first time:: + + # cd /sys/module/acpi/parameters + # echo "0xXXXXXXXX" > trace_debug_layer + # echo "0xYYYYYYYY" > trace_debug_level + # echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name + # echo "method-once" > /sys/module/acpi/parameters/trace_state + +Where: + 0xXXXXXXXX/0xYYYYYYYY + Refer to Documentation/acpi/debug.txt for possible debug layer/level + masking values. + \PPPP.AAAA.TTTT.HHHH + Full path of a control method that can be found in the ACPI namespace. + It needn't be an entry of a control method evaluation. + +AML tracer +---------- + +There are special log entries added by the method tracing facility at +the "trace points" the AML interpreter starts/stops to execute a control +method, or an AML opcode. Note that the format of the log entries are +subject to change:: + + [ 0.186427] exdebug-0398 ex_trace_point : Method Begin [0xf58394d8:\_SB.PCI0.LPCB.ECOK] execution. + [ 0.186630] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905c88:If] execution. + [ 0.186820] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905cc0:LEqual] execution. + [ 0.187010] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905a20:-NamePath-] execution. + [ 0.187214] exdebug-0398 ex_trace_point : Opcode End [0xf5905a20:-NamePath-] execution. + [ 0.187407] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905f60:One] execution. + [ 0.187594] exdebug-0398 ex_trace_point : Opcode End [0xf5905f60:One] execution. + [ 0.187789] exdebug-0398 ex_trace_point : Opcode End [0xf5905cc0:LEqual] execution. + [ 0.187980] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905cc0:Return] execution. + [ 0.188146] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905f60:One] execution. + [ 0.188334] exdebug-0398 ex_trace_point : Opcode End [0xf5905f60:One] execution. + [ 0.188524] exdebug-0398 ex_trace_point : Opcode End [0xf5905cc0:Return] execution. + [ 0.188712] exdebug-0398 ex_trace_point : Opcode End [0xf5905c88:If] execution. + [ 0.188903] exdebug-0398 ex_trace_point : Method End [0xf58394d8:\_SB.PCI0.LPCB.ECOK] execution. + +Developers can utilize these special log entries to track the AML +interpretion, thus can aid issue debugging and performance tuning. Note +that, as the "AML tracer" logs are implemented via ACPI_DEBUG_PRINT() +macro, CONFIG_ACPI_DEBUG is also required to be enabled for enabling +"AML tracer" logs. + +The following command examples illustrate the usage of the "AML tracer" +functionality: + +a. Filter out the method start/stop "AML tracer" logs when control + methods are being evaluated:: + + # cd /sys/module/acpi/parameters + # echo "0x80" > trace_debug_layer + # echo "0x10" > trace_debug_level + # echo "enable" > trace_state + +b. Filter out the method start/stop "AML tracer" when the specified + control method is being evaluated:: + + # cd /sys/module/acpi/parameters + # echo "0x80" > trace_debug_layer + # echo "0x10" > trace_debug_level + # echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name + # echo "method" > trace_state + +c. Filter out the method start/stop "AML tracer" logs when the specified + control method is being evaluated for the first time:: + + # cd /sys/module/acpi/parameters + # echo "0x80" > trace_debug_layer + # echo "0x10" > trace_debug_level + # echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name + # echo "method-once" > trace_state + +d. Filter out the method/opcode start/stop "AML tracer" when the + specified control method is being evaluated:: + + # cd /sys/module/acpi/parameters + # echo "0x80" > trace_debug_layer + # echo "0x10" > trace_debug_level + # echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name + # echo "opcode" > trace_state + +e. Filter out the method/opcode start/stop "AML tracer" when the + specified control method is being evaluated for the first time:: + + # cd /sys/module/acpi/parameters + # echo "0x80" > trace_debug_layer + # echo "0x10" > trace_debug_level + # echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name + # echo "opcode-opcode" > trace_state + +Note that all above method tracing facility related module parameters can +be used as the boot parameters, for example:: + + acpi.trace_debug_layer=0x80 acpi.trace_debug_level=0x10 \ + acpi.trace_method_name=\_SB.LID0._LID acpi.trace_state=opcode-once + + +Interface descriptions +====================== + +All method tracing functions can be configured via ACPI module +parameters that are accessible at /sys/module/acpi/parameters/: + +trace_method_name + The full path of the AML method that the user wants to trace. + + Note that the full path shouldn't contain the trailing "_"s in its + name segments but may contain "\" to form an absolute path. + +trace_debug_layer + The temporary debug_layer used when the tracing feature is enabled. + + Using ACPI_EXECUTER (0x80) by default, which is the debug_layer + used to match all "AML tracer" logs. + +trace_debug_level + The temporary debug_level used when the tracing feature is enabled. + + Using ACPI_LV_TRACE_POINT (0x10) by default, which is the + debug_level used to match all "AML tracer" logs. + +trace_state + The status of the tracing feature. + + Users can enable/disable this debug tracing feature by executing + the following command:: + + # echo string > /sys/module/acpi/parameters/trace_state + +Where "string" should be one of the following: + +"disable" + Disable the method tracing feature. + +"enable" + Enable the method tracing feature. + + ACPICA debugging messages matching "trace_debug_layer/trace_debug_level" + during any method execution will be logged. + +"method" + Enable the method tracing feature. + + ACPICA debugging messages matching "trace_debug_layer/trace_debug_level" + during method execution of "trace_method_name" will be logged. + +"method-once" + Enable the method tracing feature. + + ACPICA debugging messages matching "trace_debug_layer/trace_debug_level" + during method execution of "trace_method_name" will be logged only once. + +"opcode" + Enable the method tracing feature. + + ACPICA debugging messages matching "trace_debug_layer/trace_debug_level" + during method/opcode execution of "trace_method_name" will be logged. + +"opcode-once" + Enable the method tracing feature. + + ACPICA debugging messages matching "trace_debug_layer/trace_debug_level" + during method/opcode execution of "trace_method_name" will be logged only + once. + +Note that, the difference between the "enable" and other feature +enabling options are: + +1. When "enable" is specified, since + "trace_debug_layer/trace_debug_level" shall apply to all control + method evaluations, after configuring "trace_state" to "enable", + "trace_method_name" will be reset to NULL. +2. When "method/opcode" is specified, if + "trace_method_name" is NULL when "trace_state" is configured to + these options, the "trace_debug_layer/trace_debug_level" will + apply to all control method evaluations. diff --git a/Documentation/acpi/namespace.txt b/Documentation/firmware-guide/acpi/namespace.rst similarity index 56% rename from Documentation/acpi/namespace.txt rename to Documentation/firmware-guide/acpi/namespace.rst index 1860cb3865c6b26474a2d07d3944a9858a9f07c6..835521baeb89765ff722eec6d73704629bb763b9 100644 --- a/Documentation/acpi/namespace.txt +++ b/Documentation/firmware-guide/acpi/namespace.rst @@ -1,85 +1,90 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +=================================================== ACPI Device Tree - Representation of ACPI Namespace +=================================================== -Copyright (C) 2013, Intel Corporation -Author: Lv Zheng +:Copyright: |copy| 2013, Intel Corporation +:Author: Lv Zheng -Abstract: +:Credit: Thanks for the help from Zhang Rui and + Rafael J.Wysocki . +Abstract +======== The Linux ACPI subsystem converts ACPI namespace objects into a Linux device tree under the /sys/devices/LNXSYSTEM:00 and updates it upon -receiving ACPI hotplug notification events. For each device object in this -hierarchy there is a corresponding symbolic link in the +receiving ACPI hotplug notification events. For each device object +in this hierarchy there is a corresponding symbolic link in the /sys/bus/acpi/devices. + This document illustrates the structure of the ACPI device tree. +ACPI Definition Blocks +====================== + +The ACPI firmware sets up RSDP (Root System Description Pointer) in the +system memory address space pointing to the XSDT (Extended System +Description Table). The XSDT always points to the FADT (Fixed ACPI +Description Table) using its first entry, the data within the FADT +includes various fixed-length entries that describe fixed ACPI features +of the hardware. The FADT contains a pointer to the DSDT +(Differentiated System Descripition Table). The XSDT also contains +entries pointing to possibly multiple SSDTs (Secondary System +Description Table). + +The DSDT and SSDT data is organized in data structures called definition +blocks that contain definitions of various objects, including ACPI +control methods, encoded in AML (ACPI Machine Language). The data block +of the DSDT along with the contents of SSDTs represents a hierarchical +data structure called the ACPI namespace whose topology reflects the +structure of the underlying hardware platform. + +The relationships between ACPI System Definition Tables described above +are illustrated in the following diagram:: + + +---------+ +-------+ +--------+ +------------------------+ + | RSDP | +->| XSDT | +->| FADT | | +-------------------+ | + +---------+ | +-------+ | +--------+ +-|->| DSDT | | + | Pointer | | | Entry |-+ | ...... | | | +-------------------+ | + +---------+ | +-------+ | X_DSDT |--+ | | Definition Blocks | | + | Pointer |-+ | ..... | | ...... | | +-------------------+ | + +---------+ +-------+ +--------+ | +-------------------+ | + | Entry |------------------|->| SSDT | | + +- - - -+ | +-------------------| | + | Entry | - - - - - - - -+ | | Definition Blocks | | + +- - - -+ | | +-------------------+ | + | | +- - - - - - - - - -+ | + +-|->| SSDT | | + | +-------------------+ | + | | Definition Blocks | | + | +- - - - - - - - - -+ | + +------------------------+ + | + OSPM Loading | + \|/ + +----------------+ + | ACPI Namespace | + +----------------+ + + Figure 1. ACPI Definition Blocks + +.. note:: RSDP can also contain a pointer to the RSDT (Root System + Description Table). Platforms provide RSDT to enable + compatibility with ACPI 1.0 operating systems. The OS is expected + to use XSDT, if present. + + +Example ACPI Namespace +====================== + +All definition blocks are loaded into a single namespace. The namespace +is a hierarchy of objects identified by names and paths. +The following naming conventions apply to object names in the ACPI +namespace: -Credit: - -Thanks for the help from Zhang Rui and Rafael J. -Wysocki . - - -1. ACPI Definition Blocks - - The ACPI firmware sets up RSDP (Root System Description Pointer) in the - system memory address space pointing to the XSDT (Extended System - Description Table). The XSDT always points to the FADT (Fixed ACPI - Description Table) using its first entry, the data within the FADT - includes various fixed-length entries that describe fixed ACPI features - of the hardware. The FADT contains a pointer to the DSDT - (Differentiated System Descripition Table). The XSDT also contains - entries pointing to possibly multiple SSDTs (Secondary System - Description Table). - - The DSDT and SSDT data is organized in data structures called definition - blocks that contain definitions of various objects, including ACPI - control methods, encoded in AML (ACPI Machine Language). The data block - of the DSDT along with the contents of SSDTs represents a hierarchical - data structure called the ACPI namespace whose topology reflects the - structure of the underlying hardware platform. - - The relationships between ACPI System Definition Tables described above - are illustrated in the following diagram. - - +---------+ +-------+ +--------+ +------------------------+ - | RSDP | +->| XSDT | +->| FADT | | +-------------------+ | - +---------+ | +-------+ | +--------+ +-|->| DSDT | | - | Pointer | | | Entry |-+ | ...... | | | +-------------------+ | - +---------+ | +-------+ | X_DSDT |--+ | | Definition Blocks | | - | Pointer |-+ | ..... | | ...... | | +-------------------+ | - +---------+ +-------+ +--------+ | +-------------------+ | - | Entry |------------------|->| SSDT | | - +- - - -+ | +-------------------| | - | Entry | - - - - - - - -+ | | Definition Blocks | | - +- - - -+ | | +-------------------+ | - | | +- - - - - - - - - -+ | - +-|->| SSDT | | - | +-------------------+ | - | | Definition Blocks | | - | +- - - - - - - - - -+ | - +------------------------+ - | - OSPM Loading | - \|/ - +----------------+ - | ACPI Namespace | - +----------------+ - - Figure 1. ACPI Definition Blocks - - NOTE: RSDP can also contain a pointer to the RSDT (Root System - Description Table). Platforms provide RSDT to enable - compatibility with ACPI 1.0 operating systems. The OS is expected - to use XSDT, if present. - - -2. Example ACPI Namespace - - All definition blocks are loaded into a single namespace. The namespace - is a hierarchy of objects identified by names and paths. - The following naming conventions apply to object names in the ACPI - namespace: 1. All names are 32 bits long. 2. The first byte of a name must be one of 'A' - 'Z', '_'. 3. Each of the remaining bytes of a name must be one of 'A' - 'Z', '0' @@ -91,7 +96,7 @@ Wysocki . (i.e. names prepended with '^' are relative to the parent of the current namespace node). - The figure below shows an example ACPI namespace. +The figure below shows an example ACPI namespace:: +------+ | \ | Root @@ -184,19 +189,20 @@ Wysocki . Figure 2. Example ACPI Namespace -3. Linux ACPI Device Objects +Linux ACPI Device Objects +========================= - The Linux kernel's core ACPI subsystem creates struct acpi_device - objects for ACPI namespace objects representing devices, power resources - processors, thermal zones. Those objects are exported to user space via - sysfs as directories in the subtree under /sys/devices/LNXSYSTM:00. The - format of their names is , where 'bus_id' refers to the - ACPI namespace representation of the given object and 'instance' is used - for distinguishing different object of the same 'bus_id' (it is - two-digit decimal representation of an unsigned integer). +The Linux kernel's core ACPI subsystem creates struct acpi_device +objects for ACPI namespace objects representing devices, power resources +processors, thermal zones. Those objects are exported to user space via +sysfs as directories in the subtree under /sys/devices/LNXSYSTM:00. The +format of their names is , where 'bus_id' refers to the +ACPI namespace representation of the given object and 'instance' is used +for distinguishing different object of the same 'bus_id' (it is +two-digit decimal representation of an unsigned integer). - The value of 'bus_id' depends on the type of the object whose name it is - part of as listed in the table below. +The value of 'bus_id' depends on the type of the object whose name it is +part of as listed in the table below:: +---+-----------------+-------+----------+ | | Object/Feature | Table | bus_id | @@ -226,10 +232,11 @@ Wysocki . Table 1. ACPI Namespace Objects Mapping - The following rules apply when creating struct acpi_device objects on - the basis of the contents of ACPI System Description Tables (as - indicated by the letter in the first column and the notation in the - second column of the table above): +The following rules apply when creating struct acpi_device objects on +the basis of the contents of ACPI System Description Tables (as +indicated by the letter in the first column and the notation in the +second column of the table above): + N: The object's source is an ACPI namespace node (as indicated by the named object's type in the second column). In that case the object's @@ -249,13 +256,14 @@ Wysocki . struct acpi_device object with LNXVIDEO 'bus_id' will be created for it. - The third column of the above table indicates which ACPI System - Description Tables contain information used for the creation of the - struct acpi_device objects represented by the given row (xSDT means DSDT - or SSDT). +The third column of the above table indicates which ACPI System +Description Tables contain information used for the creation of the +struct acpi_device objects represented by the given row (xSDT means DSDT +or SSDT). + +The forth column of the above table indicates the 'bus_id' generation +rule of the struct acpi_device object: - The forth column of the above table indicates the 'bus_id' generation - rule of the struct acpi_device object: _HID: _HID in the last column of the table means that the object's bus_id is derived from the _HID/_CID identification objects present under @@ -275,45 +283,47 @@ Wysocki . object's bus_id. -4. Linux ACPI Physical Device Glue - - ACPI device (i.e. struct acpi_device) objects may be linked to other - objects in the Linux' device hierarchy that represent "physical" devices - (for example, devices on the PCI bus). If that happens, it means that - the ACPI device object is a "companion" of a device otherwise - represented in a different way and is used (1) to provide configuration - information on that device which cannot be obtained by other means and - (2) to do specific things to the device with the help of its ACPI - control methods. One ACPI device object may be linked this way to - multiple "physical" devices. - - If an ACPI device object is linked to a "physical" device, its sysfs - directory contains the "physical_node" symbolic link to the sysfs - directory of the target device object. In turn, the target device's - sysfs directory will then contain the "firmware_node" symbolic link to - the sysfs directory of the companion ACPI device object. - The linking mechanism relies on device identification provided by the - ACPI namespace. For example, if there's an ACPI namespace object - representing a PCI device (i.e. a device object under an ACPI namespace - object representing a PCI bridge) whose _ADR returns 0x00020000 and the - bus number of the parent PCI bridge is 0, the sysfs directory - representing the struct acpi_device object created for that ACPI - namespace object will contain the 'physical_node' symbolic link to the - /sys/devices/pci0000:00/0000:00:02:0/ sysfs directory of the - corresponding PCI device. - - The linking mechanism is generally bus-specific. The core of its - implementation is located in the drivers/acpi/glue.c file, but there are - complementary parts depending on the bus types in question located - elsewhere. For example, the PCI-specific part of it is located in - drivers/pci/pci-acpi.c. - - -5. Example Linux ACPI Device Tree - - The sysfs hierarchy of struct acpi_device objects corresponding to the - example ACPI namespace illustrated in Figure 2 with the addition of - fixed PWR_BUTTON/SLP_BUTTON devices is shown below. +Linux ACPI Physical Device Glue +=============================== + +ACPI device (i.e. struct acpi_device) objects may be linked to other +objects in the Linux' device hierarchy that represent "physical" devices +(for example, devices on the PCI bus). If that happens, it means that +the ACPI device object is a "companion" of a device otherwise +represented in a different way and is used (1) to provide configuration +information on that device which cannot be obtained by other means and +(2) to do specific things to the device with the help of its ACPI +control methods. One ACPI device object may be linked this way to +multiple "physical" devices. + +If an ACPI device object is linked to a "physical" device, its sysfs +directory contains the "physical_node" symbolic link to the sysfs +directory of the target device object. In turn, the target device's +sysfs directory will then contain the "firmware_node" symbolic link to +the sysfs directory of the companion ACPI device object. +The linking mechanism relies on device identification provided by the +ACPI namespace. For example, if there's an ACPI namespace object +representing a PCI device (i.e. a device object under an ACPI namespace +object representing a PCI bridge) whose _ADR returns 0x00020000 and the +bus number of the parent PCI bridge is 0, the sysfs directory +representing the struct acpi_device object created for that ACPI +namespace object will contain the 'physical_node' symbolic link to the +/sys/devices/pci0000:00/0000:00:02:0/ sysfs directory of the +corresponding PCI device. + +The linking mechanism is generally bus-specific. The core of its +implementation is located in the drivers/acpi/glue.c file, but there are +complementary parts depending on the bus types in question located +elsewhere. For example, the PCI-specific part of it is located in +drivers/pci/pci-acpi.c. + + +Example Linux ACPI Device Tree +================================= + +The sysfs hierarchy of struct acpi_device objects corresponding to the +example ACPI namespace illustrated in Figure 2 with the addition of +fixed PWR_BUTTON/SLP_BUTTON devices is shown below:: +--------------+---+-----------------+ | LNXSYSTEM:00 | \ | acpi:LNXSYSTEM: | @@ -377,12 +387,14 @@ Wysocki . Figure 3. Example Linux ACPI Device Tree - NOTE: Each node is represented as "object/path/modalias", where: - 1. 'object' is the name of the object's directory in sysfs. - 2. 'path' is the ACPI namespace path of the corresponding - ACPI namespace object, as returned by the object's 'path' - sysfs attribute. - 3. 'modalias' is the value of the object's 'modalias' sysfs - attribute (as described earlier in this document). - NOTE: N/A indicates the device object does not have the 'path' or the - 'modalias' attribute. +.. note:: Each node is represented as "object/path/modalias", where: + + 1. 'object' is the name of the object's directory in sysfs. + 2. 'path' is the ACPI namespace path of the corresponding + ACPI namespace object, as returned by the object's 'path' + sysfs attribute. + 3. 'modalias' is the value of the object's 'modalias' sysfs + attribute (as described earlier in this document). + +.. note:: N/A indicates the device object does not have the 'path' or the + 'modalias' attribute. diff --git a/Documentation/acpi/osi.txt b/Documentation/firmware-guide/acpi/osi.rst similarity index 97% rename from Documentation/acpi/osi.txt rename to Documentation/firmware-guide/acpi/osi.rst index 50cde0ceb9b005f2474817bfc126697bd9ea721f..29e9ef79ebc0fde0834a17d2eff60f66841c12c6 100644 --- a/Documentation/acpi/osi.txt +++ b/Documentation/firmware-guide/acpi/osi.rst @@ -1,5 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== ACPI _OSI and _REV methods --------------------------- +========================== An ACPI BIOS can use the "Operating System Interfaces" method (_OSI) to find out what the operating system supports. Eg. If BIOS @@ -14,7 +17,7 @@ This document explains how and why the BIOS and Linux should use these methods. It also explains how and why they are widely misused. How to use _OSI ---------------- +=============== Linux runs on two groups of machines -- those that are tested by the OEM to be compatible with Linux, and those that were never tested with Linux, @@ -62,7 +65,7 @@ the string when that support is added to the kernel. That was easy. Read on, to find out how to do it wrong. Before _OSI, there was _OS --------------------------- +========================== ACPI 1.0 specified "_OS" as an "object that evaluates to a string that identifies the operating system." @@ -96,7 +99,7 @@ That is the *only* viable strategy, as that is what modern Windows does, and so doing otherwise could steer the BIOS down an untested path. _OSI is born, and immediately misused --------------------------------------- +===================================== With _OSI, the *BIOS* provides the string describing an interface, and asks the OS: "YES/NO, are you compatible with this interface?" @@ -144,7 +147,7 @@ catastrophic failure resulting from the BIOS taking paths that were never validated under *any* OS. Do not use _REV ---------------- +=============== Since _OSI("Linux") went away, some BIOS writers used _REV to support Linux and Windows differences in the same BIOS. @@ -164,7 +167,7 @@ from mid-2015 onward. The ACPI specification will also be updated to reflect that _REV is deprecated, and always returns 2. Apple Mac and _OSI("Darwin") ----------------------------- +============================ On Apple's Mac platforms, the ACPI BIOS invokes _OSI("Darwin") to determine if the machine is running Apple OSX. diff --git a/Documentation/acpi/video_extension.txt b/Documentation/firmware-guide/acpi/video_extension.rst similarity index 70% rename from Documentation/acpi/video_extension.txt rename to Documentation/firmware-guide/acpi/video_extension.rst index 79bf6a4921beed319291e4beb6497b642f76bcdc..099b8607e07b22e036281fe36fe2b6f27330deaa 100644 --- a/Documentation/acpi/video_extension.txt +++ b/Documentation/firmware-guide/acpi/video_extension.rst @@ -1,5 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== ACPI video extensions -~~~~~~~~~~~~~~~~~~~~~ +===================== This driver implement the ACPI Extensions For Display Adapters for integrated graphics devices on motherboard, as specified in ACPI 2.0 @@ -8,9 +11,10 @@ defining the video POST device, retrieving EDID information or to setup a video output, etc. Note that this is an ref. implementation only. It may or may not work for your integrated video device. -The ACPI video driver does 3 things regarding backlight control: +The ACPI video driver does 3 things regarding backlight control. -1 Export a sysfs interface for user space to control backlight level +Export a sysfs interface for user space to control backlight level +================================================================== If the ACPI table has a video device, and acpi_backlight=vendor kernel command line is not present, the driver will register a backlight device @@ -22,36 +26,41 @@ The backlight sysfs interface has a standard definition here: Documentation/ABI/stable/sysfs-class-backlight. And what ACPI video driver does is: -actual_brightness: on read, control method _BQC will be evaluated to -get the brightness level the firmware thinks it is at; -bl_power: not implemented, will set the current brightness instead; -brightness: on write, control method _BCM will run to set the requested -brightness level; -max_brightness: Derived from the _BCL package(see below); -type: firmware + +actual_brightness: + on read, control method _BQC will be evaluated to + get the brightness level the firmware thinks it is at; +bl_power: + not implemented, will set the current brightness instead; +brightness: + on write, control method _BCM will run to set the requested brightness level; +max_brightness: + Derived from the _BCL package(see below); +type: + firmware Note that ACPI video backlight driver will always use index for brightness, actual_brightness and max_brightness. So if we have -the following _BCL package: +the following _BCL package:: -Method (_BCL, 0, NotSerialized) -{ - Return (Package (0x0C) + Method (_BCL, 0, NotSerialized) { - 0x64, - 0x32, - 0x0A, - 0x14, - 0x1E, - 0x28, - 0x32, - 0x3C, - 0x46, - 0x50, - 0x5A, - 0x64 - }) -} + Return (Package (0x0C) + { + 0x64, + 0x32, + 0x0A, + 0x14, + 0x1E, + 0x28, + 0x32, + 0x3C, + 0x46, + 0x50, + 0x5A, + 0x64 + }) + } The first two levels are for when laptop are on AC or on battery and are not used by Linux currently. The remaining 10 levels are supported levels @@ -62,13 +71,15 @@ as a "brightness level" indicator. Thus from the user space perspective the range of available brightness levels is from 0 to 9 (max_brightness) inclusive. -2 Notify user space about hotkey event +Notify user space about hotkey event +==================================== There are generally two cases for hotkey event reporting: + i) For some laptops, when user presses the hotkey, a scancode will be generated and sent to user space through the input device created by the keyboard driver as a key type input event, with proper remap, the - following key code will appear to user space: + following key code will appear to user space:: EV_KEY, KEY_BRIGHTNESSUP EV_KEY, KEY_BRIGHTNESSDOWN @@ -84,23 +95,27 @@ ii) For some laptops, the press of the hotkey will not generate the notify value it received and send the event to user space through the input device it created: + ===== ================== event keycode + ===== ================== 0x86 KEY_BRIGHTNESSUP 0x87 KEY_BRIGHTNESSDOWN etc. + ===== ================== so this would lead to the same effect as case i) now. Once user space tool receives this event, it can modify the backlight level through the sysfs interface. -3 Change backlight level in the kernel +Change backlight level in the kernel +==================================== This works for machines covered by case ii) in Section 2. Once the driver received a notification, it will set the backlight level accordingly. This does not affect the sending of event to user space, they are always sent to user space regardless of whether or not the video module controls the backlight level directly. This behaviour can be controlled through the brightness_switch_enabled -module parameter as documented in admin-guide/kernel-parameters.rst. It is recommended to -disable this behaviour once a GUI environment starts up and wants to have full -control of the backlight level. +module parameter as documented in admin-guide/kernel-parameters.rst. It is +recommended to disable this behaviour once a GUI environment starts up and +wants to have full control of the backlight level. diff --git a/Documentation/firmware-guide/index.rst b/Documentation/firmware-guide/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..5355784ca0a21d70f838e44b38a28ac825443b77 --- /dev/null +++ b/Documentation/firmware-guide/index.rst @@ -0,0 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================== +The Linux kernel firmware guide +=============================== + +This section describes the ACPI subsystem in Linux from firmware perspective. + +.. toctree:: + :maxdepth: 1 + + acpi/index + diff --git a/Documentation/index.rst b/Documentation/index.rst index 80a421cb935e78a5533e4e3faf8e3e550353ead9..fdfa85c56a50f6d3bc501dee1a8cf0bb2e502c03 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -35,6 +35,16 @@ trying to get it to work optimally on a given system. admin-guide/index +Firmware-related documentation +------------------------------ +The following holds information on the kernel's expectations regarding the +platform firmwares. + +.. toctree:: + :maxdepth: 2 + + firmware-guide/index + Application-developer documentation ----------------------------------- diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 10f4499e677c0863475c8583edfe83a224bc2bae..ee60e519438aaffefa2b29c9787ecc5335966a5f 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -243,10 +243,10 @@ Optimization ^^^^^^^^^^^^ The Kprobe-optimizer doesn't insert the jump instruction immediately; -rather, it calls synchronize_sched() for safety first, because it's +rather, it calls synchronize_rcu() for safety first, because it's possible for a CPU to be interrupted in the middle of executing the -optimized region [3]_. As you know, synchronize_sched() can ensure -that all interruptions that were active when synchronize_sched() +optimized region [3]_. As you know, synchronize_rcu() can ensure +that all interruptions that were active when synchronize_rcu() was called are done, but only if CONFIG_PREEMPT=n. So, this version of kprobe optimization supports only kernels with CONFIG_PREEMPT=n [4]_. diff --git a/Documentation/lzo.txt b/Documentation/lzo.txt index f79934225d8d35bd98d3ebd4dcc996324abd66b9..ca983328976bcf36e040ab7508eb29583ed5ca4e 100644 --- a/Documentation/lzo.txt +++ b/Documentation/lzo.txt @@ -102,9 +102,11 @@ Byte sequences dictionary which is empty, and that it will always be invalid at this place. - 17 : bitstream version. If the first byte is 17, the next byte - gives the bitstream version (version 1 only). If the first byte - is not 17, the bitstream version is 0. + 17 : bitstream version. If the first byte is 17, and compressed + stream length is at least 5 bytes (length of shortest possible + versioned bitstream), the next byte gives the bitstream version + (version 1 only). + Otherwise, the bitstream version is 0. 18..21 : copy 0..3 literals state = (byte - 17) = 0..3 [ copy literals ] diff --git a/Documentation/media/uapi/rc/rc-tables.rst b/Documentation/media/uapi/rc/rc-tables.rst index f460031d85313821ac91d940b915d26994d1ed00..177ac44fa0fac33363d34f840c663d4699039a79 100644 --- a/Documentation/media/uapi/rc/rc-tables.rst +++ b/Documentation/media/uapi/rc/rc-tables.rst @@ -623,7 +623,7 @@ the remote via /dev/input/event devices. - .. row 78 - - ``KEY_SCREEN`` + - ``KEY_ASPECT_RATIO`` - Select screen aspect ratio @@ -631,7 +631,7 @@ the remote via /dev/input/event devices. - .. row 79 - - ``KEY_ZOOM`` + - ``KEY_FULL_SCREEN`` - Put device into zoom/full screen mode diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 1c22b21ae922f012f075e11a11a2cf77ba0bc824..f70ebcdfe592dbb5ab91f0a60561d3885d3a0aff 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1937,21 +1937,6 @@ There are some more advanced barrier functions: information on consistent memory. -MMIO WRITE BARRIER ------------------- - -The Linux kernel also has a special barrier for use with memory-mapped I/O -writes: - - mmiowb(); - -This is a variation on the mandatory write barrier that causes writes to weakly -ordered I/O regions to be partially ordered. Its effects may go beyond the -CPU->Hardware interface and actually affect the hardware at some level. - -See the subsection "Acquires vs I/O accesses" for more information. - - =============================== IMPLICIT KERNEL MEMORY BARRIERS =============================== @@ -2317,75 +2302,6 @@ But it won't see any of: *E, *F or *G following RELEASE Q - -ACQUIRES VS I/O ACCESSES ------------------------- - -Under certain circumstances (especially involving NUMA), I/O accesses within -two spinlocked sections on two different CPUs may be seen as interleaved by the -PCI bridge, because the PCI bridge does not necessarily participate in the -cache-coherence protocol, and is therefore incapable of issuing the required -read memory barriers. - -For example: - - CPU 1 CPU 2 - =============================== =============================== - spin_lock(Q) - writel(0, ADDR) - writel(1, DATA); - spin_unlock(Q); - spin_lock(Q); - writel(4, ADDR); - writel(5, DATA); - spin_unlock(Q); - -may be seen by the PCI bridge as follows: - - STORE *ADDR = 0, STORE *ADDR = 4, STORE *DATA = 1, STORE *DATA = 5 - -which would probably cause the hardware to malfunction. - - -What is necessary here is to intervene with an mmiowb() before dropping the -spinlock, for example: - - CPU 1 CPU 2 - =============================== =============================== - spin_lock(Q) - writel(0, ADDR) - writel(1, DATA); - mmiowb(); - spin_unlock(Q); - spin_lock(Q); - writel(4, ADDR); - writel(5, DATA); - mmiowb(); - spin_unlock(Q); - -this will ensure that the two stores issued on CPU 1 appear at the PCI bridge -before either of the stores issued on CPU 2. - - -Furthermore, following a store by a load from the same device obviates the need -for the mmiowb(), because the load forces the store to complete before the load -is performed: - - CPU 1 CPU 2 - =============================== =============================== - spin_lock(Q) - writel(0, ADDR) - a = readl(DATA); - spin_unlock(Q); - spin_lock(Q); - writel(4, ADDR); - b = readl(DATA); - spin_unlock(Q); - - -See Documentation/driver-api/device-io.rst for more information. - - ================================= WHERE ARE MEMORY BARRIERS NEEDED? ================================= @@ -2532,16 +2448,9 @@ the device to malfunction. Inside of the Linux kernel, I/O should be done through the appropriate accessor routines - such as inb() or writel() - which know how to make such accesses appropriately sequential. While this, for the most part, renders the explicit -use of memory barriers unnecessary, there are a couple of situations where they -might be needed: - - (1) On some systems, I/O stores are not strongly ordered across all CPUs, and - so for _all_ general drivers locks should be used and mmiowb() must be - issued prior to unlocking the critical section. - - (2) If the accessor functions are used to refer to an I/O memory window with - relaxed memory access properties, then _mandatory_ memory barriers are - required to enforce ordering. +use of memory barriers unnecessary, if the accessor functions are used to refer +to an I/O memory window with relaxed memory access properties, then _mandatory_ +memory barriers are required to enforce ordering. See Documentation/driver-api/device-io.rst for more information. @@ -2586,8 +2495,7 @@ explicit barriers are used. Normally this won't be a problem because the I/O accesses done inside such sections will include synchronous load operations on strictly ordered I/O -registers that form implicit I/O barriers. If this isn't sufficient then an -mmiowb() may need to be used explicitly. +registers that form implicit I/O barriers. A similar situation may occur between an interrupt routine and two routines @@ -2599,71 +2507,114 @@ likely, then interrupt-disabling locks should be used to guarantee ordering. KERNEL I/O BARRIER EFFECTS ========================== -When accessing I/O memory, drivers should use the appropriate accessor -functions: - - (*) inX(), outX(): - - These are intended to talk to I/O space rather than memory space, but - that's primarily a CPU-specific concept. The i386 and x86_64 processors - do indeed have special I/O space access cycles and instructions, but many - CPUs don't have such a concept. - - The PCI bus, amongst others, defines an I/O space concept which - on such - CPUs as i386 and x86_64 - readily maps to the CPU's concept of I/O - space. However, it may also be mapped as a virtual I/O space in the CPU's - memory map, particularly on those CPUs that don't support alternate I/O - spaces. - - Accesses to this space may be fully synchronous (as on i386), but - intermediary bridges (such as the PCI host bridge) may not fully honour - that. - - They are guaranteed to be fully ordered with respect to each other. - - They are not guaranteed to be fully ordered with respect to other types of - memory and I/O operation. +Interfacing with peripherals via I/O accesses is deeply architecture and device +specific. Therefore, drivers which are inherently non-portable may rely on +specific behaviours of their target systems in order to achieve synchronization +in the most lightweight manner possible. For drivers intending to be portable +between multiple architectures and bus implementations, the kernel offers a +series of accessor functions that provide various degrees of ordering +guarantees: (*) readX(), writeX(): - Whether these are guaranteed to be fully ordered and uncombined with - respect to each other on the issuing CPU depends on the characteristics - defined for the memory window through which they're accessing. On later - i386 architecture machines, for example, this is controlled by way of the - MTRR registers. + The readX() and writeX() MMIO accessors take a pointer to the + peripheral being accessed as an __iomem * parameter. For pointers + mapped with the default I/O attributes (e.g. those returned by + ioremap()), the ordering guarantees are as follows: + + 1. All readX() and writeX() accesses to the same peripheral are ordered + with respect to each other. This ensures that MMIO register accesses + by the same CPU thread to a particular device will arrive in program + order. + + 2. A writeX() issued by a CPU thread holding a spinlock is ordered + before a writeX() to the same peripheral from another CPU thread + issued after a later acquisition of the same spinlock. This ensures + that MMIO register writes to a particular device issued while holding + a spinlock will arrive in an order consistent with acquisitions of + the lock. + + 3. A writeX() by a CPU thread to the peripheral will first wait for the + completion of all prior writes to memory either issued by, or + propagated to, the same thread. This ensures that writes by the CPU + to an outbound DMA buffer allocated by dma_alloc_coherent() will be + visible to a DMA engine when the CPU writes to its MMIO control + register to trigger the transfer. + + 4. A readX() by a CPU thread from the peripheral will complete before + any subsequent reads from memory by the same thread can begin. This + ensures that reads by the CPU from an incoming DMA buffer allocated + by dma_alloc_coherent() will not see stale data after reading from + the DMA engine's MMIO status register to establish that the DMA + transfer has completed. + + 5. A readX() by a CPU thread from the peripheral will complete before + any subsequent delay() loop can begin execution on the same thread. + This ensures that two MMIO register writes by the CPU to a peripheral + will arrive at least 1us apart if the first write is immediately read + back with readX() and udelay(1) is called prior to the second + writeX(): + + writel(42, DEVICE_REGISTER_0); // Arrives at the device... + readl(DEVICE_REGISTER_0); + udelay(1); + writel(42, DEVICE_REGISTER_1); // ...at least 1us before this. + + The ordering properties of __iomem pointers obtained with non-default + attributes (e.g. those returned by ioremap_wc()) are specific to the + underlying architecture and therefore the guarantees listed above cannot + generally be relied upon for accesses to these types of mappings. + + (*) readX_relaxed(), writeX_relaxed(): + + These are similar to readX() and writeX(), but provide weaker memory + ordering guarantees. Specifically, they do not guarantee ordering with + respect to locking, normal memory accesses or delay() loops (i.e. + bullets 2-5 above) but they are still guaranteed to be ordered with + respect to other accesses from the same CPU thread to the same + peripheral when operating on __iomem pointers mapped with the default + I/O attributes. + + (*) readsX(), writesX(): + + The readsX() and writesX() MMIO accessors are designed for accessing + register-based, memory-mapped FIFOs residing on peripherals that are not + capable of performing DMA. Consequently, they provide only the ordering + guarantees of readX_relaxed() and writeX_relaxed(), as documented above. - Ordinarily, these will be guaranteed to be fully ordered and uncombined, - provided they're not accessing a prefetchable device. + (*) inX(), outX(): - However, intermediary hardware (such as a PCI bridge) may indulge in - deferral if it so wishes; to flush a store, a load from the same location - is preferred[*], but a load from the same device or from configuration - space should suffice for PCI. + The inX() and outX() accessors are intended to access legacy port-mapped + I/O peripherals, which may require special instructions on some + architectures (notably x86). The port number of the peripheral being + accessed is passed as an argument. - [*] NOTE! attempting to load from the same location as was written to may - cause a malfunction - consider the 16550 Rx/Tx serial registers for - example. + Since many CPU architectures ultimately access these peripherals via an + internal virtual memory mapping, the portable ordering guarantees + provided by inX() and outX() are the same as those provided by readX() + and writeX() respectively when accessing a mapping with the default I/O + attributes. - Used with prefetchable I/O memory, an mmiowb() barrier may be required to - force stores to be ordered. + Device drivers may expect outX() to emit a non-posted write transaction + that waits for a completion response from the I/O peripheral before + returning. This is not guaranteed by all architectures and is therefore + not part of the portable ordering semantics. - Please refer to the PCI specification for more information on interactions - between PCI transactions. + (*) insX(), outsX(): - (*) readX_relaxed(), writeX_relaxed() + As above, the insX() and outsX() accessors provide the same ordering + guarantees as readsX() and writesX() respectively when accessing a + mapping with the default I/O attributes. - These are similar to readX() and writeX(), but provide weaker memory - ordering guarantees. Specifically, they do not guarantee ordering with - respect to normal memory accesses (e.g. DMA buffers) nor do they guarantee - ordering with respect to LOCK or UNLOCK operations. If the latter is - required, an mmiowb() barrier can be used. Note that relaxed accesses to - the same peripheral are guaranteed to be ordered with respect to each - other. + (*) ioreadX(), iowriteX(): - (*) ioreadX(), iowriteX() + These will perform appropriately for the type of access they're actually + doing, be it inX()/outX() or readX()/writeX(). - These will perform appropriately for the type of access they're actually - doing, be it inX()/outX() or readX()/writeX(). +With the exception of the string accessors (insX(), outsX(), readsX() and +writesX()), all of the above assume that the underlying peripheral is +little-endian and will therefore perform byte-swapping operations on big-endian +architectures. ======================================== diff --git a/Documentation/networking/bpf_flow_dissector.rst b/Documentation/networking/bpf_flow_dissector.rst new file mode 100644 index 0000000000000000000000000000000000000000..b375ae2ec2c4f43d4c59a5f12cb26b3a8f205df5 --- /dev/null +++ b/Documentation/networking/bpf_flow_dissector.rst @@ -0,0 +1,126 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================== +BPF Flow Dissector +================== + +Overview +======== + +Flow dissector is a routine that parses metadata out of the packets. It's +used in the various places in the networking subsystem (RFS, flow hash, etc). + +BPF flow dissector is an attempt to reimplement C-based flow dissector logic +in BPF to gain all the benefits of BPF verifier (namely, limits on the +number of instructions and tail calls). + +API +=== + +BPF flow dissector programs operate on an ``__sk_buff``. However, only the +limited set of fields is allowed: ``data``, ``data_end`` and ``flow_keys``. +``flow_keys`` is ``struct bpf_flow_keys`` and contains flow dissector input +and output arguments. + +The inputs are: + * ``nhoff`` - initial offset of the networking header + * ``thoff`` - initial offset of the transport header, initialized to nhoff + * ``n_proto`` - L3 protocol type, parsed out of L2 header + +Flow dissector BPF program should fill out the rest of the ``struct +bpf_flow_keys`` fields. Input arguments ``nhoff/thoff/n_proto`` should be +also adjusted accordingly. + +The return code of the BPF program is either BPF_OK to indicate successful +dissection, or BPF_DROP to indicate parsing error. + +__sk_buff->data +=============== + +In the VLAN-less case, this is what the initial state of the BPF flow +dissector looks like:: + + +------+------+------------+-----------+ + | DMAC | SMAC | ETHER_TYPE | L3_HEADER | + +------+------+------------+-----------+ + ^ + | + +-- flow dissector starts here + + +.. code:: c + + skb->data + flow_keys->nhoff point to the first byte of L3_HEADER + flow_keys->thoff = nhoff + flow_keys->n_proto = ETHER_TYPE + +In case of VLAN, flow dissector can be called with the two different states. + +Pre-VLAN parsing:: + + +------+------+------+-----+-----------+-----------+ + | DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | + +------+------+------+-----+-----------+-----------+ + ^ + | + +-- flow dissector starts here + +.. code:: c + + skb->data + flow_keys->nhoff point the to first byte of TCI + flow_keys->thoff = nhoff + flow_keys->n_proto = TPID + +Please note that TPID can be 802.1AD and, hence, BPF program would +have to parse VLAN information twice for double tagged packets. + + +Post-VLAN parsing:: + + +------+------+------+-----+-----------+-----------+ + | DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | + +------+------+------+-----+-----------+-----------+ + ^ + | + +-- flow dissector starts here + +.. code:: c + + skb->data + flow_keys->nhoff point the to first byte of L3_HEADER + flow_keys->thoff = nhoff + flow_keys->n_proto = ETHER_TYPE + +In this case VLAN information has been processed before the flow dissector +and BPF flow dissector is not required to handle it. + + +The takeaway here is as follows: BPF flow dissector program can be called with +the optional VLAN header and should gracefully handle both cases: when single +or double VLAN is present and when it is not present. The same program +can be called for both cases and would have to be written carefully to +handle both cases. + + +Reference Implementation +======================== + +See ``tools/testing/selftests/bpf/progs/bpf_flow.c`` for the reference +implementation and ``tools/testing/selftests/bpf/flow_dissector_load.[hc]`` +for the loader. bpftool can be used to load BPF flow dissector program as well. + +The reference implementation is organized as follows: + * ``jmp_table`` map that contains sub-programs for each supported L3 protocol + * ``_dissect`` routine - entry point; it does input ``n_proto`` parsing and + does ``bpf_tail_call`` to the appropriate L3 handler + +Since BPF at this point doesn't support looping (or any jumping back), +jmp_table is used instead to handle multiple levels of encapsulation (and +IPv6 options). + + +Current Limitations +=================== +BPF flow dissector doesn't support exporting all the metadata that in-kernel +C-based implementation can export. Notable example is single VLAN (802.1Q) +and double VLAN (802.1AD) tags. Please refer to the ``struct bpf_flow_keys`` +for a set of information that's currently can be exported from the BPF context. diff --git a/Documentation/networking/decnet.txt b/Documentation/networking/decnet.txt index e12a4900cf72cb00b1ade4c0257a23c93d2d8f21..d192f8b9948b5483c16b83f41a5b25c1e5cda846 100644 --- a/Documentation/networking/decnet.txt +++ b/Documentation/networking/decnet.txt @@ -22,8 +22,6 @@ you'll need the following options as well... CONFIG_DECNET_ROUTER (to be able to add/delete routes) CONFIG_NETFILTER (will be required for the DECnet routing daemon) - CONFIG_DECNET_ROUTE_FWMARK is optional - Don't turn on SIOCGIFCONF support for DECnet unless you are really sure that you need it, in general you won't and it can cause ifconfig to malfunction. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 5449149be496fa8448fa5b74bafe2c5c796cb06d..984e68f9e0269507132846517a4c4c2b8d726216 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -9,6 +9,7 @@ Contents: netdev-FAQ af_xdp batman-adv + bpf_flow_dissector can can_ucan_protocol device_drivers/freescale/dpaa2/index diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index acdfb5d2bcaa44a8a0ecdcfcae14202d1ed75bc3..c4ac35234f0551bdca1f774f3570fe100912474c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -422,6 +422,7 @@ tcp_min_rtt_wlen - INTEGER minimum RTT when it is moved to a longer path (e.g., due to traffic engineering). A longer window makes the filter more resistant to RTT inflations such as transient congestion. The unit is seconds. + Possible values: 0 - 86400 (1 day) Default: 300 tcp_moderate_rcvbuf - BOOLEAN @@ -1336,6 +1337,7 @@ tag - INTEGER Default value is 0. xfrm4_gc_thresh - INTEGER + (Obsolete since linux-4.14) The threshold at which we will start garbage collecting for IPv4 destination cache entries. At twice this value the system will refuse new allocations. @@ -1919,6 +1921,7 @@ echo_ignore_all - BOOLEAN Default: 0 xfrm6_gc_thresh - INTEGER + (Obsolete since linux-4.14) The threshold at which we will start garbage collecting for IPv6 destination cache entries. At twice this value the system will refuse new allocations. diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index 8c7a713cf657a769f011dfd45676473e2ee94e2e..642fa963be3cf8f325c29947072e6c6851213d4b 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -132,7 +132,7 @@ version that should be applied. If there is any doubt, the maintainer will reply and ask what should be done. Q: I made changes to only a few patches in a patch series should I resend only those changed? --------------------------------------------------------------------------------------------- +--------------------------------------------------------------------------------------------- A: No, please resend the entire patch series and make sure you do number your patches such that it is clear this is the latest and greatest set of patches that can be applied. diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index 2df5894353d6954f5c0dd26d2c149c6e9ee6ee4c..cd7303d7fa25dac9ae38d0e73186f3687b7872a7 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -1009,16 +1009,18 @@ The kernel interface functions are as follows: (*) Check call still alive. - u32 rxrpc_kernel_check_life(struct socket *sock, - struct rxrpc_call *call); + bool rxrpc_kernel_check_life(struct socket *sock, + struct rxrpc_call *call, + u32 *_life); void rxrpc_kernel_probe_life(struct socket *sock, struct rxrpc_call *call); - The first function returns a number that is updated when ACKs are received - from the peer (notably including PING RESPONSE ACKs which we can elicit by - sending PING ACKs to see if the call still exists on the server). The - caller should compare the numbers of two calls to see if the call is still - alive after waiting for a suitable interval. + The first function passes back in *_life a number that is updated when + ACKs are received from the peer (notably including PING RESPONSE ACKs + which we can elicit by sending PING ACKs to see if the call still exists + on the server). The caller should compare the numbers of two calls to see + if the call is still alive after waiting for a suitable interval. It also + returns true as long as the call hasn't yet reached the completed state. This allows the caller to work out if the server is still contactable and if the call is still alive on the server while waiting for the server to diff --git a/Documentation/robust-futexes.txt b/Documentation/robust-futexes.txt index 6c42c75103eb6db715cf8a03b49e85b38070e0ae..6361fb01c9c1e836abb3616de36f8ccf3bf86561 100644 --- a/Documentation/robust-futexes.txt +++ b/Documentation/robust-futexes.txt @@ -218,5 +218,4 @@ All other architectures should build just fine too - but they won't have the new syscalls yet. Architectures need to implement the new futex_atomic_cmpxchg_inatomic() -inline function before writing up the syscalls (that function returns --ENOSYS right now). +inline function before writing up the syscalls. diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6af24cdb25ccb51a947d0bf50f3442b53a963d81..3f13d8599337ea8a010d3a33ae605201691a427e 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -866,14 +866,14 @@ The intent is that compaction has less work to do in the future and to increase the success rate of future high-order allocations such as SLUB allocations, THP and hugetlbfs pages. -To make it sensible with respect to the watermark_scale_factor parameter, -the unit is in fractions of 10,000. The default value of 15,000 means -that up to 150% of the high watermark will be reclaimed in the event of -a pageblock being mixed due to fragmentation. The level of reclaim is -determined by the number of fragmentation events that occurred in the -recent past. If this value is smaller than a pageblock then a pageblocks -worth of pages will be reclaimed (e.g. 2MB on 64-bit x86). A boost factor -of 0 will disable the feature. +To make it sensible with respect to the watermark_scale_factor +parameter, the unit is in fractions of 10,000. The default value of +15,000 on !DISCONTIGMEM configurations means that up to 150% of the high +watermark will be reclaimed in the event of a pageblock being mixed due +to fragmentation. The level of reclaim is determined by the number of +fragmentation events that occurred in the recent past. If this value is +smaller than a pageblock then a pageblocks worth of pages will be reclaimed +(e.g. 2MB on 64-bit x86). A boost factor of 0 will disable the feature. ============================================================= diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index 7f01fb1c10842dbd87bf8b1da834272c2ac92a47..db0b9d8619f1aefbcc0c750d9211471228bb11db 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -493,10 +493,8 @@ CPU 에게 기대할 수 있는 최소한의 보장사항 몇가지가 있습니 이 타입의 오퍼레이션은 단방향의 투과성 배리어처럼 동작합니다. ACQUIRE 오퍼레이션 뒤의 모든 메모리 오퍼레이션들이 ACQUIRE 오퍼레이션 후에 일어난 것으로 시스템의 나머지 컴포넌트들에 보이게 될 것이 보장됩니다. - LOCK 오퍼레이션과 smp_load_acquire(), smp_cond_acquire() 오퍼레이션도 - ACQUIRE 오퍼레이션에 포함됩니다. smp_cond_acquire() 오퍼레이션은 컨트롤 - 의존성과 smp_rmb() 를 사용해서 ACQUIRE 의 의미적 요구사항(semantic)을 - 충족시킵니다. + LOCK 오퍼레이션과 smp_load_acquire(), smp_cond_load_acquire() 오퍼레이션도 + ACQUIRE 오퍼레이션에 포함됩니다. ACQUIRE 오퍼레이션 앞의 메모리 오퍼레이션들은 ACQUIRE 오퍼레이션 완료 후에 수행된 것처럼 보일 수 있습니다. @@ -2146,33 +2144,40 @@ set_current_state() 는 다음의 것들로 감싸질 수도 있습니다: event_indicated = 1; wake_up_process(event_daemon); -wake_up() 류에 의해 쓰기 메모리 배리어가 내포됩니다. 만약 그것들이 뭔가를 -깨운다면요. 이 배리어는 태스크 상태가 지워지기 전에 수행되므로, 이벤트를 -알리기 위한 STORE 와 태스크 상태를 TASK_RUNNING 으로 설정하는 STORE 사이에 -위치하게 됩니다. +wake_up() 이 무언가를 깨우게 되면, 이 함수는 범용 메모리 배리어를 수행합니다. +이 함수가 아무것도 깨우지 않는다면 메모리 배리어는 수행될 수도, 수행되지 않을 +수도 있습니다; 이 경우에 메모리 배리어를 수행할 거라 오해해선 안됩니다. 이 +배리어는 태스크 상태가 접근되기 전에 수행되는데, 자세히 말하면 이 이벤트를 +알리기 위한 STORE 와 TASK_RUNNING 으로 상태를 쓰는 STORE 사이에 수행됩니다: - CPU 1 CPU 2 + CPU 1 (Sleeper) CPU 2 (Waker) =============================== =============================== set_current_state(); STORE event_indicated smp_store_mb(); wake_up(); - STORE current->state <쓰기 배리어> - <범용 배리어> STORE current->state - LOAD event_indicated + STORE current->state ... + <범용 배리어> <범용 배리어> + LOAD event_indicated if ((LOAD task->state) & TASK_NORMAL) + STORE task->state -한번더 말합니다만, 이 쓰기 메모리 배리어는 이 코드가 정말로 뭔가를 깨울 때에만 -실행됩니다. 이걸 설명하기 위해, X 와 Y 는 모두 0 으로 초기화 되어 있다는 가정 -하에 아래의 이벤트 시퀀스를 생각해 봅시다: +여기서 "task" 는 깨어나지는 쓰레드이고 CPU 1 의 "current" 와 같습니다. + +반복하지만, wake_up() 이 무언가를 정말 깨운다면 범용 메모리 배리어가 수행될 +것이 보장되지만, 그렇지 않다면 그런 보장이 없습니다. 이걸 이해하기 위해, X 와 +Y 는 모두 0 으로 초기화 되어 있다는 가정 하에 아래의 이벤트 시퀀스를 생각해 +봅시다: CPU 1 CPU 2 =============================== =============================== - X = 1; STORE event_indicated + X = 1; Y = 1; smp_mb(); wake_up(); - Y = 1; wait_event(wq, Y == 1); - wake_up(); load from Y sees 1, no memory barrier - load from X might see 0 + LOAD Y LOAD X + +정말로 깨우기가 행해졌다면, 두 로드 중 (최소한) 하나는 1 을 보게 됩니다. +반면에, 실제 깨우기가 행해지지 않았다면, 두 로드 모두 0을 볼 수도 있습니다. -위 예제에서의 경우와 달리 깨우기가 정말로 행해졌다면, CPU 2 의 X 로드는 1 을 -본다고 보장될 수 있을 겁니다. +wake_up_process() 는 항상 범용 메모리 배리어를 수행합니다. 이 배리어 역시 +태스크 상태가 접근되기 전에 수행됩니다. 특히, 앞의 예제 코드에서 wake_up() 이 +wake_up_process() 로 대체된다면 두 로드 중 하나는 1을 볼 것이 보장됩니다. 사용 가능한 깨우기류 함수들로 다음과 같은 것들이 있습니다: @@ -2192,6 +2197,8 @@ wake_up() 류에 의해 쓰기 메모리 배리어가 내포됩니다. 만약 wake_up_poll(); wake_up_process(); +메모리 순서규칙 관점에서, 이 함수들은 모두 wake_up() 과 같거나 보다 강한 순서 +보장을 제공합니다. [!] 잠재우는 코드와 깨우는 코드에 내포되는 메모리 배리어들은 깨우기 전에 이루어진 스토어를 잠재우는 코드가 set_current_state() 를 호출한 후에 행하는 diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 67068c47c591a5ce8fc373313d46f434863ef54b..64b38dfcc243964bfdccc905908c1d39b965817d 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -321,7 +321,7 @@ cpu's hardware control block. 4.8 KVM_GET_DIRTY_LOG (vm ioctl) Capability: basic -Architectures: x86 +Architectures: all Type: vm ioctl Parameters: struct kvm_dirty_log (in/out) Returns: 0 on success, -1 on error @@ -3810,7 +3810,7 @@ to I/O ports. 4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT -Architectures: x86 +Architectures: x86, arm, arm64, mips Type: vm ioctl Parameters: struct kvm_dirty_log (in) Returns: 0 on success, -1 on error @@ -3830,8 +3830,9 @@ The ioctl clears the dirty status of pages in a memory slot, according to the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap field. Bit 0 of the bitmap corresponds to page "first_page" in the memory slot, and num_pages is the size in bits of the input bitmap. -Both first_page and num_pages must be a multiple of 64. For each bit -that is set in the input bitmap, the corresponding page is marked "clean" +first_page must be a multiple of 64; num_pages must also be a multiple of +64 unless first_page + num_pages is the size of the memory slot. For each +bit that is set in the input bitmap, the corresponding page is marked "clean" in KVM's dirty bitmap, and dirty tracking is re-enabled for that page (for example via write-protection, or by clearing the dirty bit in a page table entry). @@ -4799,7 +4800,7 @@ and injected exceptions. 7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT -Architectures: all +Architectures: x86, arm, arm64, mips Parameters: args[0] whether feature should be enabled or not With this capability enabled, KVM_GET_DIRTY_LOG will not automatically diff --git a/Documentation/x86/kernel-stacks b/Documentation/x86/kernel-stacks index 9a0aa4d3a866938170c1b652577c9ed9d69aab29..d1bfb0b95ee0bcb3428598547742ff6460223f05 100644 --- a/Documentation/x86/kernel-stacks +++ b/Documentation/x86/kernel-stacks @@ -59,7 +59,7 @@ If that assumption is ever broken then the stacks will become corrupt. The currently assigned IST stacks are :- -* DOUBLEFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE). +* ESTACK_DF. EXCEPTION_STKSZ (PAGE_SIZE). Used for interrupt 8 - Double Fault Exception (#DF). @@ -68,7 +68,7 @@ The currently assigned IST stacks are :- Using a separate stack allows the kernel to recover from it well enough in many cases to still output an oops. -* NMI_STACK. EXCEPTION_STKSZ (PAGE_SIZE). +* ESTACK_NMI. EXCEPTION_STKSZ (PAGE_SIZE). Used for non-maskable interrupts (NMI). @@ -76,7 +76,7 @@ The currently assigned IST stacks are :- middle of switching stacks. Using IST for NMI events avoids making assumptions about the previous state of the kernel stack. -* DEBUG_STACK. DEBUG_STKSZ +* ESTACK_DB. EXCEPTION_STKSZ (PAGE_SIZE). Used for hardware debug interrupts (interrupt 1) and for software debug interrupts (INT3). @@ -86,7 +86,12 @@ The currently assigned IST stacks are :- avoids making assumptions about the previous state of the kernel stack. -* MCE_STACK. EXCEPTION_STKSZ (PAGE_SIZE). + To handle nested #DB correctly there exist two instances of DB stacks. On + #DB entry the IST stackpointer for #DB is switched to the second instance + so a nested #DB starts from a clean stack. The nested #DB switches + the IST stackpointer to a guard hole to catch triple nesting. + +* ESTACK_MCE. EXCEPTION_STKSZ (PAGE_SIZE). Used for interrupt 18 - Machine Check Exception (#MC). diff --git a/Documentation/x86/topology.txt b/Documentation/x86/topology.txt index 2953e3ec9a0259f102ce40b9c28340f761c2df4e..06b3cdbc404899de7fc5c9cea17dd8e134df86b4 100644 --- a/Documentation/x86/topology.txt +++ b/Documentation/x86/topology.txt @@ -51,7 +51,7 @@ The topology of a system is described in the units of: The physical ID of the package. This information is retrieved via CPUID and deduced from the APIC IDs of the cores in the package. - - cpuinfo_x86.logical_id: + - cpuinfo_x86.logical_proc_id: The logical ID of the package. As we do not trust BIOSes to enumerate the packages in a consistent way, we introduced the concept of logical package diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 804f9426ed17bdcf0c8fb6dc682ae9254050beb9..6cbe652d7a4973d665a3c8733d639d05da521332 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -72,7 +72,7 @@ Complete virtual memory map with 5-level page tables Notes: - With 56-bit addresses, user-space memory gets expanded by a factor of 512x, - from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting + from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PB starting offset and many of the regions expand to support the much larger physical memory supported. @@ -83,7 +83,7 @@ Notes: 0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm __________________|____________|__________________|_________|___________________________________________________________ | | | | - 0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical + 0100000000000000 | +64 PB | feffffffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical | | | | virtual memory addresses up to the -64 PB | | | | starting offset of kernel mappings. __________________|____________|__________________|_________|___________________________________________________________ @@ -99,7 +99,7 @@ ____________________________________________________________|___________________ ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base) ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole - ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory + ffdf000000000000 | -8.25 PB | fffffbffffffffff | ~8 PB | KASAN shadow memory __________________|____________|__________________|_________|____________________________________________________________ | | Identical layout to the 47-bit one from here on: diff --git a/MAINTAINERS b/MAINTAINERS index 43b36dbed48e77bffcc94ea79b8a2e6d79ad0aae..6d3c23bc6d15b1fcafa3d55e6f08dee8a3b46d1f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1893,14 +1893,15 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-nomadik.git ARM/NUVOTON NPCM ARCHITECTURE M: Avi Fishman M: Tomer Maimon +M: Tali Perry R: Patrick Venture R: Nancy Yuen -R: Brendan Higgins +R: Benjamin Fair L: openbmc@lists.ozlabs.org (moderated for non-subscribers) S: Supported F: arch/arm/mach-npcm/ F: arch/arm/boot/dts/nuvoton-npcm* -F: include/dt-bindings/clock/nuvoton,npcm7xx-clks.h +F: include/dt-bindings/clock/nuvoton,npcm7xx-clock.h F: drivers/*/*npcm* F: Documentation/devicetree/bindings/*/*npcm* F: Documentation/devicetree/bindings/*/*/*npcm* @@ -3120,6 +3121,7 @@ F: drivers/cpufreq/bmips-cpufreq.c BROADCOM BMIPS MIPS ARCHITECTURE M: Kevin Cernekee M: Florian Fainelli +L: bcm-kernel-feedback-list@broadcom.com L: linux-mips@vger.kernel.org T: git git://github.com/broadcom/stblinux.git S: Maintained @@ -4129,7 +4131,7 @@ F: drivers/cpuidle/* F: include/linux/cpuidle.h CRAMFS FILESYSTEM -M: Nicolas Pitre +M: Nicolas Pitre S: Maintained F: Documentation/filesystems/cramfs.txt F: fs/cramfs/ @@ -4551,6 +4553,7 @@ S: Maintained F: drivers/devfreq/ F: include/linux/devfreq.h F: Documentation/devicetree/bindings/devfreq/ +F: include/trace/events/devfreq.h DEVICE FREQUENCY EVENT (DEVFREQ-EVENT) M: Chanwoo Choi @@ -5596,6 +5599,12 @@ L: linux-edac@vger.kernel.org S: Maintained F: drivers/edac/ghes_edac.c +EDAC-I10NM +M: Tony Luck +L: linux-edac@vger.kernel.org +S: Maintained +F: drivers/edac/i10nm_base.c + EDAC-I3000 L: linux-edac@vger.kernel.org S: Orphan @@ -5677,7 +5686,7 @@ EDAC-SKYLAKE M: Tony Luck L: linux-edac@vger.kernel.org S: Maintained -F: drivers/edac/skx_edac.c +F: drivers/edac/skx_*.c EDAC-TI M: Tero Kristo @@ -5833,7 +5842,7 @@ L: netdev@vger.kernel.org S: Maintained F: Documentation/ABI/testing/sysfs-bus-mdio F: Documentation/devicetree/bindings/net/mdio* -F: Documentation/networking/phy.txt +F: Documentation/networking/phy.rst F: drivers/net/phy/ F: drivers/of/of_mdio.c F: drivers/of/of_net.c @@ -6460,7 +6469,7 @@ S: Maintained F: drivers/media/radio/radio-gemtek* GENERIC GPIO I2C DRIVER -M: Haavard Skinnemoen +M: Wolfram Sang S: Supported F: drivers/i2c/busses/i2c-gpio.c F: include/linux/platform_data/i2c-gpio.h @@ -6592,7 +6601,7 @@ M: Andy Shevchenko L: linux-gpio@vger.kernel.org L: linux-acpi@vger.kernel.org S: Maintained -F: Documentation/acpi/gpio-properties.txt +F: Documentation/firmware-guide/acpi/gpio-properties.rst F: drivers/gpio/gpiolib-acpi.c GPIO IR Transmitter @@ -7332,7 +7341,6 @@ F: Documentation/devicetree/bindings/i3c/ F: Documentation/driver-api/i3c F: drivers/i3c/ F: include/linux/i3c/ -F: include/dt-bindings/i3c/ I3C DRIVER FOR SYNOPSYS DESIGNWARE M: Vitor Soares @@ -7515,7 +7523,7 @@ F: include/net/mac802154.h F: include/net/af_ieee802154.h F: include/net/cfg802154.h F: include/net/ieee802154_netdev.h -F: Documentation/networking/ieee802154.txt +F: Documentation/networking/ieee802154.rst IFE PROTOCOL M: Yotam Gigi @@ -8707,6 +8715,7 @@ F: scripts/leaking_addresses.pl LED SUBSYSTEM M: Jacek Anaszewski M: Pavel Machek +R: Dan Murphy L: linux-leds@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/j.anaszewski/linux-leds.git S: Maintained @@ -8992,7 +9001,7 @@ R: Daniel Lustig L: linux-kernel@vger.kernel.org L: linux-arch@vger.kernel.org S: Supported -T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev F: tools/memory-model/ F: Documentation/atomic_bitops.txt F: Documentation/atomic_t.txt @@ -9098,7 +9107,6 @@ F: arch/*/include/asm/spinlock*.h F: include/linux/rwlock*.h F: include/linux/mutex*.h F: include/linux/rwsem*.h -F: arch/*/include/asm/rwsem.h F: include/linux/seqlock.h F: lib/locking*.[ch] F: kernel/locking/ @@ -10144,7 +10152,7 @@ F: drivers/spi/spi-at91-usart.c F: Documentation/devicetree/bindings/mfd/atmel-usart.txt MICROCHIP KSZ SERIES ETHERNET SWITCH DRIVER -M: Woojung Huh +M: Woojung Huh M: Microchip Linux Driver Support L: netdev@vger.kernel.org S: Maintained @@ -12174,6 +12182,7 @@ F: arch/*/kernel/*/*/perf_event*.c F: arch/*/include/asm/perf_event.h F: arch/*/kernel/perf_callchain.c F: arch/*/events/* +F: arch/*/events/*/* F: tools/perf/ PERSONALITY HANDLING @@ -12414,7 +12423,7 @@ M: Mark Rutland M: Lorenzo Pieralisi L: linux-arm-kernel@lists.infradead.org S: Maintained -F: drivers/firmware/psci*.c +F: drivers/firmware/psci/ F: include/linux/psci.h F: include/uapi/linux/psci.h @@ -13040,9 +13049,9 @@ M: Josh Triplett R: Steven Rostedt R: Mathieu Desnoyers R: Lai Jiangshan -L: linux-kernel@vger.kernel.org +L: rcu@vger.kernel.org S: Supported -T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev F: tools/testing/selftests/rcutorture RDC R-321X SoC @@ -13088,10 +13097,10 @@ R: Steven Rostedt R: Mathieu Desnoyers R: Lai Jiangshan R: Joel Fernandes -L: linux-kernel@vger.kernel.org +L: rcu@vger.kernel.org W: http://www.rdrop.com/users/paulmck/RCU/ S: Supported -T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev F: Documentation/RCU/ X: Documentation/RCU/torture.txt F: include/linux/rcu* @@ -13981,7 +13990,7 @@ F: drivers/media/rc/serial_ir.c SFC NETWORK DRIVER M: Solarflare linux maintainers M: Edward Cree -M: Bert Kenward +M: Martin Habets L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/sfc/ @@ -14243,10 +14252,10 @@ M: "Paul E. McKenney" M: Josh Triplett R: Steven Rostedt R: Mathieu Desnoyers -L: linux-kernel@vger.kernel.org +L: rcu@vger.kernel.org W: http://www.rdrop.com/users/paulmck/RCU/ S: Supported -T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev F: include/linux/srcu*.h F: kernel/rcu/srcu*.c @@ -15693,7 +15702,7 @@ M: "Paul E. McKenney" M: Josh Triplett L: linux-kernel@vger.kernel.org S: Supported -T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev F: Documentation/RCU/torture.txt F: kernel/torture.c F: kernel/rcu/rcutorture.c @@ -16508,7 +16517,7 @@ F: drivers/char/virtio_console.c F: include/linux/virtio_console.h F: include/uapi/linux/virtio_console.h -VIRTIO CORE, NET AND BLOCK DRIVERS +VIRTIO CORE AND NET DRIVERS M: "Michael S. Tsirkin" M: Jason Wang L: virtualization@lists.linux-foundation.org @@ -16523,6 +16532,19 @@ F: include/uapi/linux/virtio_*.h F: drivers/crypto/virtio/ F: mm/balloon_compaction.c +VIRTIO BLOCK AND SCSI DRIVERS +M: "Michael S. Tsirkin" +M: Jason Wang +R: Paolo Bonzini +R: Stefan Hajnoczi +L: virtualization@lists.linux-foundation.org +S: Maintained +F: drivers/block/virtio_blk.c +F: drivers/scsi/virtio_scsi.c +F: include/uapi/linux/virtio_blk.h +F: include/uapi/linux/virtio_scsi.h +F: drivers/vhost/scsi.c + VIRTIO CRYPTO DRIVER M: Gonglei L: virtualization@lists.linux-foundation.org @@ -16919,7 +16941,7 @@ M: Tony Luck M: Borislav Petkov L: linux-edac@vger.kernel.org S: Maintained -F: arch/x86/kernel/cpu/mcheck/* +F: arch/x86/kernel/cpu/mce/* X86 MICROCODE UPDATE SUPPORT M: Borislav Petkov diff --git a/Makefile b/Makefile index 026fbc450906ad0aba4bf587c880c5c51ed404f8..26c92f892d24b1481b3b3ee29e1a53224c4e704d 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 1 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = NAME = Shy Crocodile # *DOCUMENTATION* @@ -678,6 +678,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning,frame-address,) KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation) KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow) KBUILD_CFLAGS += $(call cc-disable-warning, int-in-bool-context) +KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os @@ -719,7 +720,6 @@ ifdef CONFIG_CC_IS_CLANG KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) -KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) # Quiet clang warning: comparison of unsigned expression < 0 is always false KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare) # CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the diff --git a/arch/Kconfig b/arch/Kconfig index 33687dddd86a7e04dfa7e7829788b4a0442ae61a..5e43fcbad4ca551493345f4a052c73461a85ae78 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -249,6 +249,10 @@ config ARCH_HAS_FORTIFY_SOURCE config ARCH_HAS_SET_MEMORY bool +# Select if arch has all set_direct_map_invalid/default() functions +config ARCH_HAS_SET_DIRECT_MAP + bool + # Select if arch init_task must go in the __init_task_data section config ARCH_TASK_STRUCT_ON_STACK bool @@ -383,7 +387,13 @@ config HAVE_ARCH_JUMP_LABEL_RELATIVE config HAVE_RCU_TABLE_FREE bool -config HAVE_RCU_TABLE_INVALIDATE +config HAVE_RCU_TABLE_NO_INVALIDATE + bool + +config HAVE_MMU_GATHER_PAGE_SIZE + bool + +config HAVE_MMU_GATHER_NO_GATHER bool config ARCH_HAVE_NMI_SAFE_CMPXCHG @@ -901,6 +911,15 @@ config HAVE_ARCH_PREL32_RELOCATIONS config ARCH_USE_MEMREMAP_PROT bool +config LOCK_EVENT_COUNTS + bool "Locking event counts collection" + depends on DEBUG_FS + ---help--- + Enable light-weight counting of various locking related events + in the system with minimal performance impact. This reduces + the chance of application behavior change because of timing + differences. The counts are reported via debugfs. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 584a6e1148539682a34a4c480c3f5252ee169267..f7b19b813a70199bdf8cf48029c5d3f1932fc2aa 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -36,6 +36,7 @@ config ALPHA select ODD_RT_SIGACTION select OLD_SIGSUSPEND select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 + select MMU_GATHER_NO_RANGE help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, @@ -49,13 +50,6 @@ config MMU bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config ARCH_HAS_ILOG2_U32 bool default n diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 70b783333965e875a7cb4f0a110327cea46cd663..89e87bbc987fa362c81bc8226766916924793214 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -9,6 +9,7 @@ generic-y += irq_work.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += preempt.h generic-y += sections.h generic-y += trace_clock.h diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index 4c533fc94d62f372786331cd1fe305d2c8d83f99..ccf9d65166bb621da0086624458f442fa4f6608f 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -513,8 +513,6 @@ extern inline void writeq(u64 b, volatile void __iomem *addr) #define writel_relaxed(b, addr) __raw_writel(b, addr) #define writeq_relaxed(b, addr) __raw_writeq(b, addr) -#define mmiowb() - /* * String version of IO memory access ops: */ diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h deleted file mode 100644 index cf8fc8f9a2ed566067a3a67724f3a4a4389fab54..0000000000000000000000000000000000000000 --- a/arch/alpha/include/asm/rwsem.h +++ /dev/null @@ -1,211 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ALPHA_RWSEM_H -#define _ALPHA_RWSEM_H - -/* - * Written by Ivan Kokshaysky , 2001. - * Based on asm-alpha/semaphore.h and asm-i386/rwsem.h - */ - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ - -#include - -#define RWSEM_UNLOCKED_VALUE 0x0000000000000000L -#define RWSEM_ACTIVE_BIAS 0x0000000000000001L -#define RWSEM_ACTIVE_MASK 0x00000000ffffffffL -#define RWSEM_WAITING_BIAS (-0x0000000100000000L) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -static inline int ___down_read(struct rw_semaphore *sem) -{ - long oldcount; -#ifndef CONFIG_SMP - oldcount = sem->count.counter; - sem->count.counter += RWSEM_ACTIVE_READ_BIAS; -#else - long temp; - __asm__ __volatile__( - "1: ldq_l %0,%1\n" - " addq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - " mb\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) - :"Ir" (RWSEM_ACTIVE_READ_BIAS), "m" (sem->count) : "memory"); -#endif - return (oldcount < 0); -} - -static inline void __down_read(struct rw_semaphore *sem) -{ - if (unlikely(___down_read(sem))) - rwsem_down_read_failed(sem); -} - -static inline int __down_read_killable(struct rw_semaphore *sem) -{ - if (unlikely(___down_read(sem))) - if (IS_ERR(rwsem_down_read_failed_killable(sem))) - return -EINTR; - - return 0; -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - long old, new, res; - - res = atomic_long_read(&sem->count); - do { - new = res + RWSEM_ACTIVE_READ_BIAS; - if (new <= 0) - break; - old = res; - res = atomic_long_cmpxchg(&sem->count, old, new); - } while (res != old); - return res >= 0 ? 1 : 0; -} - -static inline long ___down_write(struct rw_semaphore *sem) -{ - long oldcount; -#ifndef CONFIG_SMP - oldcount = sem->count.counter; - sem->count.counter += RWSEM_ACTIVE_WRITE_BIAS; -#else - long temp; - __asm__ __volatile__( - "1: ldq_l %0,%1\n" - " addq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - " mb\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) - :"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory"); -#endif - return oldcount; -} - -static inline void __down_write(struct rw_semaphore *sem) -{ - if (unlikely(___down_write(sem))) - rwsem_down_write_failed(sem); -} - -static inline int __down_write_killable(struct rw_semaphore *sem) -{ - if (unlikely(___down_write(sem))) { - if (IS_ERR(rwsem_down_write_failed_killable(sem))) - return -EINTR; - } - - return 0; -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - long ret = atomic_long_cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - if (ret == RWSEM_UNLOCKED_VALUE) - return 1; - return 0; -} - -static inline void __up_read(struct rw_semaphore *sem) -{ - long oldcount; -#ifndef CONFIG_SMP - oldcount = sem->count.counter; - sem->count.counter -= RWSEM_ACTIVE_READ_BIAS; -#else - long temp; - __asm__ __volatile__( - " mb\n" - "1: ldq_l %0,%1\n" - " subq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) - :"Ir" (RWSEM_ACTIVE_READ_BIAS), "m" (sem->count) : "memory"); -#endif - if (unlikely(oldcount < 0)) - if ((int)oldcount - RWSEM_ACTIVE_READ_BIAS == 0) - rwsem_wake(sem); -} - -static inline void __up_write(struct rw_semaphore *sem) -{ - long count; -#ifndef CONFIG_SMP - sem->count.counter -= RWSEM_ACTIVE_WRITE_BIAS; - count = sem->count.counter; -#else - long temp; - __asm__ __volatile__( - " mb\n" - "1: ldq_l %0,%1\n" - " subq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - " subq %0,%3,%0\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (count), "=m" (sem->count), "=&r" (temp) - :"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory"); -#endif - if (unlikely(count)) - if ((int)count == 0) - rwsem_wake(sem); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - long oldcount; -#ifndef CONFIG_SMP - oldcount = sem->count.counter; - sem->count.counter -= RWSEM_WAITING_BIAS; -#else - long temp; - __asm__ __volatile__( - "1: ldq_l %0,%1\n" - " addq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - " mb\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) - :"Ir" (-RWSEM_WAITING_BIAS), "m" (sem->count) : "memory"); -#endif - if (unlikely(oldcount < 0)) - rwsem_downgrade_wake(sem); -} - -#endif /* __KERNEL__ */ -#endif /* _ALPHA_RWSEM_H */ diff --git a/arch/alpha/include/asm/tlb.h b/arch/alpha/include/asm/tlb.h index 8f5042b61875fdd4c308f2b3e3accab95ff571e8..4f79e331af5ea4237ba8200867bf2161f1676d7c 100644 --- a/arch/alpha/include/asm/tlb.h +++ b/arch/alpha/include/asm/tlb.h @@ -2,12 +2,6 @@ #ifndef _ALPHA_TLB_H #define _ALPHA_TLB_H -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, pte, addr) do { } while (0) - -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 63ed39cbd3bd13a40e98ec3a9aff3b3266b3e235..165f268beafc471e14eac4c8d6d08e52c5c89864 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -463,3 +463,7 @@ 532 common getppid sys_getppid # all other architectures have common numbers for new syscall, alpha # is the exception. +534 common pidfd_send_signal sys_pidfd_send_signal +535 common io_uring_setup sys_io_uring_setup +536 common io_uring_enter sys_io_uring_enter +537 common io_uring_register sys_io_uring_register diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index c781e45d1d9953267b977bc094e0d1acde0ee623..23e063df5d2cf1233665b575193e9c5e7e227e52 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -63,9 +63,6 @@ config SCHED_OMIT_FRAME_POINTER config GENERIC_CSUM def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - config ARCH_DISCONTIGMEM_ENABLE def_bool n diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index 69bc1c9e8e50d673729a6187fb4f1669971c9cb7..7425bb0f2d1b6a8942086bce942f86108c3ddd80 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -18,8 +18,8 @@ model = "snps,hsdk"; compatible = "snps,hsdk"; - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; chosen { bootargs = "earlycon=uart8250,mmio32,0xf0005000,115200n8 console=ttyS0,115200n8 debug print-fatal-signals=1"; @@ -105,7 +105,7 @@ #size-cells = <1>; interrupt-parent = <&idu_intc>; - ranges = <0x00000000 0xf0000000 0x10000000>; + ranges = <0x00000000 0x0 0xf0000000 0x10000000>; cgu_rst: reset-controller@8a0 { compatible = "snps,hsdk-reset"; @@ -269,9 +269,10 @@ }; memory@80000000 { - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; device_type = "memory"; - reg = <0x80000000 0x40000000>; /* 1 GiB */ + reg = <0x0 0x80000000 0x0 0x40000000>; /* 1 GB lowmem */ + /* 0x1 0x00000000 0x0 0x40000000>; 1 GB highmem */ }; }; diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index decc306a3b52c2b96c2e7af7108e7aa171c51c31..393d4f5e145032bafca421778f2daa0df6227438 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += msi.h generic-y += parport.h generic-y += percpu.h diff --git a/arch/arc/include/asm/syscall.h b/arch/arc/include/asm/syscall.h index 29de098043064a20112dd7ffd02f86059261ae44..c7a4201ed62ba70f9f37275475be215b0e7fb6d1 100644 --- a/arch/arc/include/asm/syscall.h +++ b/arch/arc/include/asm/syscall.h @@ -55,12 +55,11 @@ syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, */ static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) + unsigned long *args) { unsigned long *inside_ptregs = &(regs->r0); - inside_ptregs -= i; - - BUG_ON((i + n) > 6); + unsigned int n = 6; + unsigned int i = 0; while (n--) { args[i++] = (*inside_ptregs); diff --git a/arch/arc/include/asm/tlb.h b/arch/arc/include/asm/tlb.h index a9db5f62aaf37988fe8806ac4ee0a14edf713309..90cac97643a46949fd00f3d2678eebb4a10c525b 100644 --- a/arch/arc/include/asm/tlb.h +++ b/arch/arc/include/asm/tlb.h @@ -9,38 +9,6 @@ #ifndef _ASM_ARC_TLB_H #define _ASM_ARC_TLB_H -#define tlb_flush(tlb) \ -do { \ - if (tlb->fullmm) \ - flush_tlb_mm((tlb)->mm); \ -} while (0) - -/* - * This pair is called at time of munmap/exit to flush cache and TLB entries - * for mappings being torn down. - * 1) cache-flush part -implemented via tlb_start_vma( ) for VIPT aliasing D$ - * 2) tlb-flush part - implemted via tlb_end_vma( ) flushes the TLB range - * - * Note, read http://lkml.org/lkml/2004/1/15/6 - */ -#ifndef CONFIG_ARC_CACHE_VIPT_ALIASING -#define tlb_start_vma(tlb, vma) -#else -#define tlb_start_vma(tlb, vma) \ -do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ -} while(0) -#endif - -#define tlb_end_vma(tlb, vma) \ -do { \ - if (!tlb->fullmm) \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define __tlb_remove_tlb_entry(tlb, ptep, address) - #include #include diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S index f230bb7092fdb3d7d98883ab7310db1b4bc56654..b3373f5c88e0bf9267af0cb9dbc7f5b0f6cf6be8 100644 --- a/arch/arc/lib/memset-archs.S +++ b/arch/arc/lib/memset-archs.S @@ -30,10 +30,10 @@ #else -.macro PREALLOC_INSTR +.macro PREALLOC_INSTR reg, off .endm -.macro PREFETCHW_INSTR +.macro PREFETCHW_INSTR reg, off .endm #endif diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 4135abec3fb09cd714c4c48d056a010b37f58c48..63e6e65046992f1388a3ae44aae5150abb1ee3f7 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -113,10 +113,24 @@ static void read_decode_cache_bcr_arcv2(int cpu) } READ_BCR(ARC_REG_CLUSTER_BCR, cbcr); - if (cbcr.c) + if (cbcr.c) { ioc_exists = 1; - else + + /* + * As for today we don't support both IOC and ZONE_HIGHMEM enabled + * simultaneously. This happens because as of today IOC aperture covers + * only ZONE_NORMAL (low mem) and any dma transactions outside this + * region won't be HW coherent. + * If we want to use both IOC and ZONE_HIGHMEM we can use + * bounce_buffer to handle dma transactions to HIGHMEM. + * Also it is possible to modify dma_direct cache ops or increase IOC + * aperture size if we are planning to use HIGHMEM without PAE. + */ + if (IS_ENABLED(CONFIG_HIGHMEM) || is_pae40_enabled()) + ioc_enable = 0; + } else { ioc_enable = 0; + } /* HS 2.0 didn't have AUX_VOL */ if (cpuinfo_arc700[cpu].core.family > 0x51) { @@ -1158,19 +1172,6 @@ noinline void __init arc_ioc_setup(void) if (!ioc_enable) return; - /* - * As for today we don't support both IOC and ZONE_HIGHMEM enabled - * simultaneously. This happens because as of today IOC aperture covers - * only ZONE_NORMAL (low mem) and any dma transactions outside this - * region won't be HW coherent. - * If we want to use both IOC and ZONE_HIGHMEM we can use - * bounce_buffer to handle dma transactions to HIGHMEM. - * Also it is possible to modify dma_direct cache ops or increase IOC - * aperture size if we are planning to use HIGHMEM without PAE. - */ - if (IS_ENABLED(CONFIG_HIGHMEM)) - panic("IOC and HIGHMEM can't be used simultaneously"); - /* Flush + invalidate + disable L1 dcache */ __dc_disable(); diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 850b4805e2d171436e539b326867d6ce08a6f9d6..dc9855c4a3b404cff6a4dd2ac81ba92d12bbe4fc 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -73,7 +73,7 @@ config ARM select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU select HAVE_EXIT_THREAD select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL - select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL + select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG select HAVE_FUNCTION_TRACER if !XIP_KERNEL select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7) @@ -178,10 +178,6 @@ config TRACE_IRQFLAGS_SUPPORT bool default !CPU_V7M -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config ARCH_HAS_ILOG2_U32 bool diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 6d6e0330930b52f7369a46536473fa7174fad2d9..e388af4594a6e5e42a860469e10a53b89522e7bf 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -47,8 +47,8 @@ config DEBUG_WX choice prompt "Choose kernel unwinder" - default UNWINDER_ARM if AEABI && !FUNCTION_GRAPH_TRACER - default UNWINDER_FRAME_POINTER if !AEABI || FUNCTION_GRAPH_TRACER + default UNWINDER_ARM if AEABI + default UNWINDER_FRAME_POINTER if !AEABI help This determines which method will be used for unwinding kernel stack traces for panics, oopses, bugs, warnings, perf, /proc//stack, @@ -65,7 +65,7 @@ config UNWINDER_FRAME_POINTER config UNWINDER_ARM bool "ARM EABI stack unwinder" - depends on AEABI + depends on AEABI && !FUNCTION_GRAPH_TRACER select ARM_UNWIND help This option enables stack unwinding support in the kernel diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 6c7ccb428c079c3e43ef9cce2c344ec4b6809369..7135820f76d4f8b8d24374738332c0c4c0644bf7 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -1438,7 +1438,21 @@ ENTRY(efi_stub_entry) @ Preserve return value of efi_entry() in r4 mov r4, r0 - bl cache_clean_flush + + @ our cache maintenance code relies on CP15 barrier instructions + @ but since we arrived here with the MMU and caches configured + @ by UEFI, we must check that the CP15BEN bit is set in SCTLR. + @ Note that this bit is RAO/WI on v6 and earlier, so the ISB in + @ the enable path will be executed on v7+ only. + mrc p15, 0, r1, c1, c0, 0 @ read SCTLR + tst r1, #(1 << 5) @ CP15BEN bit set? + bne 0f + orr r1, r1, #(1 << 5) @ CP15 barrier instructions + mcr p15, 0, r1, c1, c0, 0 @ write SCTLR + ARM( .inst 0xf57ff06f @ v7+ isb ) + THUMB( isb ) + +0: bl cache_clean_flush bl cache_off @ Set parameters for booting zImage according to boot protocol diff --git a/arch/arm/boot/dts/am335x-evm.dts b/arch/arm/boot/dts/am335x-evm.dts index dce5be5df97bd91abe3ff039e8befab58656124b..edcff79879e780e5aa307dfc0d18f393663a7f78 100644 --- a/arch/arm/boot/dts/am335x-evm.dts +++ b/arch/arm/boot/dts/am335x-evm.dts @@ -57,6 +57,24 @@ enable-active-high; }; + /* TPS79501 */ + v1_8d_reg: fixedregulator-v1_8d { + compatible = "regulator-fixed"; + regulator-name = "v1_8d"; + vin-supply = <&vbat>; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + }; + + /* TPS79501 */ + v3_3d_reg: fixedregulator-v3_3d { + compatible = "regulator-fixed"; + regulator-name = "v3_3d"; + vin-supply = <&vbat>; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + }; + matrix_keypad: matrix_keypad0 { compatible = "gpio-matrix-keypad"; debounce-delay-ms = <5>; @@ -499,10 +517,10 @@ status = "okay"; /* Regulators */ - AVDD-supply = <&vaux2_reg>; - IOVDD-supply = <&vaux2_reg>; - DRVDD-supply = <&vaux2_reg>; - DVDD-supply = <&vbat>; + AVDD-supply = <&v3_3d_reg>; + IOVDD-supply = <&v3_3d_reg>; + DRVDD-supply = <&v3_3d_reg>; + DVDD-supply = <&v1_8d_reg>; }; }; diff --git a/arch/arm/boot/dts/am335x-evmsk.dts b/arch/arm/boot/dts/am335x-evmsk.dts index b128998097ce7180cb2a72291bb83ea2a19d0f52..2c2d8b5b8cf52bf55b28b20a47488363c895681c 100644 --- a/arch/arm/boot/dts/am335x-evmsk.dts +++ b/arch/arm/boot/dts/am335x-evmsk.dts @@ -73,6 +73,24 @@ enable-active-high; }; + /* TPS79518 */ + v1_8d_reg: fixedregulator-v1_8d { + compatible = "regulator-fixed"; + regulator-name = "v1_8d"; + vin-supply = <&vbat>; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + }; + + /* TPS78633 */ + v3_3d_reg: fixedregulator-v3_3d { + compatible = "regulator-fixed"; + regulator-name = "v3_3d"; + vin-supply = <&vbat>; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + }; + leds { pinctrl-names = "default"; pinctrl-0 = <&user_leds_s0>; @@ -501,10 +519,10 @@ status = "okay"; /* Regulators */ - AVDD-supply = <&vaux2_reg>; - IOVDD-supply = <&vaux2_reg>; - DRVDD-supply = <&vaux2_reg>; - DVDD-supply = <&vbat>; + AVDD-supply = <&v3_3d_reg>; + IOVDD-supply = <&v3_3d_reg>; + DRVDD-supply = <&v3_3d_reg>; + DVDD-supply = <&v1_8d_reg>; }; }; diff --git a/arch/arm/boot/dts/am33xx-l4.dtsi b/arch/arm/boot/dts/am33xx-l4.dtsi index f459ec316a22d4cd723d43dd97d3709106aacedd..ca6d9f02a800c8a0e042d43280fa762ebce6fef4 100644 --- a/arch/arm/boot/dts/am33xx-l4.dtsi +++ b/arch/arm/boot/dts/am33xx-l4.dtsi @@ -1762,7 +1762,7 @@ reg = <0xcc000 0x4>; reg-names = "rev"; /* Domains (P, C): per_pwrdm, l4ls_clkdm */ - clocks = <&l4ls_clkctrl AM3_D_CAN0_CLKCTRL 0>; + clocks = <&l4ls_clkctrl AM3_L4LS_D_CAN0_CLKCTRL 0>; clock-names = "fck"; #address-cells = <1>; #size-cells = <1>; @@ -1785,7 +1785,7 @@ reg = <0xd0000 0x4>; reg-names = "rev"; /* Domains (P, C): per_pwrdm, l4ls_clkdm */ - clocks = <&l4ls_clkctrl AM3_D_CAN1_CLKCTRL 0>; + clocks = <&l4ls_clkctrl AM3_L4LS_D_CAN1_CLKCTRL 0>; clock-names = "fck"; #address-cells = <1>; #size-cells = <1>; diff --git a/arch/arm/boot/dts/rk3288-tinker.dtsi b/arch/arm/boot/dts/rk3288-tinker.dtsi index aa107ee41b8b8f3fbc13b92676224561fe0f92c0..ef653c3209bcc995aaa5d5cd79d0b3cf3fd0f8a0 100644 --- a/arch/arm/boot/dts/rk3288-tinker.dtsi +++ b/arch/arm/boot/dts/rk3288-tinker.dtsi @@ -254,6 +254,7 @@ }; vccio_sd: LDO_REG5 { + regulator-boot-on; regulator-min-microvolt = <1800000>; regulator-max-microvolt = <3300000>; regulator-name = "vccio_sd"; @@ -430,7 +431,7 @@ bus-width = <4>; cap-mmc-highspeed; cap-sd-highspeed; - card-detect-delay = <200>; + broken-cd; disable-wp; /* wp not hooked up */ pinctrl-names = "default"; pinctrl-0 = <&sdmmc_clk &sdmmc_cmd &sdmmc_cd &sdmmc_bus4>; diff --git a/arch/arm/boot/dts/rk3288-veyron.dtsi b/arch/arm/boot/dts/rk3288-veyron.dtsi index 0bc2409f6903ffec1512942e9e657d9d983a2c35..192dbc089ade1730b9dce6bca8d356f3b0c83a00 100644 --- a/arch/arm/boot/dts/rk3288-veyron.dtsi +++ b/arch/arm/boot/dts/rk3288-veyron.dtsi @@ -25,8 +25,6 @@ gpio_keys: gpio-keys { compatible = "gpio-keys"; - #address-cells = <1>; - #size-cells = <0>; pinctrl-names = "default"; pinctrl-0 = <&pwr_key_l>; diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi index ca7d52daa8fb638641e3b90f633bcf1c1a1e5497..a024d1e7e74cd94eade3e5c00ed9d56353ef93aa 100644 --- a/arch/arm/boot/dts/rk3288.dtsi +++ b/arch/arm/boot/dts/rk3288.dtsi @@ -70,7 +70,7 @@ compatible = "arm,cortex-a12"; reg = <0x501>; resets = <&cru SRST_CORE1>; - operating-points = <&cpu_opp_table>; + operating-points-v2 = <&cpu_opp_table>; #cooling-cells = <2>; /* min followed by max */ clock-latency = <40000>; clocks = <&cru ARMCLK>; @@ -80,7 +80,7 @@ compatible = "arm,cortex-a12"; reg = <0x502>; resets = <&cru SRST_CORE2>; - operating-points = <&cpu_opp_table>; + operating-points-v2 = <&cpu_opp_table>; #cooling-cells = <2>; /* min followed by max */ clock-latency = <40000>; clocks = <&cru ARMCLK>; @@ -90,7 +90,7 @@ compatible = "arm,cortex-a12"; reg = <0x503>; resets = <&cru SRST_CORE3>; - operating-points = <&cpu_opp_table>; + operating-points-v2 = <&cpu_opp_table>; #cooling-cells = <2>; /* min followed by max */ clock-latency = <40000>; clocks = <&cru ARMCLK>; @@ -1119,8 +1119,6 @@ clock-names = "ref", "pclk"; power-domains = <&power RK3288_PD_VIO>; rockchip,grf = <&grf>; - #address-cells = <1>; - #size-cells = <0>; status = "disabled"; ports { @@ -1282,27 +1280,27 @@ gpu_opp_table: gpu-opp-table { compatible = "operating-points-v2"; - opp@100000000 { + opp-100000000 { opp-hz = /bits/ 64 <100000000>; opp-microvolt = <950000>; }; - opp@200000000 { + opp-200000000 { opp-hz = /bits/ 64 <200000000>; opp-microvolt = <950000>; }; - opp@300000000 { + opp-300000000 { opp-hz = /bits/ 64 <300000000>; opp-microvolt = <1000000>; }; - opp@400000000 { + opp-400000000 { opp-hz = /bits/ 64 <400000000>; opp-microvolt = <1100000>; }; - opp@500000000 { + opp-500000000 { opp-hz = /bits/ 64 <500000000>; opp-microvolt = <1200000>; }; - opp@600000000 { + opp-600000000 { opp-hz = /bits/ 64 <600000000>; opp-microvolt = <1250000>; }; diff --git a/arch/arm/boot/dts/sama5d2-pinfunc.h b/arch/arm/boot/dts/sama5d2-pinfunc.h index 1c01a6f843d8a43c07ab25dd18ffb57acd044c0b..28a2e45752fea34eb2efb576439409c0611d9d90 100644 --- a/arch/arm/boot/dts/sama5d2-pinfunc.h +++ b/arch/arm/boot/dts/sama5d2-pinfunc.h @@ -518,7 +518,7 @@ #define PIN_PC9__GPIO PINMUX_PIN(PIN_PC9, 0, 0) #define PIN_PC9__FIQ PINMUX_PIN(PIN_PC9, 1, 3) #define PIN_PC9__GTSUCOMP PINMUX_PIN(PIN_PC9, 2, 1) -#define PIN_PC9__ISC_D0 PINMUX_PIN(PIN_PC9, 2, 1) +#define PIN_PC9__ISC_D0 PINMUX_PIN(PIN_PC9, 3, 1) #define PIN_PC9__TIOA4 PINMUX_PIN(PIN_PC9, 4, 2) #define PIN_PC10 74 #define PIN_PC10__GPIO PINMUX_PIN(PIN_PC10, 0, 0) diff --git a/arch/arm/boot/dts/ste-nomadik-nhk15.dts b/arch/arm/boot/dts/ste-nomadik-nhk15.dts index f2f6558a00f188937ca55ee5f44e689da5e1bf7a..04066f9cb8a31c643cba82ea337dd22cb7a85626 100644 --- a/arch/arm/boot/dts/ste-nomadik-nhk15.dts +++ b/arch/arm/boot/dts/ste-nomadik-nhk15.dts @@ -213,13 +213,12 @@ gpio-sck = <&gpio0 5 GPIO_ACTIVE_HIGH>; gpio-mosi = <&gpio0 4 GPIO_ACTIVE_HIGH>; /* - * This chipselect is active high. Just setting the flags - * to GPIO_ACTIVE_HIGH is not enough for the SPI DT bindings, - * it will be ignored, only the special "spi-cs-high" flag - * really counts. + * It's not actually active high, but the frameworks assume + * the polarity of the passed-in GPIO is "normal" (active + * high) then actively drives the line low to select the + * chip. */ cs-gpios = <&gpio0 6 GPIO_ACTIVE_HIGH>; - spi-cs-high; num-chipselects = <1>; /* diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index a8a4eb7f6dae0371940a9cc70c077e2194fb02bc..41deac2451af48ada5b7f59ccea8091bdc78be18 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -9,10 +9,10 @@ generic-y += kdebug.h generic-y += local.h generic-y += local64.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += msi.h generic-y += parport.h generic-y += preempt.h -generic-y += rwsem.h generic-y += seccomp.h generic-y += segment.h generic-y += serial.h diff --git a/arch/arm/include/asm/arch_timer.h b/arch/arm/include/asm/arch_timer.h index 0a8d7bba2cb01235e21a54019e64db81741c482a..4b66ecd6be99df7cc83048381896e500df137c92 100644 --- a/arch/arm/include/asm/arch_timer.h +++ b/arch/arm/include/asm/arch_timer.h @@ -11,6 +11,10 @@ #include #ifdef CONFIG_ARM_ARCH_TIMER +/* 32bit ARM doesn't know anything about timer errata... */ +#define has_erratum_handler(h) (false) +#define erratum_handler(h) (arch_timer_##h) + int arch_timer_arch_init(void); /* @@ -79,7 +83,7 @@ static inline u32 arch_timer_get_cntfrq(void) return val; } -static inline u64 arch_counter_get_cntpct(void) +static inline u64 __arch_counter_get_cntpct(void) { u64 cval; @@ -88,7 +92,12 @@ static inline u64 arch_counter_get_cntpct(void) return cval; } -static inline u64 arch_counter_get_cntvct(void) +static inline u64 __arch_counter_get_cntpct_stable(void) +{ + return __arch_counter_get_cntpct(); +} + +static inline u64 __arch_counter_get_cntvct(void) { u64 cval; @@ -97,6 +106,11 @@ static inline u64 arch_counter_get_cntvct(void) return cval; } +static inline u64 __arch_counter_get_cntvct_stable(void) +{ + return __arch_counter_get_cntvct(); +} + static inline u32 arch_timer_get_cntkctl(void) { u32 cntkctl; diff --git a/arch/arm/include/asm/cp15.h b/arch/arm/include/asm/cp15.h index 07e27f212dc754b42b0361536e33ed8bed144f5a..d2453e2d3f1f3804db034c945f268ae614ff3c3f 100644 --- a/arch/arm/include/asm/cp15.h +++ b/arch/arm/include/asm/cp15.h @@ -68,6 +68,8 @@ #define BPIALL __ACCESS_CP15(c7, 0, c5, 6) #define ICIALLU __ACCESS_CP15(c7, 0, c5, 0) +#define CNTVCT __ACCESS_CP15_64(1, c14) + extern unsigned long cr_alignment; /* defined in entry-armv.S */ static inline unsigned long get_cr(void) diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 6b51826ab3d10752c1d5e98b0ea3d5450b5359b1..7e22c81398c4268e388c8390becf8bc111846f63 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -281,8 +281,6 @@ extern void _memcpy_fromio(void *, const volatile void __iomem *, size_t); extern void _memcpy_toio(volatile void __iomem *, const void *, size_t); extern void _memset_io(volatile void __iomem *, int, size_t); -#define mmiowb() - /* * Memory access primitives * ------------------------ diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h index 9e11dce55e06f4e7359b7b779cc7814ae752c813..9587517649bd7123a4a1fa8caba11871d56e0849 100644 --- a/arch/arm/include/asm/stage2_pgtable.h +++ b/arch/arm/include/asm/stage2_pgtable.h @@ -32,14 +32,14 @@ #define stage2_pgd_present(kvm, pgd) pgd_present(pgd) #define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud) #define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address) -#define stage2_pud_free(kvm, pud) pud_free(NULL, pud) +#define stage2_pud_free(kvm, pud) do { } while (0) #define stage2_pud_none(kvm, pud) pud_none(pud) #define stage2_pud_clear(kvm, pud) pud_clear(pud) #define stage2_pud_present(kvm, pud) pud_present(pud) #define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd) #define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address) -#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd) +#define stage2_pmd_free(kvm, pmd) free_page((unsigned long)pmd) #define stage2_pud_huge(kvm, pud) pud_huge(pud) diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h index 06dea6bce293b934e1146d26aa316ea8e36e80b5..080ce70cab12a6944af4120ed5a3b9ca9889411d 100644 --- a/arch/arm/include/asm/syscall.h +++ b/arch/arm/include/asm/syscall.h @@ -55,53 +55,22 @@ static inline void syscall_set_return_value(struct task_struct *task, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { - if (n == 0) - return; - - if (i + n > SYSCALL_MAX_ARGS) { - unsigned long *args_bad = args + SYSCALL_MAX_ARGS - i; - unsigned int n_bad = n + i - SYSCALL_MAX_ARGS; - pr_warn("%s called with max args %d, handling only %d\n", - __func__, i + n, SYSCALL_MAX_ARGS); - memset(args_bad, 0, n_bad * sizeof(args[0])); - n = SYSCALL_MAX_ARGS - i; - } - - if (i == 0) { - args[0] = regs->ARM_ORIG_r0; - args++; - i++; - n--; - } - - memcpy(args, ®s->ARM_r0 + i, n * sizeof(args[0])); + args[0] = regs->ARM_ORIG_r0; + args++; + + memcpy(args, ®s->ARM_r0 + 1, 5 * sizeof(args[0])); } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) { - if (n == 0) - return; - - if (i + n > SYSCALL_MAX_ARGS) { - pr_warn("%s called with max args %d, handling only %d\n", - __func__, i + n, SYSCALL_MAX_ARGS); - n = SYSCALL_MAX_ARGS - i; - } - - if (i == 0) { - regs->ARM_ORIG_r0 = args[0]; - args++; - i++; - n--; - } - - memcpy(®s->ARM_r0 + i, args, n * sizeof(args[0])); + regs->ARM_ORIG_r0 = args[0]; + args++; + + memcpy(®s->ARM_r0 + 1, args, 5 * sizeof(args[0])); } static inline int syscall_get_arch(void) diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index f854148c8d7c258927b031d0c87e8aa8a142e309..bc6d04a098998b5079a4e5e652d0ce540b0fafba 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -33,271 +33,42 @@ #include #include -#define MMU_GATHER_BUNDLE 8 - -#ifdef CONFIG_HAVE_RCU_TABLE_FREE static inline void __tlb_remove_table(void *_table) { free_page_and_swap_cache((struct page *)_table); } -struct mmu_table_batch { - struct rcu_head rcu; - unsigned int nr; - void *tables[0]; -}; - -#define MAX_TABLE_BATCH \ - ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) - -extern void tlb_table_flush(struct mmu_gather *tlb); -extern void tlb_remove_table(struct mmu_gather *tlb, void *table); - -#define tlb_remove_entry(tlb, entry) tlb_remove_table(tlb, entry) -#else -#define tlb_remove_entry(tlb, entry) tlb_remove_page(tlb, entry) -#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ - -/* - * TLB handling. This allows us to remove pages from the page - * tables, and efficiently handle the TLB issues. - */ -struct mmu_gather { - struct mm_struct *mm; -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - struct mmu_table_batch *batch; - unsigned int need_flush; -#endif - unsigned int fullmm; - struct vm_area_struct *vma; - unsigned long start, end; - unsigned long range_start; - unsigned long range_end; - unsigned int nr; - unsigned int max; - struct page **pages; - struct page *local[MMU_GATHER_BUNDLE]; -}; - -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); - -/* - * This is unnecessarily complex. There's three ways the TLB shootdown - * code is used: - * 1. Unmapping a range of vmas. See zap_page_range(), unmap_region(). - * tlb->fullmm = 0, and tlb_start_vma/tlb_end_vma will be called. - * tlb->vma will be non-NULL. - * 2. Unmapping all vmas. See exit_mmap(). - * tlb->fullmm = 1, and tlb_start_vma/tlb_end_vma will be called. - * tlb->vma will be non-NULL. Additionally, page tables will be freed. - * 3. Unmapping argument pages. See shift_arg_pages(). - * tlb->fullmm = 0, but tlb_start_vma/tlb_end_vma will not be called. - * tlb->vma will be NULL. - */ -static inline void tlb_flush(struct mmu_gather *tlb) -{ - if (tlb->fullmm || !tlb->vma) - flush_tlb_mm(tlb->mm); - else if (tlb->range_end > 0) { - flush_tlb_range(tlb->vma, tlb->range_start, tlb->range_end); - tlb->range_start = TASK_SIZE; - tlb->range_end = 0; - } -} - -static inline void tlb_add_flush(struct mmu_gather *tlb, unsigned long addr) -{ - if (!tlb->fullmm) { - if (addr < tlb->range_start) - tlb->range_start = addr; - if (addr + PAGE_SIZE > tlb->range_end) - tlb->range_end = addr + PAGE_SIZE; - } -} - -static inline void __tlb_alloc_page(struct mmu_gather *tlb) -{ - unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); - - if (addr) { - tlb->pages = (void *)addr; - tlb->max = PAGE_SIZE / sizeof(struct page *); - } -} - -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ - tlb_flush(tlb); -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - tlb_table_flush(tlb); -#endif -} - -static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - free_pages_and_swap_cache(tlb->pages, tlb->nr); - tlb->nr = 0; - if (tlb->pages == tlb->local) - __tlb_alloc_page(tlb); -} - -static inline void tlb_flush_mmu(struct mmu_gather *tlb) -{ - tlb_flush_mmu_tlbonly(tlb); - tlb_flush_mmu_free(tlb); -} - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->fullmm = !(start | (end+1)); - tlb->start = start; - tlb->end = end; - tlb->vma = NULL; - tlb->max = ARRAY_SIZE(tlb->local); - tlb->pages = tlb->local; - tlb->nr = 0; - __tlb_alloc_page(tlb); +#include -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - tlb->batch = NULL; +#ifndef CONFIG_HAVE_RCU_TABLE_FREE +#define tlb_remove_table(tlb, entry) tlb_remove_page(tlb, entry) #endif -} - -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (force) { - tlb->range_start = start; - tlb->range_end = end; - } - - tlb_flush_mmu(tlb); - - /* keep the page table cache within bounds */ - check_pgt_cache(); - - if (tlb->pages != tlb->local) - free_pages((unsigned long)tlb->pages, 0); -} - -/* - * Memorize the range for the TLB flush. - */ -static inline void -tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr) -{ - tlb_add_flush(tlb, addr); -} - -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) -/* - * In the case of tlb vma handling, we can optimise these away in the - * case where we're doing a full MM flush. When we're doing a munmap, - * the vmas are adjusted to only cover the region to be torn down. - */ -static inline void -tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) -{ - if (!tlb->fullmm) { - flush_cache_range(vma, vma->vm_start, vma->vm_end); - tlb->vma = vma; - tlb->range_start = TASK_SIZE; - tlb->range_end = 0; - } -} static inline void -tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) -{ - if (!tlb->fullmm) - tlb_flush(tlb); -} - -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - tlb->pages[tlb->nr++] = page; - VM_WARN_ON(tlb->nr > tlb->max); - if (tlb->nr == tlb->max) - return true; - return false; -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - if (__tlb_remove_page(tlb, page)) - tlb_flush_mmu(tlb); -} - -static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return __tlb_remove_page(tlb, page); -} - -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return tlb_remove_page(tlb, page); -} - -static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, - unsigned long addr) +__pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { pgtable_page_dtor(pte); -#ifdef CONFIG_ARM_LPAE - tlb_add_flush(tlb, addr); -#else +#ifndef CONFIG_ARM_LPAE /* * With the classic ARM MMU, a pte page has two corresponding pmd * entries, each covering 1MB. */ - addr &= PMD_MASK; - tlb_add_flush(tlb, addr + SZ_1M - PAGE_SIZE); - tlb_add_flush(tlb, addr + SZ_1M); + addr = (addr & PMD_MASK) + SZ_1M; + __tlb_adjust_range(tlb, addr - PAGE_SIZE, 2 * PAGE_SIZE); #endif - tlb_remove_entry(tlb, pte); -} - -static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, - unsigned long addr) -{ -#ifdef CONFIG_ARM_LPAE - tlb_add_flush(tlb, addr); - tlb_remove_entry(tlb, virt_to_page(pmdp)); -#endif + tlb_remove_table(tlb, pte); } static inline void -tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) -{ - tlb_add_flush(tlb, addr); -} - -#define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) -#define pmd_free_tlb(tlb, pmdp, addr) __pmd_free_tlb(tlb, pmdp, addr) -#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) - -#define tlb_migrate_finish(mm) do { } while (0) - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) +__pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) { -} - -static inline void tlb_flush_remove_tables(struct mm_struct *mm) -{ -} +#ifdef CONFIG_ARM_LPAE + struct page *page = virt_to_page(pmdp); -static inline void tlb_flush_remove_tables_local(void *arg) -{ + tlb_remove_table(tlb, page); +#endif } #endif /* CONFIG_MMU */ diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S index c08d2d890f7b918981c472c155c6df368a1b30b3..b38bbd011b358f433e3c9201fdb2015611286252 100644 --- a/arch/arm/kernel/head-nommu.S +++ b/arch/arm/kernel/head-nommu.S @@ -133,9 +133,9 @@ __secondary_data: */ .text __after_proc_init: -#ifdef CONFIG_ARM_MPU M_CLASS(movw r12, #:lower16:BASEADDR_V7M_SCB) M_CLASS(movt r12, #:upper16:BASEADDR_V7M_SCB) +#ifdef CONFIG_ARM_MPU M_CLASS(ldr r3, [r12, 0x50]) AR_CLASS(mrc p15, 0, r3, c0, c1, 4) @ Read ID_MMFR0 and r3, r3, #(MMFR0_PMSA) @ PMSA field diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 76bb8de6bf6b6983bf5a231ae66b8a162bd9e3ba..be5edfdde558d600494c6720850a7ed0c0e0b153 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -549,8 +549,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) int ret; /* - * Increment event counter and perform fixup for the pre-signal - * frame. + * Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c index a56e7c856ab5648995888ae5f47c5d1ab23b08bf..86870f40f9a07558877d1d7dba098ad5926c4bbf 100644 --- a/arch/arm/kernel/stacktrace.c +++ b/arch/arm/kernel/stacktrace.c @@ -115,8 +115,6 @@ static noinline void __save_stack_trace(struct task_struct *tsk, * running on another CPU? For now, ignore it as we * can't guarantee we won't explode. */ - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; return; #else frame.fp = thread_saved_fp(tsk); @@ -134,8 +132,6 @@ static noinline void __save_stack_trace(struct task_struct *tsk, } walk_stackframe(&frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) @@ -153,8 +149,6 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) frame.pc = regs->ARM_pc; walk_stackframe(&frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index 51e808adb00cc23576b1f9148f0388542627d397..2a757dcaa1a5e9d63b5ae47833ef31d12ab94aa2 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -591,13 +591,13 @@ static int __init at91_pm_backup_init(void) np = of_find_compatible_node(NULL, NULL, "atmel,sama5d2-securam"); if (!np) - goto securam_fail; + goto securam_fail_no_ref_dev; pdev = of_find_device_by_node(np); of_node_put(np); if (!pdev) { pr_warn("%s: failed to find securam device!\n", __func__); - goto securam_fail; + goto securam_fail_no_ref_dev; } sram_pool = gen_pool_get(&pdev->dev, NULL); @@ -620,6 +620,8 @@ static int __init at91_pm_backup_init(void) return 0; securam_fail: + put_device(&pdev->dev); +securam_fail_no_ref_dev: iounmap(pm_data.sfrbu); pm_data.sfrbu = NULL; return ret; diff --git a/arch/arm/mach-iop13xx/setup.c b/arch/arm/mach-iop13xx/setup.c index 53c316f7301e69fcbebbfe5d73bb48664180f5b6..fe4932fda01d7d0bc819c0ca4e6dcedb6b061081 100644 --- a/arch/arm/mach-iop13xx/setup.c +++ b/arch/arm/mach-iop13xx/setup.c @@ -300,7 +300,7 @@ static struct resource iop13xx_adma_2_resources[] = { } }; -static u64 iop13xx_adma_dmamask = DMA_BIT_MASK(64); +static u64 iop13xx_adma_dmamask = DMA_BIT_MASK(32); static struct iop_adma_platform_data iop13xx_adma_0_data = { .hw_id = 0, .pool_size = PAGE_SIZE, @@ -324,7 +324,7 @@ static struct platform_device iop13xx_adma_0_channel = { .resource = iop13xx_adma_0_resources, .dev = { .dma_mask = &iop13xx_adma_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(64), + .coherent_dma_mask = DMA_BIT_MASK(32), .platform_data = (void *) &iop13xx_adma_0_data, }, }; @@ -336,7 +336,7 @@ static struct platform_device iop13xx_adma_1_channel = { .resource = iop13xx_adma_1_resources, .dev = { .dma_mask = &iop13xx_adma_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(64), + .coherent_dma_mask = DMA_BIT_MASK(32), .platform_data = (void *) &iop13xx_adma_1_data, }, }; @@ -348,7 +348,7 @@ static struct platform_device iop13xx_adma_2_channel = { .resource = iop13xx_adma_2_resources, .dev = { .dma_mask = &iop13xx_adma_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(64), + .coherent_dma_mask = DMA_BIT_MASK(32), .platform_data = (void *) &iop13xx_adma_2_data, }, }; diff --git a/arch/arm/mach-iop13xx/tpmi.c b/arch/arm/mach-iop13xx/tpmi.c index db511ec2b1df6824cb6d3d24659cfebe2428d5ec..116feb6b261eb7b0e08ee7ce248e44682e537898 100644 --- a/arch/arm/mach-iop13xx/tpmi.c +++ b/arch/arm/mach-iop13xx/tpmi.c @@ -152,7 +152,7 @@ static struct resource iop13xx_tpmi_3_resources[] = { } }; -u64 iop13xx_tpmi_mask = DMA_BIT_MASK(64); +u64 iop13xx_tpmi_mask = DMA_BIT_MASK(32); static struct platform_device iop13xx_tpmi_0_device = { .name = "iop-tpmi", .id = 0, @@ -160,7 +160,7 @@ static struct platform_device iop13xx_tpmi_0_device = { .resource = iop13xx_tpmi_0_resources, .dev = { .dma_mask = &iop13xx_tpmi_mask, - .coherent_dma_mask = DMA_BIT_MASK(64), + .coherent_dma_mask = DMA_BIT_MASK(32), }, }; @@ -171,7 +171,7 @@ static struct platform_device iop13xx_tpmi_1_device = { .resource = iop13xx_tpmi_1_resources, .dev = { .dma_mask = &iop13xx_tpmi_mask, - .coherent_dma_mask = DMA_BIT_MASK(64), + .coherent_dma_mask = DMA_BIT_MASK(32), }, }; @@ -182,7 +182,7 @@ static struct platform_device iop13xx_tpmi_2_device = { .resource = iop13xx_tpmi_2_resources, .dev = { .dma_mask = &iop13xx_tpmi_mask, - .coherent_dma_mask = DMA_BIT_MASK(64), + .coherent_dma_mask = DMA_BIT_MASK(32), }, }; @@ -193,7 +193,7 @@ static struct platform_device iop13xx_tpmi_3_device = { .resource = iop13xx_tpmi_3_resources, .dev = { .dma_mask = &iop13xx_tpmi_mask, - .coherent_dma_mask = DMA_BIT_MASK(64), + .coherent_dma_mask = DMA_BIT_MASK(32), }, }; diff --git a/arch/arm/mach-milbeaut/platsmp.c b/arch/arm/mach-milbeaut/platsmp.c index 591543c81399b4f976b52599cf84b7cb22e512ea..3ea880f5fcb7338b8e419db9fd49c06cee263ca0 100644 --- a/arch/arm/mach-milbeaut/platsmp.c +++ b/arch/arm/mach-milbeaut/platsmp.c @@ -65,6 +65,7 @@ static void m10v_smp_init(unsigned int max_cpus) writel(KERNEL_UNBOOT_FLAG, m10v_smp_base + cpu * 4); } +#ifdef CONFIG_HOTPLUG_CPU static void m10v_cpu_die(unsigned int l_cpu) { gic_cpu_if_down(0); @@ -83,12 +84,15 @@ static int m10v_cpu_kill(unsigned int l_cpu) return 1; } +#endif static struct smp_operations m10v_smp_ops __initdata = { .smp_prepare_cpus = m10v_smp_init, .smp_boot_secondary = m10v_boot_secondary, +#ifdef CONFIG_HOTPLUG_CPU .cpu_die = m10v_cpu_die, .cpu_kill = m10v_cpu_kill, +#endif }; CPU_METHOD_OF_DECLARE(m10v_smp, "socionext,milbeaut-m10v-smp", &m10v_smp_ops); diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c index be30c3c061b46ee0c1adf3ce55a872eb7bc9c9c0..1b15d593837ed78ea22298ccc4ae60cb3de166f1 100644 --- a/arch/arm/mach-omap1/board-ams-delta.c +++ b/arch/arm/mach-omap1/board-ams-delta.c @@ -182,6 +182,7 @@ static struct resource latch1_resources[] = { static struct bgpio_pdata latch1_pdata = { .label = LATCH1_LABEL, + .base = -1, .ngpio = LATCH1_NGPIO, }; @@ -219,6 +220,7 @@ static struct resource latch2_resources[] = { static struct bgpio_pdata latch2_pdata = { .label = LATCH2_LABEL, + .base = -1, .ngpio = LATCH2_NGPIO, }; diff --git a/arch/arm/mach-omap2/display.c b/arch/arm/mach-omap2/display.c index 1444b4b4bd9f85e54368c0e18ac31f3f2fc033eb..439e143cad7b5d4d8ef48122816f9acf436570c3 100644 --- a/arch/arm/mach-omap2/display.c +++ b/arch/arm/mach-omap2/display.c @@ -250,8 +250,10 @@ static int __init omapdss_init_of(void) if (!node) return 0; - if (!of_device_is_available(node)) + if (!of_device_is_available(node)) { + of_node_put(node); return 0; + } pdev = of_find_device_by_node(node); diff --git a/arch/arm/plat-iop/adma.c b/arch/arm/plat-iop/adma.c index a4d1f8de3b5b23453ee4738723164a5ba8405424..d9612221e4848971f4ea27cf4f5d4c319073e439 100644 --- a/arch/arm/plat-iop/adma.c +++ b/arch/arm/plat-iop/adma.c @@ -143,7 +143,7 @@ struct platform_device iop3xx_dma_0_channel = { .resource = iop3xx_dma_0_resources, .dev = { .dma_mask = &iop3xx_adma_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(64), + .coherent_dma_mask = DMA_BIT_MASK(32), .platform_data = (void *) &iop3xx_dma_0_data, }, }; @@ -155,7 +155,7 @@ struct platform_device iop3xx_dma_1_channel = { .resource = iop3xx_dma_1_resources, .dev = { .dma_mask = &iop3xx_adma_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(64), + .coherent_dma_mask = DMA_BIT_MASK(32), .platform_data = (void *) &iop3xx_dma_1_data, }, }; @@ -167,7 +167,7 @@ struct platform_device iop3xx_aau_channel = { .resource = iop3xx_aau_resources, .dev = { .dma_mask = &iop3xx_adma_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(64), + .coherent_dma_mask = DMA_BIT_MASK(32), .platform_data = (void *) &iop3xx_aau_data, }, }; diff --git a/arch/arm/plat-orion/common.c b/arch/arm/plat-orion/common.c index a6c81ce00f520625880c29c083b3f70384c3db1f..8647cb80a93bd222234f4951f2249ac0399ca025 100644 --- a/arch/arm/plat-orion/common.c +++ b/arch/arm/plat-orion/common.c @@ -622,7 +622,7 @@ static struct platform_device orion_xor0_shared = { .resource = orion_xor0_shared_resources, .dev = { .dma_mask = &orion_xor_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(64), + .coherent_dma_mask = DMA_BIT_MASK(32), .platform_data = &orion_xor0_pdata, }, }; @@ -683,7 +683,7 @@ static struct platform_device orion_xor1_shared = { .resource = orion_xor1_shared_resources, .dev = { .dma_mask = &orion_xor_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(64), + .coherent_dma_mask = DMA_BIT_MASK(32), .platform_data = &orion_xor1_pdata, }, }; diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 9016f4081bb9cff33886860e9a1d48f0ee58e47c..0393917eaa57aaf8cda4548b9a34a705be6c73a0 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -437,3 +437,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/arm/vdso/vgettimeofday.c b/arch/arm/vdso/vgettimeofday.c index a9dd619c6c290042d052f03c351d1dae4760f7e5..7bdbf5d5c47d3c0864ca9d48026989b94d06ffb5 100644 --- a/arch/arm/vdso/vgettimeofday.c +++ b/arch/arm/vdso/vgettimeofday.c @@ -18,9 +18,9 @@ #include #include #include -#include #include #include +#include #include #include #include @@ -123,7 +123,8 @@ static notrace u64 get_ns(struct vdso_data *vdata) u64 cycle_now; u64 nsec; - cycle_now = arch_counter_get_cntvct(); + isb(); + cycle_now = read_sysreg(CNTVCT); cycle_delta = (cycle_now - vdata->cs_cycle_last) & vdata->cs_mask; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7e34b9eba5de151572ba479d73ccf82ba18e8beb..df350f4e1e7ac479f03cf8b48b889a5fc84232ef 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -90,6 +90,7 @@ config ARM64 select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST select GENERIC_CPU_AUTOPROBE + select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_IDLE_POLL_SETUP select GENERIC_IRQ_MULTI_HANDLER @@ -148,8 +149,8 @@ config ARM64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_RCU_TABLE_FREE - select HAVE_RCU_TABLE_INVALIDATE select HAVE_RSEQ select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS @@ -237,9 +238,6 @@ config LOCKDEP_SUPPORT config TRACE_IRQFLAGS_SUPPORT def_bool y -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config GENERIC_BUG def_bool y depends on BUG @@ -297,7 +295,7 @@ menu "Kernel Features" menu "ARM errata workarounds via the alternatives framework" config ARM64_WORKAROUND_CLEAN_CACHE - def_bool n + bool config ARM64_ERRATUM_826319 bool "Cortex-A53: 826319: System might deadlock if a write cannot complete until read data is accepted" @@ -464,26 +462,28 @@ config ARM64_ERRATUM_1024718 bool "Cortex-A55: 1024718: Update of DBM/AP bits without break before make might result in incorrect update" default y help - This option adds work around for Arm Cortex-A55 Erratum 1024718. + This option adds a workaround for ARM Cortex-A55 Erratum 1024718. Affected Cortex-A55 cores (r0p0, r0p1, r1p0) could cause incorrect update of the hardware dirty bit when the DBM/AP bits are updated - without a break-before-make. The work around is to disable the usage + without a break-before-make. The workaround is to disable the usage of hardware DBM locally on the affected cores. CPUs not affected by - erratum will continue to use the feature. + this erratum will continue to use the feature. If unsure, say Y. config ARM64_ERRATUM_1188873 - bool "Cortex-A76: MRC read following MRRC read of specific Generic Timer in AArch32 might give incorrect result" + bool "Cortex-A76/Neoverse-N1: MRC read following MRRC read of specific Generic Timer in AArch32 might give incorrect result" default y + depends on COMPAT select ARM_ARCH_TIMER_OOL_WORKAROUND help - This option adds work arounds for ARM Cortex-A76 erratum 1188873 + This option adds a workaround for ARM Cortex-A76/Neoverse-N1 + erratum 1188873. - Affected Cortex-A76 cores (r0p0, r1p0, r2p0) could cause - register corruption when accessing the timer registers from - AArch32 userspace. + Affected Cortex-A76/Neoverse-N1 cores (r0p0, r1p0, r2p0) could + cause register corruption when accessing the timer registers + from AArch32 userspace. If unsure, say Y. @@ -491,7 +491,7 @@ config ARM64_ERRATUM_1165522 bool "Cortex-A76: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" default y help - This option adds work arounds for ARM Cortex-A76 erratum 1165522 + This option adds a workaround for ARM Cortex-A76 erratum 1165522. Affected Cortex-A76 cores (r0p0, r1p0, r2p0) could end-up with corrupted TLBs by speculating an AT instruction during a guest @@ -504,7 +504,7 @@ config ARM64_ERRATUM_1286807 default y select ARM64_WORKAROUND_REPEAT_TLBI help - This option adds workaround for ARM Cortex-A76 erratum 1286807 + This option adds a workaround for ARM Cortex-A76 erratum 1286807. On the affected Cortex-A76 cores (r0p0 to r3p0), if a virtual address for a cacheable mapping of a location is being @@ -521,10 +521,10 @@ config CAVIUM_ERRATUM_22375 bool "Cavium erratum 22375, 24313" default y help - Enable workaround for erratum 22375, 24313. + Enable workaround for errata 22375 and 24313. This implements two gicv3-its errata workarounds for ThunderX. Both - with small impact affecting only ITS table allocation. + with a small impact affecting only ITS table allocation. erratum 22375: only alloc 8MB table size erratum 24313: ignore memory access type @@ -588,9 +588,6 @@ config QCOM_FALKOR_ERRATUM_1003 config ARM64_WORKAROUND_REPEAT_TLBI bool - help - Enable the repeat TLBI workaround for Falkor erratum 1009 and - Cortex-A76 erratum 1286807. config QCOM_FALKOR_ERRATUM_1009 bool "Falkor E1009: Prematurely complete a DSB after a TLBI" @@ -626,7 +623,7 @@ config HISILICON_ERRATUM_161600802 bool "Hip07 161600802: Erroneous redistributor VLPI base" default y help - The HiSilicon Hip07 SoC usees the wrong redistributor base + The HiSilicon Hip07 SoC uses the wrong redistributor base when issued ITS commands such as VMOVP and VMAPP, and requires a 128kB offset to be applied to the target address in this commands. @@ -646,7 +643,7 @@ config FUJITSU_ERRATUM_010001 bool "Fujitsu-A64FX erratum E#010001: Undefined fault may occur wrongly" default y help - This option adds workaround for Fujitsu-A64FX erratum E#010001. + This option adds a workaround for Fujitsu-A64FX erratum E#010001. On some variants of the Fujitsu-A64FX cores ver(1.0, 1.1), memory accesses may cause undefined fault (Data abort, DFSC=0b111111). This fault occurs under a specific hardware condition when a @@ -657,7 +654,7 @@ config FUJITSU_ERRATUM_010001 case-4 TTBR1_EL2 with TCR_EL2.NFD1 == 1. The workaround is to ensure these bits are clear in TCR_ELx. - The workaround only affect the Fujitsu-A64FX. + The workaround only affects the Fujitsu-A64FX. If unsure, say Y. @@ -889,6 +886,9 @@ config ARCH_WANT_HUGE_PMD_SHARE config ARCH_HAS_CACHE_LINE_SIZE def_bool y +config ARCH_ENABLE_SPLIT_PMD_PTLOCK + def_bool y if PGTABLE_LEVELS > 2 + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" ---help--- @@ -1078,9 +1078,65 @@ config RODATA_FULL_DEFAULT_ENABLED This requires the linear region to be mapped down to pages, which may adversely affect performance in some cases. +config ARM64_SW_TTBR0_PAN + bool "Emulate Privileged Access Never using TTBR0_EL1 switching" + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved + zeroed area and reserved ASID. The user access routines + restore the valid TTBR0_EL1 temporarily. + +menuconfig COMPAT + bool "Kernel support for 32-bit EL0" + depends on ARM64_4K_PAGES || EXPERT + select COMPAT_BINFMT_ELF if BINFMT_ELF + select HAVE_UID16 + select OLD_SIGSUSPEND3 + select COMPAT_OLD_SIGACTION + help + This option enables support for a 32-bit EL0 running under a 64-bit + kernel at EL1. AArch32-specific components such as system calls, + the user helper functions, VFP support and the ptrace interface are + handled appropriately by the kernel. + + If you use a page size other than 4KB (i.e, 16KB or 64KB), please be aware + that you will only be able to execute AArch32 binaries that were compiled + with page size aligned segments. + + If you want to execute 32-bit userspace applications, say Y. + +if COMPAT + +config KUSER_HELPERS + bool "Enable kuser helpers page for 32 bit applications" + default y + help + Warning: disabling this option may break 32-bit user programs. + + Provide kuser helpers to compat tasks. The kernel provides + helper code to userspace in read only form at a fixed location + to allow userspace to be independent of the CPU type fitted to + the system. This permits binaries to be run on ARMv4 through + to ARMv8 without modification. + + See Documentation/arm/kernel_user_helpers.txt for details. + + However, the fixed address nature of these helpers can be used + by ROP (return orientated programming) authors when creating + exploits. + + If all of the binaries and libraries which run on your platform + are built specifically for your platform, and make no use of + these helpers, then you can turn this option off to hinder + such exploits. However, in that case, if a binary or library + relying on those helpers is run, it will not function correctly. + + Say N here only if you are absolutely certain that you do not + need these helpers; otherwise, the safe option is to say Y. + + menuconfig ARMV8_DEPRECATED bool "Emulate deprecated/obsolete ARMv8 instructions" - depends on COMPAT depends on SYSCTL help Legacy software support may require certain instructions @@ -1146,13 +1202,7 @@ config SETEND_EMULATION If unsure, say Y endif -config ARM64_SW_TTBR0_PAN - bool "Emulate Privileged Access Never using TTBR0_EL1 switching" - help - Enabling this option prevents the kernel from accessing - user-space memory directly by pointing TTBR0_EL1 to a reserved - zeroed area and reserved ASID. The user access routines - restore the valid TTBR0_EL1 temporarily. +endif menu "ARMv8.1 architectural features" @@ -1318,6 +1368,9 @@ config ARM64_SVE To enable use of this extension on CPUs that implement it, say Y. + On CPUs that support the SVE2 extensions, this option will enable + those too. + Note that for architectural reasons, firmware _must_ implement SVE support when running on SVE capable hardware. The required support is present in: @@ -1351,7 +1404,7 @@ config ARM64_PSEUDO_NMI help Adds support for mimicking Non-Maskable Interrupts through the use of GIC interrupt priority. This support requires version 3 or later of - Arm GIC. + ARM GIC. This high priority configuration for interrupts needs to be explicitly enabled by setting the kernel parameter @@ -1475,25 +1528,6 @@ config DMI endmenu -config COMPAT - bool "Kernel support for 32-bit EL0" - depends on ARM64_4K_PAGES || EXPERT - select COMPAT_BINFMT_ELF if BINFMT_ELF - select HAVE_UID16 - select OLD_SIGSUSPEND3 - select COMPAT_OLD_SIGACTION - help - This option enables support for a 32-bit EL0 running under a 64-bit - kernel at EL1. AArch32-specific components such as system calls, - the user helper functions, VFP support and the ptrace interface are - handled appropriately by the kernel. - - If you use a page size other than 4KB (i.e, 16KB or 64KB), please be aware - that you will only be able to execute AArch32 binaries that were compiled - with page size aligned segments. - - If you want to execute 32-bit userspace applications, say Y. - config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi index 7c649f6b14cb6eb73ea8fb23ded74ed5152d3a70..a2cec6218211b6aad8ff5e1da84c6aee750ebd50 100644 --- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi +++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi @@ -162,6 +162,7 @@ rx-fifo-depth = <16384>; snps,multicast-filter-bins = <256>; iommus = <&smmu 1>; + altr,sysmgr-syscon = <&sysmgr 0x44 0>; status = "disabled"; }; @@ -179,6 +180,7 @@ rx-fifo-depth = <16384>; snps,multicast-filter-bins = <256>; iommus = <&smmu 2>; + altr,sysmgr-syscon = <&sysmgr 0x48 0>; status = "disabled"; }; @@ -196,6 +198,7 @@ rx-fifo-depth = <16384>; snps,multicast-filter-bins = <256>; iommus = <&smmu 3>; + altr,sysmgr-syscon = <&sysmgr 0x4c 0>; status = "disabled"; }; @@ -531,11 +534,12 @@ }; eccmgr { - compatible = "altr,socfpga-a10-ecc-manager"; + compatible = "altr,socfpga-s10-ecc-manager", + "altr,socfpga-a10-ecc-manager"; altr,sysmgr-syscon = <&sysmgr>; #address-cells = <1>; #size-cells = <1>; - interrupts = <0 15 4>, <0 95 4>; + interrupts = <0 15 4>; interrupt-controller; #interrupt-cells = <2>; ranges; @@ -543,31 +547,31 @@ sdramedac { compatible = "altr,sdram-edac-s10"; altr,sdr-syscon = <&sdr>; - interrupts = <16 4>, <48 4>; + interrupts = <16 4>; }; usb0-ecc@ff8c4000 { - compatible = "altr,socfpga-usb-ecc"; + compatible = "altr,socfpga-s10-usb-ecc", + "altr,socfpga-usb-ecc"; reg = <0xff8c4000 0x100>; altr,ecc-parent = <&usb0>; - interrupts = <2 4>, - <34 4>; + interrupts = <2 4>; }; emac0-rx-ecc@ff8c0000 { - compatible = "altr,socfpga-eth-mac-ecc"; + compatible = "altr,socfpga-s10-eth-mac-ecc", + "altr,socfpga-eth-mac-ecc"; reg = <0xff8c0000 0x100>; altr,ecc-parent = <&gmac0>; - interrupts = <4 4>, - <36 4>; + interrupts = <4 4>; }; emac0-tx-ecc@ff8c0400 { - compatible = "altr,socfpga-eth-mac-ecc"; + compatible = "altr,socfpga-s10-eth-mac-ecc", + "altr,socfpga-eth-mac-ecc"; reg = <0xff8c0400 0x100>; altr,ecc-parent = <&gmac0>; - interrupts = <5 4>, - <37 4>; + interrupts = <5 4>; }; }; diff --git a/arch/arm64/boot/dts/mediatek/mt2712-pinfunc.h b/arch/arm64/boot/dts/mediatek/mt2712-pinfunc.h index 1b4cb0c55744a224b959a321e522f6e463aee702..385c455a7c985801827b577f973321c43f95eaa3 100644 --- a/arch/arm64/boot/dts/mediatek/mt2712-pinfunc.h +++ b/arch/arm64/boot/dts/mediatek/mt2712-pinfunc.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2018 MediaTek Inc. * Author: Zhiyong Tao diff --git a/arch/arm64/boot/dts/rockchip/rk3328-roc-cc.dts b/arch/arm64/boot/dts/rockchip/rk3328-roc-cc.dts index 33c44e857247e4a64847f22945c4c06bb7735ecf..0e34354b20927698482fddaf6814483394a18b93 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-roc-cc.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-roc-cc.dts @@ -108,8 +108,8 @@ snps,reset-gpio = <&gpio1 RK_PC2 GPIO_ACTIVE_LOW>; snps,reset-active-low; snps,reset-delays-us = <0 10000 50000>; - tx_delay = <0x25>; - rx_delay = <0x11>; + tx_delay = <0x24>; + rx_delay = <0x18>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts index 2157a528276bffae23afbaf3152db66292b7817a..79b4d1d4b5d6b67672dcbab1de19d274cecd5c5b 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts @@ -46,8 +46,7 @@ vcc_host1_5v: vcc_otg_5v: vcc-host1-5v-regulator { compatible = "regulator-fixed"; - enable-active-high; - gpio = <&gpio0 RK_PA2 GPIO_ACTIVE_HIGH>; + gpio = <&gpio0 RK_PA2 GPIO_ACTIVE_LOW>; pinctrl-names = "default"; pinctrl-0 = <&usb20_host_drv>; regulator-name = "vcc_host1_5v"; diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 84f14b132e8f5fb80bf3f178a72f5e138d144bd3..dabef1a21649ba44ee4b880d83d9b24591ac1d9d 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -1445,11 +1445,11 @@ sdmmc0 { sdmmc0_clk: sdmmc0-clk { - rockchip,pins = <1 RK_PA6 1 &pcfg_pull_none_4ma>; + rockchip,pins = <1 RK_PA6 1 &pcfg_pull_none_8ma>; }; sdmmc0_cmd: sdmmc0-cmd { - rockchip,pins = <1 RK_PA4 1 &pcfg_pull_up_4ma>; + rockchip,pins = <1 RK_PA4 1 &pcfg_pull_up_8ma>; }; sdmmc0_dectn: sdmmc0-dectn { @@ -1461,14 +1461,14 @@ }; sdmmc0_bus1: sdmmc0-bus1 { - rockchip,pins = <1 RK_PA0 1 &pcfg_pull_up_4ma>; + rockchip,pins = <1 RK_PA0 1 &pcfg_pull_up_8ma>; }; sdmmc0_bus4: sdmmc0-bus4 { - rockchip,pins = <1 RK_PA0 1 &pcfg_pull_up_4ma>, - <1 RK_PA1 1 &pcfg_pull_up_4ma>, - <1 RK_PA2 1 &pcfg_pull_up_4ma>, - <1 RK_PA3 1 &pcfg_pull_up_4ma>; + rockchip,pins = <1 RK_PA0 1 &pcfg_pull_up_8ma>, + <1 RK_PA1 1 &pcfg_pull_up_8ma>, + <1 RK_PA2 1 &pcfg_pull_up_8ma>, + <1 RK_PA3 1 &pcfg_pull_up_8ma>; }; sdmmc0_gpio: sdmmc0-gpio { @@ -1642,50 +1642,50 @@ rgmiim1_pins: rgmiim1-pins { rockchip,pins = /* mac_txclk */ - <1 RK_PB4 2 &pcfg_pull_none_12ma>, + <1 RK_PB4 2 &pcfg_pull_none_8ma>, /* mac_rxclk */ - <1 RK_PB5 2 &pcfg_pull_none_2ma>, + <1 RK_PB5 2 &pcfg_pull_none_4ma>, /* mac_mdio */ - <1 RK_PC3 2 &pcfg_pull_none_2ma>, + <1 RK_PC3 2 &pcfg_pull_none_4ma>, /* mac_txen */ - <1 RK_PD1 2 &pcfg_pull_none_12ma>, + <1 RK_PD1 2 &pcfg_pull_none_8ma>, /* mac_clk */ - <1 RK_PC5 2 &pcfg_pull_none_2ma>, + <1 RK_PC5 2 &pcfg_pull_none_4ma>, /* mac_rxdv */ - <1 RK_PC6 2 &pcfg_pull_none_2ma>, + <1 RK_PC6 2 &pcfg_pull_none_4ma>, /* mac_mdc */ - <1 RK_PC7 2 &pcfg_pull_none_2ma>, + <1 RK_PC7 2 &pcfg_pull_none_4ma>, /* mac_rxd1 */ - <1 RK_PB2 2 &pcfg_pull_none_2ma>, + <1 RK_PB2 2 &pcfg_pull_none_4ma>, /* mac_rxd0 */ - <1 RK_PB3 2 &pcfg_pull_none_2ma>, + <1 RK_PB3 2 &pcfg_pull_none_4ma>, /* mac_txd1 */ - <1 RK_PB0 2 &pcfg_pull_none_12ma>, + <1 RK_PB0 2 &pcfg_pull_none_8ma>, /* mac_txd0 */ - <1 RK_PB1 2 &pcfg_pull_none_12ma>, + <1 RK_PB1 2 &pcfg_pull_none_8ma>, /* mac_rxd3 */ - <1 RK_PB6 2 &pcfg_pull_none_2ma>, + <1 RK_PB6 2 &pcfg_pull_none_4ma>, /* mac_rxd2 */ - <1 RK_PB7 2 &pcfg_pull_none_2ma>, + <1 RK_PB7 2 &pcfg_pull_none_4ma>, /* mac_txd3 */ - <1 RK_PC0 2 &pcfg_pull_none_12ma>, + <1 RK_PC0 2 &pcfg_pull_none_8ma>, /* mac_txd2 */ - <1 RK_PC1 2 &pcfg_pull_none_12ma>, + <1 RK_PC1 2 &pcfg_pull_none_8ma>, /* mac_txclk */ - <0 RK_PB0 1 &pcfg_pull_none>, + <0 RK_PB0 1 &pcfg_pull_none_8ma>, /* mac_txen */ - <0 RK_PB4 1 &pcfg_pull_none>, + <0 RK_PB4 1 &pcfg_pull_none_8ma>, /* mac_clk */ - <0 RK_PD0 1 &pcfg_pull_none>, + <0 RK_PD0 1 &pcfg_pull_none_4ma>, /* mac_txd1 */ - <0 RK_PC0 1 &pcfg_pull_none>, + <0 RK_PC0 1 &pcfg_pull_none_8ma>, /* mac_txd0 */ - <0 RK_PC1 1 &pcfg_pull_none>, + <0 RK_PC1 1 &pcfg_pull_none_8ma>, /* mac_txd3 */ - <0 RK_PC7 1 &pcfg_pull_none>, + <0 RK_PC7 1 &pcfg_pull_none_8ma>, /* mac_txd2 */ - <0 RK_PC6 1 &pcfg_pull_none>; + <0 RK_PC6 1 &pcfg_pull_none_8ma>; }; rmiim1_pins: rmiim1-pins { diff --git a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dts b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dts index 4a543f2117d4212b9e26578a64db9ad982ff5c59..844eac939a97c58f9aea4a2e681b39dd6648f4f1 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dts @@ -158,6 +158,7 @@ }; &hdmi { + ddc-i2c-bus = <&i2c3>; pinctrl-names = "default"; pinctrl-0 = <&hdmi_cec>; status = "okay"; diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c index 5fc6f51908fdd916f80d08d2931185b37dcb1d50..036ea77f83bc582b9c1e19e9a76c037304feb2b5 100644 --- a/arch/arm64/crypto/aes-ce-ccm-glue.c +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -372,7 +372,7 @@ static struct aead_alg ccm_aes_alg = { static int __init aes_mod_init(void) { - if (!(elf_hwcap & HWCAP_AES)) + if (!cpu_have_named_feature(AES)) return -ENODEV; return crypto_register_aead(&ccm_aes_alg); } diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index e7a95a566462f259c227d924ae61eaa5697a23d4..bf1b321ff4c1ffb4c0692c235821708f5e28be6e 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -440,7 +440,7 @@ static int __init aes_init(void) int err; int i; - if (!(elf_hwcap & HWCAP_ASIMD)) + if (!cpu_have_named_feature(ASIMD)) return -ENODEV; err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c index bece1d85bd816ad3103fd99375e24b2ec21e4c19..cb054f51c91788b7b03724d314ed51d17cb0c667 100644 --- a/arch/arm64/crypto/chacha-neon-glue.c +++ b/arch/arm64/crypto/chacha-neon-glue.c @@ -173,7 +173,7 @@ static struct skcipher_alg algs[] = { static int __init chacha_simd_mod_init(void) { - if (!(elf_hwcap & HWCAP_ASIMD)) + if (!cpu_have_named_feature(ASIMD)) return -ENODEV; return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c index dd325829ee44f92d30c435392f2bcada92f29825..e81d5bd555c097db8aeadc7b4153312f40c539e4 100644 --- a/arch/arm64/crypto/crct10dif-ce-glue.c +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -101,7 +101,7 @@ static struct shash_alg crc_t10dif_alg[] = {{ static int __init crc_t10dif_mod_init(void) { - if (elf_hwcap & HWCAP_PMULL) + if (cpu_have_named_feature(PMULL)) return crypto_register_shashes(crc_t10dif_alg, ARRAY_SIZE(crc_t10dif_alg)); else @@ -111,7 +111,7 @@ static int __init crc_t10dif_mod_init(void) static void __exit crc_t10dif_mod_exit(void) { - if (elf_hwcap & HWCAP_PMULL) + if (cpu_have_named_feature(PMULL)) crypto_unregister_shashes(crc_t10dif_alg, ARRAY_SIZE(crc_t10dif_alg)); else diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 791ad422c427dedc8eb022c609f86a7429b92c3d..4e69bb78ea89fe069059e39aef8e069829a4e436 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -704,10 +704,10 @@ static int __init ghash_ce_mod_init(void) { int ret; - if (!(elf_hwcap & HWCAP_ASIMD)) + if (!cpu_have_named_feature(ASIMD)) return -ENODEV; - if (elf_hwcap & HWCAP_PMULL) + if (cpu_have_named_feature(PMULL)) ret = crypto_register_shashes(ghash_alg, ARRAY_SIZE(ghash_alg)); else @@ -717,7 +717,7 @@ static int __init ghash_ce_mod_init(void) if (ret) return ret; - if (elf_hwcap & HWCAP_PMULL) { + if (cpu_have_named_feature(PMULL)) { ret = crypto_register_aead(&gcm_aes_alg); if (ret) crypto_unregister_shashes(ghash_alg, @@ -728,7 +728,7 @@ static int __init ghash_ce_mod_init(void) static void __exit ghash_ce_mod_exit(void) { - if (elf_hwcap & HWCAP_PMULL) + if (cpu_have_named_feature(PMULL)) crypto_unregister_shashes(ghash_alg, ARRAY_SIZE(ghash_alg)); else crypto_unregister_shash(ghash_alg); diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c index 22cc32ac9448dba73febc4ad447d5a8144307a4c..38a589044b6ccf285b1a2a2749be9929f2a51a4a 100644 --- a/arch/arm64/crypto/nhpoly1305-neon-glue.c +++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c @@ -56,7 +56,7 @@ static struct shash_alg nhpoly1305_alg = { static int __init nhpoly1305_mod_init(void) { - if (!(elf_hwcap & HWCAP_ASIMD)) + if (!cpu_have_named_feature(ASIMD)) return -ENODEV; return crypto_register_shash(&nhpoly1305_alg); diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c index 4aedeaefd61f39de0cd518157d70559e50c8ed2d..0cccdb9cc2c0146b3690d67a84b024d05ab5ac7b 100644 --- a/arch/arm64/crypto/sha256-glue.c +++ b/arch/arm64/crypto/sha256-glue.c @@ -173,7 +173,7 @@ static int __init sha256_mod_init(void) if (ret) return ret; - if (elf_hwcap & HWCAP_ASIMD) { + if (cpu_have_named_feature(ASIMD)) { ret = crypto_register_shashes(neon_algs, ARRAY_SIZE(neon_algs)); if (ret) crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); @@ -183,7 +183,7 @@ static int __init sha256_mod_init(void) static void __exit sha256_mod_fini(void) { - if (elf_hwcap & HWCAP_ASIMD) + if (cpu_have_named_feature(ASIMD)) crypto_unregister_shashes(neon_algs, ARRAY_SIZE(neon_algs)); crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 1e17ea5c372b2782cb11bde1a8cfb9162bb4e9e8..eb0df239a759fcaa7ffb8db7b092b5246b446092 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -13,10 +13,10 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += msi.h generic-y += qrwlock.h generic-y += qspinlock.h -generic-y += rwsem.h generic-y += segment.h generic-y += serial.h generic-y += set_memory.h diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h index f2a234d6516cf5b537b80134b137729dd9abb798..b7bca1ae09e66df3a096c6b900c7dfe8ccd4ea9e 100644 --- a/arch/arm64/include/asm/arch_timer.h +++ b/arch/arm64/include/asm/arch_timer.h @@ -31,11 +31,23 @@ #include #if IS_ENABLED(CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND) -extern struct static_key_false arch_timer_read_ool_enabled; -#define needs_unstable_timer_counter_workaround() \ - static_branch_unlikely(&arch_timer_read_ool_enabled) +#define has_erratum_handler(h) \ + ({ \ + const struct arch_timer_erratum_workaround *__wa; \ + __wa = __this_cpu_read(timer_unstable_counter_workaround); \ + (__wa && __wa->h); \ + }) + +#define erratum_handler(h) \ + ({ \ + const struct arch_timer_erratum_workaround *__wa; \ + __wa = __this_cpu_read(timer_unstable_counter_workaround); \ + (__wa && __wa->h) ? __wa->h : arch_timer_##h; \ + }) + #else -#define needs_unstable_timer_counter_workaround() false +#define has_erratum_handler(h) false +#define erratum_handler(h) (arch_timer_##h) #endif enum arch_timer_erratum_match_type { @@ -61,23 +73,37 @@ struct arch_timer_erratum_workaround { DECLARE_PER_CPU(const struct arch_timer_erratum_workaround *, timer_unstable_counter_workaround); +/* inline sysreg accessors that make erratum_handler() work */ +static inline notrace u32 arch_timer_read_cntp_tval_el0(void) +{ + return read_sysreg(cntp_tval_el0); +} + +static inline notrace u32 arch_timer_read_cntv_tval_el0(void) +{ + return read_sysreg(cntv_tval_el0); +} + +static inline notrace u64 arch_timer_read_cntpct_el0(void) +{ + return read_sysreg(cntpct_el0); +} + +static inline notrace u64 arch_timer_read_cntvct_el0(void) +{ + return read_sysreg(cntvct_el0); +} + #define arch_timer_reg_read_stable(reg) \ -({ \ - u64 _val; \ - if (needs_unstable_timer_counter_workaround()) { \ - const struct arch_timer_erratum_workaround *wa; \ + ({ \ + u64 _val; \ + \ preempt_disable_notrace(); \ - wa = __this_cpu_read(timer_unstable_counter_workaround); \ - if (wa && wa->read_##reg) \ - _val = wa->read_##reg(); \ - else \ - _val = read_sysreg(reg); \ + _val = erratum_handler(read_ ## reg)(); \ preempt_enable_notrace(); \ - } else { \ - _val = read_sysreg(reg); \ - } \ - _val; \ -}) + \ + _val; \ + }) /* * These register accessors are marked inline so the compiler can @@ -148,18 +174,67 @@ static inline void arch_timer_set_cntkctl(u32 cntkctl) isb(); } -static inline u64 arch_counter_get_cntpct(void) +/* + * Ensure that reads of the counter are treated the same as memory reads + * for the purposes of ordering by subsequent memory barriers. + * + * This insanity brought to you by speculative system register reads, + * out-of-order memory accesses, sequence locks and Thomas Gleixner. + * + * http://lists.infradead.org/pipermail/linux-arm-kernel/2019-February/631195.html + */ +#define arch_counter_enforce_ordering(val) do { \ + u64 tmp, _val = (val); \ + \ + asm volatile( \ + " eor %0, %1, %1\n" \ + " add %0, sp, %0\n" \ + " ldr xzr, [%0]" \ + : "=r" (tmp) : "r" (_val)); \ +} while (0) + +static inline u64 __arch_counter_get_cntpct_stable(void) +{ + u64 cnt; + + isb(); + cnt = arch_timer_reg_read_stable(cntpct_el0); + arch_counter_enforce_ordering(cnt); + return cnt; +} + +static inline u64 __arch_counter_get_cntpct(void) { + u64 cnt; + isb(); - return arch_timer_reg_read_stable(cntpct_el0); + cnt = read_sysreg(cntpct_el0); + arch_counter_enforce_ordering(cnt); + return cnt; } -static inline u64 arch_counter_get_cntvct(void) +static inline u64 __arch_counter_get_cntvct_stable(void) { + u64 cnt; + isb(); - return arch_timer_reg_read_stable(cntvct_el0); + cnt = arch_timer_reg_read_stable(cntvct_el0); + arch_counter_enforce_ordering(cnt); + return cnt; } +static inline u64 __arch_counter_get_cntvct(void) +{ + u64 cnt; + + isb(); + cnt = read_sysreg(cntvct_el0); + arch_counter_enforce_ordering(cnt); + return cnt; +} + +#undef arch_counter_enforce_ordering + static inline int arch_timer_arch_init(void) { return 0; diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index c5308d01e228c3ff08d4cda6d0b6457c80aa5b31..039fbd822ec64025665291187941231b79cc7213 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -407,10 +407,14 @@ alternative_endif .ifc \op, cvap sys 3, c7, c12, 1, \kaddr // dc cvap .else + .ifc \op, cvadp + sys 3, c7, c13, 1, \kaddr // dc cvadp + .else dc \op, \kaddr .endif .endif .endif + .endif add \kaddr, \kaddr, \tmp1 cmp \kaddr, \size b.lo 9998b @@ -442,8 +446,8 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present */ .macro reset_pmuserenr_el0, tmpreg - mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer - sbfx \tmpreg, \tmpreg, #8, #4 + mrs \tmpreg, id_aa64dfr0_el1 + sbfx \tmpreg, \tmpreg, #ID_AA64DFR0_PMUVER_SHIFT, #4 cmp \tmpreg, #1 // Skip if no PMU present b.lt 9000f msr pmuserenr_el0, xzr // Disable PMU access from EL0 diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index f66bb04fdf2dd3cc21f79a36e3d2d79115c7f670..85b6bedbcc68037a907f462e1c63691c4f5d26de 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -20,6 +20,8 @@ #ifndef __ASSEMBLY__ +#include + #define __nops(n) ".rept " #n "\nnop\n.endr\n" #define nops(n) asm volatile(__nops(n)) @@ -72,31 +74,33 @@ static inline unsigned long array_index_mask_nospec(unsigned long idx, #define __smp_store_release(p, v) \ do { \ + typeof(p) __p = (p); \ union { typeof(*p) __val; char __c[1]; } __u = \ - { .__val = (__force typeof(*p)) (v) }; \ + { .__val = (__force typeof(*p)) (v) }; \ compiletime_assert_atomic_type(*p); \ + kasan_check_write(__p, sizeof(*p)); \ switch (sizeof(*p)) { \ case 1: \ asm volatile ("stlrb %w1, %0" \ - : "=Q" (*p) \ + : "=Q" (*__p) \ : "r" (*(__u8 *)__u.__c) \ : "memory"); \ break; \ case 2: \ asm volatile ("stlrh %w1, %0" \ - : "=Q" (*p) \ + : "=Q" (*__p) \ : "r" (*(__u16 *)__u.__c) \ : "memory"); \ break; \ case 4: \ asm volatile ("stlr %w1, %0" \ - : "=Q" (*p) \ + : "=Q" (*__p) \ : "r" (*(__u32 *)__u.__c) \ : "memory"); \ break; \ case 8: \ asm volatile ("stlr %1, %0" \ - : "=Q" (*p) \ + : "=Q" (*__p) \ : "r" (*(__u64 *)__u.__c) \ : "memory"); \ break; \ @@ -106,27 +110,29 @@ do { \ #define __smp_load_acquire(p) \ ({ \ union { typeof(*p) __val; char __c[1]; } __u; \ + typeof(p) __p = (p); \ compiletime_assert_atomic_type(*p); \ + kasan_check_read(__p, sizeof(*p)); \ switch (sizeof(*p)) { \ case 1: \ asm volatile ("ldarb %w0, %1" \ : "=r" (*(__u8 *)__u.__c) \ - : "Q" (*p) : "memory"); \ + : "Q" (*__p) : "memory"); \ break; \ case 2: \ asm volatile ("ldarh %w0, %1" \ : "=r" (*(__u16 *)__u.__c) \ - : "Q" (*p) : "memory"); \ + : "Q" (*__p) : "memory"); \ break; \ case 4: \ asm volatile ("ldar %w0, %1" \ : "=r" (*(__u32 *)__u.__c) \ - : "Q" (*p) : "memory"); \ + : "Q" (*__p) : "memory"); \ break; \ case 8: \ asm volatile ("ldar %0, %1" \ : "=r" (*(__u64 *)__u.__c) \ - : "Q" (*p) : "memory"); \ + : "Q" (*__p) : "memory"); \ break; \ } \ __u.__val; \ diff --git a/arch/arm64/include/asm/brk-imm.h b/arch/arm64/include/asm/brk-imm.h index 2945fe6cd863c2712451d13f544b961de13a6ea1..d84294064e6a086c654843d09636f6351dfa7afd 100644 --- a/arch/arm64/include/asm/brk-imm.h +++ b/arch/arm64/include/asm/brk-imm.h @@ -11,6 +11,8 @@ /* * #imm16 values used for BRK instruction generation + * 0x004: for installing kprobes + * 0x005: for installing uprobes * Allowed values for kgdb are 0x400 - 0x7ff * 0x100: for triggering a fault on purpose (reserved) * 0x400: for dynamic BRK instruction @@ -18,10 +20,13 @@ * 0x800: kernel-mode BUG() and WARN() traps * 0x9xx: tag-based KASAN trap (allowed values 0x900 - 0x9ff) */ +#define KPROBES_BRK_IMM 0x004 +#define UPROBES_BRK_IMM 0x005 #define FAULT_BRK_IMM 0x100 #define KGDB_DYN_DBG_BRK_IMM 0x400 #define KGDB_COMPILED_DBG_BRK_IMM 0x401 #define BUG_BRK_IMM 0x800 #define KASAN_BRK_IMM 0x900 +#define KASAN_BRK_MASK 0x0ff #endif diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index f6a76e43f39ed434aad795378be444b0ce121474..defdc67d9ab49c8542e42e79203db5a9df1b0754 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -61,7 +61,8 @@ #define ARM64_HAS_GENERIC_AUTH_ARCH 40 #define ARM64_HAS_GENERIC_AUTH_IMP_DEF 41 #define ARM64_HAS_IRQ_PRIO_MASKING 42 +#define ARM64_HAS_DCPODP 43 -#define ARM64_NCAPS 43 +#define ARM64_NCAPS 44 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index e505e1fbd2b933b42af6f717e48e7eb284248659..f210bcf096f76543aca975aa34b9228ae8ef6487 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -14,15 +14,8 @@ #include #include -/* - * In the arm64 world (as in the ARM world), elf_hwcap is used both internally - * in the kernel and for user space to keep track of which optional features - * are supported by the current system. So let's map feature 'x' to HWCAP_x. - * Note that HWCAP_x constants are bit fields so we need to take the log. - */ - -#define MAX_CPU_FEATURES (8 * sizeof(elf_hwcap)) -#define cpu_feature(x) ilog2(HWCAP_ ## x) +#define MAX_CPU_FEATURES 64 +#define cpu_feature(x) KERNEL_HWCAP_ ## x #ifndef __ASSEMBLY__ @@ -399,11 +392,13 @@ extern DECLARE_BITMAP(boot_capabilities, ARM64_NPATCHABLE); for_each_set_bit(cap, cpu_hwcaps, ARM64_NCAPS) bool this_cpu_has_cap(unsigned int cap); +void cpu_set_feature(unsigned int num); +bool cpu_have_feature(unsigned int num); +unsigned long cpu_get_elf_hwcap(void); +unsigned long cpu_get_elf_hwcap2(void); -static inline bool cpu_have_feature(unsigned int num) -{ - return elf_hwcap & (1UL << num); -} +#define cpu_set_named_feature(name) cpu_set_feature(cpu_feature(name)) +#define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name)) /* System capability check for constant caps */ static inline bool __cpus_have_const_cap(int num) @@ -638,11 +633,7 @@ static inline int arm64_get_ssbd_state(void) #endif } -#ifdef CONFIG_ARM64_SSBD void arm64_set_ssbd_mitigation(bool state); -#else -static inline void arm64_set_ssbd_mitigation(bool state) {} -#endif extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 5f1437099b9979ac983ae0896272229e2b04f1e3..2602bae334fb70a825475011d269e5c5af4c671d 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -89,6 +89,7 @@ #define ARM_CPU_PART_CORTEX_A35 0xD04 #define ARM_CPU_PART_CORTEX_A55 0xD05 #define ARM_CPU_PART_CORTEX_A76 0xD0B +#define ARM_CPU_PART_NEOVERSE_N1 0xD0C #define APM_CPU_PART_POTENZA 0x000 @@ -118,6 +119,7 @@ #define MIDR_CORTEX_A35 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A35) #define MIDR_CORTEX_A55 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A55) #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) +#define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index a44cf522542961b7d4a6faee5f6edc3a8dca4720..0679f781696d410240184e05710123c5e7ab70ec 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -65,12 +65,9 @@ #define CACHE_FLUSH_IS_SAFE 1 /* kprobes BRK opcodes with ESR encoding */ -#define BRK64_ESR_MASK 0xFFFF -#define BRK64_ESR_KPROBES 0x0004 -#define BRK64_OPCODE_KPROBES (AARCH64_BREAK_MON | (BRK64_ESR_KPROBES << 5)) +#define BRK64_OPCODE_KPROBES (AARCH64_BREAK_MON | (KPROBES_BRK_IMM << 5)) /* uprobes BRK opcodes with ESR encoding */ -#define BRK64_ESR_UPROBES 0x0005 -#define BRK64_OPCODE_UPROBES (AARCH64_BREAK_MON | (BRK64_ESR_UPROBES << 5)) +#define BRK64_OPCODE_UPROBES (AARCH64_BREAK_MON | (UPROBES_BRK_IMM << 5)) /* AArch32 */ #define DBG_ESR_EVT_BKPT 0x4 @@ -94,18 +91,24 @@ struct step_hook { int (*fn)(struct pt_regs *regs, unsigned int esr); }; -void register_step_hook(struct step_hook *hook); -void unregister_step_hook(struct step_hook *hook); +void register_user_step_hook(struct step_hook *hook); +void unregister_user_step_hook(struct step_hook *hook); + +void register_kernel_step_hook(struct step_hook *hook); +void unregister_kernel_step_hook(struct step_hook *hook); struct break_hook { struct list_head node; - u32 esr_val; - u32 esr_mask; int (*fn)(struct pt_regs *regs, unsigned int esr); + u16 imm; + u16 mask; /* These bits are ignored when comparing with imm */ }; -void register_break_hook(struct break_hook *hook); -void unregister_break_hook(struct break_hook *hook); +void register_user_break_hook(struct break_hook *hook); +void unregister_user_break_hook(struct break_hook *hook); + +void register_kernel_break_hook(struct break_hook *hook); +void unregister_kernel_break_hook(struct break_hook *hook); u8 debug_monitors_arch(void); diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 6adc1a90e7e6b62b752e4da7a433bb5163517598..355d120b78cb651bdb6bca6c5ab70fe572134fb6 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -214,10 +214,10 @@ typedef compat_elf_greg_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; set_thread_flag(TIF_32BIT); \ }) #define COMPAT_ARCH_DLINFO -extern int aarch32_setup_vectors_page(struct linux_binprm *bprm, - int uses_interp); +extern int aarch32_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp); #define compat_arch_setup_additional_pages \ - aarch32_setup_vectors_page + aarch32_setup_additional_pages #endif /* CONFIG_COMPAT */ diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 52233f00d53d8af8974f74c870e42b3fa93f533c..0e27fe91d5ea8e0e2703f89754da159ea6c90cd9 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -156,9 +156,7 @@ ESR_ELx_WFx_ISS_WFI) /* BRK instruction trap from AArch64 state */ -#define ESR_ELx_VAL_BRK64(imm) \ - ((ESR_ELx_EC_BRK64 << ESR_ELx_EC_SHIFT) | ESR_ELx_IL | \ - ((imm) & 0xffff)) +#define ESR_ELx_BRK64_ISS_COMMENT_MASK 0xffff /* ISS field definitions for System instruction traps */ #define ESR_ELx_SYS64_ISS_RES0_SHIFT 22 @@ -198,9 +196,10 @@ /* * User space cache operations have the following sysreg encoding * in System instructions. - * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 12, 14 }, WRITE (L=0) + * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 12, 13, 14 }, WRITE (L=0) */ #define ESR_ELx_SYS64_ISS_CRM_DC_CIVAC 14 +#define ESR_ELx_SYS64_ISS_CRM_DC_CVADP 13 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAP 12 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAU 11 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAC 10 diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index cccb83ad7fa8ea2e1f4251dd724edc62c754771b..a56efb5626fa25264d3de7d34c06ba024cc2c131 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -23,26 +23,34 @@ #include +#define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */ + #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ do { \ + unsigned int loops = FUTEX_MAX_LOOPS; \ + \ uaccess_enable(); \ asm volatile( \ " prfm pstl1strm, %2\n" \ "1: ldxr %w1, %2\n" \ insn "\n" \ -"2: stlxr %w3, %w0, %2\n" \ -" cbnz %w3, 1b\n" \ -" dmb ish\n" \ +"2: stlxr %w0, %w3, %2\n" \ +" cbz %w0, 3f\n" \ +" sub %w4, %w4, %w0\n" \ +" cbnz %w4, 1b\n" \ +" mov %w0, %w7\n" \ "3:\n" \ +" dmb ish\n" \ " .pushsection .fixup,\"ax\"\n" \ " .align 2\n" \ -"4: mov %w0, %w5\n" \ +"4: mov %w0, %w6\n" \ " b 3b\n" \ " .popsection\n" \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ - : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \ - : "r" (oparg), "Ir" (-EFAULT) \ + : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp), \ + "+r" (loops) \ + : "r" (oparg), "Ir" (-EFAULT), "Ir" (-EAGAIN) \ : "memory"); \ uaccess_disable(); \ } while (0) @@ -57,23 +65,23 @@ arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *_uaddr) switch (op) { case FUTEX_OP_SET: - __futex_atomic_op("mov %w0, %w4", + __futex_atomic_op("mov %w3, %w5", ret, oldval, uaddr, tmp, oparg); break; case FUTEX_OP_ADD: - __futex_atomic_op("add %w0, %w1, %w4", + __futex_atomic_op("add %w3, %w1, %w5", ret, oldval, uaddr, tmp, oparg); break; case FUTEX_OP_OR: - __futex_atomic_op("orr %w0, %w1, %w4", + __futex_atomic_op("orr %w3, %w1, %w5", ret, oldval, uaddr, tmp, oparg); break; case FUTEX_OP_ANDN: - __futex_atomic_op("and %w0, %w1, %w4", + __futex_atomic_op("and %w3, %w1, %w5", ret, oldval, uaddr, tmp, ~oparg); break; case FUTEX_OP_XOR: - __futex_atomic_op("eor %w0, %w1, %w4", + __futex_atomic_op("eor %w3, %w1, %w5", ret, oldval, uaddr, tmp, oparg); break; default: @@ -93,6 +101,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, u32 oldval, u32 newval) { int ret = 0; + unsigned int loops = FUTEX_MAX_LOOPS; u32 val, tmp; u32 __user *uaddr; @@ -104,24 +113,30 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, asm volatile("// futex_atomic_cmpxchg_inatomic\n" " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" -" sub %w3, %w1, %w4\n" -" cbnz %w3, 3f\n" -"2: stlxr %w3, %w5, %2\n" -" cbnz %w3, 1b\n" -" dmb ish\n" +" sub %w3, %w1, %w5\n" +" cbnz %w3, 4f\n" +"2: stlxr %w3, %w6, %2\n" +" cbz %w3, 3f\n" +" sub %w4, %w4, %w3\n" +" cbnz %w4, 1b\n" +" mov %w0, %w8\n" "3:\n" +" dmb ish\n" +"4:\n" " .pushsection .fixup,\"ax\"\n" -"4: mov %w0, %w6\n" -" b 3b\n" +"5: mov %w0, %w7\n" +" b 4b\n" " .popsection\n" - _ASM_EXTABLE(1b, 4b) - _ASM_EXTABLE(2b, 4b) - : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) - : "r" (oldval), "r" (newval), "Ir" (-EFAULT) + _ASM_EXTABLE(1b, 5b) + _ASM_EXTABLE(2b, 5b) + : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops) + : "r" (oldval), "r" (newval), "Ir" (-EFAULT), "Ir" (-EAGAIN) : "memory"); uaccess_disable(); - *uval = val; + if (!ret) + *uval = val; + return ret; } diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 400b80b49595dc147fb5a2110ff347405f58bd0b..b4bfb6672168b3f24eb18bd1822e3a6dd7441580 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -17,6 +17,7 @@ #define __ASM_HWCAP_H #include +#include #define COMPAT_HWCAP_HALF (1 << 1) #define COMPAT_HWCAP_THUMB (1 << 2) @@ -40,11 +41,67 @@ #define COMPAT_HWCAP2_CRC32 (1 << 4) #ifndef __ASSEMBLY__ +#include + +/* + * For userspace we represent hwcaps as a collection of HWCAP{,2}_x bitfields + * as described in uapi/asm/hwcap.h. For the kernel we represent hwcaps as + * natural numbers (in a single range of size MAX_CPU_FEATURES) defined here + * with prefix KERNEL_HWCAP_ mapped to their HWCAP{,2}_x counterpart. + * + * Hwcaps should be set and tested within the kernel via the + * cpu_{set,have}_named_feature(feature) where feature is the unique suffix + * of KERNEL_HWCAP_{feature}. + */ +#define __khwcap_feature(x) const_ilog2(HWCAP_ ## x) +#define KERNEL_HWCAP_FP __khwcap_feature(FP) +#define KERNEL_HWCAP_ASIMD __khwcap_feature(ASIMD) +#define KERNEL_HWCAP_EVTSTRM __khwcap_feature(EVTSTRM) +#define KERNEL_HWCAP_AES __khwcap_feature(AES) +#define KERNEL_HWCAP_PMULL __khwcap_feature(PMULL) +#define KERNEL_HWCAP_SHA1 __khwcap_feature(SHA1) +#define KERNEL_HWCAP_SHA2 __khwcap_feature(SHA2) +#define KERNEL_HWCAP_CRC32 __khwcap_feature(CRC32) +#define KERNEL_HWCAP_ATOMICS __khwcap_feature(ATOMICS) +#define KERNEL_HWCAP_FPHP __khwcap_feature(FPHP) +#define KERNEL_HWCAP_ASIMDHP __khwcap_feature(ASIMDHP) +#define KERNEL_HWCAP_CPUID __khwcap_feature(CPUID) +#define KERNEL_HWCAP_ASIMDRDM __khwcap_feature(ASIMDRDM) +#define KERNEL_HWCAP_JSCVT __khwcap_feature(JSCVT) +#define KERNEL_HWCAP_FCMA __khwcap_feature(FCMA) +#define KERNEL_HWCAP_LRCPC __khwcap_feature(LRCPC) +#define KERNEL_HWCAP_DCPOP __khwcap_feature(DCPOP) +#define KERNEL_HWCAP_SHA3 __khwcap_feature(SHA3) +#define KERNEL_HWCAP_SM3 __khwcap_feature(SM3) +#define KERNEL_HWCAP_SM4 __khwcap_feature(SM4) +#define KERNEL_HWCAP_ASIMDDP __khwcap_feature(ASIMDDP) +#define KERNEL_HWCAP_SHA512 __khwcap_feature(SHA512) +#define KERNEL_HWCAP_SVE __khwcap_feature(SVE) +#define KERNEL_HWCAP_ASIMDFHM __khwcap_feature(ASIMDFHM) +#define KERNEL_HWCAP_DIT __khwcap_feature(DIT) +#define KERNEL_HWCAP_USCAT __khwcap_feature(USCAT) +#define KERNEL_HWCAP_ILRCPC __khwcap_feature(ILRCPC) +#define KERNEL_HWCAP_FLAGM __khwcap_feature(FLAGM) +#define KERNEL_HWCAP_SSBS __khwcap_feature(SSBS) +#define KERNEL_HWCAP_SB __khwcap_feature(SB) +#define KERNEL_HWCAP_PACA __khwcap_feature(PACA) +#define KERNEL_HWCAP_PACG __khwcap_feature(PACG) + +#define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 32) +#define KERNEL_HWCAP_DCPODP __khwcap2_feature(DCPODP) +#define KERNEL_HWCAP_SVE2 __khwcap2_feature(SVE2) +#define KERNEL_HWCAP_SVEAES __khwcap2_feature(SVEAES) +#define KERNEL_HWCAP_SVEPMULL __khwcap2_feature(SVEPMULL) +#define KERNEL_HWCAP_SVEBITPERM __khwcap2_feature(SVEBITPERM) +#define KERNEL_HWCAP_SVESHA3 __khwcap2_feature(SVESHA3) +#define KERNEL_HWCAP_SVESM4 __khwcap2_feature(SVESM4) + /* * This yields a mask that user programs can use to figure out what * instruction set this cpu supports. */ -#define ELF_HWCAP (elf_hwcap) +#define ELF_HWCAP cpu_get_elf_hwcap() +#define ELF_HWCAP2 cpu_get_elf_hwcap2() #ifdef CONFIG_COMPAT #define COMPAT_ELF_HWCAP (compat_elf_hwcap) @@ -60,6 +117,5 @@ enum { #endif }; -extern unsigned long elf_hwcap; #endif #endif diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 8bb7210ac286c8d817931ed9821d43bd861f520f..b807cb9b517d3848a76ddd696ec71f3e90b6484a 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -124,8 +124,6 @@ static inline u64 __raw_readq(const volatile void __iomem *addr) #define __io_par(v) __iormb(v) #define __iowmb() wmb() -#define mmiowb() do { } while (0) - /* * Relaxed I/O memory access primitives. These follow the Device memory * ordering rules but do not guarantee any ordering relative to Normal memory diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h index 43d8366c1e878ef2635d9c3b69733021c804e6f5..6299631890857365626641c225c06645beb123e8 100644 --- a/arch/arm64/include/asm/irqflags.h +++ b/arch/arm64/include/asm/irqflags.h @@ -43,7 +43,7 @@ static inline void arch_local_irq_enable(void) asm volatile(ALTERNATIVE( "msr daifclr, #2 // arch_local_irq_enable\n" "nop", - "msr_s " __stringify(SYS_ICC_PMR_EL1) ",%0\n" + __msr_s(SYS_ICC_PMR_EL1, "%0") "dsb sy", ARM64_HAS_IRQ_PRIO_MASKING) : @@ -55,7 +55,7 @@ static inline void arch_local_irq_disable(void) { asm volatile(ALTERNATIVE( "msr daifset, #2 // arch_local_irq_disable", - "msr_s " __stringify(SYS_ICC_PMR_EL1) ", %0", + __msr_s(SYS_ICC_PMR_EL1, "%0"), ARM64_HAS_IRQ_PRIO_MASKING) : : "r" ((unsigned long) GIC_PRIO_IRQOFF) @@ -86,7 +86,7 @@ static inline unsigned long arch_local_save_flags(void) "mov %0, %1\n" "nop\n" "nop", - "mrs_s %0, " __stringify(SYS_ICC_PMR_EL1) "\n" + __mrs_s("%0", SYS_ICC_PMR_EL1) "ands %1, %1, " __stringify(PSR_I_BIT) "\n" "csel %0, %0, %2, eq", ARM64_HAS_IRQ_PRIO_MASKING) @@ -116,7 +116,7 @@ static inline void arch_local_irq_restore(unsigned long flags) asm volatile(ALTERNATIVE( "msr daif, %0\n" "nop", - "msr_s " __stringify(SYS_ICC_PMR_EL1) ", %0\n" + __msr_s(SYS_ICC_PMR_EL1, "%0") "dsb sy", ARM64_HAS_IRQ_PRIO_MASKING) : "+r" (flags) diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h index d5a44cf859e94284a1a38990105107fe3de28b2f..21721fbf44e749fdcc892428270daed95c99c4d5 100644 --- a/arch/arm64/include/asm/kprobes.h +++ b/arch/arm64/include/asm/kprobes.h @@ -54,8 +54,6 @@ void arch_remove_kprobe(struct kprobe *); int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr); int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); -int kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr); -int kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr); void kretprobe_trampoline(void); void __kprobes *trampoline_probe_handler(struct pt_regs *regs); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 4da765f2cca589a6ba0761b5002e876269b6a21f..c3060833b7a5aae918bd902963b1dff2843beae1 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -30,7 +30,7 @@ ({ \ u64 reg; \ asm volatile(ALTERNATIVE("mrs %0, " __stringify(r##nvh),\ - "mrs_s %0, " __stringify(r##vh),\ + __mrs_s("%0", r##vh), \ ARM64_HAS_VIRT_HOST_EXTN) \ : "=r" (reg)); \ reg; \ @@ -40,7 +40,7 @@ do { \ u64 __val = (u64)(v); \ asm volatile(ALTERNATIVE("msr " __stringify(r##nvh) ", %x0",\ - "msr_s " __stringify(r##vh) ", %x0",\ + __msr_s(r##vh, "%x0"), \ ARM64_HAS_VIRT_HOST_EXTN) \ : : "rZ" (__val)); \ } while (0) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 290195168bb3c7f3aa90820e49c2364e62ce04d2..2cb8248fa2c893debb6b70cecd60dabbe2884531 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -302,7 +302,7 @@ static inline void *phys_to_virt(phys_addr_t x) */ #define ARCH_PFN_OFFSET ((unsigned long)PHYS_PFN_OFFSET) -#ifndef CONFIG_SPARSEMEM_VMEMMAP +#if !defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_DEBUG_VIRTUAL) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define _virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) #else diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index 905e1bb0e7bd023b7174da7a6a81459df87b40dc..cd9f4e9d04d3be6564843e821b2a612642717210 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -73,4 +73,9 @@ static inline bool is_forbidden_offset_for_adrp(void *place) struct plt_entry get_plt_entry(u64 dst, void *pc); bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b); +static inline bool plt_entry_is_initialized(const struct plt_entry *e) +{ + return e->adrp || e->add || e->br; +} + #endif /* __ASM_MODULE_H */ diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 52fa47c73bf042efc0c84796dcf1a21c10c534c1..dabba4b2c61f17fa9898777b12c549d07517fc77 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -33,12 +33,22 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pmd_t *)__get_free_page(PGALLOC_GFP); + struct page *page; + + page = alloc_page(PGALLOC_GFP); + if (!page) + return NULL; + if (!pgtable_pmd_page_ctor(page)) { + __free_page(page); + return NULL; + } + return page_address(page); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmdp) { BUG_ON((unsigned long)pmdp & (PAGE_SIZE-1)); + pgtable_pmd_page_dtor(virt_to_page(pmdp)); free_page((unsigned long)pmdp); } diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index de70c1eabf336abc2473dddb4e788d15599bc58b..2c41b04708fe33465b8193a65410d74efcd56be1 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -478,6 +478,8 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) return __pmd_to_phys(pmd); } +static inline void pte_unmap(pte_t *pte) { } + /* Find an entry in the third-level page table. */ #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) @@ -485,9 +487,6 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) #define pte_offset_kernel(dir,addr) ((pte_t *)__va(pte_offset_phys((dir), (addr)))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) -#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) -#define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) #define pte_set_fixmap(addr) ((pte_t *)set_fixmap_offset(FIX_PTE, addr)) #define pte_set_fixmap_offset(pmd, addr) pte_set_fixmap(pte_offset_phys(pmd, addr)) diff --git a/arch/arm64/include/asm/pointer_auth.h b/arch/arm64/include/asm/pointer_auth.h index 15d49515efdd767a37b5e2b768e6f91d3ae9c1b3..d328540cb85edc5f272190e3d36481b4c1987694 100644 --- a/arch/arm64/include/asm/pointer_auth.h +++ b/arch/arm64/include/asm/pointer_auth.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_POINTER_AUTH_H #define __ASM_POINTER_AUTH_H diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 5d9ce62bdebdebfb1c438f9c458cca6dcd4b711c..fcd0e691b1ea242600f025be0467df6405c67671 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -57,7 +57,15 @@ #define TASK_SIZE_64 (UL(1) << vabits_user) #ifdef CONFIG_COMPAT +#if defined(CONFIG_ARM64_64K_PAGES) && defined(CONFIG_KUSER_HELPERS) +/* + * With CONFIG_ARM64_64K_PAGES enabled, the last page is occupied + * by the compat vectors page. + */ #define TASK_SIZE_32 UL(0x100000000) +#else +#define TASK_SIZE_32 (UL(0x100000000) - PAGE_SIZE) +#endif /* CONFIG_ARM64_64K_PAGES */ #define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \ TASK_SIZE_32 : TASK_SIZE_64) #define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index ec60174c8c18417354fa9bac578b8dde5284f7bd..b2de32939ada8454b9af594c251a2d85f029ef36 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -305,6 +305,28 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) return regs->regs[0]; } +/** + * regs_get_kernel_argument() - get Nth function argument in kernel + * @regs: pt_regs of that context + * @n: function argument number (start from 0) + * + * regs_get_argument() returns @n th argument of the function call. + * + * Note that this chooses the most likely register mapping. In very rare + * cases this may not return correct data, for example, if one of the + * function parameters is 16 bytes or bigger. In such cases, we cannot + * get access the parameter correctly and the register assignment of + * subsequent parameters will be shifted. + */ +static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, + unsigned int n) +{ +#define NR_REG_ARGUMENTS 8 + if (n < NR_REG_ARGUMENTS) + return pt_regs_read_reg(regs, n); + return 0; +} + /* We must avoid circular header include via sched.h */ struct task_struct; int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task); diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h index ffe47d766c2593601573f748a18c84645bec79ad..63e0b92a5fbb069bc232a93e32e17a39d6df9bd1 100644 --- a/arch/arm64/include/asm/sdei.h +++ b/arch/arm64/include/asm/sdei.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2017 Arm Ltd. #ifndef __ASM_SDEI_H #define __ASM_SDEI_H diff --git a/arch/arm64/include/asm/signal32.h b/arch/arm64/include/asm/signal32.h index 81abea0b7650867d86385fc0c4fc145fefd24c46..58e288aaf0bae3663ca0019945c1c7a79aabd730 100644 --- a/arch/arm64/include/asm/signal32.h +++ b/arch/arm64/include/asm/signal32.h @@ -20,8 +20,6 @@ #ifdef CONFIG_COMPAT #include -#define AARCH32_KERN_SIGRET_CODE_OFFSET 0x500 - int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs); int compat_setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 5412fa40825e83d892b19d379083d3b8cf479bb7..915809e4ac32d4339452ea9c8be76a6499ecc2ca 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -119,7 +119,7 @@ static inline pud_t *stage2_pud_offset(struct kvm *kvm, static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) { if (kvm_stage2_has_pud(kvm)) - pud_free(NULL, pud); + free_page((unsigned long)pud); } static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp) @@ -192,7 +192,7 @@ static inline pmd_t *stage2_pmd_offset(struct kvm *kvm, static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd) { if (kvm_stage2_has_pmd(kvm)) - pmd_free(NULL, pmd); + free_page((unsigned long)pmd); } static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud) diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h index ad8be16a39c9d18bdbd406f522c02432529c4cf6..a179df3674a1aa207dfdead37e47219353b67b91 100644 --- a/arch/arm64/include/asm/syscall.h +++ b/arch/arm64/include/asm/syscall.h @@ -65,52 +65,22 @@ static inline void syscall_set_return_value(struct task_struct *task, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { - if (n == 0) - return; - - if (i + n > SYSCALL_MAX_ARGS) { - unsigned long *args_bad = args + SYSCALL_MAX_ARGS - i; - unsigned int n_bad = n + i - SYSCALL_MAX_ARGS; - pr_warning("%s called with max args %d, handling only %d\n", - __func__, i + n, SYSCALL_MAX_ARGS); - memset(args_bad, 0, n_bad * sizeof(args[0])); - } - - if (i == 0) { - args[0] = regs->orig_x0; - args++; - i++; - n--; - } - - memcpy(args, ®s->regs[i], n * sizeof(args[0])); + args[0] = regs->orig_x0; + args++; + + memcpy(args, ®s->regs[1], 5 * sizeof(args[0])); } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) { - if (n == 0) - return; - - if (i + n > SYSCALL_MAX_ARGS) { - pr_warning("%s called with max args %d, handling only %d\n", - __func__, i + n, SYSCALL_MAX_ARGS); - n = SYSCALL_MAX_ARGS - i; - } - - if (i == 0) { - regs->orig_x0 = args[0]; - args++; - i++; - n--; - } - - memcpy(®s->regs[i], args, n * sizeof(args[0])); + regs->orig_x0 = args[0]; + args++; + + memcpy(®s->regs[1], args, 5 * sizeof(args[0])); } /* diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 5b267dec6194e9675bb48f710a0d16b58eca3d64..3f7b917e8f3a76aef3787aa7cd3c356e9f04b1b5 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -606,6 +606,20 @@ #define ID_AA64PFR1_SSBS_PSTATE_ONLY 1 #define ID_AA64PFR1_SSBS_PSTATE_INSNS 2 +/* id_aa64zfr0 */ +#define ID_AA64ZFR0_SM4_SHIFT 40 +#define ID_AA64ZFR0_SHA3_SHIFT 32 +#define ID_AA64ZFR0_BITPERM_SHIFT 16 +#define ID_AA64ZFR0_AES_SHIFT 4 +#define ID_AA64ZFR0_SVEVER_SHIFT 0 + +#define ID_AA64ZFR0_SM4 0x1 +#define ID_AA64ZFR0_SHA3 0x1 +#define ID_AA64ZFR0_BITPERM 0x1 +#define ID_AA64ZFR0_AES 0x1 +#define ID_AA64ZFR0_AES_PMULL 0x2 +#define ID_AA64ZFR0_SVEVER_SVE2 0x1 + /* id_aa64mmfr0 */ #define ID_AA64MMFR0_TGRAN4_SHIFT 28 #define ID_AA64MMFR0_TGRAN64_SHIFT 24 @@ -746,20 +760,39 @@ #include #include -asm( -" .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" -" .equ .L__reg_num_x\\num, \\num\n" -" .endr\n" +#define __DEFINE_MRS_MSR_S_REGNUM \ +" .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" \ +" .equ .L__reg_num_x\\num, \\num\n" \ +" .endr\n" \ " .equ .L__reg_num_xzr, 31\n" -"\n" -" .macro mrs_s, rt, sreg\n" - __emit_inst(0xd5200000|(\\sreg)|(.L__reg_num_\\rt)) + +#define DEFINE_MRS_S \ + __DEFINE_MRS_MSR_S_REGNUM \ +" .macro mrs_s, rt, sreg\n" \ + __emit_inst(0xd5200000|(\\sreg)|(.L__reg_num_\\rt)) \ " .endm\n" -"\n" -" .macro msr_s, sreg, rt\n" - __emit_inst(0xd5000000|(\\sreg)|(.L__reg_num_\\rt)) + +#define DEFINE_MSR_S \ + __DEFINE_MRS_MSR_S_REGNUM \ +" .macro msr_s, sreg, rt\n" \ + __emit_inst(0xd5000000|(\\sreg)|(.L__reg_num_\\rt)) \ " .endm\n" -); + +#define UNDEFINE_MRS_S \ +" .purgem mrs_s\n" + +#define UNDEFINE_MSR_S \ +" .purgem msr_s\n" + +#define __mrs_s(v, r) \ + DEFINE_MRS_S \ +" mrs_s " v ", " __stringify(r) "\n" \ + UNDEFINE_MRS_S + +#define __msr_s(r, v) \ + DEFINE_MSR_S \ +" msr_s " __stringify(r) ", " v "\n" \ + UNDEFINE_MSR_S /* * Unlike read_cpuid, calls to read_sysreg are never expected to be @@ -787,13 +820,13 @@ asm( */ #define read_sysreg_s(r) ({ \ u64 __val; \ - asm volatile("mrs_s %0, " __stringify(r) : "=r" (__val)); \ + asm volatile(__mrs_s("%0", r) : "=r" (__val)); \ __val; \ }) #define write_sysreg_s(v, r) do { \ u64 __val = (u64)(v); \ - asm volatile("msr_s " __stringify(r) ", %x0" : : "rZ" (__val)); \ + asm volatile(__msr_s(r, "%x0") : : "rZ" (__val)); \ } while (0) /* diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h index 32693f34f43172ba25e419b98292609c4bc11373..fca95424e873522a7d8c4c8eca6b69b59f7ec2e5 100644 --- a/arch/arm64/include/asm/system_misc.h +++ b/arch/arm64/include/asm/system_misc.h @@ -41,7 +41,6 @@ void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int, int sig, int code, const char *name); struct mm_struct; -extern void show_pte(unsigned long addr); extern void __show_regs(struct pt_regs *); extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 106fdc951b6eefdda0a97c877c2493b7bdfac1f8..a287189ca8b4ea4124c0139cd15c683f6afdcd87 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -27,6 +27,7 @@ static inline void __tlb_remove_table(void *_table) free_page_and_swap_cache((struct page *)_table); } +#define tlb_flush tlb_flush static void tlb_flush(struct mmu_gather *tlb); #include @@ -62,7 +63,10 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) { - tlb_remove_table(tlb, virt_to_page(pmdp)); + struct page *page = virt_to_page(pmdp); + + pgtable_pmd_page_dtor(page); + tlb_remove_table(tlb, page); } #endif diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index d1dd93436e1eedad0ea3cf83ba1cdc6b3fd50c22..f2a83ff6b73c2414110c02dc14aa24686d6ada9c 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -44,7 +44,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 424 +#define __NR_compat_syscalls 428 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 5590f262369079bca3b66561a51e9b3f4705cdd7..23f1a44acada413fb4e2ad5411624d2925c71835 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -866,6 +866,14 @@ __SYSCALL(__NR_rt_sigtimedwait_time64, compat_sys_rt_sigtimedwait_time64) __SYSCALL(__NR_futex_time64, sys_futex) #define __NR_sched_rr_get_interval_time64 423 __SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval) +#define __NR_pidfd_send_signal 424 +__SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal) +#define __NR_io_uring_setup 425 +__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) +#define __NR_io_uring_enter 426 +__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) +#define __NR_io_uring_register 427 +__SYSCALL(__NR_io_uring_register, sys_io_uring_register) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/include/asm/vdso_datapage.h b/arch/arm64/include/asm/vdso_datapage.h index 2b9a63771eda8c81b12ec4279a9e279543739ecc..f89263c8e11affe95f628b848dd344a766860b9c 100644 --- a/arch/arm64/include/asm/vdso_datapage.h +++ b/arch/arm64/include/asm/vdso_datapage.h @@ -38,6 +38,7 @@ struct vdso_data { __u32 tz_minuteswest; /* Whacky timezone stuff */ __u32 tz_dsttime; __u32 use_syscall; + __u32 hrtimer_res; }; #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h index 0b5ec6e08c10181d1c552aa8ce0d39546dd56eab..0a12115d96384f94b7f7b7902d490137bbe9c4e4 100644 --- a/arch/arm64/include/asm/vmap_stack.h +++ b/arch/arm64/include/asm/vmap_stack.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2017 Arm Ltd. #ifndef __ASM_VMAP_STACK_H #define __ASM_VMAP_STACK_H diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 5f0750c2199ccfa0ed8bd04c2c729149b62f55c5..1a772b162191ad71f2a07620c9d4f31b7ab59d65 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -18,7 +18,7 @@ #define _UAPI__ASM_HWCAP_H /* - * HWCAP flags - for elf_hwcap (in kernel) and AT_HWCAP + * HWCAP flags - for AT_HWCAP */ #define HWCAP_FP (1 << 0) #define HWCAP_ASIMD (1 << 1) @@ -53,4 +53,15 @@ #define HWCAP_PACA (1 << 30) #define HWCAP_PACG (1UL << 31) +/* + * HWCAP2 flags - for AT_HWCAP2 + */ +#define HWCAP2_DCPODP (1 << 0) +#define HWCAP2_SVE2 (1 << 1) +#define HWCAP2_SVEAES (1 << 2) +#define HWCAP2_SVEPMULL (1 << 3) +#define HWCAP2_SVEBITPERM (1 << 4) +#define HWCAP2_SVESHA3 (1 << 5) +#define HWCAP2_SVESM4 (1 << 6) + #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index cd434d0719c1cf8ade1aaacdec52a7c69d11d605..9e7dcb2c31c7a6a1ef3b12e6308683e20e7b28fd 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -7,9 +7,9 @@ CPPFLAGS_vmlinux.lds := -DTEXT_OFFSET=$(TEXT_OFFSET) AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET) CFLAGS_armv8_deprecated.o := -I$(src) -CFLAGS_REMOVE_ftrace.o = -pg -CFLAGS_REMOVE_insn.o = -pg -CFLAGS_REMOVE_return_address.o = -pg +CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_insn.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) # Object file lists. obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ @@ -27,8 +27,9 @@ OBJCOPYFLAGS := --prefix-symbols=__efistub_ $(obj)/%.stub.o: $(obj)/%.o FORCE $(call if_changed,objcopy) -obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \ - sys_compat.o +obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ + sigreturn32.o sys_compat.o +obj-$(CONFIG_KUSER_HELPERS) += kuser32.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 7f40dcbdd51d0a0d80d56a0b8315dcc2ed419d44..e10e2a5d9ddcf2ca1f60d55e7a02b5baff6bf509 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -94,7 +94,7 @@ int main(void) DEFINE(CLOCK_REALTIME, CLOCK_REALTIME); DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); DEFINE(CLOCK_MONOTONIC_RAW, CLOCK_MONOTONIC_RAW); - DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC); + DEFINE(CLOCK_REALTIME_RES, offsetof(struct vdso_data, hrtimer_res)); DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE); DEFINE(CLOCK_MONOTONIC_COARSE,CLOCK_MONOTONIC_COARSE); DEFINE(CLOCK_COARSE_RES, LOW_RES_NSEC); diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 9950bb0cbd52167c6b3b76e9d122291d70264df4..e88d4e7bdfc71c0a054f730e6722ae741a761b8e 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -109,7 +110,6 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *__unused) atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1); -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR #include #include @@ -131,9 +131,9 @@ static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); } -static void __install_bp_hardening_cb(bp_hardening_cb_t fn, - const char *hyp_vecs_start, - const char *hyp_vecs_end) +static void install_bp_hardening_cb(bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) { static DEFINE_RAW_SPINLOCK(bp_lock); int cpu, slot = -1; @@ -169,7 +169,7 @@ static void __install_bp_hardening_cb(bp_hardening_cb_t fn, #define __smccc_workaround_1_smc_start NULL #define __smccc_workaround_1_smc_end NULL -static void __install_bp_hardening_cb(bp_hardening_cb_t fn, +static void install_bp_hardening_cb(bp_hardening_cb_t fn, const char *hyp_vecs_start, const char *hyp_vecs_end) { @@ -177,23 +177,6 @@ static void __install_bp_hardening_cb(bp_hardening_cb_t fn, } #endif /* CONFIG_KVM_INDIRECT_VECTORS */ -static void install_bp_hardening_cb(const struct arm64_cpu_capabilities *entry, - bp_hardening_cb_t fn, - const char *hyp_vecs_start, - const char *hyp_vecs_end) -{ - u64 pfr0; - - if (!entry->matches(entry, SCOPE_LOCAL_CPU)) - return; - - pfr0 = read_cpuid(ID_AA64PFR0_EL1); - if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT)) - return; - - __install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end); -} - #include #include #include @@ -220,60 +203,83 @@ static void qcom_link_stack_sanitization(void) : "=&r" (tmp)); } -static void -enable_smccc_arch_workaround_1(const struct arm64_cpu_capabilities *entry) +static bool __nospectre_v2; +static int __init parse_nospectre_v2(char *str) +{ + __nospectre_v2 = true; + return 0; +} +early_param("nospectre_v2", parse_nospectre_v2); + +/* + * -1: No workaround + * 0: No workaround required + * 1: Workaround installed + */ +static int detect_harden_bp_fw(void) { bp_hardening_cb_t cb; void *smccc_start, *smccc_end; struct arm_smccc_res res; u32 midr = read_cpuid_id(); - if (!entry->matches(entry, SCOPE_LOCAL_CPU)) - return; - if (psci_ops.smccc_version == SMCCC_VERSION_1_0) - return; + return -1; switch (psci_ops.conduit) { case PSCI_CONDUIT_HVC: arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_ARCH_WORKAROUND_1, &res); - if ((int)res.a0 < 0) - return; - cb = call_hvc_arch_workaround_1; - /* This is a guest, no need to patch KVM vectors */ - smccc_start = NULL; - smccc_end = NULL; + switch ((int)res.a0) { + case 1: + /* Firmware says we're just fine */ + return 0; + case 0: + cb = call_hvc_arch_workaround_1; + /* This is a guest, no need to patch KVM vectors */ + smccc_start = NULL; + smccc_end = NULL; + break; + default: + return -1; + } break; case PSCI_CONDUIT_SMC: arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_ARCH_WORKAROUND_1, &res); - if ((int)res.a0 < 0) - return; - cb = call_smc_arch_workaround_1; - smccc_start = __smccc_workaround_1_smc_start; - smccc_end = __smccc_workaround_1_smc_end; + switch ((int)res.a0) { + case 1: + /* Firmware says we're just fine */ + return 0; + case 0: + cb = call_smc_arch_workaround_1; + smccc_start = __smccc_workaround_1_smc_start; + smccc_end = __smccc_workaround_1_smc_end; + break; + default: + return -1; + } break; default: - return; + return -1; } if (((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR) || ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1)) cb = qcom_link_stack_sanitization; - install_bp_hardening_cb(entry, cb, smccc_start, smccc_end); + if (IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR)) + install_bp_hardening_cb(cb, smccc_start, smccc_end); - return; + return 1; } -#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ -#ifdef CONFIG_ARM64_SSBD DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); int ssbd_state __read_mostly = ARM64_SSBD_KERNEL; +static bool __ssb_safe = true; static const struct ssbd_options { const char *str; @@ -343,6 +349,11 @@ void __init arm64_enable_wa2_handling(struct alt_instr *alt, void arm64_set_ssbd_mitigation(bool state) { + if (!IS_ENABLED(CONFIG_ARM64_SSBD)) { + pr_info_once("SSBD disabled by kernel configuration\n"); + return; + } + if (this_cpu_has_cap(ARM64_SSBS)) { if (state) asm volatile(SET_PSTATE_SSBS(0)); @@ -372,16 +383,28 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry, struct arm_smccc_res res; bool required = true; s32 val; + bool this_cpu_safe = false; WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + if (cpu_mitigations_off()) + ssbd_state = ARM64_SSBD_FORCE_DISABLE; + + /* delay setting __ssb_safe until we get a firmware response */ + if (is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list)) + this_cpu_safe = true; + if (this_cpu_has_cap(ARM64_SSBS)) { + if (!this_cpu_safe) + __ssb_safe = false; required = false; goto out_printmsg; } if (psci_ops.smccc_version == SMCCC_VERSION_1_0) { ssbd_state = ARM64_SSBD_UNKNOWN; + if (!this_cpu_safe) + __ssb_safe = false; return false; } @@ -398,6 +421,8 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry, default: ssbd_state = ARM64_SSBD_UNKNOWN; + if (!this_cpu_safe) + __ssb_safe = false; return false; } @@ -406,14 +431,18 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry, switch (val) { case SMCCC_RET_NOT_SUPPORTED: ssbd_state = ARM64_SSBD_UNKNOWN; + if (!this_cpu_safe) + __ssb_safe = false; return false; + /* machines with mixed mitigation requirements must not return this */ case SMCCC_RET_NOT_REQUIRED: pr_info_once("%s mitigation not required\n", entry->desc); ssbd_state = ARM64_SSBD_MITIGATED; return false; case SMCCC_RET_SUCCESS: + __ssb_safe = false; required = true; break; @@ -423,6 +452,8 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry, default: WARN_ON(1); + if (!this_cpu_safe) + __ssb_safe = false; return false; } @@ -462,7 +493,14 @@ out_printmsg: return required; } -#endif /* CONFIG_ARM64_SSBD */ + +/* known invulnerable cores */ +static const struct midr_range arm64_ssb_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + {}, +}; static void __maybe_unused cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused) @@ -507,26 +545,67 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused) .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, \ CAP_MIDR_RANGE_LIST(midr_list) -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR +/* Track overall mitigation state. We are only mitigated if all cores are ok */ +static bool __hardenbp_enab = true; +static bool __spectrev2_safe = true; /* - * List of CPUs where we need to issue a psci call to - * harden the branch predictor. + * List of CPUs that do not need any Spectre-v2 mitigation at all. */ -static const struct midr_range arm64_bp_harden_smccc_cpus[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), - MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), - MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), - MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1), - MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR), - MIDR_ALL_VERSIONS(MIDR_NVIDIA_DENVER), - {}, +static const struct midr_range spectre_v2_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + { /* sentinel */ } }; -#endif +/* + * Track overall bp hardening for all heterogeneous cores in the machine. + * We are only considered "safe" if all booted cores are known safe. + */ +static bool __maybe_unused +check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope) +{ + int need_wa; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + /* If the CPU has CSV2 set, we're safe */ + if (cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64PFR0_EL1), + ID_AA64PFR0_CSV2_SHIFT)) + return false; + + /* Alternatively, we have a list of unaffected CPUs */ + if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list)) + return false; + + /* Fallback to firmware detection */ + need_wa = detect_harden_bp_fw(); + if (!need_wa) + return false; + + __spectrev2_safe = false; + + if (!IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR)) { + pr_warn_once("spectrev2 mitigation disabled by kernel configuration\n"); + __hardenbp_enab = false; + return false; + } + + /* forced off */ + if (__nospectre_v2 || cpu_mitigations_off()) { + pr_info_once("spectrev2 mitigation disabled by command line option\n"); + __hardenbp_enab = false; + return false; + } + + if (need_wa < 0) { + pr_warn_once("ARM_SMCCC_ARCH_WORKAROUND_1 missing from firmware\n"); + __hardenbp_enab = false; + } + + return (need_wa > 0); +} #ifdef CONFIG_HARDEN_EL2_VECTORS @@ -603,6 +682,16 @@ static const struct midr_range workaround_clean_cache[] = { }; #endif +#ifdef CONFIG_ARM64_ERRATUM_1188873 +static const struct midr_range erratum_1188873_list[] = { + /* Cortex-A76 r0p0 to r2p0 */ + MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0), + /* Neoverse-N1 r0p0 to r2p0 */ + MIDR_RANGE(MIDR_NEOVERSE_N1, 0, 0, 2, 0), + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { @@ -701,13 +790,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), }, #endif -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR { .capability = ARM64_HARDEN_BRANCH_PREDICTOR, - .cpu_enable = enable_smccc_arch_workaround_1, - ERRATA_MIDR_RANGE_LIST(arm64_bp_harden_smccc_cpus), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = check_branch_predictor, }, -#endif #ifdef CONFIG_HARDEN_EL2_VECTORS { .desc = "EL2 vector hardening", @@ -715,20 +802,18 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_RANGE_LIST(arm64_harden_el2_vectors), }, #endif -#ifdef CONFIG_ARM64_SSBD { .desc = "Speculative Store Bypass Disable", .capability = ARM64_SSBD, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, .matches = has_ssbd_mitigation, + .midr_range_list = arm64_ssb_cpus, }, -#endif #ifdef CONFIG_ARM64_ERRATUM_1188873 { - /* Cortex-A76 r0p0 to r2p0 */ .desc = "ARM erratum 1188873", .capability = ARM64_WORKAROUND_1188873, - ERRATA_MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0), + ERRATA_MIDR_RANGE_LIST(erratum_1188873_list), }, #endif #ifdef CONFIG_ARM64_ERRATUM_1165522 @@ -742,3 +827,38 @@ const struct arm64_cpu_capabilities arm64_errata[] = { { } }; + +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, + char *buf) +{ + if (__spectrev2_safe) + return sprintf(buf, "Not affected\n"); + + if (__hardenbp_enab) + return sprintf(buf, "Mitigation: Branch predictor hardening\n"); + + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spec_store_bypass(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (__ssb_safe) + return sprintf(buf, "Not affected\n"); + + switch (ssbd_state) { + case ARM64_SSBD_KERNEL: + case ARM64_SSBD_FORCE_ENABLE: + if (IS_ENABLED(CONFIG_ARM64_SSBD)) + return sprintf(buf, + "Mitigation: Speculative Store Bypass disabled via prctl\n"); + } + + return sprintf(buf, "Vulnerable\n"); +} diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c index ea001241bdd470ab4a0a13ba4dad9bdb5a818bae..00f8b8612b69f87bd3b9dbaa4f7c9a08254d2bca 100644 --- a/arch/arm64/kernel/cpu_ops.c +++ b/arch/arm64/kernel/cpu_ops.c @@ -85,6 +85,7 @@ static const char *__init cpu_read_enable_method(int cpu) pr_err("%pOF: missing enable-method property\n", dn); } + of_node_put(dn); } else { enable_method = acpi_get_enable_method(cpu); if (!enable_method) { diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 4061de10cea6ccb0ccaa890c4f2dd98d4a2bcd91..2b807f129e602bd3644e8be50c77b75452d637b3 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -35,8 +36,8 @@ #include #include -unsigned long elf_hwcap __read_mostly; -EXPORT_SYMBOL_GPL(elf_hwcap); +/* Kernel representation of AT_HWCAP and AT_HWCAP2 */ +static unsigned long elf_hwcap __read_mostly; #ifdef CONFIG_COMPAT #define COMPAT_ELF_HWCAP_DEFAULT \ @@ -184,6 +185,15 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SM4_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SHA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_BITPERM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SVEVER_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN4_SHIFT, 4, ID_AA64MMFR0_TGRAN4_NI), S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN64_SHIFT, 4, ID_AA64MMFR0_TGRAN64_NI), @@ -392,7 +402,7 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 4 */ ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0), ARM64_FTR_REG(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1), - ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_raz), + ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0), /* Op1 = 0, CRn = 0, CRm = 5 */ ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0), @@ -947,7 +957,7 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) return has_cpuid_feature(entry, scope); } -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +static bool __meltdown_safe = true; static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, @@ -966,7 +976,17 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), { /* sentinel */ } }; - char const *str = "command line option"; + char const *str = "kpti command line option"; + bool meltdown_safe; + + meltdown_safe = is_midr_in_range_list(read_cpuid_id(), kpti_safe_list); + + /* Defer to CPU feature registers */ + if (has_cpuid_feature(entry, scope)) + meltdown_safe = true; + + if (!meltdown_safe) + __meltdown_safe = false; /* * For reasons that aren't entirely clear, enabling KPTI on Cavium @@ -978,6 +998,24 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, __kpti_forced = -1; } + /* Useful for KASLR robustness */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset() > 0) { + if (!__kpti_forced) { + str = "KASLR"; + __kpti_forced = 1; + } + } + + if (cpu_mitigations_off() && !__kpti_forced) { + str = "mitigations=off"; + __kpti_forced = -1; + } + + if (!IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) { + pr_info_once("kernel page table isolation disabled by kernel configuration\n"); + return false; + } + /* Forced? */ if (__kpti_forced) { pr_info_once("kernel page table isolation forced %s by %s\n", @@ -985,18 +1023,10 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, return __kpti_forced > 0; } - /* Useful for KASLR robustness */ - if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - return kaslr_offset() > 0; - - /* Don't force KPTI for CPUs that are not vulnerable */ - if (is_midr_in_range_list(read_cpuid_id(), kpti_safe_list)) - return false; - - /* Defer to CPU feature registers */ - return !has_cpuid_feature(entry, scope); + return !meltdown_safe; } +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 static void kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) { @@ -1026,6 +1056,12 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) return; } +#else +static void +kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) +{ +} +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ static int __init parse_kpti(char *str) { @@ -1039,7 +1075,6 @@ static int __init parse_kpti(char *str) return 0; } early_param("kpti", parse_kpti); -#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ #ifdef CONFIG_ARM64_HW_AFDBM static inline void __cpu_enable_hw_dbm(void) @@ -1306,7 +1341,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .field_pos = ID_AA64PFR0_EL0_SHIFT, .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, }, -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 { .desc = "Kernel page table isolation (KPTI)", .capability = ARM64_UNMAP_KERNEL_AT_EL0, @@ -1322,7 +1356,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = unmap_kernel_at_el0, .cpu_enable = kpti_install_ng_mappings, }, -#endif { /* FP/SIMD is not implemented */ .capability = ARM64_HAS_NO_FPSIMD, @@ -1340,6 +1373,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .field_pos = ID_AA64ISAR1_DPB_SHIFT, .min_field_value = 1, }, + { + .desc = "Data cache clean to Point of Deep Persistence", + .capability = ARM64_HAS_DCPODP, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR1_DPB_SHIFT, + .min_field_value = 2, + }, #endif #ifdef CONFIG_ARM64_SVE { @@ -1571,39 +1614,46 @@ static const struct arm64_cpu_capabilities ptr_auth_hwcap_gen_matches[] = { #endif static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_PMULL), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_AES), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA1), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA2), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_SHA512), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_CRC32), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_ATOMICS), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RDM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDRDM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA3), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM3), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM4), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDDP), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDFHM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FLAGM), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_FP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_ASIMDHP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_DIT_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_DIT), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_DCPOP), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_JSCVT), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FCMA), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_LRCPC), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_ILRCPC), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_SB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SB), - HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_USCAT), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_PMULL), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AES), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA1), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA2), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_SHA512), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_CRC32), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ATOMICS), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RDM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA3), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM3), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM4), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FLAGM), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_FP), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FPHP), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_ASIMD), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_DIT_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DIT), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DCPOP), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_DCPODP), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_JSCVT), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FCMA), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_LRCPC), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_SB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SB), + HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, ID_AA64PFR0_SVE, CAP_HWCAP, HWCAP_SVE), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, ID_AA64PFR0_SVE, CAP_HWCAP, KERNEL_HWCAP_SVE), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SVEVER_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_SVEVER_SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_AES_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_AES, CAP_HWCAP, KERNEL_HWCAP_SVEAES), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_AES_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_AES_PMULL, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_BITPERM_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_BITPERM, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SHA3_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_SHA3, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SM4_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_SM4, CAP_HWCAP, KERNEL_HWCAP_SVESM4), #endif - HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, HWCAP_SSBS), + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, KERNEL_HWCAP_SSBS), #ifdef CONFIG_ARM64_PTR_AUTH - HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, HWCAP_PACA), - HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, HWCAP_PACG), + HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA), + HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG), #endif {}, }; @@ -1623,7 +1673,7 @@ static void __init cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap) { switch (cap->hwcap_type) { case CAP_HWCAP: - elf_hwcap |= cap->hwcap; + cpu_set_feature(cap->hwcap); break; #ifdef CONFIG_COMPAT case CAP_COMPAT_HWCAP: @@ -1646,7 +1696,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap) switch (cap->hwcap_type) { case CAP_HWCAP: - rc = (elf_hwcap & cap->hwcap) != 0; + rc = cpu_have_feature(cap->hwcap); break; #ifdef CONFIG_COMPAT case CAP_COMPAT_HWCAP: @@ -1667,7 +1717,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap) static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps) { /* We support emulation of accesses to CPU ID feature registers */ - elf_hwcap |= HWCAP_CPUID; + cpu_set_named_feature(CPUID); for (; hwcaps->matches; hwcaps++) if (hwcaps->matches(hwcaps, cpucap_default_scope(hwcaps))) cap_set_elf_hwcap(hwcaps); @@ -1947,6 +1997,35 @@ bool this_cpu_has_cap(unsigned int n) return false; } +void cpu_set_feature(unsigned int num) +{ + WARN_ON(num >= MAX_CPU_FEATURES); + elf_hwcap |= BIT(num); +} +EXPORT_SYMBOL_GPL(cpu_set_feature); + +bool cpu_have_feature(unsigned int num) +{ + WARN_ON(num >= MAX_CPU_FEATURES); + return elf_hwcap & BIT(num); +} +EXPORT_SYMBOL_GPL(cpu_have_feature); + +unsigned long cpu_get_elf_hwcap(void) +{ + /* + * We currently only populate the first 32 bits of AT_HWCAP. Please + * note that for userspace compatibility we guarantee that bits 62 + * and 63 will always be returned as 0. + */ + return lower_32_bits(elf_hwcap); +} + +unsigned long cpu_get_elf_hwcap2(void) +{ + return upper_32_bits(elf_hwcap); +} + static void __init setup_system_capabilities(void) { /* @@ -2101,3 +2180,15 @@ static int __init enable_mrs_emulation(void) } core_initcall(enable_mrs_emulation); + +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, + char *buf) +{ + if (__meltdown_safe) + return sprintf(buf, "Not affected\n"); + + if (arm64_kernel_unmapped_at_el0()) + return sprintf(buf, "Mitigation: PTI\n"); + + return sprintf(buf, "Vulnerable\n"); +} diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index ca0685f339002a97d73651db213cdeb369465e34..f6f7936be6e7f81b0e39a5db68afdd864edda775 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -85,6 +85,13 @@ static const char *const hwcap_str[] = { "sb", "paca", "pacg", + "dcpodp", + "sve2", + "sveaes", + "svepmull", + "svebitperm", + "svesha3", + "svesm4", NULL }; @@ -167,7 +174,7 @@ static int c_show(struct seq_file *m, void *v) #endif /* CONFIG_COMPAT */ } else { for (j = 0; hwcap_str[j]; j++) - if (elf_hwcap & (1 << j)) + if (cpu_have_feature(j)) seq_printf(m, " %s", hwcap_str[j]); } seq_puts(m, "\n"); diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index d7bb6aefae0a8b4c19387208e71ef6021c234d3a..555b6bd2f3d68f1fb451af110e26a500c8f2738a 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -135,6 +135,7 @@ NOKPROBE_SYMBOL(disable_debug_monitors); */ static int clear_os_lock(unsigned int cpu) { + write_sysreg(0, osdlr_el1); write_sysreg(0, oslar_el1); isb(); return 0; @@ -163,25 +164,46 @@ static void clear_regs_spsr_ss(struct pt_regs *regs) } NOKPROBE_SYMBOL(clear_regs_spsr_ss); -/* EL1 Single Step Handler hooks */ -static LIST_HEAD(step_hook); -static DEFINE_SPINLOCK(step_hook_lock); +static DEFINE_SPINLOCK(debug_hook_lock); +static LIST_HEAD(user_step_hook); +static LIST_HEAD(kernel_step_hook); -void register_step_hook(struct step_hook *hook) +static void register_debug_hook(struct list_head *node, struct list_head *list) { - spin_lock(&step_hook_lock); - list_add_rcu(&hook->node, &step_hook); - spin_unlock(&step_hook_lock); + spin_lock(&debug_hook_lock); + list_add_rcu(node, list); + spin_unlock(&debug_hook_lock); + } -void unregister_step_hook(struct step_hook *hook) +static void unregister_debug_hook(struct list_head *node) { - spin_lock(&step_hook_lock); - list_del_rcu(&hook->node); - spin_unlock(&step_hook_lock); + spin_lock(&debug_hook_lock); + list_del_rcu(node); + spin_unlock(&debug_hook_lock); synchronize_rcu(); } +void register_user_step_hook(struct step_hook *hook) +{ + register_debug_hook(&hook->node, &user_step_hook); +} + +void unregister_user_step_hook(struct step_hook *hook) +{ + unregister_debug_hook(&hook->node); +} + +void register_kernel_step_hook(struct step_hook *hook) +{ + register_debug_hook(&hook->node, &kernel_step_hook); +} + +void unregister_kernel_step_hook(struct step_hook *hook) +{ + unregister_debug_hook(&hook->node); +} + /* * Call registered single step handlers * There is no Syndrome info to check for determining the handler. @@ -191,11 +213,14 @@ void unregister_step_hook(struct step_hook *hook) static int call_step_hook(struct pt_regs *regs, unsigned int esr) { struct step_hook *hook; + struct list_head *list; int retval = DBG_HOOK_ERROR; + list = user_mode(regs) ? &user_step_hook : &kernel_step_hook; + rcu_read_lock(); - list_for_each_entry_rcu(hook, &step_hook, node) { + list_for_each_entry_rcu(hook, list, node) { retval = hook->fn(regs, esr); if (retval == DBG_HOOK_HANDLED) break; @@ -222,7 +247,7 @@ static void send_user_sigtrap(int si_code) "User debug trap"); } -static int single_step_handler(unsigned long addr, unsigned int esr, +static int single_step_handler(unsigned long unused, unsigned int esr, struct pt_regs *regs) { bool handler_found = false; @@ -234,10 +259,6 @@ static int single_step_handler(unsigned long addr, unsigned int esr, if (!reinstall_suspended_bps(regs)) return 0; -#ifdef CONFIG_KPROBES - if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; -#endif if (!handler_found && call_step_hook(regs, esr) == DBG_HOOK_HANDLED) handler_found = true; @@ -264,61 +285,59 @@ static int single_step_handler(unsigned long addr, unsigned int esr, } NOKPROBE_SYMBOL(single_step_handler); -/* - * Breakpoint handler is re-entrant as another breakpoint can - * hit within breakpoint handler, especically in kprobes. - * Use reader/writer locks instead of plain spinlock. - */ -static LIST_HEAD(break_hook); -static DEFINE_SPINLOCK(break_hook_lock); +static LIST_HEAD(user_break_hook); +static LIST_HEAD(kernel_break_hook); -void register_break_hook(struct break_hook *hook) +void register_user_break_hook(struct break_hook *hook) { - spin_lock(&break_hook_lock); - list_add_rcu(&hook->node, &break_hook); - spin_unlock(&break_hook_lock); + register_debug_hook(&hook->node, &user_break_hook); } -void unregister_break_hook(struct break_hook *hook) +void unregister_user_break_hook(struct break_hook *hook) { - spin_lock(&break_hook_lock); - list_del_rcu(&hook->node); - spin_unlock(&break_hook_lock); - synchronize_rcu(); + unregister_debug_hook(&hook->node); +} + +void register_kernel_break_hook(struct break_hook *hook) +{ + register_debug_hook(&hook->node, &kernel_break_hook); +} + +void unregister_kernel_break_hook(struct break_hook *hook) +{ + unregister_debug_hook(&hook->node); } static int call_break_hook(struct pt_regs *regs, unsigned int esr) { struct break_hook *hook; + struct list_head *list; int (*fn)(struct pt_regs *regs, unsigned int esr) = NULL; + list = user_mode(regs) ? &user_break_hook : &kernel_break_hook; + rcu_read_lock(); - list_for_each_entry_rcu(hook, &break_hook, node) - if ((esr & hook->esr_mask) == hook->esr_val) + list_for_each_entry_rcu(hook, list, node) { + unsigned int comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; + + if ((comment & ~hook->mask) == hook->imm) fn = hook->fn; + } rcu_read_unlock(); return fn ? fn(regs, esr) : DBG_HOOK_ERROR; } NOKPROBE_SYMBOL(call_break_hook); -static int brk_handler(unsigned long addr, unsigned int esr, +static int brk_handler(unsigned long unused, unsigned int esr, struct pt_regs *regs) { - bool handler_found = false; - -#ifdef CONFIG_KPROBES - if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) { - if (kprobe_breakpoint_handler(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; - } -#endif - if (!handler_found && call_break_hook(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; + if (call_break_hook(regs, esr) == DBG_HOOK_HANDLED) + return 0; - if (!handler_found && user_mode(regs)) { + if (user_mode(regs)) { send_user_sigtrap(TRAP_BRKPT); - } else if (!handler_found) { + } else { pr_warn("Unexpected kernel BRK exception at EL1\n"); return -EFAULT; } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index c50a7a75f2e0f791fc0a53f85e66fcf0140f3c7e..1a7811b7e3c4611fc132d44996955e7632a6876a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -336,6 +336,21 @@ alternative_if ARM64_WORKAROUND_845719 alternative_else_nop_endif #endif 3: +#ifdef CONFIG_ARM64_ERRATUM_1188873 +alternative_if_not ARM64_WORKAROUND_1188873 + b 4f +alternative_else_nop_endif + /* + * if (x22.mode32 == cntkctl_el1.el0vcten) + * cntkctl_el1.el0vcten = ~cntkctl_el1.el0vcten + */ + mrs x1, cntkctl_el1 + eon x0, x1, x22, lsr #3 + tbz x0, #1, 4f + eor x1, x1, #2 // ARCH_TIMER_USR_VCT_ACCESS_EN + msr cntkctl_el1, x1 +4: +#endif apply_ssbd 0, x0, x1 .endif @@ -362,11 +377,11 @@ alternative_else_nop_endif .if \el == 0 alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - bne 4f + bne 5f msr far_el1, x30 tramp_alias x30, tramp_exit_native br x30 -4: +5: tramp_alias x30, tramp_exit_compat br x30 #endif diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 5ebe73b6996186b3475745cac1d90abafdb737ad..735cf1f8b109c730a48df5099fc3128f5c4905fb 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1258,14 +1258,14 @@ static inline void fpsimd_hotplug_init(void) { } */ static int __init fpsimd_init(void) { - if (elf_hwcap & HWCAP_FP) { + if (cpu_have_named_feature(FP)) { fpsimd_pm_init(); fpsimd_hotplug_init(); } else { pr_notice("Floating-point is not implemented\n"); } - if (!(elf_hwcap & HWCAP_ASIMD)) + if (!cpu_have_named_feature(ASIMD)) pr_notice("Advanced SIMD is not implemented\n"); return sve_sysctl_init(); diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 8e4431a8821f5920e49910dd287db5882e73f330..65a51331088eb0afd0db70e52fb03335cc8151dc 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -103,12 +103,16 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) * to be revisited if support for multiple ftrace entry points * is added in the future, but for now, the pr_err() below * deals with a theoretical issue only. + * + * Note that PLTs are place relative, and plt_entries_equal() + * checks whether they point to the same target. Here, we need + * to check if the actual opcodes are in fact identical, + * regardless of the offset in memory so use memcmp() instead. */ trampoline = get_plt_entry(addr, mod->arch.ftrace_trampoline); - if (!plt_entries_equal(mod->arch.ftrace_trampoline, - &trampoline)) { - if (!plt_entries_equal(mod->arch.ftrace_trampoline, - &(struct plt_entry){})) { + if (memcmp(mod->arch.ftrace_trampoline, &trampoline, + sizeof(trampoline))) { + if (plt_entry_is_initialized(mod->arch.ftrace_trampoline)) { pr_err("ftrace: far branches to multiple entry points unsupported inside a single module\n"); return -EINVAL; } diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index eecf7927dab08bf66176841004f7e39bc7605c47..fcae3f85c6cdf80d77c700241f5a9e9482809d32 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -505,7 +505,7 @@ ENTRY(el2_setup) * kernel is intended to run at EL2. */ mrs x2, id_aa64mmfr1_el1 - ubfx x2, x2, #8, #4 + ubfx x2, x2, #ID_AA64MMFR1_VHE_SHIFT, #4 #else mov x2, xzr #endif @@ -538,7 +538,7 @@ set_hcr: #ifdef CONFIG_ARM_GIC_V3 /* GICv3 system register access */ mrs x0, id_aa64pfr0_el1 - ubfx x0, x0, #24, #4 + ubfx x0, x0, #ID_AA64PFR0_GIC_SHIFT, #4 cbz x0, 3f mrs_s x0, SYS_ICC_SRE_EL2 @@ -564,8 +564,8 @@ set_hcr: #endif /* EL2 debug */ - mrs x1, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer - sbfx x0, x1, #8, #4 + mrs x1, id_aa64dfr0_el1 + sbfx x0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4 cmp x0, #1 b.lt 4f // Skip if no PMU present mrs x0, pmcr_el0 // Disable debug access traps @@ -574,7 +574,7 @@ set_hcr: csel x3, xzr, x0, lt // all PMU counters from EL1 /* Statistical profiling */ - ubfx x0, x1, #32, #4 // Check ID_AA64DFR0_EL1 PMSVer + ubfx x0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4 cbz x0, 7f // Skip if SPE not present cbnz x2, 6f // VHE? mrs_s x4, SYS_PMBIDR_EL1 // If SPE available at EL2, @@ -684,7 +684,7 @@ ENTRY(__boot_cpu_mode) * with MMU turned off. */ ENTRY(__early_cpu_boot_status) - .long 0 + .quad 0 .popsection diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index 691854b77c7fe67b984c1ffb4f89677136f06251..30853d5b785928d51666207c4af0fa61cb065adf 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -244,9 +244,6 @@ int kgdb_arch_handle_exception(int exception_vector, int signo, static int kgdb_brk_fn(struct pt_regs *regs, unsigned int esr) { - if (user_mode(regs)) - return DBG_HOOK_ERROR; - kgdb_handle_exception(1, SIGTRAP, 0, regs); return DBG_HOOK_HANDLED; } @@ -254,9 +251,6 @@ NOKPROBE_SYMBOL(kgdb_brk_fn) static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr) { - if (user_mode(regs)) - return DBG_HOOK_ERROR; - compiled_break = 1; kgdb_handle_exception(1, SIGTRAP, 0, regs); @@ -266,7 +260,7 @@ NOKPROBE_SYMBOL(kgdb_compiled_brk_fn); static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr) { - if (user_mode(regs) || !kgdb_single_step) + if (!kgdb_single_step) return DBG_HOOK_ERROR; kgdb_handle_exception(1, SIGTRAP, 0, regs); @@ -275,15 +269,13 @@ static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr) NOKPROBE_SYMBOL(kgdb_step_brk_fn); static struct break_hook kgdb_brkpt_hook = { - .esr_mask = 0xffffffff, - .esr_val = (u32)ESR_ELx_VAL_BRK64(KGDB_DYN_DBG_BRK_IMM), - .fn = kgdb_brk_fn + .fn = kgdb_brk_fn, + .imm = KGDB_DYN_DBG_BRK_IMM, }; static struct break_hook kgdb_compiled_brkpt_hook = { - .esr_mask = 0xffffffff, - .esr_val = (u32)ESR_ELx_VAL_BRK64(KGDB_COMPILED_DBG_BRK_IMM), - .fn = kgdb_compiled_brk_fn + .fn = kgdb_compiled_brk_fn, + .imm = KGDB_COMPILED_DBG_BRK_IMM, }; static struct step_hook kgdb_step_hook = { @@ -332,9 +324,9 @@ int kgdb_arch_init(void) if (ret != 0) return ret; - register_break_hook(&kgdb_brkpt_hook); - register_break_hook(&kgdb_compiled_brkpt_hook); - register_step_hook(&kgdb_step_hook); + register_kernel_break_hook(&kgdb_brkpt_hook); + register_kernel_break_hook(&kgdb_compiled_brkpt_hook); + register_kernel_step_hook(&kgdb_step_hook); return 0; } @@ -345,9 +337,9 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { - unregister_break_hook(&kgdb_brkpt_hook); - unregister_break_hook(&kgdb_compiled_brkpt_hook); - unregister_step_hook(&kgdb_step_hook); + unregister_kernel_break_hook(&kgdb_brkpt_hook); + unregister_kernel_break_hook(&kgdb_compiled_brkpt_hook); + unregister_kernel_step_hook(&kgdb_step_hook); unregister_die_notifier(&kgdb_notifier); } diff --git a/arch/arm64/kernel/kuser32.S b/arch/arm64/kernel/kuser32.S index 997e6b27ff6a41a424cec2503bae44807f576888..49825e9e421e0d9aa8df026cbc46c0308f1d3928 100644 --- a/arch/arm64/kernel/kuser32.S +++ b/arch/arm64/kernel/kuser32.S @@ -1,29 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* - * Low-level user helpers placed in the vectors page for AArch32. + * AArch32 user helpers. * Based on the kuser helpers in arch/arm/kernel/entry-armv.S. * * Copyright (C) 2005-2011 Nicolas Pitre - * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * Copyright (C) 2012-2018 ARM Ltd. * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * - * AArch32 user helpers. - * - * Each segment is 32-byte aligned and will be moved to the top of the high - * vector page. New segments (if ever needed) must be added in front of - * existing ones. This mechanism should be used only for things that are - * really small and justified, and not be abused freely. + * The kuser helpers below are mapped at a fixed address by + * aarch32_setup_additional_pages() and are provided for compatibility + * reasons with 32 bit (aarch32) applications that need them. * * See Documentation/arm/kernel_user_helpers.txt for formal definitions. */ @@ -77,42 +62,3 @@ __kuser_helper_version: // 0xffff0ffc .word ((__kuser_helper_end - __kuser_helper_start) >> 5) .globl __kuser_helper_end __kuser_helper_end: - -/* - * AArch32 sigreturn code - * - * For ARM syscalls, the syscall number has to be loaded into r7. - * We do not support an OABI userspace. - * - * For Thumb syscalls, we also pass the syscall number via r7. We therefore - * need two 16-bit instructions. - */ - .globl __aarch32_sigret_code_start -__aarch32_sigret_code_start: - - /* - * ARM Code - */ - .byte __NR_compat_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_sigreturn - .byte __NR_compat_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_sigreturn - - /* - * Thumb code - */ - .byte __NR_compat_sigreturn, 0x27 // svc #__NR_compat_sigreturn - .byte __NR_compat_sigreturn, 0xdf // mov r7, #__NR_compat_sigreturn - - /* - * ARM code - */ - .byte __NR_compat_rt_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_rt_sigreturn - .byte __NR_compat_rt_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_rt_sigreturn - - /* - * Thumb code - */ - .byte __NR_compat_rt_sigreturn, 0x27 // svc #__NR_compat_rt_sigreturn - .byte __NR_compat_rt_sigreturn, 0xdf // mov r7, #__NR_compat_rt_sigreturn - - .globl __aarch32_sigret_code_end -__aarch32_sigret_code_end: diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 4addb38bc250b711aff75278fff59746a356db02..6164d389eed6065867eae5e55e4f8b459d9d9b42 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -431,7 +431,7 @@ static inline u64 armv8pmu_read_hw_counter(struct perf_event *event) return val; } -static inline u64 armv8pmu_read_counter(struct perf_event *event) +static u64 armv8pmu_read_counter(struct perf_event *event) { struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; @@ -468,7 +468,7 @@ static inline void armv8pmu_write_hw_counter(struct perf_event *event, } } -static inline void armv8pmu_write_counter(struct perf_event *event, u64 value) +static void armv8pmu_write_counter(struct perf_event *event, u64 value) { struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 7a679caf45856e75c21860aaa7522c080fb48e41..2509fcb6d4048a17c20760741e8b33f22137f7fd 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -439,15 +439,12 @@ kprobe_ss_hit(struct kprobe_ctlblk *kcb, unsigned long addr) return DBG_HOOK_ERROR; } -int __kprobes +static int __kprobes kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); int retval; - if (user_mode(regs)) - return DBG_HOOK_ERROR; - /* return error if this is not our step */ retval = kprobe_ss_hit(kcb, instruction_pointer(regs)); @@ -461,16 +458,22 @@ kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr) return retval; } -int __kprobes +static struct step_hook kprobes_step_hook = { + .fn = kprobe_single_step_handler, +}; + +static int __kprobes kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr) { - if (user_mode(regs)) - return DBG_HOOK_ERROR; - kprobe_handler(regs); return DBG_HOOK_HANDLED; } +static struct break_hook kprobes_break_hook = { + .imm = KPROBES_BRK_IMM, + .fn = kprobe_breakpoint_handler, +}; + /* * Provide a blacklist of symbols identifying ranges which cannot be kprobed. * This blacklist is exposed to userspace via debugfs (kprobes/blacklist). @@ -599,5 +602,8 @@ int __kprobes arch_trampoline_kprobe(struct kprobe *p) int __init arch_init_kprobes(void) { + register_kernel_break_hook(&kprobes_break_hook); + register_kernel_step_hook(&kprobes_step_hook); + return 0; } diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index 636ca0119c0efa7cb1254568d04edfdddf791db9..605945eac1f843d221d47c67177ee46068a9916d 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -171,7 +171,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, static int uprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr) { - if (user_mode(regs) && uprobe_pre_sstep_notifier(regs)) + if (uprobe_pre_sstep_notifier(regs)) return DBG_HOOK_HANDLED; return DBG_HOOK_ERROR; @@ -182,21 +182,16 @@ static int uprobe_single_step_handler(struct pt_regs *regs, { struct uprobe_task *utask = current->utask; - if (user_mode(regs)) { - WARN_ON(utask && - (instruction_pointer(regs) != utask->xol_vaddr + 4)); - - if (uprobe_post_sstep_notifier(regs)) - return DBG_HOOK_HANDLED; - } + WARN_ON(utask && (instruction_pointer(regs) != utask->xol_vaddr + 4)); + if (uprobe_post_sstep_notifier(regs)) + return DBG_HOOK_HANDLED; return DBG_HOOK_ERROR; } /* uprobe breakpoint handler hook */ static struct break_hook uprobes_break_hook = { - .esr_mask = BRK64_ESR_MASK, - .esr_val = BRK64_ESR_UPROBES, + .imm = UPROBES_BRK_IMM, .fn = uprobe_breakpoint_handler, }; @@ -207,8 +202,8 @@ static struct step_hook uprobes_step_hook = { static int __init arch_init_uprobes(void) { - register_break_hook(&uprobes_break_hook); - register_step_hook(&uprobes_step_hook); + register_user_break_hook(&uprobes_break_hook); + register_user_step_hook(&uprobes_step_hook); return 0; } diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index 5ba4465e44f09028c89fb190b7d65927635a9d10..ea94cf8f9dc6d15f58a7c8e298eba6d8bfdecede 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -94,6 +94,9 @@ static bool on_sdei_normal_stack(unsigned long sp, struct stack_info *info) unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr); unsigned long high = low + SDEI_STACK_SIZE; + if (!low) + return false; + if (sp < low || sp >= high) return false; @@ -111,6 +114,9 @@ static bool on_sdei_critical_stack(unsigned long sp, struct stack_info *info) unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr); unsigned long high = low + SDEI_STACK_SIZE; + if (!low) + return false; + if (sp < low || sp >= high) return false; diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index cb7800acd19fbd4554b1d25adab62c908421a0cf..caea6e25db2acf5f71c2ed021247eb6fc972a747 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -403,8 +403,7 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, if (ka->sa.sa_flags & SA_SIGINFO) idx += 3; - retcode = AARCH32_VECTORS_BASE + - AARCH32_KERN_SIGRET_CODE_OFFSET + + retcode = (unsigned long)current->mm->context.vdso + (idx << 2) + thumb; } diff --git a/arch/arm64/kernel/sigreturn32.S b/arch/arm64/kernel/sigreturn32.S new file mode 100644 index 0000000000000000000000000000000000000000..475d30d471ac1634364bab74e7f3d58c0dfc1fb6 --- /dev/null +++ b/arch/arm64/kernel/sigreturn32.S @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AArch32 sigreturn code. + * Based on the kuser helpers in arch/arm/kernel/entry-armv.S. + * + * Copyright (C) 2005-2011 Nicolas Pitre + * Copyright (C) 2012-2018 ARM Ltd. + * + * For ARM syscalls, the syscall number has to be loaded into r7. + * We do not support an OABI userspace. + * + * For Thumb syscalls, we also pass the syscall number via r7. We therefore + * need two 16-bit instructions. + */ + +#include + + .globl __aarch32_sigret_code_start +__aarch32_sigret_code_start: + + /* + * ARM Code + */ + .byte __NR_compat_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_sigreturn + .byte __NR_compat_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_sigreturn + + /* + * Thumb code + */ + .byte __NR_compat_sigreturn, 0x27 // svc #__NR_compat_sigreturn + .byte __NR_compat_sigreturn, 0xdf // mov r7, #__NR_compat_sigreturn + + /* + * ARM code + */ + .byte __NR_compat_rt_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_rt_sigreturn + .byte __NR_compat_rt_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_rt_sigreturn + + /* + * Thumb code + */ + .byte __NR_compat_rt_sigreturn, 0x27 // svc #__NR_compat_rt_sigreturn + .byte __NR_compat_rt_sigreturn, 0xdf // mov r7, #__NR_compat_rt_sigreturn + + .globl __aarch32_sigret_code_end +__aarch32_sigret_code_end: diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index d908b5e9e949c6745598abd90f6c3afef89ae040..b00ec7d483d1c33b848e869885067551da214252 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -140,8 +140,6 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) #endif walk_stackframe(current, &frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_regs); @@ -172,8 +170,6 @@ static noinline void __save_stack_trace(struct task_struct *tsk, #endif walk_stackframe(tsk, &frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; put_task_stack(tsk); } diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c index b44065fb16160c62d3fa113d0a86f1dd1e4564dc..6f91e81165147dd5ef665c292dda6763af5fbabf 100644 --- a/arch/arm64/kernel/sys.c +++ b/arch/arm64/kernel/sys.c @@ -31,7 +31,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, - unsigned long, fd, off_t, off) + unsigned long, fd, unsigned long, off) { if (offset_in_page(off) != 0) return -EINVAL; diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8ad119c3f665d4e8001038ccf3bd6dcb62e2e224..ade32046f3fea606172536f16912ed3fbc34b9e4 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -102,10 +102,16 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; - int skip; + int skip = 0; pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); + if (regs) { + if (user_mode(regs)) + return; + skip = 1; + } + if (!tsk) tsk = current; @@ -126,7 +132,6 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) frame.graph = 0; #endif - skip = !!regs; printk("Call trace:\n"); do { /* skip until specified stack frame */ @@ -176,15 +181,13 @@ static int __die(const char *str, int err, struct pt_regs *regs) return ret; print_modules(); - __show_regs(regs); pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n", TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), end_of_stack(tsk)); + show_regs(regs); - if (!user_mode(regs)) { - dump_backtrace(regs, tsk); + if (!user_mode(regs)) dump_instr(KERN_EMERG, regs); - } return ret; } @@ -459,6 +462,9 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) case ESR_ELx_SYS64_ISS_CRM_DC_CVAC: /* DC CVAC, gets promoted */ __user_cache_maint("dc civac", address, ret); break; + case ESR_ELx_SYS64_ISS_CRM_DC_CVADP: /* DC CVADP */ + __user_cache_maint("sys 3, c7, c13, 1", address, ret); + break; case ESR_ELx_SYS64_ISS_CRM_DC_CVAP: /* DC CVAP */ __user_cache_maint("sys 3, c7, c12, 1", address, ret); break; @@ -493,7 +499,7 @@ static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs) { int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_counter_get_cntvct()); + pt_regs_write_reg(regs, rt, arch_timer_read_counter()); arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } @@ -665,7 +671,7 @@ static void compat_cntvct_read_handler(unsigned int esr, struct pt_regs *regs) { int rt = (esr & ESR_ELx_CP15_64_ISS_RT_MASK) >> ESR_ELx_CP15_64_ISS_RT_SHIFT; int rt2 = (esr & ESR_ELx_CP15_64_ISS_RT2_MASK) >> ESR_ELx_CP15_64_ISS_RT2_SHIFT; - u64 val = arch_counter_get_cntvct(); + u64 val = arch_timer_read_counter(); pt_regs_write_reg(regs, rt, lower_32_bits(val)); pt_regs_write_reg(regs, rt2, upper_32_bits(val)); @@ -947,9 +953,6 @@ int is_valid_bugaddr(unsigned long addr) static int bug_handler(struct pt_regs *regs, unsigned int esr) { - if (user_mode(regs)) - return DBG_HOOK_ERROR; - switch (report_bug(regs->pc, regs)) { case BUG_TRAP_TYPE_BUG: die("Oops - BUG", regs, 0); @@ -969,9 +972,8 @@ static int bug_handler(struct pt_regs *regs, unsigned int esr) } static struct break_hook bug_break_hook = { - .esr_val = 0xf2000000 | BUG_BRK_IMM, - .esr_mask = 0xffffffff, .fn = bug_handler, + .imm = BUG_BRK_IMM, }; #ifdef CONFIG_KASAN_SW_TAGS @@ -989,9 +991,6 @@ static int kasan_handler(struct pt_regs *regs, unsigned int esr) u64 addr = regs->regs[0]; u64 pc = regs->pc; - if (user_mode(regs)) - return DBG_HOOK_ERROR; - kasan_report(addr, size, write, pc); /* @@ -1016,13 +1015,10 @@ static int kasan_handler(struct pt_regs *regs, unsigned int esr) return DBG_HOOK_HANDLED; } -#define KASAN_ESR_VAL (0xf2000000 | KASAN_BRK_IMM) -#define KASAN_ESR_MASK 0xffffff00 - static struct break_hook kasan_break_hook = { - .esr_val = KASAN_ESR_VAL, - .esr_mask = KASAN_ESR_MASK, - .fn = kasan_handler, + .fn = kasan_handler, + .imm = KASAN_BRK_IMM, + .mask = KASAN_BRK_MASK, }; #endif @@ -1034,7 +1030,9 @@ int __init early_brk64(unsigned long addr, unsigned int esr, struct pt_regs *regs) { #ifdef CONFIG_KASAN_SW_TAGS - if ((esr & KASAN_ESR_MASK) == KASAN_ESR_VAL) + unsigned int comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; + + if ((comment & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) return kasan_handler(regs, esr) != DBG_HOOK_HANDLED; #endif return bug_handler(regs, esr) != DBG_HOOK_HANDLED; @@ -1043,8 +1041,8 @@ int __init early_brk64(unsigned long addr, unsigned int esr, /* This registration must happen early, before debug_traps_init(). */ void __init trap_init(void) { - register_break_hook(&bug_break_hook); + register_kernel_break_hook(&bug_break_hook); #ifdef CONFIG_KASAN_SW_TAGS - register_break_hook(&kasan_break_hook); + register_kernel_break_hook(&kasan_break_hook); #endif } diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 2d419006ad4330c5a76cb0cf9372fdb0c76cf279..8074cbd3a3a854f293d7145e98fe725fd1d9229f 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -1,5 +1,5 @@ /* - * VDSO implementation for AArch64 and vector page setup for AArch32. + * VDSO implementations. * * Copyright (C) 2012 ARM Limited * @@ -53,61 +53,129 @@ struct vdso_data *vdso_data = &vdso_data_store.data; /* * Create and map the vectors page for AArch32 tasks. */ -static struct page *vectors_page[1] __ro_after_init; +#define C_VECTORS 0 +#define C_SIGPAGE 1 +#define C_PAGES (C_SIGPAGE + 1) +static struct page *aarch32_vdso_pages[C_PAGES] __ro_after_init; +static const struct vm_special_mapping aarch32_vdso_spec[C_PAGES] = { + { + .name = "[vectors]", /* ABI */ + .pages = &aarch32_vdso_pages[C_VECTORS], + }, + { + .name = "[sigpage]", /* ABI */ + .pages = &aarch32_vdso_pages[C_SIGPAGE], + }, +}; -static int __init alloc_vectors_page(void) +static int aarch32_alloc_kuser_vdso_page(void) { extern char __kuser_helper_start[], __kuser_helper_end[]; - extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[]; - int kuser_sz = __kuser_helper_end - __kuser_helper_start; - int sigret_sz = __aarch32_sigret_code_end - __aarch32_sigret_code_start; - unsigned long vpage; + unsigned long vdso_page; - vpage = get_zeroed_page(GFP_ATOMIC); + if (!IS_ENABLED(CONFIG_KUSER_HELPERS)) + return 0; - if (!vpage) + vdso_page = get_zeroed_page(GFP_ATOMIC); + if (!vdso_page) return -ENOMEM; - /* kuser helpers */ - memcpy((void *)vpage + 0x1000 - kuser_sz, __kuser_helper_start, - kuser_sz); + memcpy((void *)(vdso_page + 0x1000 - kuser_sz), __kuser_helper_start, + kuser_sz); + aarch32_vdso_pages[C_VECTORS] = virt_to_page(vdso_page); + flush_dcache_page(aarch32_vdso_pages[C_VECTORS]); + return 0; +} - /* sigreturn code */ - memcpy((void *)vpage + AARCH32_KERN_SIGRET_CODE_OFFSET, - __aarch32_sigret_code_start, sigret_sz); +static int __init aarch32_alloc_vdso_pages(void) +{ + extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[]; + int sigret_sz = __aarch32_sigret_code_end - __aarch32_sigret_code_start; + unsigned long sigpage; + int ret; - flush_icache_range(vpage, vpage + PAGE_SIZE); - vectors_page[0] = virt_to_page(vpage); + sigpage = get_zeroed_page(GFP_ATOMIC); + if (!sigpage) + return -ENOMEM; - return 0; + memcpy((void *)sigpage, __aarch32_sigret_code_start, sigret_sz); + aarch32_vdso_pages[C_SIGPAGE] = virt_to_page(sigpage); + flush_dcache_page(aarch32_vdso_pages[C_SIGPAGE]); + + ret = aarch32_alloc_kuser_vdso_page(); + if (ret) + free_page(sigpage); + + return ret; } -arch_initcall(alloc_vectors_page); +arch_initcall(aarch32_alloc_vdso_pages); -int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp) +static int aarch32_kuser_helpers_setup(struct mm_struct *mm) { - struct mm_struct *mm = current->mm; - unsigned long addr = AARCH32_VECTORS_BASE; - static const struct vm_special_mapping spec = { - .name = "[vectors]", - .pages = vectors_page, + void *ret; + + if (!IS_ENABLED(CONFIG_KUSER_HELPERS)) + return 0; + + /* + * Avoid VM_MAYWRITE for compatibility with arch/arm/, where it's + * not safe to CoW the page containing the CPU exception vectors. + */ + ret = _install_special_mapping(mm, AARCH32_VECTORS_BASE, PAGE_SIZE, + VM_READ | VM_EXEC | + VM_MAYREAD | VM_MAYEXEC, + &aarch32_vdso_spec[C_VECTORS]); - }; + return PTR_ERR_OR_ZERO(ret); +} + +static int aarch32_sigreturn_setup(struct mm_struct *mm) +{ + unsigned long addr; void *ret; - if (down_write_killable(&mm->mmap_sem)) - return -EINTR; - current->mm->context.vdso = (void *)addr; + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = ERR_PTR(addr); + goto out; + } - /* Map vectors page at the high address. */ + /* + * VM_MAYWRITE is required to allow gdb to Copy-on-Write and + * set breakpoints. + */ ret = _install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC, - &spec); + VM_READ | VM_EXEC | VM_MAYREAD | + VM_MAYWRITE | VM_MAYEXEC, + &aarch32_vdso_spec[C_SIGPAGE]); + if (IS_ERR(ret)) + goto out; - up_write(&mm->mmap_sem); + mm->context.vdso = (void *)addr; +out: return PTR_ERR_OR_ZERO(ret); } + +int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + struct mm_struct *mm = current->mm; + int ret; + + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + + ret = aarch32_kuser_helpers_setup(mm); + if (ret) + goto out; + + ret = aarch32_sigreturn_setup(mm); + +out: + up_write(&mm->mmap_sem); + return ret; +} #endif /* CONFIG_COMPAT */ static int vdso_mremap(const struct vm_special_mapping *sm, @@ -146,8 +214,6 @@ static int __init vdso_init(void) } vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT; - pr_info("vdso: %ld pages (%ld code @ %p, %ld data @ %p)\n", - vdso_pages + 1, vdso_pages, vdso_start, 1L, vdso_data); /* Allocate the vDSO pagelist, plus a page for the data. */ vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *), @@ -232,6 +298,9 @@ void update_vsyscall(struct timekeeper *tk) vdso_data->wtm_clock_sec = tk->wall_to_monotonic.tv_sec; vdso_data->wtm_clock_nsec = tk->wall_to_monotonic.tv_nsec; + /* Read without the seqlock held by clock_getres() */ + WRITE_ONCE(vdso_data->hrtimer_res, hrtimer_resolution); + if (!use_syscall) { /* tkr_mono.cycle_last == tkr_raw.cycle_last */ vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last; diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index b215c712d89704e3247951fde01bcd4224b0c190..744b9dbaba036a3822f05b86825c81fd9246d2da 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -12,17 +12,12 @@ obj-vdso := gettimeofday.o note.o sigreturn.o targets := $(obj-vdso) vdso.so vdso.so.dbg obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) -ccflags-y := -shared -fno-common -fno-builtin -ccflags-y += -nostdlib -Wl,-soname=linux-vdso.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) +ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 \ + $(call ld-option, --hash-style=sysv) -n -T # Disable gcov profiling for VDSO code GCOV_PROFILE := n -# Workaround for bare-metal (ELF) toolchains that neglect to pass -shared -# down to collect2, resulting in silent corruption of the vDSO image. -ccflags-y += -Wl,-shared - obj-y += vdso.o extra-y += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) @@ -31,8 +26,8 @@ CPPFLAGS_vdso.lds += -P -C -U$(ARCH) $(obj)/vdso.o : $(obj)/vdso.so # Link rule for the .so file, .lds has to be first -$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) - $(call if_changed,vdsold) +$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE + $(call if_changed,ld) # Strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S @@ -42,9 +37,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE # Generate VDSO offsets using helper script gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ -define cmd_vdsosym - $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ -endef + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE $(call if_changed,vdsosym) @@ -54,8 +47,6 @@ $(obj-vdso): %.o: %.S FORCE $(call if_changed_dep,vdsoas) # Actual build commands -quiet_cmd_vdsold = VDSOL $@ - cmd_vdsold = $(CC) $(c_flags) -Wl,-n -Wl,-T $^ -o $@ quiet_cmd_vdsoas = VDSOA $@ cmd_vdsoas = $(CC) $(a_flags) -c -o $@ $< diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S index c39872a7b03c3e152315781c753f3e6b186524ed..856fee6d3512977d502c86f265bd406245f3b039 100644 --- a/arch/arm64/kernel/vdso/gettimeofday.S +++ b/arch/arm64/kernel/vdso/gettimeofday.S @@ -73,6 +73,13 @@ x_tmp .req x8 movn x_tmp, #0xff00, lsl #48 and \res, x_tmp, \res mul \res, \res, \mult + /* + * Fake address dependency from the value computed from the counter + * register to subsequent data page accesses so that the sequence + * locking also orders the read of the counter. + */ + and x_tmp, \res, xzr + add vdso_data, vdso_data, x_tmp .endm /* @@ -147,12 +154,12 @@ ENTRY(__kernel_gettimeofday) /* w11 = cs_mono_mult, w12 = cs_shift */ ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - seqcnt_check fail=1b get_nsec_per_sec res=x9 lsl x9, x9, x12 get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + seqcnt_check fail=1b get_ts_realtime res_sec=x10, res_nsec=x11, \ clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 @@ -211,13 +218,13 @@ realtime: /* w11 = cs_mono_mult, w12 = cs_shift */ ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - seqcnt_check fail=realtime /* All computations are done with left-shifted nsecs. */ get_nsec_per_sec res=x9 lsl x9, x9, x12 get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + seqcnt_check fail=realtime get_ts_realtime res_sec=x10, res_nsec=x11, \ clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 clock_gettime_return, shift=1 @@ -231,7 +238,6 @@ monotonic: ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] ldp x3, x4, [vdso_data, #VDSO_WTM_CLK_SEC] - seqcnt_check fail=monotonic /* All computations are done with left-shifted nsecs. */ lsl x4, x4, x12 @@ -239,6 +245,7 @@ monotonic: lsl x9, x9, x12 get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + seqcnt_check fail=monotonic get_ts_realtime res_sec=x10, res_nsec=x11, \ clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 @@ -253,13 +260,13 @@ monotonic_raw: /* w11 = cs_raw_mult, w12 = cs_shift */ ldp w12, w11, [vdso_data, #VDSO_CS_SHIFT] ldp x13, x14, [vdso_data, #VDSO_RAW_TIME_SEC] - seqcnt_check fail=monotonic_raw /* All computations are done with left-shifted nsecs. */ get_nsec_per_sec res=x9 lsl x9, x9, x12 get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + seqcnt_check fail=monotonic_raw get_ts_clock_raw res_sec=x10, res_nsec=x11, \ clock_nsec=x15, nsec_to_sec=x9 @@ -301,13 +308,14 @@ ENTRY(__kernel_clock_getres) ccmp w0, #CLOCK_MONOTONIC_RAW, #0x4, ne b.ne 1f - ldr x2, 5f + adr vdso_data, _vdso_data + ldr w2, [vdso_data, #CLOCK_REALTIME_RES] b 2f 1: cmp w0, #CLOCK_REALTIME_COARSE ccmp w0, #CLOCK_MONOTONIC_COARSE, #0x4, ne b.ne 4f - ldr x2, 6f + ldr x2, 5f 2: cbz x1, 3f stp xzr, x2, [x1] @@ -321,8 +329,6 @@ ENTRY(__kernel_clock_getres) svc #0 ret 5: - .quad CLOCK_REALTIME_RES -6: .quad CLOCK_COARSE_RES .cfi_endproc ENDPROC(__kernel_clock_getres) diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 5540a1638baf714f9ef4e2fc518d9d4962d1888c..33c2a4abda046532c733b1fcc55f90ff8fa68d7c 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -24,7 +24,7 @@ CFLAGS_atomic_ll_sc.o := -ffixed-x1 -ffixed-x2 \ -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12 \ -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15 \ -fcall-saved-x18 -fomit-frame-pointer -CFLAGS_REMOVE_atomic_ll_sc.o := -pg +CFLAGS_REMOVE_atomic_ll_sc.o := $(CC_FLAGS_FTRACE) GCOV_PROFILE_atomic_ll_sc.o := n KASAN_SANITIZE_atomic_ll_sc.o := n KCOV_INSTRUMENT_atomic_ll_sc.o := n diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 1a7e92ab69ebbff7defeaf4eca01c8094c6ac6e9..0cb0e09995e110ef075c0f834ce22f3493f70d2e 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -148,7 +148,7 @@ static inline bool is_ttbr1_addr(unsigned long addr) /* * Dump out the page tables associated with 'addr' in the currently active mm. */ -void show_pte(unsigned long addr) +static void show_pte(unsigned long addr) { struct mm_struct *mm; pgd_t *pgdp; @@ -810,13 +810,12 @@ void __init hook_debug_fault_code(int nr, debug_fault_info[nr].name = name; } -asmlinkage int __exception do_debug_exception(unsigned long addr_if_watchpoint, - unsigned int esr, - struct pt_regs *regs) +asmlinkage void __exception do_debug_exception(unsigned long addr_if_watchpoint, + unsigned int esr, + struct pt_regs *regs) { const struct fault_info *inf = esr_to_debug_fault_info(esr); unsigned long pc = instruction_pointer(regs); - int rv; /* * Tell lockdep we disabled irqs in entry.S. Do nothing if they were @@ -828,17 +827,12 @@ asmlinkage int __exception do_debug_exception(unsigned long addr_if_watchpoint, if (user_mode(regs) && !is_ttbr0_addr(pc)) arm64_apply_bp_hardening(); - if (!inf->fn(addr_if_watchpoint, esr, regs)) { - rv = 1; - } else { + if (inf->fn(addr_if_watchpoint, esr, regs)) { arm64_notify_die(inf->name, regs, inf->sig, inf->code, (void __user *)pc, esr); - rv = 0; } if (interrupts_enabled(regs)) trace_hardirqs_on(); - - return rv; } NOKPROBE_SYMBOL(do_debug_exception); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 6bc135042f5e4dc244dbf14e8ea953121931ad2b..40e2d7e5efcb1e10c34b4abf83352ca4ec4c0d99 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -363,7 +363,7 @@ void __init arm64_memblock_init(void) * Otherwise, this is a no-op */ u64 base = phys_initrd_start & PAGE_MASK; - u64 size = PAGE_ALIGN(phys_initrd_size); + u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; /* * We can only add back the initrd memory if we don't end up @@ -377,7 +377,7 @@ void __init arm64_memblock_init(void) base + size > memblock_start_of_DRAM() + linear_region_size, "initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) { - initrd_start = 0; + phys_initrd_size = 0; } else { memblock_remove(base, size); /* clear MEMBLOCK_ flags */ memblock_add(base, size); @@ -440,6 +440,7 @@ void __init bootmem_init(void) early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT); max_pfn = max_low_pfn = max; + min_low_pfn = min; arm64_numa_init(); /* @@ -535,7 +536,7 @@ void __init mem_init(void) else swiotlb_force = SWIOTLB_NO_FORCE; - set_max_mapnr(pfn_to_page(max_pfn) - mem_map); + set_max_mapnr(max_pfn - PHYS_PFN_OFFSET); #ifndef CONFIG_SPARSEMEM_VMEMMAP free_unused_memmap(); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index e97f018ff740f8a1cea437aa88a652882412928b..ef82312860ac3ee8e8568b229d1cd92b24c6ddd6 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -97,7 +97,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static phys_addr_t __init early_pgtable_alloc(void) +static phys_addr_t __init early_pgtable_alloc(int shift) { phys_addr_t phys; void *ptr; @@ -174,7 +174,7 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), + phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; @@ -184,7 +184,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, if (pmd_none(pmd)) { phys_addr_t pte_phys; BUG_ON(!pgtable_alloc); - pte_phys = pgtable_alloc(); + pte_phys = pgtable_alloc(PAGE_SHIFT); __pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE); pmd = READ_ONCE(*pmdp); } @@ -208,7 +208,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), int flags) + phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; pmd_t *pmdp; @@ -246,7 +246,7 @@ static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end, static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), int flags) + phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; pud_t pud = READ_ONCE(*pudp); @@ -258,7 +258,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, if (pud_none(pud)) { phys_addr_t pmd_phys; BUG_ON(!pgtable_alloc); - pmd_phys = pgtable_alloc(); + pmd_phys = pgtable_alloc(PMD_SHIFT); __pud_populate(pudp, pmd_phys, PUD_TYPE_TABLE); pud = READ_ONCE(*pudp); } @@ -294,7 +294,7 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), + phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; @@ -304,7 +304,7 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, if (pgd_none(pgd)) { phys_addr_t pud_phys; BUG_ON(!pgtable_alloc); - pud_phys = pgtable_alloc(); + pud_phys = pgtable_alloc(PUD_SHIFT); __pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE); pgd = READ_ONCE(*pgdp); } @@ -345,7 +345,7 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), + phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long addr, length, end, next; @@ -371,17 +371,36 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, } while (pgdp++, addr = next, addr != end); } -static phys_addr_t pgd_pgtable_alloc(void) +static phys_addr_t __pgd_pgtable_alloc(int shift) { void *ptr = (void *)__get_free_page(PGALLOC_GFP); - if (!ptr || !pgtable_page_ctor(virt_to_page(ptr))) - BUG(); + BUG_ON(!ptr); /* Ensure the zeroed page is visible to the page table walker */ dsb(ishst); return __pa(ptr); } +static phys_addr_t pgd_pgtable_alloc(int shift) +{ + phys_addr_t pa = __pgd_pgtable_alloc(shift); + + /* + * Call proper page table ctor in case later we need to + * call core mm functions like apply_to_page_range() on + * this pre-allocated page table. + * + * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is + * folded, and if so pgtable_pmd_page_ctor() becomes nop. + */ + if (shift == PAGE_SHIFT) + BUG_ON(!pgtable_page_ctor(phys_to_page(pa))); + else if (shift == PMD_SHIFT) + BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa))); + + return pa; +} + /* * This function can only be used to modify existing table entries, * without allocating new levels of table. Note that this permits the @@ -583,7 +602,7 @@ static int __init map_entry_trampoline(void) /* Map only the text into the trampoline page table */ memset(tramp_pg_dir, 0, PGD_SIZE); __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE, - prot, pgd_pgtable_alloc, 0); + prot, __pgd_pgtable_alloc, 0); /* Map both the text and data into the kernel page table */ __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot); @@ -1055,7 +1074,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), - size, PAGE_KERNEL, pgd_pgtable_alloc, flags); + size, PAGE_KERNEL, __pgd_pgtable_alloc, flags); return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap, want_memblock); diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index 06a6f264f2ddf27f1274802882f9cedb77e043d4..5202f63c29c9d627b21b1fead460dc69787c10eb 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -124,7 +124,7 @@ static void __init setup_node_to_cpumask_map(void) } /* - * Set the cpu to node and mem mapping + * Set the cpu to node and mem mapping */ void numa_store_cpu_info(unsigned int cpu) { @@ -200,7 +200,7 @@ void __init setup_per_cpu_areas(void) #endif /** - * numa_add_memblk - Set node id to memblk + * numa_add_memblk() - Set node id to memblk * @nid: NUMA node ID of the new memblk * @start: Start address of the new memblk * @end: End address of the new memblk @@ -223,7 +223,7 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) return ret; } -/** +/* * Initialize NODE_DATA for a node on the local memory */ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) @@ -257,7 +257,7 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; } -/** +/* * numa_free_distance * * The current table is freed. @@ -277,10 +277,8 @@ void __init numa_free_distance(void) numa_distance = NULL; } -/** - * +/* * Create a new NUMA distance table. - * */ static int __init numa_alloc_distance(void) { @@ -311,7 +309,7 @@ static int __init numa_alloc_distance(void) } /** - * numa_set_distance - Set inter node NUMA distance from node to node. + * numa_set_distance() - Set inter node NUMA distance from node to node. * @from: the 'from' node to set distance * @to: the 'to' node to set distance * @distance: NUMA distance @@ -321,7 +319,6 @@ static int __init numa_alloc_distance(void) * * If @from or @to is higher than the highest known node or lower than zero * or @distance doesn't make sense, the call is ignored. - * */ void __init numa_set_distance(int from, int to, int distance) { @@ -347,7 +344,7 @@ void __init numa_set_distance(int from, int to, int distance) numa_distance[from * numa_distance_cnt + to] = distance; } -/** +/* * Return NUMA distance @from to @to */ int __node_distance(int from, int to) @@ -422,13 +419,15 @@ out_free_distance: } /** - * dummy_numa_init - Fallback dummy NUMA init + * dummy_numa_init() - Fallback dummy NUMA init * * Used if there's no underlying NUMA architecture, NUMA initialization * fails, or NUMA is disabled on the command line. * * Must online at least one node (node 0) and add memory blocks that cover all * allowed memory. It is unlikely that this function fails. + * + * Return: 0 on success, -errno on failure. */ static int __init dummy_numa_init(void) { @@ -454,9 +453,9 @@ static int __init dummy_numa_init(void) } /** - * arm64_numa_init - Initialize NUMA + * arm64_numa_init() - Initialize NUMA * - * Try each configured NUMA initialization method until one succeeds. The + * Try each configured NUMA initialization method until one succeeds. The * last fallback is dummy single node config encomapssing whole memory. */ void __init arm64_numa_init(void) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index aa0817c9c4c362df8e80570748a642adb96c4616..fdd626d34274fda47041318e832918ff6f9746ad 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -65,24 +65,25 @@ ENTRY(cpu_do_suspend) mrs x2, tpidr_el0 mrs x3, tpidrro_el0 mrs x4, contextidr_el1 - mrs x5, cpacr_el1 - mrs x6, tcr_el1 - mrs x7, vbar_el1 - mrs x8, mdscr_el1 - mrs x9, oslsr_el1 - mrs x10, sctlr_el1 + mrs x5, osdlr_el1 + mrs x6, cpacr_el1 + mrs x7, tcr_el1 + mrs x8, vbar_el1 + mrs x9, mdscr_el1 + mrs x10, oslsr_el1 + mrs x11, sctlr_el1 alternative_if_not ARM64_HAS_VIRT_HOST_EXTN - mrs x11, tpidr_el1 + mrs x12, tpidr_el1 alternative_else - mrs x11, tpidr_el2 + mrs x12, tpidr_el2 alternative_endif - mrs x12, sp_el0 + mrs x13, sp_el0 stp x2, x3, [x0] - stp x4, xzr, [x0, #16] - stp x5, x6, [x0, #32] - stp x7, x8, [x0, #48] - stp x9, x10, [x0, #64] - stp x11, x12, [x0, #80] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + stp x10, x11, [x0, #64] + stp x12, x13, [x0, #80] ret ENDPROC(cpu_do_suspend) @@ -105,8 +106,8 @@ ENTRY(cpu_do_resume) msr cpacr_el1, x6 /* Don't change t0sz here, mask those bits when restoring */ - mrs x5, tcr_el1 - bfi x8, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH + mrs x7, tcr_el1 + bfi x8, x7, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH msr tcr_el1, x8 msr vbar_el1, x9 @@ -130,6 +131,7 @@ alternative_endif /* * Restore oslsr_el1 by writing oslar_el1 */ + msr osdlr_el1, x5 ubfx x11, x11, #1, #1 msr oslar_el1, x11 reset_pmuserenr_el0 x0 // Disable PMU access from EL0 diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index e5cd3c5f8399ce1cb055315083db5b2f4873a1fe..eeb0471268a079994b843310de718f745624acb2 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -20,6 +20,7 @@ config C6X select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA select ARCH_NO_COHERENT_DMA_MMAP + select MMU_GATHER_NO_RANGE if MMU config MMU def_bool n @@ -27,9 +28,6 @@ config MMU config FPU def_bool n -config RWSEM_GENERIC_SPINLOCK - def_bool y - config GENERIC_CALIBRATE_DELAY def_bool y diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index 249c9f6f26dce7c2dd2a43f9a9f20bd0d29478f3..6b168d32fbffe63a1acf325352d5fc21ad2c4ab2 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild @@ -23,6 +23,7 @@ generic-y += kvm_para.h generic-y += local.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += mmu.h generic-y += mmu_context.h generic-y += pci.h diff --git a/arch/c6x/include/asm/syscall.h b/arch/c6x/include/asm/syscall.h index ae2be315ee9c98d8440f4409be8b67cc261f8858..15ba8599858e6be5a860e23f338c8bd292da7207 100644 --- a/arch/c6x/include/asm/syscall.h +++ b/arch/c6x/include/asm/syscall.h @@ -46,78 +46,27 @@ static inline void syscall_set_return_value(struct task_struct *task, } static inline void syscall_get_arguments(struct task_struct *task, - struct pt_regs *regs, unsigned int i, - unsigned int n, unsigned long *args) + struct pt_regs *regs, + unsigned long *args) { - switch (i) { - case 0: - if (!n--) - break; - *args++ = regs->a4; - case 1: - if (!n--) - break; - *args++ = regs->b4; - case 2: - if (!n--) - break; - *args++ = regs->a6; - case 3: - if (!n--) - break; - *args++ = regs->b6; - case 4: - if (!n--) - break; - *args++ = regs->a8; - case 5: - if (!n--) - break; - *args++ = regs->b8; - case 6: - if (!n--) - break; - default: - BUG(); - } + *args++ = regs->a4; + *args++ = regs->b4; + *args++ = regs->a6; + *args++ = regs->b6; + *args++ = regs->a8; + *args = regs->b8; } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) { - switch (i) { - case 0: - if (!n--) - break; - regs->a4 = *args++; - case 1: - if (!n--) - break; - regs->b4 = *args++; - case 2: - if (!n--) - break; - regs->a6 = *args++; - case 3: - if (!n--) - break; - regs->b6 = *args++; - case 4: - if (!n--) - break; - regs->a8 = *args++; - case 5: - if (!n--) - break; - regs->a9 = *args++; - case 6: - if (!n) - break; - default: - BUG(); - } + regs->a4 = *args++; + regs->b4 = *args++; + regs->a6 = *args++; + regs->b6 = *args++; + regs->a8 = *args++; + regs->a9 = *args; } #endif /* __ASM_C6X_SYSCALLS_H */ diff --git a/arch/c6x/include/asm/tlb.h b/arch/c6x/include/asm/tlb.h index 34525dea1356645c12a4c3c9b6d1a5f57eec3851..240ba0febb57b7407070df4e52f6ae2c346a42c4 100644 --- a/arch/c6x/include/asm/tlb.h +++ b/arch/c6x/include/asm/tlb.h @@ -2,8 +2,6 @@ #ifndef _ASM_C6X_TLB_H #define _ASM_C6X_TLB_H -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include #endif /* _ASM_C6X_TLB_H */ diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 725a115759c97695eec204f2c15ca399eab234ee..6555d178113221412b1c0f3dbd0ff09c63bb99a6 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -92,9 +92,6 @@ config GENERIC_HWEIGHT config MMU def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - config STACKTRACE_SUPPORT def_bool y diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index 2a0abe8f2a352e9209abdd83382cf3e411b768ff..95f4e550db8a162e896410be3d3ab411857687c6 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -28,6 +28,7 @@ generic-y += linkage.h generic-y += local.h generic-y += local64.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += mutex.h generic-y += pci.h diff --git a/arch/csky/include/asm/syscall.h b/arch/csky/include/asm/syscall.h index d637445737b78fd5c78c9994173a1e7c73eb3d1f..bda0a446c63ead759d9360d769fe68647a28f83d 100644 --- a/arch/csky/include/asm/syscall.h +++ b/arch/csky/include/asm/syscall.h @@ -43,30 +43,20 @@ syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) + unsigned long *args) { - BUG_ON(i + n > 6); - if (i == 0) { - args[0] = regs->orig_a0; - args++; - i++; - n--; - } - memcpy(args, ®s->a1 + i * sizeof(regs->a1), n * sizeof(args[0])); + args[0] = regs->orig_a0; + args++; + memcpy(args, ®s->a1, 5 * sizeof(args[0])); } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) + const unsigned long *args) { - BUG_ON(i + n > 6); - if (i == 0) { - regs->orig_a0 = args[0]; - args++; - i++; - n--; - } - memcpy(®s->a1 + i * sizeof(regs->a1), args, n * sizeof(regs->a0)); + regs->orig_a0 = args[0]; + args++; + memcpy(®s->a1, args, 5 * sizeof(regs->a1)); } static inline int diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index c071da34e0817be181e82ac3d7a9ca7eea85450c..61c01db6c29230ca8b60ffc64d00179ccc579b24 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -27,9 +27,6 @@ config H8300 config CPU_BIG_ENDIAN def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - config GENERIC_HWEIGHT def_bool y diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index e3dead402e5fbe94ebe53063968801c8f51360b5..123d8f54be4a5a5a58ca6fc8b0b4a1145fb157e1 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild @@ -29,6 +29,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += mmu.h generic-y += mmu_context.h generic-y += module.h diff --git a/arch/h8300/include/asm/syscall.h b/arch/h8300/include/asm/syscall.h index 924990401237126585ea8fd105e4b57e8f9e5b24..ddd483c6ca95c9df50e9ed7b8c820c9884afcbeb 100644 --- a/arch/h8300/include/asm/syscall.h +++ b/arch/h8300/include/asm/syscall.h @@ -17,34 +17,14 @@ syscall_get_nr(struct task_struct *task, struct pt_regs *regs) static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) + unsigned long *args) { - BUG_ON(i + n > 6); - - while (n > 0) { - switch (i) { - case 0: - *args++ = regs->er1; - break; - case 1: - *args++ = regs->er2; - break; - case 2: - *args++ = regs->er3; - break; - case 3: - *args++ = regs->er4; - break; - case 4: - *args++ = regs->er5; - break; - case 5: - *args++ = regs->er6; - break; - } - i++; - n--; - } + *args++ = regs->er1; + *args++ = regs->er2; + *args++ = regs->er3; + *args++ = regs->er4; + *args++ = regs->er5; + *args = regs->er6; } diff --git a/arch/h8300/include/asm/tlb.h b/arch/h8300/include/asm/tlb.h index 98f344279904a684d367c4d55556c06bab3e74ab..d8201ca312061d96d9409fd27efc87cddf384196 100644 --- a/arch/h8300/include/asm/tlb.h +++ b/arch/h8300/include/asm/tlb.h @@ -2,8 +2,6 @@ #ifndef __H8300_TLB_H__ #define __H8300_TLB_H__ -#define tlb_flush(tlb) do { } while (0) - #include #endif diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index ac441680dcc06acddcfdf717b1828c2cdc1962d8..3e54a53208d58ad7c7587dcc88920a692c5d2634 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -65,12 +65,6 @@ config GENERIC_CSUM config GENERIC_IRQ_PROBE def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool n - -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config GENERIC_HWEIGHT def_bool y diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index d046e8ccdf786be5029237ad722d819de88d6124..6234a303d2a3bb75cb0870aa9e92e82ae06bc108 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -24,10 +24,10 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += pci.h generic-y += percpu.h generic-y += preempt.h -generic-y += rwsem.h generic-y += sections.h generic-y += segment.h generic-y += serial.h diff --git a/arch/hexagon/include/asm/io.h b/arch/hexagon/include/asm/io.h index e17262ad125e7ee0c48be297dfff08dc50f2d5e6..3d0ae09c2b8e152df0f404083ed91b87e9f126a8 100644 --- a/arch/hexagon/include/asm/io.h +++ b/arch/hexagon/include/asm/io.h @@ -184,8 +184,6 @@ static inline void writel(u32 data, volatile void __iomem *addr) #define writew_relaxed __raw_writew #define writel_relaxed __raw_writel -#define mmiowb() - /* * Need an mtype somewhere in here, for cache type deals? * This is probably too long for an inline. diff --git a/arch/hexagon/include/asm/syscall.h b/arch/hexagon/include/asm/syscall.h index 4af9c7b6f13af9490e4bee7b9f608c7d467cecb3..ae3a1e24fabd7193ff7d3dc142c4ca7d123f56e6 100644 --- a/arch/hexagon/include/asm/syscall.h +++ b/arch/hexagon/include/asm/syscall.h @@ -37,10 +37,8 @@ static inline long syscall_get_nr(struct task_struct *task, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { - BUG_ON(i + n > 6); - memcpy(args, &(®s->r00)[i], n * sizeof(args[0])); + memcpy(args, &(®s->r00)[0], 6 * sizeof(args[0])); } #endif diff --git a/arch/hexagon/include/asm/tlb.h b/arch/hexagon/include/asm/tlb.h index 2f00772cc08a551df873985b29647ac388fb1e55..f71c4ba83614c38187fd5ca0a5e24bb5ff71749d 100644 --- a/arch/hexagon/include/asm/tlb.h +++ b/arch/hexagon/include/asm/tlb.h @@ -22,18 +22,6 @@ #include #include -/* - * We don't need any special per-pte or per-vma handling... - */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) - -/* - * .. because we flush the whole mm when it fills up - */ -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include #endif diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 8d7396bd1790319eb7fa9a10b671d2b922081b36..73a26f04644e3e27a17c1fc1ab4145f22f6d520c 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -83,10 +83,6 @@ config STACKTRACE_SUPPORT config GENERIC_LOCKBREAK def_bool n -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config HUGETLB_PAGE_SIZE_VARIABLE bool depends on HUGETLB_PAGE diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h index 1e6fef69bb01c26286df9ea7963d092b442e66ab..a511d62d447ac1b3071a64cb2ee50c20bfaa76a9 100644 --- a/arch/ia64/include/asm/io.h +++ b/arch/ia64/include/asm/io.h @@ -113,20 +113,6 @@ extern int valid_mmap_phys_addr_range (unsigned long pfn, size_t count); */ #define __ia64_mf_a() ia64_mfa() -/** - * ___ia64_mmiowb - I/O write barrier - * - * Ensure ordering of I/O space writes. This will make sure that writes - * following the barrier will arrive after all previous writes. For most - * ia64 platforms, this is a simple 'mf.a' instruction. - * - * See Documentation/driver-api/device-io.rst for more information. - */ -static inline void ___ia64_mmiowb(void) -{ - ia64_mfa(); -} - static inline void* __ia64_mk_io_addr (unsigned long port) { @@ -161,7 +147,6 @@ __ia64_mk_io_addr (unsigned long port) #define __ia64_writew ___ia64_writew #define __ia64_writel ___ia64_writel #define __ia64_writeq ___ia64_writeq -#define __ia64_mmiowb ___ia64_mmiowb /* * For the in/out routines, we need to do "mf.a" _after_ doing the I/O access to ensure @@ -296,7 +281,6 @@ __outsl (unsigned long port, const void *src, unsigned long count) #define __outb platform_outb #define __outw platform_outw #define __outl platform_outl -#define __mmiowb platform_mmiowb #define inb(p) __inb(p) #define inw(p) __inw(p) @@ -310,7 +294,6 @@ __outsl (unsigned long port, const void *src, unsigned long count) #define outsb(p,s,c) __outsb(p,s,c) #define outsw(p,s,c) __outsw(p,s,c) #define outsl(p,s,c) __outsl(p,s,c) -#define mmiowb() __mmiowb() /* * The address passed to these functions are ioremap()ped already. diff --git a/arch/ia64/include/asm/machvec.h b/arch/ia64/include/asm/machvec.h index 5133739966bcfa00570aca667c88d96fe71e771a..beae261fbcb415b2321af6c333bd9ce5e304896f 100644 --- a/arch/ia64/include/asm/machvec.h +++ b/arch/ia64/include/asm/machvec.h @@ -30,7 +30,6 @@ typedef void ia64_mv_irq_init_t (void); typedef void ia64_mv_send_ipi_t (int, int, int, int); typedef void ia64_mv_timer_interrupt_t (int, void *); typedef void ia64_mv_global_tlb_purge_t (struct mm_struct *, unsigned long, unsigned long, unsigned long); -typedef void ia64_mv_tlb_migrate_finish_t (struct mm_struct *); typedef u8 ia64_mv_irq_to_vector (int); typedef unsigned int ia64_mv_local_vector_to_irq (u8); typedef char *ia64_mv_pci_get_legacy_mem_t (struct pci_bus *); @@ -79,11 +78,6 @@ machvec_noop (void) { } -static inline void -machvec_noop_mm (struct mm_struct *mm) -{ -} - static inline void machvec_noop_task (struct task_struct *task) { @@ -96,7 +90,6 @@ machvec_noop_bus (struct pci_bus *bus) extern void machvec_setup (char **); extern void machvec_timer_interrupt (int, void *); -extern void machvec_tlb_migrate_finish (struct mm_struct *); # if defined (CONFIG_IA64_HP_SIM) # include @@ -124,7 +117,6 @@ extern void machvec_tlb_migrate_finish (struct mm_struct *); # define platform_send_ipi ia64_mv.send_ipi # define platform_timer_interrupt ia64_mv.timer_interrupt # define platform_global_tlb_purge ia64_mv.global_tlb_purge -# define platform_tlb_migrate_finish ia64_mv.tlb_migrate_finish # define platform_dma_init ia64_mv.dma_init # define platform_dma_get_ops ia64_mv.dma_get_ops # define platform_irq_to_vector ia64_mv.irq_to_vector @@ -167,7 +159,6 @@ struct ia64_machine_vector { ia64_mv_send_ipi_t *send_ipi; ia64_mv_timer_interrupt_t *timer_interrupt; ia64_mv_global_tlb_purge_t *global_tlb_purge; - ia64_mv_tlb_migrate_finish_t *tlb_migrate_finish; ia64_mv_dma_init *dma_init; ia64_mv_dma_get_ops *dma_get_ops; ia64_mv_irq_to_vector *irq_to_vector; @@ -206,7 +197,6 @@ struct ia64_machine_vector { platform_send_ipi, \ platform_timer_interrupt, \ platform_global_tlb_purge, \ - platform_tlb_migrate_finish, \ platform_dma_init, \ platform_dma_get_ops, \ platform_irq_to_vector, \ @@ -270,9 +260,6 @@ extern const struct dma_map_ops *dma_get_ops(struct device *); #ifndef platform_global_tlb_purge # define platform_global_tlb_purge ia64_global_tlb_purge /* default to architected version */ #endif -#ifndef platform_tlb_migrate_finish -# define platform_tlb_migrate_finish machvec_noop_mm -#endif #ifndef platform_kernel_launch_event # define platform_kernel_launch_event machvec_noop #endif diff --git a/arch/ia64/include/asm/machvec_sn2.h b/arch/ia64/include/asm/machvec_sn2.h index b5153d300289724622ae936d560b40a94e471500..a243e4fb4877d7416949359ae6b52d59d25d803d 100644 --- a/arch/ia64/include/asm/machvec_sn2.h +++ b/arch/ia64/include/asm/machvec_sn2.h @@ -34,7 +34,6 @@ extern ia64_mv_irq_init_t sn_irq_init; extern ia64_mv_send_ipi_t sn2_send_IPI; extern ia64_mv_timer_interrupt_t sn_timer_interrupt; extern ia64_mv_global_tlb_purge_t sn2_global_tlb_purge; -extern ia64_mv_tlb_migrate_finish_t sn_tlb_migrate_finish; extern ia64_mv_irq_to_vector sn_irq_to_vector; extern ia64_mv_local_vector_to_irq sn_local_vector_to_irq; extern ia64_mv_pci_get_legacy_mem_t sn_pci_get_legacy_mem; @@ -77,7 +76,6 @@ extern ia64_mv_pci_fixup_bus_t sn_pci_fixup_bus; #define platform_send_ipi sn2_send_IPI #define platform_timer_interrupt sn_timer_interrupt #define platform_global_tlb_purge sn2_global_tlb_purge -#define platform_tlb_migrate_finish sn_tlb_migrate_finish #define platform_pci_fixup sn_pci_fixup #define platform_inb __sn_inb #define platform_inw __sn_inw diff --git a/arch/ia64/include/asm/mmiowb.h b/arch/ia64/include/asm/mmiowb.h new file mode 100644 index 0000000000000000000000000000000000000000..297b85ac84a01b04f779e919b2257515934f005b --- /dev/null +++ b/arch/ia64/include/asm/mmiowb.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_IA64_MMIOWB_H +#define _ASM_IA64_MMIOWB_H + +#include + +/** + * ___ia64_mmiowb - I/O write barrier + * + * Ensure ordering of I/O space writes. This will make sure that writes + * following the barrier will arrive after all previous writes. For most + * ia64 platforms, this is a simple 'mf.a' instruction. + */ +static inline void ___ia64_mmiowb(void) +{ + ia64_mfa(); +} + +#define __ia64_mmiowb ___ia64_mmiowb +#define mmiowb() platform_mmiowb() + +#include + +#endif /* _ASM_IA64_MMIOWB_H */ diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h deleted file mode 100644 index 917910607e0ea94a5bb349540d5d6cdbaec79de8..0000000000000000000000000000000000000000 --- a/arch/ia64/include/asm/rwsem.h +++ /dev/null @@ -1,172 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * R/W semaphores for ia64 - * - * Copyright (C) 2003 Ken Chen - * Copyright (C) 2003 Asit Mallick - * Copyright (C) 2005 Christoph Lameter - * - * Based on asm-i386/rwsem.h and other architecture implementation. - * - * The MSW of the count is the negated number of active writers and - * waiting lockers, and the LSW is the total number of active locks. - * - * The lock count is initialized to 0 (no active and no waiting lockers). - * - * When a writer subtracts WRITE_BIAS, it'll get 0xffffffff00000001 for - * the case of an uncontended lock. Readers increment by 1 and see a positive - * value when uncontended, negative if there are writers (and maybe) readers - * waiting (in which case it goes to sleep). - */ - -#ifndef _ASM_IA64_RWSEM_H -#define _ASM_IA64_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "Please don't include directly, use instead." -#endif - -#include - -#define RWSEM_UNLOCKED_VALUE __IA64_UL_CONST(0x0000000000000000) -#define RWSEM_ACTIVE_BIAS (1L) -#define RWSEM_ACTIVE_MASK (0xffffffffL) -#define RWSEM_WAITING_BIAS (-0x100000000L) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline int -___down_read (struct rw_semaphore *sem) -{ - long result = ia64_fetchadd8_acq((unsigned long *)&sem->count.counter, 1); - - return (result < 0); -} - -static inline void -__down_read (struct rw_semaphore *sem) -{ - if (___down_read(sem)) - rwsem_down_read_failed(sem); -} - -static inline int -__down_read_killable (struct rw_semaphore *sem) -{ - if (___down_read(sem)) - if (IS_ERR(rwsem_down_read_failed_killable(sem))) - return -EINTR; - - return 0; -} - -/* - * lock for writing - */ -static inline long -___down_write (struct rw_semaphore *sem) -{ - long old, new; - - do { - old = atomic_long_read(&sem->count); - new = old + RWSEM_ACTIVE_WRITE_BIAS; - } while (atomic_long_cmpxchg_acquire(&sem->count, old, new) != old); - - return old; -} - -static inline void -__down_write (struct rw_semaphore *sem) -{ - if (___down_write(sem)) - rwsem_down_write_failed(sem); -} - -static inline int -__down_write_killable (struct rw_semaphore *sem) -{ - if (___down_write(sem)) { - if (IS_ERR(rwsem_down_write_failed_killable(sem))) - return -EINTR; - } - - return 0; -} - -/* - * unlock after reading - */ -static inline void -__up_read (struct rw_semaphore *sem) -{ - long result = ia64_fetchadd8_rel((unsigned long *)&sem->count.counter, -1); - - if (result < 0 && (--result & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void -__up_write (struct rw_semaphore *sem) -{ - long old, new; - - do { - old = atomic_long_read(&sem->count); - new = old - RWSEM_ACTIVE_WRITE_BIAS; - } while (atomic_long_cmpxchg_release(&sem->count, old, new) != old); - - if (new < 0 && (new & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -static inline int -__down_read_trylock (struct rw_semaphore *sem) -{ - long tmp; - while ((tmp = atomic_long_read(&sem->count)) >= 0) { - if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp, tmp+1)) { - return 1; - } - } - return 0; -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -static inline int -__down_write_trylock (struct rw_semaphore *sem) -{ - long tmp = atomic_long_cmpxchg_acquire(&sem->count, - RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * downgrade write lock to read lock - */ -static inline void -__downgrade_write (struct rw_semaphore *sem) -{ - long old, new; - - do { - old = atomic_long_read(&sem->count); - new = old - RWSEM_WAITING_BIAS; - } while (atomic_long_cmpxchg_release(&sem->count, old, new) != old); - - if (old < 0) - rwsem_downgrade_wake(sem); -} - -#endif /* _ASM_IA64_RWSEM_H */ diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index afd0b3121b4c9565cd5c20d85e813e92fea50039..5f620e66384e37079396f0f177d49d19c5724c94 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -73,6 +73,8 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { unsigned short *p = (unsigned short *)&lock->lock + 1, tmp; + /* This could be optimised with ARCH_HAS_MMIOWB */ + mmiowb(); asm volatile ("ld2.bias %0=[%1]" : "=r"(tmp) : "r"(p)); WRITE_ONCE(*p, (tmp + 2) & ~1); } diff --git a/arch/ia64/include/asm/syscall.h b/arch/ia64/include/asm/syscall.h index 1d0b875fec44fc0fd8a70876e8ee6191f66f6005..0d9e7fab4a79fddcc63d24c30f002e73d2b91db0 100644 --- a/arch/ia64/include/asm/syscall.h +++ b/arch/ia64/include/asm/syscall.h @@ -59,26 +59,19 @@ static inline void syscall_set_return_value(struct task_struct *task, } extern void ia64_syscall_get_set_arguments(struct task_struct *task, - struct pt_regs *regs, unsigned int i, unsigned int n, - unsigned long *args, int rw); + struct pt_regs *regs, unsigned long *args, int rw); static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { - BUG_ON(i + n > 6); - - ia64_syscall_get_set_arguments(task, regs, i, n, args, 0); + ia64_syscall_get_set_arguments(task, regs, args, 0); } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { - BUG_ON(i + n > 6); - - ia64_syscall_get_set_arguments(task, regs, i, n, args, 1); + ia64_syscall_get_set_arguments(task, regs, args, 1); } static inline int syscall_get_arch(void) diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index 516355a774bfe89b2dc8ce6413aa0f3a8e1e71c0..86ec034ba49917bcc2b71b8425ffdafd82ed1cae 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -47,263 +47,6 @@ #include #include -/* - * If we can't allocate a page to make a big batch of page pointers - * to work on, then just handle a few from the on-stack structure. - */ -#define IA64_GATHER_BUNDLE 8 - -struct mmu_gather { - struct mm_struct *mm; - unsigned int nr; - unsigned int max; - unsigned char fullmm; /* non-zero means full mm flush */ - unsigned char need_flush; /* really unmapped some PTEs? */ - unsigned long start, end; - unsigned long start_addr; - unsigned long end_addr; - struct page **pages; - struct page *local[IA64_GATHER_BUNDLE]; -}; - -struct ia64_tr_entry { - u64 ifa; - u64 itir; - u64 pte; - u64 rr; -}; /*Record for tr entry!*/ - -extern int ia64_itr_entry(u64 target_mask, u64 va, u64 pte, u64 log_size); -extern void ia64_ptr_entry(u64 target_mask, int slot); - -extern struct ia64_tr_entry *ia64_idtrs[NR_CPUS]; - -/* - region register macros -*/ -#define RR_TO_VE(val) (((val) >> 0) & 0x0000000000000001) -#define RR_VE(val) (((val) & 0x0000000000000001) << 0) -#define RR_VE_MASK 0x0000000000000001L -#define RR_VE_SHIFT 0 -#define RR_TO_PS(val) (((val) >> 2) & 0x000000000000003f) -#define RR_PS(val) (((val) & 0x000000000000003f) << 2) -#define RR_PS_MASK 0x00000000000000fcL -#define RR_PS_SHIFT 2 -#define RR_RID_MASK 0x00000000ffffff00L -#define RR_TO_RID(val) ((val >> 8) & 0xffffff) - -static inline void -ia64_tlb_flush_mmu_tlbonly(struct mmu_gather *tlb, unsigned long start, unsigned long end) -{ - tlb->need_flush = 0; - - if (tlb->fullmm) { - /* - * Tearing down the entire address space. This happens both as a result - * of exit() and execve(). The latter case necessitates the call to - * flush_tlb_mm() here. - */ - flush_tlb_mm(tlb->mm); - } else if (unlikely (end - start >= 1024*1024*1024*1024UL - || REGION_NUMBER(start) != REGION_NUMBER(end - 1))) - { - /* - * If we flush more than a tera-byte or across regions, we're probably - * better off just flushing the entire TLB(s). This should be very rare - * and is not worth optimizing for. - */ - flush_tlb_all(); - } else { - /* - * flush_tlb_range() takes a vma instead of a mm pointer because - * some architectures want the vm_flags for ITLB/DTLB flush. - */ - struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0); - - /* flush the address range from the tlb: */ - flush_tlb_range(&vma, start, end); - /* now flush the virt. page-table area mapping the address range: */ - flush_tlb_range(&vma, ia64_thash(start), ia64_thash(end)); - } - -} - -static inline void -ia64_tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - unsigned long i; - unsigned int nr; - - /* lastly, release the freed pages */ - nr = tlb->nr; - - tlb->nr = 0; - tlb->start_addr = ~0UL; - for (i = 0; i < nr; ++i) - free_page_and_swap_cache(tlb->pages[i]); -} - -/* - * Flush the TLB for address range START to END and, if not in fast mode, release the - * freed pages that where gathered up to this point. - */ -static inline void -ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end) -{ - if (!tlb->need_flush) - return; - ia64_tlb_flush_mmu_tlbonly(tlb, start, end); - ia64_tlb_flush_mmu_free(tlb); -} - -static inline void __tlb_alloc_page(struct mmu_gather *tlb) -{ - unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); - - if (addr) { - tlb->pages = (void *)addr; - tlb->max = PAGE_SIZE / sizeof(void *); - } -} - - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->max = ARRAY_SIZE(tlb->local); - tlb->pages = tlb->local; - tlb->nr = 0; - tlb->fullmm = !(start | (end+1)); - tlb->start = start; - tlb->end = end; - tlb->start_addr = ~0UL; -} - -/* - * Called at the end of the shootdown operation to free up any resources that were - * collected. - */ -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (force) - tlb->need_flush = 1; - /* - * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and - * tlb->end_addr. - */ - ia64_tlb_flush_mmu(tlb, start, end); - - /* keep the page table cache within bounds */ - check_pgt_cache(); - - if (tlb->pages != tlb->local) - free_pages((unsigned long)tlb->pages, 0); -} - -/* - * Logically, this routine frees PAGE. On MP machines, the actual freeing of the page - * must be delayed until after the TLB has been flushed (see comments at the beginning of - * this file). - */ -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - tlb->need_flush = 1; - - if (!tlb->nr && tlb->pages == tlb->local) - __tlb_alloc_page(tlb); - - tlb->pages[tlb->nr++] = page; - VM_WARN_ON(tlb->nr > tlb->max); - if (tlb->nr == tlb->max) - return true; - return false; -} - -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ - ia64_tlb_flush_mmu_tlbonly(tlb, tlb->start_addr, tlb->end_addr); -} - -static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - ia64_tlb_flush_mmu_free(tlb); -} - -static inline void tlb_flush_mmu(struct mmu_gather *tlb) -{ - ia64_tlb_flush_mmu(tlb, tlb->start_addr, tlb->end_addr); -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - if (__tlb_remove_page(tlb, page)) - tlb_flush_mmu(tlb); -} - -static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return __tlb_remove_page(tlb, page); -} - -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return tlb_remove_page(tlb, page); -} - -/* - * Remove TLB entry for PTE mapped at virtual address ADDRESS. This is called for any - * PTE, not just those pointing to (normal) physical memory. - */ -static inline void -__tlb_remove_tlb_entry (struct mmu_gather *tlb, pte_t *ptep, unsigned long address) -{ - if (tlb->start_addr == ~0UL) - tlb->start_addr = address; - tlb->end_addr = address + PAGE_SIZE; -} - -#define tlb_migrate_finish(mm) platform_tlb_migrate_finish(mm) - -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) - -#define tlb_remove_tlb_entry(tlb, ptep, addr) \ -do { \ - tlb->need_flush = 1; \ - __tlb_remove_tlb_entry(tlb, ptep, addr); \ -} while (0) - -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ -} - -#define pte_free_tlb(tlb, ptep, address) \ -do { \ - tlb->need_flush = 1; \ - __pte_free_tlb(tlb, ptep, address); \ -} while (0) - -#define pmd_free_tlb(tlb, ptep, address) \ -do { \ - tlb->need_flush = 1; \ - __pmd_free_tlb(tlb, ptep, address); \ -} while (0) - -#define pud_free_tlb(tlb, pudp, address) \ -do { \ - tlb->need_flush = 1; \ - __pud_free_tlb(tlb, pudp, address); \ -} while (0) +#include #endif /* _ASM_IA64_TLB_H */ diff --git a/arch/ia64/include/asm/tlbflush.h b/arch/ia64/include/asm/tlbflush.h index 25e280810f6c423700e4f13a52a936c45dc6682b..ceac10c4d6e2f3e11fd4a7c06fdb47c71dcaf876 100644 --- a/arch/ia64/include/asm/tlbflush.h +++ b/arch/ia64/include/asm/tlbflush.h @@ -14,6 +14,31 @@ #include #include +struct ia64_tr_entry { + u64 ifa; + u64 itir; + u64 pte; + u64 rr; +}; /*Record for tr entry!*/ + +extern int ia64_itr_entry(u64 target_mask, u64 va, u64 pte, u64 log_size); +extern void ia64_ptr_entry(u64 target_mask, int slot); +extern struct ia64_tr_entry *ia64_idtrs[NR_CPUS]; + +/* + region register macros +*/ +#define RR_TO_VE(val) (((val) >> 0) & 0x0000000000000001) +#define RR_VE(val) (((val) & 0x0000000000000001) << 0) +#define RR_VE_MASK 0x0000000000000001L +#define RR_VE_SHIFT 0 +#define RR_TO_PS(val) (((val) >> 2) & 0x000000000000003f) +#define RR_PS(val) (((val) & 0x000000000000003f) << 2) +#define RR_PS_MASK 0x00000000000000fcL +#define RR_PS_SHIFT 2 +#define RR_RID_MASK 0x00000000ffffff00L +#define RR_TO_RID(val) ((val >> 8) & 0xffffff) + /* * Now for some TLB flushing routines. This is the kind of stuff that * can be very expensive, so try to avoid them whenever possible. diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 6d50ede0ed691ca1899540722e65edb3cf896510..bf9c24d9ce84e66d1519ce7e5aa65330628d221b 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -2179,12 +2179,11 @@ static void syscall_get_set_args_cb(struct unw_frame_info *info, void *data) } void ia64_syscall_get_set_arguments(struct task_struct *task, - struct pt_regs *regs, unsigned int i, unsigned int n, - unsigned long *args, int rw) + struct pt_regs *regs, unsigned long *args, int rw) { struct syscall_get_set_args data = { - .i = i, - .n = n, + .i = 0, + .n = 6, .args = args, .regs = regs, .rw = rw, diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 583a3746d70be85de588b3dc64355cb85b389e61..c9cfa760cd57bfc4c00ce275708e6723422d9769 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -1058,9 +1058,7 @@ check_bugs (void) static int __init run_dmi_scan(void) { - dmi_scan_machine(); - dmi_memdev_walk(); - dmi_set_dump_stack_arch_desc(); + dmi_setup(); return 0; } core_initcall(run_dmi_scan); diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index ab9cda5f6136ad60753de5e725f6a6271ad88e9c..56e3d0b685e19119afc0a3e244ca64c3752aca4e 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -344,3 +344,7 @@ 332 common pkey_free sys_pkey_free 333 common rseq sys_rseq # 334 through 423 are reserved to sync up with other architectures +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index 5fc89aabdce1f8be105e8cb1e1805218cf9f77d9..5158bd28de0551588b29ab9ca2f7a76e0a18d409 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c @@ -305,8 +305,8 @@ local_flush_tlb_all (void) ia64_srlz_i(); /* srlz.i implies srlz.d */ } -void -flush_tlb_range (struct vm_area_struct *vma, unsigned long start, +static void +__flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; @@ -343,6 +343,25 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, preempt_enable(); ia64_srlz_i(); /* srlz.i implies srlz.d */ } + +void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (unlikely(end - start >= 1024*1024*1024*1024UL + || REGION_NUMBER(start) != REGION_NUMBER(end - 1))) { + /* + * If we flush more than a tera-byte or across regions, we're + * probably better off just flushing the entire TLB(s). This + * should be very rare and is not worth optimizing for. + */ + flush_tlb_all(); + } else { + /* flush the address range from the tlb */ + __flush_tlb_range(vma, start, end); + /* flush the virt. page-table area mapping the addr range */ + __flush_tlb_range(vma, ia64_thash(start), ia64_thash(end)); + } +} EXPORT_SYMBOL(flush_tlb_range); void ia64_tlb_init(void) diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c index b73b0ebf82148eac5442a55eeb5f40a3e35897f9..b510f4f17fd4679abf2e0de1fd5191f6f56d5a8f 100644 --- a/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c @@ -120,13 +120,6 @@ void sn_migrate(struct task_struct *task) cpu_relax(); } -void sn_tlb_migrate_finish(struct mm_struct *mm) -{ - /* flush_tlb_mm is inefficient if more than 1 users of mm */ - if (mm == current->mm && mm && atomic_read(&mm->mm_users) == 1) - flush_tlb_mm(mm); -} - static void sn2_ipi_flush_all_tlb(struct mm_struct *mm) { diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index b54206408f91b9693581c9b6a139324a659cf361..fe5cc2da6d108ac0dd74feb5c709a3000eeb2e7b 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -20,7 +20,6 @@ config M68K select GENERIC_STRNCPY_FROM_USER if MMU select GENERIC_STRNLEN_USER if MMU select ARCH_WANT_IPC_PARSE_VERSION - select ARCH_USES_GETTIMEOFFSET if MMU && !COLDFIRE select HAVE_FUTEX_CMPXCHG if MMU && FUTEX select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_REL @@ -28,17 +27,11 @@ config M68K select OLD_SIGSUSPEND3 select OLD_SIGACTION select ARCH_DISCARD_MEMBLOCK + select MMU_GATHER_NO_RANGE if MMU config CPU_BIG_ENDIAN def_bool y -config RWSEM_GENERIC_SPINLOCK - bool - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - config ARCH_HAS_ILOG2_U32 bool diff --git a/arch/m68k/amiga/cia.c b/arch/m68k/amiga/cia.c index 2081b8cd5591c6d77385de8594e737c9d421f7be..b9aee983e6f4cda91fcc709abb85471d2f5f4b63 100644 --- a/arch/m68k/amiga/cia.c +++ b/arch/m68k/amiga/cia.c @@ -88,10 +88,19 @@ static irqreturn_t cia_handler(int irq, void *dev_id) struct ciabase *base = dev_id; int mach_irq; unsigned char ints; + unsigned long flags; + /* Interrupts get disabled while the timer irq flag is cleared and + * the timer interrupt serviced. + */ mach_irq = base->cia_irq; + local_irq_save(flags); ints = cia_set_irq(base, CIA_ICR_ALL); amiga_custom.intreq = base->int_mask; + if (ints & 1) + generic_handle_irq(mach_irq); + local_irq_restore(flags); + mach_irq++, ints >>= 1; for (; ints; mach_irq++, ints >>= 1) { if (ints & 1) generic_handle_irq(mach_irq); diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c index 65f63a4571300085a2a90838c6259f6b84249dc5..c32ab8041cf6b8dca70652ded719cae362c6c773 100644 --- a/arch/m68k/amiga/config.c +++ b/arch/m68k/amiga/config.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -95,8 +96,6 @@ static char amiga_model_name[13] = "Amiga "; static void amiga_sched_init(irq_handler_t handler); static void amiga_get_model(char *model); static void amiga_get_hardware_list(struct seq_file *m); -/* amiga specific timer functions */ -static u32 amiga_gettimeoffset(void); extern void amiga_mksound(unsigned int count, unsigned int ticks); static void amiga_reset(void); extern void amiga_init_sound(void); @@ -386,7 +385,6 @@ void __init config_amiga(void) mach_init_IRQ = amiga_init_IRQ; mach_get_model = amiga_get_model; mach_get_hardware_list = amiga_get_hardware_list; - arch_gettimeoffset = amiga_gettimeoffset; /* * default MAX_DMA=0xffffffff on all machines. If we don't do so, the SCSI @@ -464,7 +462,29 @@ void __init config_amiga(void) *(unsigned char *)ZTWO_VADDR(0xde0002) |= 0x80; } +static u64 amiga_read_clk(struct clocksource *cs); + +static struct clocksource amiga_clk = { + .name = "ciab", + .rating = 250, + .read = amiga_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + static unsigned short jiffy_ticks; +static u32 clk_total, clk_offset; + +static irqreturn_t ciab_timer_handler(int irq, void *dev_id) +{ + irq_handler_t timer_routine = dev_id; + + clk_total += jiffy_ticks; + clk_offset = 0; + timer_routine(0, NULL); + + return IRQ_HANDLED; +} static void __init amiga_sched_init(irq_handler_t timer_routine) { @@ -484,19 +504,22 @@ static void __init amiga_sched_init(irq_handler_t timer_routine) * Please don't change this to use ciaa, as it interferes with the * SCSI code. We'll have to take a look at this later */ - if (request_irq(IRQ_AMIGA_CIAB_TA, timer_routine, 0, "timer", NULL)) + if (request_irq(IRQ_AMIGA_CIAB_TA, ciab_timer_handler, IRQF_TIMER, + "timer", timer_routine)) pr_err("Couldn't register timer interrupt\n"); /* start timer */ ciab.cra |= 0x11; -} -#define TICK_SIZE 10000 + clocksource_register_hz(&amiga_clk, amiga_eclock); +} -/* This is always executed with interrupts disabled. */ -static u32 amiga_gettimeoffset(void) +static u64 amiga_read_clk(struct clocksource *cs) { unsigned short hi, lo, hi2; - u32 ticks, offset = 0; + unsigned long flags; + u32 ticks; + + local_irq_save(flags); /* read CIA B timer A current value */ hi = ciab.tahi; @@ -513,12 +536,14 @@ static u32 amiga_gettimeoffset(void) if (ticks > jiffy_ticks / 2) /* check for pending interrupt */ if (cia_set_irq(&ciab_base, 0) & CIA_ICR_TA) - offset = 10000; + clk_offset = jiffy_ticks; ticks = jiffy_ticks - ticks; - ticks = (10000 * ticks) / jiffy_ticks; + ticks += clk_offset + clk_total; + + local_irq_restore(flags); - return (ticks + offset) * 1000; + return ticks; } static void amiga_reset(void) __noreturn; diff --git a/arch/m68k/apollo/config.c b/arch/m68k/apollo/config.c index aef8d42e078ddffe50f9557702bd493810a0ef13..7d168e6dfb014b1fe9c0f3ac95e3fe763c6d5357 100644 --- a/arch/m68k/apollo/config.c +++ b/arch/m68k/apollo/config.c @@ -29,7 +29,6 @@ u_long apollo_model; extern void dn_sched_init(irq_handler_t handler); extern void dn_init_IRQ(void); -extern u32 dn_gettimeoffset(void); extern int dn_dummy_hwclk(int, struct rtc_time *); extern void dn_dummy_reset(void); #ifdef CONFIG_HEARTBEAT @@ -152,7 +151,6 @@ void __init config_apollo(void) mach_sched_init=dn_sched_init; /* */ mach_init_IRQ=dn_init_IRQ; - arch_gettimeoffset = dn_gettimeoffset; mach_max_dma_address = 0xffffffff; mach_hwclk = dn_dummy_hwclk; /* */ mach_reset = dn_dummy_reset; /* */ @@ -205,11 +203,6 @@ void dn_sched_init(irq_handler_t timer_routine) pr_err("Couldn't register timer interrupt\n"); } -u32 dn_gettimeoffset(void) -{ - return 0xdeadbeef; -} - int dn_dummy_hwclk(int op, struct rtc_time *t) { diff --git a/arch/m68k/atari/ataints.c b/arch/m68k/atari/ataints.c index 3d2b63bedf0589c67cdf0471007157bb913f18ac..56f02ea2c248d844e43fb88a4e3742315d741400 100644 --- a/arch/m68k/atari/ataints.c +++ b/arch/m68k/atari/ataints.c @@ -142,7 +142,7 @@ struct mfptimerbase { .name = "MFP Timer D" }; -static irqreturn_t mfptimer_handler(int irq, void *dev_id) +static irqreturn_t mfp_timer_d_handler(int irq, void *dev_id) { struct mfptimerbase *base = dev_id; int mach_irq; @@ -344,7 +344,7 @@ void __init atari_init_IRQ(void) st_mfp.tim_ct_cd = (st_mfp.tim_ct_cd & 0xf0) | 0x6; /* request timer D dispatch handler */ - if (request_irq(IRQ_MFP_TIMD, mfptimer_handler, IRQF_SHARED, + if (request_irq(IRQ_MFP_TIMD, mfp_timer_d_handler, IRQF_SHARED, stmfp_base.name, &stmfp_base)) pr_err("Couldn't register %s interrupt\n", stmfp_base.name); diff --git a/arch/m68k/atari/config.c b/arch/m68k/atari/config.c index 4fcc4b1df1c0f2b4d417a9ce96d29dc740755126..902255e7b5b2ad56840561a112433e38d72ccb5c 100644 --- a/arch/m68k/atari/config.c +++ b/arch/m68k/atari/config.c @@ -78,7 +78,6 @@ static void atari_heartbeat(int on); /* atari specific timer functions (in time.c) */ extern void atari_sched_init(irq_handler_t); -extern u32 atari_gettimeoffset(void); extern int atari_mste_hwclk (int, struct rtc_time *); extern int atari_tt_hwclk (int, struct rtc_time *); @@ -205,7 +204,6 @@ void __init config_atari(void) mach_init_IRQ = atari_init_IRQ; mach_get_model = atari_get_model; mach_get_hardware_list = atari_get_hardware_list; - arch_gettimeoffset = atari_gettimeoffset; mach_reset = atari_reset; mach_max_dma_address = 0xffffff; #if IS_ENABLED(CONFIG_INPUT_M68K_BEEP) diff --git a/arch/m68k/atari/time.c b/arch/m68k/atari/time.c index 9cca64286464c31ae9b961609f67c3afbbb25d73..ce923a523695ae1ae0cf172c55a87fe2c7f6179b 100644 --- a/arch/m68k/atari/time.c +++ b/arch/m68k/atari/time.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -24,6 +25,35 @@ DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL_GPL(rtc_lock); +static u64 atari_read_clk(struct clocksource *cs); + +static struct clocksource atari_clk = { + .name = "mfp", + .rating = 100, + .read = atari_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static u32 clk_total; +static u8 last_timer_count; + +static irqreturn_t mfp_timer_c_handler(int irq, void *dev_id) +{ + irq_handler_t timer_routine = dev_id; + unsigned long flags; + + local_irq_save(flags); + do { + last_timer_count = st_mfp.tim_dt_c; + } while (last_timer_count == 1); + clk_total += INT_TICKS; + timer_routine(0, NULL); + local_irq_restore(flags); + + return IRQ_HANDLED; +} + void __init atari_sched_init(irq_handler_t timer_routine) { @@ -32,31 +62,33 @@ atari_sched_init(irq_handler_t timer_routine) /* start timer C, div = 1:100 */ st_mfp.tim_ct_cd = (st_mfp.tim_ct_cd & 15) | 0x60; /* install interrupt service routine for MFP Timer C */ - if (request_irq(IRQ_MFP_TIMC, timer_routine, 0, "timer", timer_routine)) + if (request_irq(IRQ_MFP_TIMC, mfp_timer_c_handler, IRQF_TIMER, "timer", + timer_routine)) pr_err("Couldn't register timer interrupt\n"); + + clocksource_register_hz(&atari_clk, INT_CLK); } /* ++andreas: gettimeoffset fixed to check for pending interrupt */ -#define TICK_SIZE 10000 - -/* This is always executed with interrupts disabled. */ -u32 atari_gettimeoffset(void) +static u64 atari_read_clk(struct clocksource *cs) { - u32 ticks, offset = 0; - - /* read MFP timer C current value */ - ticks = st_mfp.tim_dt_c; - /* The probability of underflow is less than 2% */ - if (ticks > INT_TICKS - INT_TICKS / 50) - /* Check for pending timer interrupt */ - if (st_mfp.int_pn_b & (1 << 5)) - offset = TICK_SIZE; - - ticks = INT_TICKS - ticks; - ticks = ticks * 10000L / INT_TICKS; - - return (ticks + offset) * 1000; + unsigned long flags; + u8 count; + u32 ticks; + + local_irq_save(flags); + /* Ensure that the count is monotonically decreasing, even though + * the result may briefly stop changing after counter wrap-around. + */ + count = min(st_mfp.tim_dt_c, last_timer_count); + last_timer_count = count; + + ticks = INT_TICKS - count; + ticks += clk_total; + local_irq_restore(flags); + + return ticks; } diff --git a/arch/m68k/bvme6000/config.c b/arch/m68k/bvme6000/config.c index 143ee9fa3893ecce16e13c87309354a6d5c6211e..8ebaabc931cd094bf5ffd8f552bb0220b1448509 100644 --- a/arch/m68k/bvme6000/config.c +++ b/arch/m68k/bvme6000/config.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -39,16 +40,10 @@ static void bvme6000_get_model(char *model); extern void bvme6000_sched_init(irq_handler_t handler); -extern u32 bvme6000_gettimeoffset(void); extern int bvme6000_hwclk (int, struct rtc_time *); extern void bvme6000_reset (void); void bvme6000_set_vectors (void); -/* Save tick handler routine pointer, will point to xtime_update() in - * kernel/timer/timekeeping.c, called via bvme6000_process_int() */ - -static irq_handler_t tick_handler; - int __init bvme6000_parse_bootinfo(const struct bi_record *bi) { @@ -110,7 +105,6 @@ void __init config_bvme6000(void) mach_max_dma_address = 0xffffffff; mach_sched_init = bvme6000_sched_init; mach_init_IRQ = bvme6000_init_IRQ; - arch_gettimeoffset = bvme6000_gettimeoffset; mach_hwclk = bvme6000_hwclk; mach_reset = bvme6000_reset; mach_get_model = bvme6000_get_model; @@ -154,15 +148,38 @@ irqreturn_t bvme6000_abort_int (int irq, void *dev_id) return IRQ_HANDLED; } +static u64 bvme6000_read_clk(struct clocksource *cs); + +static struct clocksource bvme6000_clk = { + .name = "rtc", + .rating = 250, + .read = bvme6000_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static u32 clk_total, clk_offset; + +#define RTC_TIMER_CLOCK_FREQ 8000000 +#define RTC_TIMER_CYCLES (RTC_TIMER_CLOCK_FREQ / HZ) +#define RTC_TIMER_COUNT ((RTC_TIMER_CYCLES / 2) - 1) static irqreturn_t bvme6000_timer_int (int irq, void *dev_id) { + irq_handler_t timer_routine = dev_id; + unsigned long flags; volatile RtcPtr_t rtc = (RtcPtr_t)BVME_RTC_BASE; - unsigned char msr = rtc->msr & 0xc0; + unsigned char msr; + local_irq_save(flags); + msr = rtc->msr & 0xc0; rtc->msr = msr | 0x20; /* Ack the interrupt */ + clk_total += RTC_TIMER_CYCLES; + clk_offset = 0; + timer_routine(0, NULL); + local_irq_restore(flags); - return tick_handler(irq, dev_id); + return IRQ_HANDLED; } /* @@ -181,14 +198,13 @@ void bvme6000_sched_init (irq_handler_t timer_routine) rtc->msr = 0; /* Ensure timer registers accessible */ - tick_handler = timer_routine; - if (request_irq(BVME_IRQ_RTC, bvme6000_timer_int, 0, - "timer", bvme6000_timer_int)) + if (request_irq(BVME_IRQ_RTC, bvme6000_timer_int, IRQF_TIMER, "timer", + timer_routine)) panic ("Couldn't register timer int"); rtc->t1cr_omr = 0x04; /* Mode 2, ext clk */ - rtc->t1msb = 39999 >> 8; - rtc->t1lsb = 39999 & 0xff; + rtc->t1msb = RTC_TIMER_COUNT >> 8; + rtc->t1lsb = RTC_TIMER_COUNT & 0xff; rtc->irr_icr1 &= 0xef; /* Route timer 1 to INTR pin */ rtc->msr = 0x40; /* Access int.cntrl, etc */ rtc->pfr_icr0 = 0x80; /* Just timer 1 ints enabled */ @@ -200,14 +216,14 @@ void bvme6000_sched_init (irq_handler_t timer_routine) rtc->msr = msr; + clocksource_register_hz(&bvme6000_clk, RTC_TIMER_CLOCK_FREQ); + if (request_irq(BVME_IRQ_ABORT, bvme6000_abort_int, 0, "abort", bvme6000_abort_int)) panic ("Couldn't register abort int"); } -/* This is always executed with interrupts disabled. */ - /* * NOTE: Don't accept any readings within 5us of rollover, as * the T1INT bit may be a little slow getting set. There is also @@ -215,14 +231,18 @@ void bvme6000_sched_init (irq_handler_t timer_routine) * results... */ -u32 bvme6000_gettimeoffset(void) +static u64 bvme6000_read_clk(struct clocksource *cs) { + unsigned long flags; volatile RtcPtr_t rtc = (RtcPtr_t)BVME_RTC_BASE; volatile PitRegsPtr pit = (PitRegsPtr)BVME_PIT_BASE; - unsigned char msr = rtc->msr & 0xc0; + unsigned char msr, msb; unsigned char t1int, t1op; u32 v = 800000, ov; + local_irq_save(flags); + + msr = rtc->msr & 0xc0; rtc->msr = 0; /* Ensure timer registers accessible */ do { @@ -230,22 +250,25 @@ u32 bvme6000_gettimeoffset(void) t1int = rtc->msr & 0x20; t1op = pit->pcdr & 0x04; rtc->t1cr_omr |= 0x40; /* Latch timer1 */ - v = rtc->t1msb << 8; /* Read timer1 */ - v |= rtc->t1lsb; /* Read timer1 */ + msb = rtc->t1msb; /* Read timer1 */ + v = (msb << 8) | rtc->t1lsb; /* Read timer1 */ } while (t1int != (rtc->msr & 0x20) || t1op != (pit->pcdr & 0x04) || abs(ov-v) > 80 || - v > 39960); + v > RTC_TIMER_COUNT - (RTC_TIMER_COUNT / 100)); - v = 39999 - v; + v = RTC_TIMER_COUNT - v; if (!t1op) /* If in second half cycle.. */ - v += 40000; - v /= 8; /* Convert ticks to microseconds */ - if (t1int) - v += 10000; /* Int pending, + 10ms */ + v += RTC_TIMER_CYCLES / 2; + if (msb > 0 && t1int) + clk_offset = RTC_TIMER_CYCLES; rtc->msr = msr; - return v * 1000; + v += clk_offset + clk_total; + + local_irq_restore(flags); + + return v; } /* diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 525421ae277d6ebc73554dcfac28a9ea361b1310..fea392cfcf1be142d8425477d8f16807e41afdb9 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -56,6 +56,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -210,9 +211,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -234,9 +232,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -313,7 +308,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -460,12 +454,12 @@ CONFIG_RTC_DRV_RP5C01=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -573,9 +567,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -640,6 +636,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -649,4 +646,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index db0e654a88d537d0d1c8a33925663243d0682389..2474d267460e9a629df5458d8385c42f8001d5e1 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -52,6 +52,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -206,9 +207,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -230,9 +228,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -309,7 +304,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -420,12 +414,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -533,9 +527,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -600,6 +596,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -609,4 +606,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 1451168eb789b00c5a4bafd3e4f020531bd87e73..0fc7d2992fe030ab2e2ed6d680a5fcb0bd8fb12e 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -59,6 +59,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -213,9 +214,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -237,9 +235,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -316,7 +311,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -442,12 +436,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -555,9 +549,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -622,6 +618,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -631,4 +628,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index b0d3609f5bb3c77f7bfe18aa38c4e9868ceff30d..699df9fdf8668d5a2e09279c40b43981718e2eb4 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -49,6 +49,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -203,9 +204,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -227,9 +225,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -306,7 +301,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -413,12 +407,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -526,9 +520,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -593,6 +589,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -602,4 +599,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 4ed7c151347cdc1394d28288f1b84dfbbdf71fa2..b508022553247e829ee378bc988a7225331ab25c 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -51,6 +51,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -205,9 +206,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -229,9 +227,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -308,7 +303,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -422,12 +416,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -535,9 +529,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -602,6 +598,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -611,4 +608,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 0dc544e1ce1fb02aa9f81c142ab61c7d9b13d936..04e7d70f6030e77ccdcd5a169f86a2ee1b7ae283 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -50,6 +50,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -204,9 +205,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -228,9 +226,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -310,7 +305,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -444,12 +438,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -557,9 +551,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -624,6 +620,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -633,4 +630,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 5a7b7b0d6e72e293bf00420f4fe56590a0c4fe9e..5e1cc4c17852259d593c910ba9fff704146ba222 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -70,6 +70,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -224,9 +225,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -248,9 +246,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -330,7 +325,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -526,12 +520,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -639,9 +633,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -706,6 +702,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -715,4 +712,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 71eb9be1803b10067d2033b74387693f6b436dc8..170ac8792c2df7545a2f63d05e1b98722fcb473e 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -48,6 +48,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -202,9 +203,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -226,9 +224,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -305,7 +300,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -412,12 +406,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -525,9 +519,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -592,6 +588,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -601,4 +598,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index ea2ebd4241c039928e023aafde463e62fdb3dd06..d865592a423e353fff9bee5e58337607b92c060f 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -49,6 +49,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -203,9 +204,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -227,9 +225,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -306,7 +301,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -413,12 +407,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -526,9 +520,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -593,6 +589,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -602,4 +599,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index cef6dc47c7250d327afff05607221ffb112a69d8..034a9de904846371e73b400571600c7ae6b6403a 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -50,6 +50,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -204,9 +205,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -228,9 +226,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -307,7 +302,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -431,12 +425,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -544,9 +538,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -611,6 +607,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -620,4 +617,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 69f2282dc4e9cafa34aa646629d0f18f2a44f228..49be0f9fcd8dd68af757fb0f4648c2b5c2ceadce 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -46,6 +46,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -200,9 +201,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -224,9 +222,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -303,7 +298,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -415,12 +409,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -528,9 +522,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -595,6 +591,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -604,3 +601,4 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index e91267e868b2dd37b065afc7ed11d7853daa38b6..a71acf4a6004c0d92945bb911dd77254d3f4a324 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -46,6 +46,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -200,9 +201,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -224,9 +222,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -303,7 +298,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -414,12 +408,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -527,9 +521,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -594,6 +590,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -603,4 +600,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/hp300/config.c b/arch/m68k/hp300/config.c index a19bcd23f80b30820694cb0d2787319ae5384444..a161d44fd20bb5dc20c6d9efd4ffef3bdb98ce74 100644 --- a/arch/m68k/hp300/config.c +++ b/arch/m68k/hp300/config.c @@ -254,7 +254,6 @@ void __init config_hp300(void) mach_sched_init = hp300_sched_init; mach_init_IRQ = hp300_init_IRQ; mach_get_model = hp300_get_model; - arch_gettimeoffset = hp300_gettimeoffset; mach_hwclk = hp300_hwclk; mach_get_ss = hp300_get_ss; mach_reset = hp300_reset; diff --git a/arch/m68k/hp300/time.c b/arch/m68k/hp300/time.c index 289d928a46cbe1c2cbcce1f5e94dc9fbd43db331..bfee13e1d0fe1432e1b843763995341230855b41 100644 --- a/arch/m68k/hp300/time.c +++ b/arch/m68k/hp300/time.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -19,6 +20,18 @@ #include #include +static u64 hp300_read_clk(struct clocksource *cs); + +static struct clocksource hp300_clk = { + .name = "timer", + .rating = 250, + .read = hp300_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static u32 clk_total, clk_offset; + /* Clock hardware definitions */ #define CLOCKBASE 0xf05f8000 @@ -28,39 +41,61 @@ #define CLKCR3 CLKCR1 #define CLKSR CLKCR2 #define CLKMSB1 0x5 +#define CLKLSB1 0x7 #define CLKMSB2 0x9 #define CLKMSB3 0xD +#define CLKSR_INT1 BIT(0) + /* This is for machines which generate the exact clock. */ -#define USECS_PER_JIFFY (1000000/HZ) -#define INTVAL ((10000 / 4) - 1) +#define HP300_TIMER_CLOCK_FREQ 250000 +#define HP300_TIMER_CYCLES (HP300_TIMER_CLOCK_FREQ / HZ) +#define INTVAL (HP300_TIMER_CYCLES - 1) static irqreturn_t hp300_tick(int irq, void *dev_id) { + irq_handler_t timer_routine = dev_id; + unsigned long flags; unsigned long tmp; - irq_handler_t vector = dev_id; + + local_irq_save(flags); in_8(CLOCKBASE + CLKSR); asm volatile ("movpw %1@(5),%0" : "=d" (tmp) : "a" (CLOCKBASE)); + clk_total += INTVAL; + clk_offset = 0; + timer_routine(0, NULL); + local_irq_restore(flags); + /* Turn off the network and SCSI leds */ blinken_leds(0, 0xe0); - return vector(irq, NULL); + return IRQ_HANDLED; } -u32 hp300_gettimeoffset(void) +static u64 hp300_read_clk(struct clocksource *cs) { - /* Read current timer 1 value */ - unsigned char lsb, msb1, msb2; - unsigned short ticks; - - msb1 = in_8(CLOCKBASE + 5); - lsb = in_8(CLOCKBASE + 7); - msb2 = in_8(CLOCKBASE + 5); - if (msb1 != msb2) - /* A carry happened while we were reading. Read it again */ - lsb = in_8(CLOCKBASE + 7); - ticks = INTVAL - ((msb2 << 8) | lsb); - return ((USECS_PER_JIFFY * ticks) / INTVAL) * 1000; + unsigned long flags; + unsigned char lsb, msb, msb_new; + u32 ticks; + + local_irq_save(flags); + /* Read current timer 1 value */ + msb = in_8(CLOCKBASE + CLKMSB1); +again: + if ((in_8(CLOCKBASE + CLKSR) & CLKSR_INT1) && msb > 0) + clk_offset = INTVAL; + lsb = in_8(CLOCKBASE + CLKLSB1); + msb_new = in_8(CLOCKBASE + CLKMSB1); + if (msb_new != msb) { + msb = msb_new; + goto again; + } + + ticks = INTVAL - ((msb << 8) | lsb); + ticks += clk_offset + clk_total; + local_irq_restore(flags); + + return ticks; } void __init hp300_sched_init(irq_handler_t vector) @@ -70,9 +105,11 @@ void __init hp300_sched_init(irq_handler_t vector) asm volatile(" movpw %0,%1@(5)" : : "d" (INTVAL), "a" (CLOCKBASE)); - if (request_irq(IRQ_AUTO_6, hp300_tick, 0, "timer tick", vector)) + if (request_irq(IRQ_AUTO_6, hp300_tick, IRQF_TIMER, "timer tick", vector)) pr_err("Couldn't register timer interrupt\n"); out_8(CLOCKBASE + CLKCR2, 0x1); /* select CR1 */ out_8(CLOCKBASE + CLKCR1, 0x40); /* enable irq */ + + clocksource_register_hz(&hp300_clk, HP300_TIMER_CLOCK_FREQ); } diff --git a/arch/m68k/hp300/time.h b/arch/m68k/hp300/time.h index f5583ec4033d46c076d91e8e017fbb996ad0fcab..1d77b55cc72a0c24bc7de05e1e80add0ca0e3b36 100644 --- a/arch/m68k/hp300/time.h +++ b/arch/m68k/hp300/time.h @@ -1,2 +1 @@ extern void hp300_sched_init(irq_handler_t vector); -extern u32 hp300_gettimeoffset(void); diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index 2c359d9e80f63fe44468c29b7a48bb4033a9c31a..0ddae4a74adb2a5d889fd2c88f2bdb3a5578c68c 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -18,6 +18,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += percpu.h generic-y += preempt.h generic-y += sections.h diff --git a/arch/m68k/include/asm/io_mm.h b/arch/m68k/include/asm/io_mm.h index 782b78f8a04890b315685b0ea78cab16a2d219af..6c03ca5bc436551e16cf9ec7019cf09414607ba0 100644 --- a/arch/m68k/include/asm/io_mm.h +++ b/arch/m68k/include/asm/io_mm.h @@ -377,8 +377,6 @@ static inline void isa_delay(void) #define writesw(port, buf, nr) raw_outsw((port), (u16 *)(buf), (nr)) #define writesl(port, buf, nr) raw_outsl((port), (u32 *)(buf), (nr)) -#define mmiowb() - #ifndef CONFIG_SUN3 #define IO_SPACE_LIMIT 0xffff #else diff --git a/arch/m68k/include/asm/mvme147hw.h b/arch/m68k/include/asm/mvme147hw.h index 9c7ff67c5ffd616d20f13d0d08cf60aded9845af..257b29184af913f8613b3cbcacb91a08e14ef924 100644 --- a/arch/m68k/include/asm/mvme147hw.h +++ b/arch/m68k/include/asm/mvme147hw.h @@ -66,7 +66,7 @@ struct pcc_regs { #define PCC_INT_ENAB 0x08 #define PCC_TIMER_INT_CLR 0x80 -#define PCC_TIMER_PRELOAD 63936l +#define PCC_TIMER_CLR_OVF 0x04 #define PCC_LEVEL_ABORT 0x07 #define PCC_LEVEL_SERIAL 0x04 diff --git a/arch/m68k/include/asm/tlb.h b/arch/m68k/include/asm/tlb.h index b4b9efb6f963be8761e0cdc830be3cb5fb3bf3ab..3c81f6adfc8b36b26f53fea65941072707c9778f 100644 --- a/arch/m68k/include/asm/tlb.h +++ b/arch/m68k/include/asm/tlb.h @@ -2,20 +2,6 @@ #ifndef _M68K_TLB_H #define _M68K_TLB_H -/* - * m68k doesn't need any special per-pte or - * per-vma handling.. - */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) - -/* - * .. because we flush the whole mm when it - * fills up. - */ -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include #endif /* _M68K_TLB_H */ diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 125c14178979c010648895bd7925f85e5e54b10d..df4ec3ec71d1518bfac752044f7a1eae9291535a 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -423,3 +423,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c index cd9317d5327694e3761f066abf99fa3f493cb675..11be08f4f750aa93068d922d1b7f496800dc6655 100644 --- a/arch/m68k/mac/config.c +++ b/arch/m68k/mac/config.c @@ -54,8 +54,6 @@ struct mac_booter_data mac_bi_data; /* The phys. video addr. - might be bogus on some machines */ static unsigned long mac_orig_videoaddr; -/* Mac specific timer functions */ -extern u32 mac_gettimeoffset(void); extern int mac_hwclk(int, struct rtc_time *); extern void iop_preinit(void); extern void iop_init(void); @@ -155,7 +153,6 @@ void __init config_mac(void) mach_sched_init = mac_sched_init; mach_init_IRQ = mac_init_IRQ; mach_get_model = mac_get_model; - arch_gettimeoffset = mac_gettimeoffset; mach_hwclk = mac_hwclk; mach_reset = mac_reset; mach_halt = mac_poweroff; diff --git a/arch/m68k/mac/via.c b/arch/m68k/mac/via.c index 0b02894591738d6e8a26453435ee59f3d72ac8eb..3c2cfcb749825f183b28c77a48c64c6ada9ff03b 100644 --- a/arch/m68k/mac/via.c +++ b/arch/m68k/mac/via.c @@ -23,6 +23,7 @@ * */ +#include #include #include #include @@ -54,16 +55,6 @@ static __u8 rbv_clear; static int gIER,gIFR,gBufA,gBufB; -/* - * Timer defs. - */ - -#define TICK_SIZE 10000 -#define MAC_CLOCK_TICK (783300/HZ) /* ticks per HZ */ -#define MAC_CLOCK_LOW (MAC_CLOCK_TICK&0xFF) -#define MAC_CLOCK_HIGH (MAC_CLOCK_TICK>>8) - - /* * On Macs with a genuine VIA chip there is no way to mask an individual slot * interrupt. This limitation also seems to apply to VIA clone logic cores in @@ -271,22 +262,6 @@ void __init via_init(void) } } -/* - * Start the 100 Hz clock - */ - -void __init via_init_clock(irq_handler_t func) -{ - via1[vACR] |= 0x40; - via1[vT1LL] = MAC_CLOCK_LOW; - via1[vT1LH] = MAC_CLOCK_HIGH; - via1[vT1CL] = MAC_CLOCK_LOW; - via1[vT1CH] = MAC_CLOCK_HIGH; - - if (request_irq(IRQ_MAC_TIMER_1, func, 0, "timer", func)) - pr_err("Couldn't register %s interrupt\n", "timer"); -} - /* * Debugging dump, used in various places to see what's going on. */ @@ -314,29 +289,6 @@ void via_debug_dump(void) } } -/* - * This is always executed with interrupts disabled. - * - * TBI: get time offset between scheduling timer ticks - */ - -u32 mac_gettimeoffset(void) -{ - unsigned long ticks, offset = 0; - - /* read VIA1 timer 2 current value */ - ticks = via1[vT1CL] | (via1[vT1CH] << 8); - /* The probability of underflow is less than 2% */ - if (ticks > MAC_CLOCK_TICK - MAC_CLOCK_TICK / 50) - /* Check for pending timer interrupt in VIA1 IFR */ - if (via1[vIFR] & 0x40) offset = TICK_SIZE; - - ticks = MAC_CLOCK_TICK - ticks; - ticks = ticks * 10000L / MAC_CLOCK_TICK; - - return (ticks + offset) * 1000; -} - /* * Flush the L2 cache on Macs that have it by flipping * the system into 24-bit mode for an instant. @@ -440,6 +392,8 @@ void via_nubus_irq_shutdown(int irq) * via6522.c :-), disable/pending masks added. */ +#define VIA_TIMER_1_INT BIT(6) + void via1_irq(struct irq_desc *desc) { int irq_num; @@ -449,6 +403,21 @@ void via1_irq(struct irq_desc *desc) if (!events) return; + irq_num = IRQ_MAC_TIMER_1; + irq_bit = VIA_TIMER_1_INT; + if (events & irq_bit) { + unsigned long flags; + + local_irq_save(flags); + via1[vIFR] = irq_bit; + generic_handle_irq(irq_num); + local_irq_restore(flags); + + events &= ~irq_bit; + if (!events) + return; + } + irq_num = VIA1_SOURCE_BASE; irq_bit = 1; do { @@ -605,3 +574,82 @@ int via2_scsi_drq_pending(void) return via2[gIFR] & (1 << IRQ_IDX(IRQ_MAC_SCSIDRQ)); } EXPORT_SYMBOL(via2_scsi_drq_pending); + +/* timer and clock source */ + +#define VIA_CLOCK_FREQ 783360 /* VIA "phase 2" clock in Hz */ +#define VIA_TIMER_CYCLES (VIA_CLOCK_FREQ / HZ) /* clock cycles per jiffy */ + +#define VIA_TC (VIA_TIMER_CYCLES - 2) /* including 0 and -1 */ +#define VIA_TC_LOW (VIA_TC & 0xFF) +#define VIA_TC_HIGH (VIA_TC >> 8) + +static u64 mac_read_clk(struct clocksource *cs); + +static struct clocksource mac_clk = { + .name = "via1", + .rating = 250, + .read = mac_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static u32 clk_total, clk_offset; + +static irqreturn_t via_timer_handler(int irq, void *dev_id) +{ + irq_handler_t timer_routine = dev_id; + + clk_total += VIA_TIMER_CYCLES; + clk_offset = 0; + timer_routine(0, NULL); + + return IRQ_HANDLED; +} + +void __init via_init_clock(irq_handler_t timer_routine) +{ + if (request_irq(IRQ_MAC_TIMER_1, via_timer_handler, IRQF_TIMER, "timer", + timer_routine)) { + pr_err("Couldn't register %s interrupt\n", "timer"); + return; + } + + via1[vT1LL] = VIA_TC_LOW; + via1[vT1LH] = VIA_TC_HIGH; + via1[vT1CL] = VIA_TC_LOW; + via1[vT1CH] = VIA_TC_HIGH; + via1[vACR] |= 0x40; + + clocksource_register_hz(&mac_clk, VIA_CLOCK_FREQ); +} + +static u64 mac_read_clk(struct clocksource *cs) +{ + unsigned long flags; + u8 count_high; + u16 count; + u32 ticks; + + /* + * Timer counter wrap-around is detected with the timer interrupt flag + * but reading the counter low byte (vT1CL) would reset the flag. + * Also, accessing both counter registers is essentially a data race. + * These problems are avoided by ignoring the low byte. Clock accuracy + * is 256 times worse (error can reach 0.327 ms) but CPU overhead is + * reduced by avoiding slow VIA register accesses. + */ + + local_irq_save(flags); + count_high = via1[vT1CH]; + if (count_high == 0xFF) + count_high = 0; + if (count_high > 0 && (via1[vIFR] & VIA_TIMER_1_INT)) + clk_offset = VIA_TIMER_CYCLES; + count = count_high << 8; + ticks = VIA_TIMER_CYCLES - count; + ticks += clk_offset + clk_total; + local_irq_restore(flags); + + return ticks; +} diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c index adea549d240e9eb555200c7ec7db49582373e2b7..545a1fe0e1194695953b066ebd0837d4bbd2d26d 100644 --- a/arch/m68k/mvme147/config.c +++ b/arch/m68k/mvme147/config.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -38,18 +39,12 @@ static void mvme147_get_model(char *model); extern void mvme147_sched_init(irq_handler_t handler); -extern u32 mvme147_gettimeoffset(void); extern int mvme147_hwclk (int, struct rtc_time *); extern void mvme147_reset (void); static int bcd2int (unsigned char b); -/* Save tick handler routine pointer, will point to xtime_update() in - * kernel/time/timekeeping.c, called via mvme147_process_int() */ - -irq_handler_t tick_handler; - int __init mvme147_parse_bootinfo(const struct bi_record *bi) { @@ -89,7 +84,6 @@ void __init config_mvme147(void) mach_max_dma_address = 0x01000000; mach_sched_init = mvme147_sched_init; mach_init_IRQ = mvme147_init_IRQ; - arch_gettimeoffset = mvme147_gettimeoffset; mach_hwclk = mvme147_hwclk; mach_reset = mvme147_reset; mach_get_model = mvme147_get_model; @@ -99,45 +93,76 @@ void __init config_mvme147(void) vme_brdtype = VME_TYPE_MVME147; } +static u64 mvme147_read_clk(struct clocksource *cs); + +static struct clocksource mvme147_clk = { + .name = "pcc", + .rating = 250, + .read = mvme147_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static u32 clk_total; + +#define PCC_TIMER_CLOCK_FREQ 160000 +#define PCC_TIMER_CYCLES (PCC_TIMER_CLOCK_FREQ / HZ) +#define PCC_TIMER_PRELOAD (0x10000 - PCC_TIMER_CYCLES) /* Using pcc tick timer 1 */ static irqreturn_t mvme147_timer_int (int irq, void *dev_id) { + irq_handler_t timer_routine = dev_id; + unsigned long flags; + + local_irq_save(flags); m147_pcc->t1_int_cntrl = PCC_TIMER_INT_CLR; - m147_pcc->t1_int_cntrl = PCC_INT_ENAB|PCC_LEVEL_TIMER1; - return tick_handler(irq, dev_id); + m147_pcc->t1_cntrl = PCC_TIMER_CLR_OVF; + clk_total += PCC_TIMER_CYCLES; + timer_routine(0, NULL); + local_irq_restore(flags); + + return IRQ_HANDLED; } void mvme147_sched_init (irq_handler_t timer_routine) { - tick_handler = timer_routine; - if (request_irq(PCC_IRQ_TIMER1, mvme147_timer_int, 0, "timer 1", NULL)) + if (request_irq(PCC_IRQ_TIMER1, mvme147_timer_int, IRQF_TIMER, + "timer 1", timer_routine)) pr_err("Couldn't register timer interrupt\n"); /* Init the clock with a value */ - /* our clock goes off every 6.25us */ + /* The clock counter increments until 0xFFFF then reloads */ m147_pcc->t1_preload = PCC_TIMER_PRELOAD; m147_pcc->t1_cntrl = 0x0; /* clear timer */ m147_pcc->t1_cntrl = 0x3; /* start timer */ m147_pcc->t1_int_cntrl = PCC_TIMER_INT_CLR; /* clear pending ints */ m147_pcc->t1_int_cntrl = PCC_INT_ENAB|PCC_LEVEL_TIMER1; + + clocksource_register_hz(&mvme147_clk, PCC_TIMER_CLOCK_FREQ); } -/* This is always executed with interrupts disabled. */ -/* XXX There are race hazards in this code XXX */ -u32 mvme147_gettimeoffset(void) +static u64 mvme147_read_clk(struct clocksource *cs) { - volatile unsigned short *cp = (volatile unsigned short *)0xfffe1012; - unsigned short n; - - n = *cp; - while (n != *cp) - n = *cp; - - n -= PCC_TIMER_PRELOAD; - return ((unsigned long)n * 25 / 4) * 1000; + unsigned long flags; + u8 overflow, tmp; + u16 count; + u32 ticks; + + local_irq_save(flags); + tmp = m147_pcc->t1_cntrl >> 4; + count = m147_pcc->t1_count; + overflow = m147_pcc->t1_cntrl >> 4; + if (overflow != tmp) + count = m147_pcc->t1_count; + count -= PCC_TIMER_PRELOAD; + ticks = count + overflow * PCC_TIMER_CYCLES; + ticks += clk_total; + local_irq_restore(flags); + + return ticks; } static int bcd2int (unsigned char b) diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c index 6ee36a5b528d80b9b519c8ea97775be38114704f..9bc2da69f80cba4742cec68e8d1e6faa5c32ff9f 100644 --- a/arch/m68k/mvme16x/config.c +++ b/arch/m68k/mvme16x/config.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -44,17 +45,11 @@ static MK48T08ptr_t volatile rtc = (MK48T08ptr_t)MVME_RTC_BASE; static void mvme16x_get_model(char *model); extern void mvme16x_sched_init(irq_handler_t handler); -extern u32 mvme16x_gettimeoffset(void); extern int mvme16x_hwclk (int, struct rtc_time *); extern void mvme16x_reset (void); int bcd2int (unsigned char b); -/* Save tick handler routine pointer, will point to xtime_update() in - * kernel/time/timekeeping.c, called via mvme16x_process_int() */ - -static irq_handler_t tick_handler; - unsigned short mvme16x_config; EXPORT_SYMBOL(mvme16x_config); @@ -120,11 +115,11 @@ static void __init mvme16x_init_IRQ (void) m68k_setup_user_interrupt(VEC_USER, 192); } -#define pcc2chip ((volatile u_char *)0xfff42000) -#define PccSCCMICR 0x1d -#define PccSCCTICR 0x1e -#define PccSCCRICR 0x1f -#define PccTPIACKR 0x25 +#define PCC2CHIP (0xfff42000) +#define PCCSCCMICR (PCC2CHIP + 0x1d) +#define PCCSCCTICR (PCC2CHIP + 0x1e) +#define PCCSCCRICR (PCC2CHIP + 0x1f) +#define PCCTPIACKR (PCC2CHIP + 0x25) #ifdef CONFIG_EARLY_PRINTK @@ -232,10 +227,10 @@ void mvme16x_cons_write(struct console *co, const char *str, unsigned count) base_addr[CyIER] = CyTxMpty; while (1) { - if (pcc2chip[PccSCCTICR] & 0x20) + if (in_8(PCCSCCTICR) & 0x20) { /* We have a Tx int. Acknowledge it */ - sink = pcc2chip[PccTPIACKR]; + sink = in_8(PCCTPIACKR); if ((base_addr[CyLICR] >> 2) == port) { if (i == count) { /* Last char of string is now output */ @@ -277,7 +272,6 @@ void __init config_mvme16x(void) mach_max_dma_address = 0xffffffff; mach_sched_init = mvme16x_sched_init; mach_init_IRQ = mvme16x_init_IRQ; - arch_gettimeoffset = mvme16x_gettimeoffset; mach_hwclk = mvme16x_hwclk; mach_reset = mvme16x_reset; mach_get_model = mvme16x_get_model; @@ -350,10 +344,46 @@ static irqreturn_t mvme16x_abort_int (int irq, void *dev_id) return IRQ_HANDLED; } +static u64 mvme16x_read_clk(struct clocksource *cs); + +static struct clocksource mvme16x_clk = { + .name = "pcc", + .rating = 250, + .read = mvme16x_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static u32 clk_total; + +#define PCC_TIMER_CLOCK_FREQ 1000000 +#define PCC_TIMER_CYCLES (PCC_TIMER_CLOCK_FREQ / HZ) + +#define PCCTCMP1 (PCC2CHIP + 0x04) +#define PCCTCNT1 (PCC2CHIP + 0x08) +#define PCCTOVR1 (PCC2CHIP + 0x17) +#define PCCTIC1 (PCC2CHIP + 0x1b) + +#define PCCTOVR1_TIC_EN 0x01 +#define PCCTOVR1_COC_EN 0x02 +#define PCCTOVR1_OVR_CLR 0x04 + +#define PCCTIC1_INT_CLR 0x08 +#define PCCTIC1_INT_EN 0x10 + static irqreturn_t mvme16x_timer_int (int irq, void *dev_id) { - *(volatile unsigned char *)0xfff4201b |= 8; - return tick_handler(irq, dev_id); + irq_handler_t timer_routine = dev_id; + unsigned long flags; + + local_irq_save(flags); + out_8(PCCTIC1, in_8(PCCTIC1) | PCCTIC1_INT_CLR); + out_8(PCCTOVR1, PCCTOVR1_OVR_CLR); + clk_total += PCC_TIMER_CYCLES; + timer_routine(0, NULL); + local_irq_restore(flags); + + return IRQ_HANDLED; } void mvme16x_sched_init (irq_handler_t timer_routine) @@ -361,16 +391,17 @@ void mvme16x_sched_init (irq_handler_t timer_routine) uint16_t brdno = be16_to_cpu(mvme_bdid.brdno); int irq; - tick_handler = timer_routine; /* Using PCCchip2 or MC2 chip tick timer 1 */ - *(volatile unsigned long *)0xfff42008 = 0; - *(volatile unsigned long *)0xfff42004 = 10000; /* 10ms */ - *(volatile unsigned char *)0xfff42017 |= 3; - *(volatile unsigned char *)0xfff4201b = 0x16; - if (request_irq(MVME16x_IRQ_TIMER, mvme16x_timer_int, 0, - "timer", mvme16x_timer_int)) + out_be32(PCCTCNT1, 0); + out_be32(PCCTCMP1, PCC_TIMER_CYCLES); + out_8(PCCTOVR1, in_8(PCCTOVR1) | PCCTOVR1_TIC_EN | PCCTOVR1_COC_EN); + out_8(PCCTIC1, PCCTIC1_INT_EN | 6); + if (request_irq(MVME16x_IRQ_TIMER, mvme16x_timer_int, IRQF_TIMER, "timer", + timer_routine)) panic ("Couldn't register timer int"); + clocksource_register_hz(&mvme16x_clk, PCC_TIMER_CLOCK_FREQ); + if (brdno == 0x0162 || brdno == 0x172) irq = MVME162_IRQ_ABORT; else @@ -380,11 +411,23 @@ void mvme16x_sched_init (irq_handler_t timer_routine) panic ("Couldn't register abort int"); } - -/* This is always executed with interrupts disabled. */ -u32 mvme16x_gettimeoffset(void) +static u64 mvme16x_read_clk(struct clocksource *cs) { - return (*(volatile u32 *)0xfff42008) * 1000; + unsigned long flags; + u8 overflow, tmp; + u32 ticks; + + local_irq_save(flags); + tmp = in_8(PCCTOVR1) >> 4; + ticks = in_be32(PCCTCNT1); + overflow = in_8(PCCTOVR1) >> 4; + if (overflow != tmp) + ticks = in_be32(PCCTCNT1); + ticks += overflow * PCC_TIMER_CYCLES; + ticks += clk_total; + local_irq_restore(flags); + + return ticks; } int bcd2int (unsigned char b) diff --git a/arch/m68k/q40/config.c b/arch/m68k/q40/config.c index 96810d91da2bd9cf2318e0dda4b40c092c3ea7d7..e63eb5f069995e199eeb0af60629ef8c4c246a62 100644 --- a/arch/m68k/q40/config.c +++ b/arch/m68k/q40/config.c @@ -40,7 +40,6 @@ extern void q40_init_IRQ(void); static void q40_get_model(char *model); extern void q40_sched_init(irq_handler_t handler); -static u32 q40_gettimeoffset(void); static int q40_hwclk(int, struct rtc_time *); static unsigned int q40_get_ss(void); static int q40_get_rtc_pll(struct rtc_pll_info *pll); @@ -169,7 +168,6 @@ void __init config_q40(void) mach_sched_init = q40_sched_init; mach_init_IRQ = q40_init_IRQ; - arch_gettimeoffset = q40_gettimeoffset; mach_hwclk = q40_hwclk; mach_get_ss = q40_get_ss; mach_get_rtc_pll = q40_get_rtc_pll; @@ -201,13 +199,6 @@ int __init q40_parse_bootinfo(const struct bi_record *rec) return 1; } - -static u32 q40_gettimeoffset(void) -{ - return 5000 * (ql_ticks != 0) * 1000; -} - - /* * Looks like op is non-zero for setting the clock, and zero for * reading the clock. diff --git a/arch/m68k/q40/q40ints.c b/arch/m68k/q40/q40ints.c index 3e7603202977e715a6afc07e4ca0f3d713e0c3ed..1c696906c159f5acb5d39f773324d93bfa5508b5 100644 --- a/arch/m68k/q40/q40ints.c +++ b/arch/m68k/q40/q40ints.c @@ -127,10 +127,10 @@ void q40_mksound(unsigned int hz, unsigned int ticks) sound_ticks = ticks << 1; } -static irq_handler_t q40_timer_routine; - -static irqreturn_t q40_timer_int (int irq, void * dev) +static irqreturn_t q40_timer_int(int irq, void *dev_id) { + irq_handler_t timer_routine = dev_id; + ql_ticks = ql_ticks ? 0 : 1; if (sound_ticks) { unsigned char sval=(sound_ticks & 1) ? 128-SVOL : 128+SVOL; @@ -139,8 +139,13 @@ static irqreturn_t q40_timer_int (int irq, void * dev) *DAC_RIGHT=sval; } - if (!ql_ticks) - q40_timer_routine(irq, dev); + if (!ql_ticks) { + unsigned long flags; + + local_irq_save(flags); + timer_routine(0, NULL); + local_irq_restore(flags); + } return IRQ_HANDLED; } @@ -148,11 +153,9 @@ void q40_sched_init (irq_handler_t timer_routine) { int timer_irq; - q40_timer_routine = timer_routine; timer_irq = Q40_IRQ_FRAME; - if (request_irq(timer_irq, q40_timer_int, 0, - "timer", q40_timer_int)) + if (request_irq(timer_irq, q40_timer_int, 0, "timer", timer_routine)) panic("Couldn't register timer int"); master_outb(-1, FRAME_CLEAR_REG); diff --git a/arch/m68k/sun3/config.c b/arch/m68k/sun3/config.c index 542c4404861c3827fc76821bfa25fbc7bb063528..229ea37dfe1b161cbe9cc2032399ab8d92f125f8 100644 --- a/arch/m68k/sun3/config.c +++ b/arch/m68k/sun3/config.c @@ -37,7 +37,6 @@ char sun3_reserved_pmeg[SUN3_PMEGS_NUM]; -extern u32 sun3_gettimeoffset(void); static void sun3_sched_init(irq_handler_t handler); extern void sun3_get_model (char* model); extern int sun3_hwclk(int set, struct rtc_time *t); @@ -138,7 +137,6 @@ void __init config_sun3(void) mach_sched_init = sun3_sched_init; mach_init_IRQ = sun3_init_IRQ; mach_reset = sun3_reboot; - arch_gettimeoffset = sun3_gettimeoffset; mach_get_model = sun3_get_model; mach_hwclk = sun3_hwclk; mach_halt = sun3_halt; diff --git a/arch/m68k/sun3/intersil.c b/arch/m68k/sun3/intersil.c index d911070af02a5d238d4ea28010cf488fe6b80f02..8fc74864de81962b1be7ab03acc56c0c7fb53498 100644 --- a/arch/m68k/sun3/intersil.c +++ b/arch/m68k/sun3/intersil.c @@ -22,13 +22,6 @@ #define STOP_VAL (INTERSIL_STOP | INTERSIL_INT_ENABLE | INTERSIL_24H_MODE) #define START_VAL (INTERSIL_RUN | INTERSIL_INT_ENABLE | INTERSIL_24H_MODE) -/* does this need to be implemented? */ -u32 sun3_gettimeoffset(void) -{ - return 1000; -} - - /* get/set hwclock */ int sun3_hwclk(int set, struct rtc_time *t) diff --git a/arch/m68k/sun3/sun3ints.c b/arch/m68k/sun3/sun3ints.c index 6bbca30c91884850e45cc07db24b4b17a00b06e5..a5824abb4a39c2f75a42d3ae8a8a93ee038da465 100644 --- a/arch/m68k/sun3/sun3ints.c +++ b/arch/m68k/sun3/sun3ints.c @@ -61,8 +61,10 @@ static irqreturn_t sun3_int7(int irq, void *dev_id) static irqreturn_t sun3_int5(int irq, void *dev_id) { + unsigned long flags; unsigned int cnt; + local_irq_save(flags); #ifdef CONFIG_SUN3 intersil_clear(); #endif @@ -76,6 +78,7 @@ static irqreturn_t sun3_int5(int irq, void *dev_id) cnt = kstat_irqs_cpu(irq, 0); if (!(cnt % 20)) sun3_leds(led_pattern[cnt % 160 / 20]); + local_irq_restore(flags); return IRQ_HANDLED; } diff --git a/arch/m68k/sun3x/config.c b/arch/m68k/sun3x/config.c index 33d3a1c6fba042db36988b571645bcb422f9d06b..03ce7f9facfe57d0b68b0530b0bf0cabfe82e977 100644 --- a/arch/m68k/sun3x/config.c +++ b/arch/m68k/sun3x/config.c @@ -49,7 +49,6 @@ void __init config_sun3x(void) mach_sched_init = sun3x_sched_init; mach_init_IRQ = sun3_init_IRQ; - arch_gettimeoffset = sun3x_gettimeoffset; mach_reset = sun3x_reboot; mach_hwclk = sun3x_hwclk; diff --git a/arch/m68k/sun3x/time.c b/arch/m68k/sun3x/time.c index 047e2bcee3d7a5710b42e561b0c1a9cc03d3a97e..9163294b0fb62254e1c2f36fe6019cdb89d936b4 100644 --- a/arch/m68k/sun3x/time.c +++ b/arch/m68k/sun3x/time.c @@ -73,22 +73,21 @@ int sun3x_hwclk(int set, struct rtc_time *t) return 0; } -/* Not much we can do here */ -u32 sun3x_gettimeoffset(void) -{ - return 0L; -} #if 0 -static void sun3x_timer_tick(int irq, void *dev_id, struct pt_regs *regs) +static irqreturn_t sun3x_timer_tick(int irq, void *dev_id) { - void (*vector)(int, void *, struct pt_regs *) = dev_id; + irq_handler_t timer_routine = dev_id; + unsigned long flags; - /* Clear the pending interrupt - pulse the enable line low */ - disable_irq(5); - enable_irq(5); + local_irq_save(flags); + /* Clear the pending interrupt - pulse the enable line low */ + disable_irq(5); + enable_irq(5); + timer_routine(0, NULL); + local_irq_restore(flags); - vector(irq, NULL, regs); + return IRQ_HANDLED; } #endif diff --git a/arch/m68k/sun3x/time.h b/arch/m68k/sun3x/time.h index 496f406412adc1df3b94efe3d98cdf29ad5d8688..86ce78bb3c28d1018353940f45fa7ce65039b8a1 100644 --- a/arch/m68k/sun3x/time.h +++ b/arch/m68k/sun3x/time.h @@ -3,7 +3,6 @@ #define SUN3X_TIME_H extern int sun3x_hwclk(int set, struct rtc_time *t); -u32 sun3x_gettimeoffset(void); void sun3x_sched_init(irq_handler_t vector); struct mostek_dt { diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index a51b965b3b82359f5feb15aaef1529cdbaa0f32a..adb179f519f950ee79dc1b7a9b2268a8fa2abf97 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -41,6 +41,7 @@ config MICROBLAZE select TRACING_SUPPORT select VIRT_TO_BUS select CPU_NO_EFFICIENT_FFS + select MMU_GATHER_NO_RANGE if MMU # Endianness selection choice @@ -58,15 +59,9 @@ config CPU_LITTLE_ENDIAN endchoice -config RWSEM_GENERIC_SPINLOCK - def_bool y - config ZONE_DMA def_bool y -config RWSEM_XCHGADD_ALGORITHM - bool - config ARCH_HAS_ILOG2_U32 def_bool n diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index 1a8285c3f693990c8a8f3f7d5d37d240b17c4a80..17a8d0a62038be272999a7320347297cb214acc6 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -23,6 +23,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += parport.h generic-y += percpu.h generic-y += preempt.h diff --git a/arch/microblaze/include/asm/syscall.h b/arch/microblaze/include/asm/syscall.h index 220decd605a4aded46a99b445e54bf27c4adc821..833d3a53dab30182b586dd364cd323d1db07835a 100644 --- a/arch/microblaze/include/asm/syscall.h +++ b/arch/microblaze/include/asm/syscall.h @@ -82,18 +82,22 @@ static inline void microblaze_set_syscall_arg(struct pt_regs *regs, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { + unsigned int i = 0; + unsigned int n = 6; + while (n--) *args++ = microblaze_get_syscall_arg(regs, i++); } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) { + unsigned int i = 0; + unsigned int n = 6; + while (n--) microblaze_set_syscall_arg(regs, i++, *args++); } diff --git a/arch/microblaze/include/asm/tlb.h b/arch/microblaze/include/asm/tlb.h index 99b6ded54849e2327e7593592c14aa73adee1429..628a78ee0a720975f6413cba6d32a9e342812345 100644 --- a/arch/microblaze/include/asm/tlb.h +++ b/arch/microblaze/include/asm/tlb.h @@ -11,16 +11,7 @@ #ifndef _ASM_MICROBLAZE_TLB_H #define _ASM_MICROBLAZE_TLB_H -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include - -#ifdef CONFIG_MMU -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while (0) -#endif - #include #endif /* _ASM_MICROBLAZE_TLB_H */ diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 8ee3a8c18498eb591ab9d1fc2b2044d43afa1cd4..4964947732af3e37bd5d651aaad9a3f3ccd39056 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -429,3 +429,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 4a5f5b0ee9a9e7d9988321e452bf87a7e13be9ab..b9c48b27162dc111aa6fee50d057f2cc4bee368d 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1037,13 +1037,6 @@ source "arch/mips/paravirt/Kconfig" endmenu -config RWSEM_GENERIC_SPINLOCK - bool - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - config GENERIC_HWEIGHT bool default y diff --git a/arch/mips/ath79/setup.c b/arch/mips/ath79/setup.c index 4a70c5de8c929bad778788db2f6c46b3cc2633a2..25a57895a3a359f6f7ded09785bf6e4cc15ea1db 100644 --- a/arch/mips/ath79/setup.c +++ b/arch/mips/ath79/setup.c @@ -210,12 +210,6 @@ const char *get_system_type(void) return ath79_sys_type; } -int get_c0_perfcount_int(void) -{ - return ATH79_MISC_IRQ(5); -} -EXPORT_SYMBOL_GPL(get_c0_perfcount_int); - unsigned int get_c0_compare_int(void) { return CP0_LEGACY_COMPARE_IRQ; diff --git a/arch/mips/configs/generic/board-ocelot.config b/arch/mips/configs/generic/board-ocelot.config index f607888d24838be1c7d1ecd0fe7c20736b7d2ed1..184eb65a6ba71a5bea1e6b39cbe5d389a9764416 100644 --- a/arch/mips/configs/generic/board-ocelot.config +++ b/arch/mips/configs/generic/board-ocelot.config @@ -1,6 +1,10 @@ # require CONFIG_CPU_MIPS32_R2=y CONFIG_LEGACY_BOARD_OCELOT=y +CONFIG_FIT_IMAGE_FDT_OCELOT=y + +CONFIG_BRIDGE=y +CONFIG_GENERIC_PHY=y CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y @@ -19,6 +23,8 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_OF_PLATFORM=y CONFIG_NETDEVICES=y +CONFIG_NET_SWITCHDEV=y +CONFIG_NET_DSA=y CONFIG_MSCC_OCELOT_SWITCH=y CONFIG_MSCC_OCELOT_SWITCH_OCELOT=y CONFIG_MDIO_MSCC_MIIM=y @@ -35,6 +41,8 @@ CONFIG_SPI_DESIGNWARE=y CONFIG_SPI_DW_MMIO=y CONFIG_SPI_SPIDEV=y +CONFIG_PINCTRL_OCELOT=y + CONFIG_GPIO_SYSFS=y CONFIG_POWER_RESET=y diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h index 845fbbc7a2e34993db0b5fb4dc853ef9bee313f9..29997e42480e916b7075c46a7b74a62d7641fda4 100644 --- a/arch/mips/include/asm/io.h +++ b/arch/mips/include/asm/io.h @@ -102,9 +102,6 @@ static inline void set_io_port_base(unsigned long base) #define iobarrier_w() wmb() #define iobarrier_sync() iob() -/* Some callers use this older API instead. */ -#define mmiowb() iobarrier_w() - /* * virt_to_phys - map virtual addresses to physical * @address: address to remap diff --git a/arch/mips/include/asm/mmiowb.h b/arch/mips/include/asm/mmiowb.h new file mode 100644 index 0000000000000000000000000000000000000000..a40824e3ef8e342c80590d4b6a6b79f72a3570ee --- /dev/null +++ b/arch/mips/include/asm/mmiowb.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_MMIOWB_H +#define _ASM_MMIOWB_H + +#include + +#define mmiowb() iobarrier_w() + +#include + +#endif /* _ASM_MMIOWB_H */ diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h index ee81297d9117ef0b2e26e40619a2f4391e4ca80f..8a88eb2655160da1a5055af4249825046026ab4b 100644 --- a/arch/mips/include/asm/spinlock.h +++ b/arch/mips/include/asm/spinlock.h @@ -11,6 +11,21 @@ #include #include + +#include + +#define queued_spin_unlock queued_spin_unlock +/** + * queued_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + */ +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + /* This could be optimised with ARCH_HAS_MMIOWB */ + mmiowb(); + smp_store_release(&lock->locked, 0); +} + #include #endif /* _ASM_SPINLOCK_H */ diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h index 6cf8ffb5367ec3fb725aac26c701d0ae5d81923c..a2b4748655df4d1466d037b971855c8046c589e8 100644 --- a/arch/mips/include/asm/syscall.h +++ b/arch/mips/include/asm/syscall.h @@ -116,9 +116,10 @@ static inline void syscall_set_return_value(struct task_struct *task, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { + unsigned int i = 0; + unsigned int n = 6; int ret; /* O32 ABI syscall() */ diff --git a/arch/mips/include/asm/tlb.h b/arch/mips/include/asm/tlb.h index b6823b9e94dad0c2f3b13ad867e1153d4f95c789..90f3ad76d9e0b03761ceec49eca71605065d1465 100644 --- a/arch/mips/include/asm/tlb.h +++ b/arch/mips/include/asm/tlb.h @@ -5,23 +5,6 @@ #include #include -/* - * MIPS doesn't need any special per-pte or per-vma handling, except - * we need to flush cache for area to be unmapped. - */ -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) - -/* - * .. because we flush the whole mm when it fills up. - */ -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #define _UNIQUE_ENTRYHI(base, idx) \ (((base) + ((idx) << (PAGE_SHIFT + 1))) | \ (cpu_has_tlbinv ? MIPS_ENTRYHI_EHINV : 0)) diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c index 6e574c02e4c3b81137618c97fe9bc176c5a40d52..ea781b29f7f17291d90391c87a8a250772d17a37 100644 --- a/arch/mips/kernel/kgdb.c +++ b/arch/mips/kernel/kgdb.c @@ -33,6 +33,7 @@ #include #include #include +#include static struct hard_trap_info { unsigned char tt; /* Trap type code for MIPS R3xxx and R4xxx */ @@ -214,7 +215,7 @@ void kgdb_call_nmi_hook(void *ignored) old_fs = get_fs(); set_fs(KERNEL_DS); - kgdb_nmicallback(raw_smp_processor_id(), NULL); + kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); set_fs(old_fs); } diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 0057c910bc2f34de0f518c43d2e234c845db0da1..3a62f80958e170527a93f4058d60f5372d5773ee 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -1419,7 +1419,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) sd.nr = syscall; sd.arch = syscall_get_arch(); - syscall_get_arguments(current, regs, 0, 6, args); + syscall_get_arguments(current, regs, args); for (i = 0; i < 6; i++) sd.args[i] = args[i]; sd.instruction_pointer = KSTK_EIP(current); diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index f158c5894a9a8760d3c1ec3430617bad976fac97..feb2653490dfe7a744b0eaa41d913297d9392ed5 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -125,7 +125,7 @@ trace_a_syscall: subu t1, v0, __NR_O32_Linux move a1, v0 bnez t1, 1f /* __NR_syscall at offset 0 */ - lw a1, PT_R4(sp) /* Arg1 for __NR_syscall case */ + ld a1, PT_R4(sp) /* Arg1 for __NR_syscall case */ .set pop 1: jal syscall_trace_enter diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 15f4117900ee8d8c9285b61dd332b3c1832558ea..9392dfe33f97ec48a74014d3bc49940dafdc944d 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -362,3 +362,7 @@ 421 n32 rt_sigtimedwait_time64 compat_sys_rt_sigtimedwait_time64 422 n32 futex_time64 sys_futex 423 n32 sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 n32 pidfd_send_signal sys_pidfd_send_signal +425 n32 io_uring_setup sys_io_uring_setup +426 n32 io_uring_enter sys_io_uring_enter +427 n32 io_uring_register sys_io_uring_register diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index c85502e67b44145420d6638489300aae4388cefe..cd0c8aa21fbacfb7563c39123f0880d2b753a7c2 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -338,3 +338,7 @@ 327 n64 rseq sys_rseq 328 n64 io_pgetevents sys_io_pgetevents # 329 through 423 are reserved to sync up with other architectures +424 n64 pidfd_send_signal sys_pidfd_send_signal +425 n64 io_uring_setup sys_io_uring_setup +426 n64 io_uring_enter sys_io_uring_enter +427 n64 io_uring_register sys_io_uring_register diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 2e063d0f837e78c3cb566d68374c33e7bcdd9a8e..e849e8ffe4a25b4516cdc748abfa96bb2c918ebe 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -411,3 +411,7 @@ 421 o32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 o32 futex_time64 sys_futex sys_futex 423 o32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +424 o32 pidfd_send_signal sys_pidfd_send_signal +425 o32 io_uring_setup sys_io_uring_setup +426 o32 io_uring_enter sys_io_uring_enter +427 o32 io_uring_register sys_io_uring_register diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 0effd3cba9a731907920c6f47a1b52321ac7225f..98bf0c222b5fe84c2086a8707172392323829d57 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -186,8 +186,9 @@ enum which_ebpf_reg { * separate frame pointer, so BPF_REG_10 relative accesses are * adjusted to be $sp relative. */ -int ebpf_to_mips_reg(struct jit_ctx *ctx, const struct bpf_insn *insn, - enum which_ebpf_reg w) +static int ebpf_to_mips_reg(struct jit_ctx *ctx, + const struct bpf_insn *insn, + enum which_ebpf_reg w) { int ebpf_reg = (w == src_reg || w == src_reg_no_fp) ? insn->src_reg : insn->dst_reg; diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c index 710a59764b01c164d3ffae92f18a394224bdc153..a32f843cdbe02299e34bf7f0897ad61f6e23dce5 100644 --- a/arch/mips/sgi-ip27/ip27-irq.c +++ b/arch/mips/sgi-ip27/ip27-irq.c @@ -118,7 +118,6 @@ static void shutdown_bridge_irq(struct irq_data *d) { struct hub_irq_data *hd = irq_data_get_irq_chip_data(d); struct bridge_controller *bc; - int pin = hd->pin; if (!hd) return; @@ -126,7 +125,7 @@ static void shutdown_bridge_irq(struct irq_data *d) disable_hub_irq(d); bc = hd->bc; - bridge_clr(bc, b_int_enable, (1 << pin)); + bridge_clr(bc, b_int_enable, (1 << hd->pin)); bridge_read(bc, b_wid_tflush); } diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index addb7f5f52645c75e83025f075d2878efd186d02..55559ca0efe404ce78d7039f9fa09a72df7c0ad3 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -60,9 +60,6 @@ config GENERIC_LOCKBREAK def_bool y depends on PREEMPT -config RWSEM_GENERIC_SPINLOCK - def_bool y - config TRACE_IRQFLAGS_SUPPORT def_bool y diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild index 64ceff7ab99b790974fc8f7073c893a2d9f1691d..688b6ed262278e9536e9fc2a18f11a07f68583e0 100644 --- a/arch/nds32/include/asm/Kbuild +++ b/arch/nds32/include/asm/Kbuild @@ -31,6 +31,7 @@ generic-y += limits.h generic-y += local.h generic-y += local64.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += parport.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/nds32/include/asm/io.h b/arch/nds32/include/asm/io.h index 71cd226d6863ee4a218e657dda216535f2e6fcf1..5ef8ae5ba83368af59be986582fb7e5f34a266f9 100644 --- a/arch/nds32/include/asm/io.h +++ b/arch/nds32/include/asm/io.h @@ -55,8 +55,6 @@ static inline u32 __raw_readl(const volatile void __iomem *addr) #define __iormb() rmb() #define __iowmb() wmb() -#define mmiowb() __asm__ __volatile__ ("msync all" : : : "memory"); - /* * {read,write}{b,w,l,q}_relaxed() are like the regular version, but * are not guaranteed to provide ordering against spinlocks or memory diff --git a/arch/nds32/include/asm/syscall.h b/arch/nds32/include/asm/syscall.h index f7e5e86765fe8efe51283d7c350a6a4ab8b73863..671ebd357496c4e1608b240d2ba23c4d3d9efa42 100644 --- a/arch/nds32/include/asm/syscall.h +++ b/arch/nds32/include/asm/syscall.h @@ -108,81 +108,41 @@ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, * syscall_get_arguments - extract system call parameter values * @task: task of interest, must be blocked * @regs: task_pt_regs() of @task - * @i: argument index [0,5] - * @n: number of arguments; n+i must be [1,6]. * @args: array filled with argument values * - * Fetches @n arguments to the system call starting with the @i'th argument - * (from 0 through 5). Argument @i is stored in @args[0], and so on. - * An arch inline version is probably optimal when @i and @n are constants. + * Fetches 6 arguments to the system call (from 0 through 5). The first + * argument is stored in @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. - * It's invalid to call this with @i + @n > 6; we only support system calls - * taking up to 6 arguments. */ #define SYSCALL_MAX_ARGS 6 void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) + unsigned long *args) { - if (n == 0) - return; - if (i + n > SYSCALL_MAX_ARGS) { - unsigned long *args_bad = args + SYSCALL_MAX_ARGS - i; - unsigned int n_bad = n + i - SYSCALL_MAX_ARGS; - pr_warning("%s called with max args %d, handling only %d\n", - __func__, i + n, SYSCALL_MAX_ARGS); - memset(args_bad, 0, n_bad * sizeof(args[0])); - memset(args_bad, 0, n_bad * sizeof(args[0])); - } - - if (i == 0) { - args[0] = regs->orig_r0; - args++; - i++; - n--; - } - - memcpy(args, ®s->uregs[0] + i, n * sizeof(args[0])); + args[0] = regs->orig_r0; + args++; + memcpy(args, ®s->uregs[0] + 1, 5 * sizeof(args[0])); } /** * syscall_set_arguments - change system call parameter value * @task: task of interest, must be in system call entry tracing * @regs: task_pt_regs() of @task - * @i: argument index [0,5] - * @n: number of arguments; n+i must be [1,6]. * @args: array of argument values to store * - * Changes @n arguments to the system call starting with the @i'th argument. - * Argument @i gets value @args[0], and so on. - * An arch inline version is probably optimal when @i and @n are constants. + * Changes 6 arguments to the system call. The first argument gets value + * @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. - * It's invalid to call this with @i + @n > 6; we only support system calls - * taking up to 6 arguments. */ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) { - if (n == 0) - return; - - if (i + n > SYSCALL_MAX_ARGS) { - pr_warn("%s called with max args %d, handling only %d\n", - __func__, i + n, SYSCALL_MAX_ARGS); - n = SYSCALL_MAX_ARGS - i; - } - - if (i == 0) { - regs->orig_r0 = args[0]; - args++; - i++; - n--; - } + regs->orig_r0 = args[0]; + args++; - memcpy(®s->uregs[0] + i, args, n * sizeof(args[0])); + memcpy(®s->uregs[0] + 1, args, 5 * sizeof(args[0])); } #endif /* _ASM_NDS32_SYSCALL_H */ diff --git a/arch/nds32/include/asm/tlb.h b/arch/nds32/include/asm/tlb.h index b35ae5eae3ab3384cbfa7032e7a5345f351d1b36..d5ae571c8d303f4e87350e3a9644df4469ec4da2 100644 --- a/arch/nds32/include/asm/tlb.h +++ b/arch/nds32/include/asm/tlb.h @@ -4,22 +4,6 @@ #ifndef __ASMNDS32_TLB_H #define __ASMNDS32_TLB_H -#define tlb_start_vma(tlb,vma) \ - do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) - -#define tlb_end_vma(tlb,vma) \ - do { \ - if(!tlb->fullmm) \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) - -#define __tlb_remove_tlb_entry(tlb, pte, addr) do { } while (0) - -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) diff --git a/arch/nds32/include/asm/tlbflush.h b/arch/nds32/include/asm/tlbflush.h index 9b411f401903630fff9f66498e09a6f1f1f7ce3e..38ee769b18d8ad344e09819dff0fb93c24af94bc 100644 --- a/arch/nds32/include/asm/tlbflush.h +++ b/arch/nds32/include/asm/tlbflush.h @@ -42,6 +42,5 @@ void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long addr); void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * pte); -void tlb_migrate_finish(struct mm_struct *mm); #endif diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 4ef15a61b7bc33ee199a84fb6c8ef36be2a9deac..ea37394ff3eab2ad16cb5d7f1f08fe9626431080 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -24,6 +24,7 @@ config NIOS2 select USB_ARCH_HAS_HCD if USB_SUPPORT select CPU_NO_EFFICIENT_FFS select ARCH_DISCARD_MEMBLOCK + select MMU_GATHER_NO_RANGE if MMU config GENERIC_CSUM def_bool y @@ -40,9 +41,6 @@ config NO_IOPORT_MAP config FPU def_bool n -config RWSEM_GENERIC_SPINLOCK - def_bool y - config TRACE_IRQFLAGS_SUPPORT def_bool n diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild index 88a667d12aaa9cefafad5260f03e073fefeb1fed..d7ef3512504a6be516b7fe81c2a74d49f81e9cbe 100644 --- a/arch/nios2/include/asm/Kbuild +++ b/arch/nios2/include/asm/Kbuild @@ -27,6 +27,7 @@ generic-y += kvm_para.h generic-y += local.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/nios2/include/asm/syscall.h b/arch/nios2/include/asm/syscall.h index 9de220854c4ad88f43ea579cbcf51c250cb6e688..d7624ed06efb6c9ea2e616c23cd20b030a53b1c8 100644 --- a/arch/nios2/include/asm/syscall.h +++ b/arch/nios2/include/asm/syscall.h @@ -58,81 +58,25 @@ static inline void syscall_set_return_value(struct task_struct *task, } static inline void syscall_get_arguments(struct task_struct *task, - struct pt_regs *regs, unsigned int i, unsigned int n, - unsigned long *args) + struct pt_regs *regs, unsigned long *args) { - BUG_ON(i + n > 6); - - switch (i) { - case 0: - if (!n--) - break; - *args++ = regs->r4; - case 1: - if (!n--) - break; - *args++ = regs->r5; - case 2: - if (!n--) - break; - *args++ = regs->r6; - case 3: - if (!n--) - break; - *args++ = regs->r7; - case 4: - if (!n--) - break; - *args++ = regs->r8; - case 5: - if (!n--) - break; - *args++ = regs->r9; - case 6: - if (!n--) - break; - default: - BUG(); - } + *args++ = regs->r4; + *args++ = regs->r5; + *args++ = regs->r6; + *args++ = regs->r7; + *args++ = regs->r8; + *args = regs->r9; } static inline void syscall_set_arguments(struct task_struct *task, - struct pt_regs *regs, unsigned int i, unsigned int n, - const unsigned long *args) + struct pt_regs *regs, const unsigned long *args) { - BUG_ON(i + n > 6); - - switch (i) { - case 0: - if (!n--) - break; - regs->r4 = *args++; - case 1: - if (!n--) - break; - regs->r5 = *args++; - case 2: - if (!n--) - break; - regs->r6 = *args++; - case 3: - if (!n--) - break; - regs->r7 = *args++; - case 4: - if (!n--) - break; - regs->r8 = *args++; - case 5: - if (!n--) - break; - regs->r9 = *args++; - case 6: - if (!n) - break; - default: - BUG(); - } + regs->r4 = *args++; + regs->r5 = *args++; + regs->r6 = *args++; + regs->r7 = *args++; + regs->r8 = *args++; + regs->r9 = *args; } #endif diff --git a/arch/nios2/include/asm/tlb.h b/arch/nios2/include/asm/tlb.h index d3bc648e08b5dad86e5e9c449e655fe291381c91..f9f2e27e32dd5e7768ba9a63faac98b18516f41d 100644 --- a/arch/nios2/include/asm/tlb.h +++ b/arch/nios2/include/asm/tlb.h @@ -11,22 +11,12 @@ #ifndef _ASM_NIOS2_TLB_H #define _ASM_NIOS2_TLB_H -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - extern void set_mmu_pid(unsigned long pid); /* - * NiosII doesn't need any special per-pte or per-vma handling, except - * we need to flush cache for the area to be unmapped. + * NIOS32 does have flush_tlb_range(), but it lacks a limit and fallback to + * full mm invalidation. So use flush_tlb_mm() for everything. */ -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) - -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) #include #include diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index a5e361fbb75a01400681b6238b21a7eea18b9607..7cfb20555b100508527be57172e99f0431fcb72f 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -36,6 +36,7 @@ config OPENRISC select OMPIC if SMP select ARCH_WANT_FRAME_POINTERS select GENERIC_IRQ_MULTI_HANDLER + select MMU_GATHER_NO_RANGE if MMU config CPU_BIG_ENDIAN def_bool y @@ -43,12 +44,6 @@ config CPU_BIG_ENDIAN config MMU def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - -config RWSEM_XCHGADD_ALGORITHM - def_bool n - config GENERIC_HWEIGHT def_bool y diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index 22aa97136c0195ae2b687c0793c42e43f22888ec..1919cc5e0f11d4af523998bf2fac993a4e932547 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -24,6 +24,7 @@ generic-y += kvm_para.h generic-y += local.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/openrisc/include/asm/syscall.h b/arch/openrisc/include/asm/syscall.h index 2db9f1cf0694c0f2c6bdaec77953f62fb4fe6372..b4ff07c1baed5d13c9abb0d025a1ece11ee78ad0 100644 --- a/arch/openrisc/include/asm/syscall.h +++ b/arch/openrisc/include/asm/syscall.h @@ -56,20 +56,16 @@ syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) + unsigned long *args) { - BUG_ON(i + n > 6); - - memcpy(args, ®s->gpr[3 + i], n * sizeof(args[0])); + memcpy(args, ®s->gpr[3], 6 * sizeof(args[0])); } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) + const unsigned long *args) { - BUG_ON(i + n > 6); - - memcpy(®s->gpr[3 + i], args, n * sizeof(args[0])); + memcpy(®s->gpr[3], args, 6 * sizeof(args[0])); } static inline int syscall_get_arch(void) diff --git a/arch/openrisc/include/asm/tlb.h b/arch/openrisc/include/asm/tlb.h index fa4376a4515d14a8921ae7c3e382c0ee6398952e..92d8a42098849dcffcc5ceb07dc8c15b00f992c9 100644 --- a/arch/openrisc/include/asm/tlb.h +++ b/arch/openrisc/include/asm/tlb.h @@ -20,14 +20,10 @@ #define __ASM_OPENRISC_TLB_H__ /* - * or32 doesn't need any special per-pte or - * per-vma handling.. + * OpenRISC doesn't have an efficient flush_tlb_range() so use flush_tlb_mm() + * for everything. */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) #include #include diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index c8e621296092d83751ecdec774fb46aa430c6c04..f1ed8ddfe48697b0fa02810fbcb22763d1bd1302 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -75,12 +75,6 @@ config GENERIC_LOCKBREAK default y depends on SMP && PREEMPT -config RWSEM_GENERIC_SPINLOCK - def_bool y - -config RWSEM_XCHGADD_ALGORITHM - bool - config ARCH_HAS_ILOG2_U32 bool default n diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 9bcd0c903dbbef2aee61ade11844c5091e4bfd0c..b8c7db777144ced1531e4f443038ad2eb2240d94 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += percpu.h generic-y += preempt.h generic-y += seccomp.h diff --git a/arch/parisc/include/asm/io.h b/arch/parisc/include/asm/io.h index 30a8315d5c0751fe448c39fc98be501e1d295271..93d37010b375c7441953cfc69ada30f1eb40979e 100644 --- a/arch/parisc/include/asm/io.h +++ b/arch/parisc/include/asm/io.h @@ -229,8 +229,6 @@ static inline void writeq(unsigned long long q, volatile void __iomem *addr) #define writel_relaxed(l, addr) writel(l, addr) #define writeq_relaxed(q, addr) writeq(q, addr) -#define mmiowb() do { } while (0) - void memset_io(volatile void __iomem *addr, unsigned char val, int count); void memcpy_fromio(void *dst, const volatile void __iomem *src, int count); void memcpy_toio(volatile void __iomem *dst, const void *src, int count); diff --git a/arch/parisc/include/asm/ptrace.h b/arch/parisc/include/asm/ptrace.h index 2a27b275ab092cc60b3d003250aaaf647aa9c916..9ff033d261ab381c9e356fea458d768170f9effc 100644 --- a/arch/parisc/include/asm/ptrace.h +++ b/arch/parisc/include/asm/ptrace.h @@ -22,13 +22,14 @@ unsigned long profile_pc(struct pt_regs *); static inline unsigned long regs_return_value(struct pt_regs *regs) { - return regs->gr[20]; + return regs->gr[28]; } static inline void instruction_pointer_set(struct pt_regs *regs, unsigned long val) { - regs->iaoq[0] = val; + regs->iaoq[0] = val; + regs->iaoq[1] = val + 4; } /* Query offset/name of register from its name/offset */ diff --git a/arch/parisc/include/asm/syscall.h b/arch/parisc/include/asm/syscall.h index 8bff1a58c97f1b107dabf79e172f5ecb56c5db2d..62a6d477fae0197cdba9d62044e31104f1b05192 100644 --- a/arch/parisc/include/asm/syscall.h +++ b/arch/parisc/include/asm/syscall.h @@ -18,29 +18,15 @@ static inline long syscall_get_nr(struct task_struct *tsk, } static inline void syscall_get_arguments(struct task_struct *tsk, - struct pt_regs *regs, unsigned int i, - unsigned int n, unsigned long *args) + struct pt_regs *regs, + unsigned long *args) { - BUG_ON(i); - - switch (n) { - case 6: - args[5] = regs->gr[21]; - case 5: - args[4] = regs->gr[22]; - case 4: - args[3] = regs->gr[23]; - case 3: - args[2] = regs->gr[24]; - case 2: - args[1] = regs->gr[25]; - case 1: - args[0] = regs->gr[26]; - case 0: - break; - default: - BUG(); - } + args[5] = regs->gr[21]; + args[4] = regs->gr[22]; + args[3] = regs->gr[23]; + args[2] = regs->gr[24]; + args[1] = regs->gr[25]; + args[0] = regs->gr[26]; } static inline long syscall_get_return_value(struct task_struct *task, diff --git a/arch/parisc/include/asm/tlb.h b/arch/parisc/include/asm/tlb.h index 0c881e74d8a62cd6a4e6082178299a118b58a5d9..8c0446b04c9e17f593cac1d4fa3671658766038d 100644 --- a/arch/parisc/include/asm/tlb.h +++ b/arch/parisc/include/asm/tlb.h @@ -2,24 +2,6 @@ #ifndef _PARISC_TLB_H #define _PARISC_TLB_H -#define tlb_flush(tlb) \ -do { if ((tlb)->fullmm) \ - flush_tlb_mm((tlb)->mm);\ -} while (0) - -#define tlb_start_vma(tlb, vma) \ -do { if (!(tlb)->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define tlb_end_vma(tlb, vma) \ -do { if (!(tlb)->fullmm) \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define __tlb_remove_tlb_entry(tlb, pte, address) \ - do { } while (0) - #include #define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd) diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index eb39e7e380d7e27b24f6bae39ae0e6c3583511e3..841db71958cdb50dff183dd058a9b09a5ec81421 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -210,12 +210,6 @@ void __cpuidle arch_cpu_idle(void) static int __init parisc_idle_init(void) { - const char *marker; - - /* check QEMU/SeaBIOS marker in PAGE0 */ - marker = (char *) &PAGE0->pad0; - running_on_qemu = (memcmp(marker, "SeaBIOS", 8) == 0); - if (!running_on_qemu) cpu_idle_poll_ctrl(1); diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index 15dd9e21be7eac6d1fcf37d67f72de9b227bfa75..d908058d05c10bf4880e361070c30a42b668fd7e 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -397,6 +397,9 @@ void __init start_parisc(void) int ret, cpunum; struct pdc_coproc_cfg coproc_cfg; + /* check QEMU/SeaBIOS marker in PAGE0 */ + running_on_qemu = (memcmp(&PAGE0->pad0, "SeaBIOS", 8) == 0); + cpunum = smp_processor_id(); init_cpu_topology(); diff --git a/arch/parisc/kernel/stacktrace.c b/arch/parisc/kernel/stacktrace.c index ec5835e83a7a756c9fecda603f57f42bec6ed870..6f0b9c8d80523682f85ab3f927ecbe3f9e96f188 100644 --- a/arch/parisc/kernel/stacktrace.c +++ b/arch/parisc/kernel/stacktrace.c @@ -29,22 +29,17 @@ static void dump_trace(struct task_struct *task, struct stack_trace *trace) } } - /* * Save stack-backtrace addresses into a stack_trace buffer. */ void save_stack_trace(struct stack_trace *trace) { dump_trace(current, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { dump_trace(tsk, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index b26766c6647dc7a40fd3235460902112c20cd3d4..fe8ca623add89a627710b697f7886fc879589ac2 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -420,3 +420,7 @@ 421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 sys_futex sys_futex 423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2d0be82c30619bd405949b64c2847603caf5a67d..fa7219ffeadc96b0d11d625b87cd0b7f3dc5b1d4 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -103,13 +103,6 @@ config LOCKDEP_SUPPORT bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config GENERIC_LOCKBREAK bool default y @@ -132,6 +125,7 @@ config PPC select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV + select ARCH_HAS_MMIOWB if PPC64 select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 select ARCH_HAS_PTE_SPECIAL @@ -218,6 +212,8 @@ config PPC select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP + select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE + select HAVE_MMU_GATHER_PAGE_SIZE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN select HAVE_SYSCALL_TRACEPOINTS @@ -318,6 +314,10 @@ config ARCH_SUSPEND_POSSIBLE (PPC_85xx && !PPC_E500MC) || PPC_86xx || PPC_PSERIES \ || 44x || 40x +config ARCH_SUSPEND_NONZERO_CPU + def_bool y + depends on PPC_POWERNV || PPC_PSERIES + config PPC_DCR_NATIVE bool diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 5ba131c30f6bcded4e65ccc40bb8aa2595e44ff1..1bcd468ab422dc100b120607b03d5d587850b453 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -266,6 +266,7 @@ CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_PROC_KCORE=y +CONFIG_HUGETLBFS=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set CONFIG_NLS=y diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index a0c132bedfae86965c2f7c850098b65420c2c5fc..36bda391e549f87dc477a9c0997e93b855e556b9 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -8,6 +8,5 @@ generic-y += irq_regs.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += preempt.h -generic-y += rwsem.h generic-y += vtime.h generic-y += msi.h diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 4b73847e9b955872248aa0b659fff2f4abfaf897..1fad67b4640926ad6660ffce74d84d0b050d3f0e 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -34,14 +34,11 @@ extern struct pci_dev *isa_bridge_pcidev; #include #include #include +#include #include #include #include -#ifdef CONFIG_PPC64 -#include -#endif - #define SIO_CONFIG_RA 0x398 #define SIO_CONFIG_RD 0x399 @@ -107,12 +104,6 @@ extern bool isa_io_special; * */ -#ifdef CONFIG_PPC64 -#define IO_SET_SYNC_FLAG() do { local_paca->io_sync = 1; } while(0) -#else -#define IO_SET_SYNC_FLAG() -#endif - #define DEF_MMIO_IN_X(name, size, insn) \ static inline u##size name(const volatile u##size __iomem *addr) \ { \ @@ -127,7 +118,7 @@ static inline void name(volatile u##size __iomem *addr, u##size val) \ { \ __asm__ __volatile__("sync;"#insn" %1,%y0" \ : "=Z" (*addr) : "r" (val) : "memory"); \ - IO_SET_SYNC_FLAG(); \ + mmiowb_set_pending(); \ } #define DEF_MMIO_IN_D(name, size, insn) \ @@ -144,7 +135,7 @@ static inline void name(volatile u##size __iomem *addr, u##size val) \ { \ __asm__ __volatile__("sync;"#insn"%U0%X0 %1,%0" \ : "=m" (*addr) : "r" (val) : "memory"); \ - IO_SET_SYNC_FLAG(); \ + mmiowb_set_pending(); \ } DEF_MMIO_IN_D(in_8, 8, lbz); @@ -652,24 +643,6 @@ static inline void name at \ #include -#ifdef CONFIG_PPC32 -#define mmiowb() -#else -/* - * Enforce synchronisation of stores vs. spin_unlock - * (this does it explicitly, though our implementation of spin_unlock - * does it implicitely too) - */ -static inline void mmiowb(void) -{ - unsigned long tmp; - - __asm__ __volatile__("sync; li %0,0; stb %0,%1(13)" - : "=&r" (tmp) : "i" (offsetof(struct paca_struct, io_sync)) - : "memory"); -} -#endif /* !CONFIG_PPC32 */ - static inline void iosync(void) { __asm__ __volatile__ ("sync" : : : "memory"); diff --git a/arch/powerpc/include/asm/mmiowb.h b/arch/powerpc/include/asm/mmiowb.h new file mode 100644 index 0000000000000000000000000000000000000000..74a00127eb20416e95ec4761c088ec83bab7ae39 --- /dev/null +++ b/arch/powerpc/include/asm/mmiowb.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_MMIOWB_H +#define _ASM_POWERPC_MMIOWB_H + +#ifdef CONFIG_MMIOWB + +#include +#include +#include + +#define arch_mmiowb_state() (&local_paca->mmiowb_state) +#define mmiowb() mb() + +#endif /* CONFIG_MMIOWB */ + +#include + +#endif /* _ASM_POWERPC_MMIOWB_H */ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 598cdcdd13553dea4a80a9b72196dbee8987cd61..8ddd4a91bdc1e2fe9a2e4a617b32cc0e6e15e572 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -352,7 +352,7 @@ static inline bool strict_kernel_rwx_enabled(void) #if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) && \ defined (CONFIG_PPC_64K_PAGES) #define MAX_PHYSMEM_BITS 51 -#elif defined(CONFIG_SPARSEMEM) +#elif defined(CONFIG_PPC64) #define MAX_PHYSMEM_BITS 46 #endif diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index e843bc5d1a0f25e69be674bb1f634da6dbb46ffd..134e912d403fb417b67e088516aecbb8695139e3 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -34,6 +34,8 @@ #include #include +#include + register struct paca_struct *local_paca asm("r13"); #if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_SMP) @@ -171,7 +173,6 @@ struct paca_struct { u16 trap_save; /* Used when bad stack is encountered */ u8 irq_soft_mask; /* mask for irq soft masking */ u8 irq_happened; /* irq happened while soft-disabled */ - u8 io_sync; /* writel() needs spin_unlock sync */ u8 irq_work_pending; /* IRQ_WORK interrupt while soft-disable */ u8 nap_state_lost; /* NV GPR values lost in power7_idle */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -264,6 +265,9 @@ struct paca_struct { #ifdef CONFIG_STACKPROTECTOR unsigned long canary; #endif +#ifdef CONFIG_MMIOWB + struct mmiowb_state mmiowb_state; +#endif } ____cacheline_aligned; extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 685c72310f5d0e5506d5973d6250449376b7ef46..15b39c407c4e99a46455b862eb551dcd502fb4fc 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -39,19 +39,6 @@ #define LOCK_TOKEN 1 #endif -#if defined(CONFIG_PPC64) && defined(CONFIG_SMP) -#define CLEAR_IO_SYNC (get_paca()->io_sync = 0) -#define SYNC_IO do { \ - if (unlikely(get_paca()->io_sync)) { \ - mb(); \ - get_paca()->io_sync = 0; \ - } \ - } while (0) -#else -#define CLEAR_IO_SYNC -#define SYNC_IO -#endif - #ifdef CONFIG_PPC_PSERIES #define vcpu_is_preempted vcpu_is_preempted static inline bool vcpu_is_preempted(int cpu) @@ -99,7 +86,6 @@ static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock) static inline int arch_spin_trylock(arch_spinlock_t *lock) { - CLEAR_IO_SYNC; return __arch_spin_trylock(lock) == 0; } @@ -130,7 +116,6 @@ extern void __rw_yield(arch_rwlock_t *lock); static inline void arch_spin_lock(arch_spinlock_t *lock) { - CLEAR_IO_SYNC; while (1) { if (likely(__arch_spin_trylock(lock) == 0)) break; @@ -148,7 +133,6 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { unsigned long flags_dis; - CLEAR_IO_SYNC; while (1) { if (likely(__arch_spin_trylock(lock) == 0)) break; @@ -167,7 +151,6 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) static inline void arch_spin_unlock(arch_spinlock_t *lock) { - SYNC_IO; __asm__ __volatile__("# arch_spin_unlock\n\t" PPC_RELEASE_BARRIER: : :"memory"); lock->slock = 0; diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index 1a0e7a8b1c811cf5d089c5ac68eb96d189ad702d..1243045bad2d633d4bd2df3d3e086bd2988987d0 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -65,22 +65,20 @@ static inline void syscall_set_return_value(struct task_struct *task, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { unsigned long val, mask = -1UL; - - BUG_ON(i + n > 6); + unsigned int n = 6; #ifdef CONFIG_COMPAT if (test_tsk_thread_flag(task, TIF_32BIT)) mask = 0xffffffff; #endif while (n--) { - if (n == 0 && i == 0) + if (n == 0) val = regs->orig_gpr3; else - val = regs->gpr[3 + i + n]; + val = regs->gpr[3 + n]; args[n] = val & mask; } @@ -88,15 +86,12 @@ static inline void syscall_get_arguments(struct task_struct *task, static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) { - BUG_ON(i + n > 6); - memcpy(®s->gpr[3 + i], args, n * sizeof(args[0])); + memcpy(®s->gpr[3], args, 6 * sizeof(args[0])); /* Also copy the first argument into orig_gpr3 */ - if (i == 0 && n > 0) - regs->orig_gpr3 = args[0]; + regs->orig_gpr3 = args[0]; } static inline int syscall_get_arch(void) diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index e24c67d5ba75a2a18bc68d6cdfdb7399b07ba460..34fba1ce27f7c6f1ea34afb82f1cc7fcef750edd 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -27,8 +27,8 @@ #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +#define tlb_flush tlb_flush extern void tlb_flush(struct mmu_gather *tlb); /* Get the generic bits... */ @@ -46,22 +46,6 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, #endif } -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ - if (!tlb->page_size) - tlb->page_size = page_size; - else if (tlb->page_size != page_size) { - if (!tlb->fullmm) - tlb_flush_mmu(tlb); - /* - * update the page size after flush for the new - * mmu_gather. - */ - tlb->page_size = page_size; - } -} - #ifdef CONFIG_SMP static inline int mm_is_core_local(struct mm_struct *mm) { diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index a5b8fbae56a03b491f0982562f3d590cff16ca5f..9481a117e24255173231ac687c9e99b730bff420 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -656,11 +656,17 @@ EXC_COMMON_BEGIN(data_access_slb_common) ld r4,PACA_EXSLB+EX_DAR(r13) std r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD +BEGIN_MMU_FTR_SECTION + /* HPT case, do SLB fault */ bl do_slb_fault cmpdi r3,0 bne- 1f b fast_exception_return 1: /* Error case */ +MMU_FTR_SECTION_ELSE + /* Radix case, access is outside page table range */ + li r3,-EFAULT +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) std r3,RESULT(r1) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) @@ -705,11 +711,17 @@ EXC_COMMON_BEGIN(instruction_access_slb_common) EXCEPTION_PROLOG_COMMON(0x480, PACA_EXSLB) ld r4,_NIP(r1) addi r3,r1,STACK_FRAME_OVERHEAD +BEGIN_MMU_FTR_SECTION + /* HPT case, do SLB fault */ bl do_slb_fault cmpdi r3,0 bne- 1f b fast_exception_return 1: /* Error case */ +MMU_FTR_SECTION_ELSE + /* Radix case, access is outside page table range */ + li r3,-EFAULT +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) std r3,RESULT(r1) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 48051c8977c5603a1ac9f8b730c0283ab04497d8..e25b615e9f9e642d34e9387aac7db652131a466f 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -851,10 +851,6 @@ __secondary_start: tophys(r4,r2) addi r4,r4,THREAD /* phys address of our thread_struct */ mtspr SPRN_SPRG_THREAD,r4 -#ifdef CONFIG_PPC_RTAS - li r3,0 - stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ -#endif lis r4, (swapper_pg_dir - PAGE_OFFSET)@h ori r4, r4, (swapper_pg_dir - PAGE_OFFSET)@l mtspr SPRN_SPRG_PGDIR, r4 @@ -941,10 +937,6 @@ start_here: tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ mtspr SPRN_SPRG_THREAD,r4 -#ifdef CONFIG_PPC_RTAS - li r3,0 - stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ -#endif lis r4, (swapper_pg_dir - PAGE_OFFSET)@h ori r4, r4, (swapper_pg_dir - PAGE_OFFSET)@l mtspr SPRN_SPRG_PGDIR, r4 diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 683b5b3805bd17493d97c261afc19279ac76b69f..cd381e2291dfeb38a569fed214778838cef42a2e 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -712,6 +713,12 @@ static void kvm_use_magic_page(void) static __init void kvm_free_tmp(void) { + /* + * Inform kmemleak about the hole in the .bss section since the + * corresponding pages will be unmapped with DEBUG_PAGEALLOC=y. + */ + kmemleak_free_part(&kvm_tmp[kvm_tmp_index], + ARRAY_SIZE(kvm_tmp) - kvm_tmp_index); free_reserved_area(&kvm_tmp[kvm_tmp_index], &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL); } diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index b33bafb8fcea1f7a964ad99e203ee0a2cf3103cb..70568ccbd9fd5eae17014473aa415d9b472b7d86 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -57,7 +57,7 @@ void setup_barrier_nospec(void) enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR); - if (!no_nospec) + if (!no_nospec && !cpu_mitigations_off()) enable_barrier_nospec(enable); } @@ -116,7 +116,7 @@ static int __init handle_nospectre_v2(char *p) early_param("nospectre_v2", handle_nospectre_v2); void setup_spectre_v2(void) { - if (no_spectrev2) + if (no_spectrev2 || cpu_mitigations_off()) do_btb_flush_fixups(); else btb_flush_enabled = true; @@ -300,7 +300,7 @@ void setup_stf_barrier(void) stf_enabled_flush_types = type; - if (!no_stf_barrier) + if (!no_stf_barrier && !cpu_mitigations_off()) stf_barrier_enable(enable); } diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index ba404dd9ce1d88809e0a6e70f0decc286caf576a..4f49e1a3594c2d3423ae232152cda2b53e730483 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -932,7 +932,7 @@ void setup_rfi_flush(enum l1d_flush_type types, bool enable) enabled_flush_types = types; - if (!no_rfi_flush) + if (!no_rfi_flush && !cpu_mitigations_off()) rfi_flush_enable(enable); } diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index b18abb0c3dae6248cfd697b1b9ee2343c39e1ba3..00f5a63c8d9a65aefd60df95b75d9cfae1fe8493 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -505,3 +505,7 @@ 421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 sys_futex sys_futex 423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 1e0bc5955a400601b106949f14c7a0ca64d1a6a6..afd516b572f8637447315ec882c08189bcf2fb4d 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -98,7 +98,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) * can be used, r7 contains NSEC_PER_SEC. */ - lwz r5,WTOM_CLOCK_SEC(r9) + lwz r5,(WTOM_CLOCK_SEC+LOPART)(r9) lwz r6,WTOM_CLOCK_NSEC(r9) /* We now have our offset in r5,r6. We create a fake dependency diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index f02b049737109c670b1af440f9f5704bbdf0afc0..f100e331e69b6ad37f5f6323219ada40fe8c8641 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -543,14 +543,14 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; + idx = srcu_read_lock(&vcpu->kvm->srcu); + ret = kvmppc_tce_validate(stt, tce); if (ret != H_SUCCESS) - return ret; + goto unlock_exit; dir = iommu_tce_direction(tce); - idx = srcu_read_lock(&vcpu->kvm->srcu); - if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { ret = H_PARAMETER; goto unlock_exit; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 06964350b97a94118d065d90a257c882b5280136..b2b29d4f9842877db15addda76a19eb061f7c858 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3423,7 +3423,9 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); - mtspr(SPRN_PSSCR, host_psscr); + /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */ + mtspr(SPRN_PSSCR, host_psscr | + (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); mtspr(SPRN_HFSCR, host_hfscr); mtspr(SPRN_CIABR, host_ciabr); mtspr(SPRN_DAWR, host_dawr); diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index e7a9c4f6bfca49585beffcb6fc3dc755eb054e8f..8330f135294f48ecfff9bb5d3555f6fa3e3514c3 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -95,28 +95,15 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, unsigned long entries, unsigned long dev_hpa, struct mm_iommu_table_group_mem_t **pmem) { - struct mm_iommu_table_group_mem_t *mem; - long i, ret, locked_entries = 0; + struct mm_iommu_table_group_mem_t *mem, *mem2; + long i, ret, locked_entries = 0, pinned = 0; unsigned int pageshift; - - mutex_lock(&mem_list_mutex); - - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, - next) { - /* Overlap? */ - if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && - (ua < (mem->ua + - (mem->entries << PAGE_SHIFT)))) { - ret = -EINVAL; - goto unlock_exit; - } - - } + unsigned long entry, chunk; if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { ret = mm_iommu_adjust_locked_vm(mm, entries, true); if (ret) - goto unlock_exit; + return ret; locked_entries = entries; } @@ -148,17 +135,27 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); + chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) / + sizeof(struct vm_area_struct *); + chunk = min(chunk, entries); + for (entry = 0; entry < entries; entry += chunk) { + unsigned long n = min(entries - entry, chunk); + + ret = get_user_pages_longterm(ua + (entry << PAGE_SHIFT), n, + FOLL_WRITE, mem->hpages + entry, NULL); + if (ret == n) { + pinned += n; + continue; + } + if (ret > 0) + pinned += ret; + break; + } up_read(&mm->mmap_sem); - if (ret != entries) { - /* free the reference taken */ - for (i = 0; i < ret; i++) - put_page(mem->hpages[i]); - - vfree(mem->hpas); - kfree(mem); - ret = -EFAULT; - goto unlock_exit; + if (pinned != entries) { + if (!ret) + ret = -EFAULT; + goto free_exit; } pageshift = PAGE_SHIFT; @@ -183,21 +180,43 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } good_exit: - ret = 0; atomic64_set(&mem->mapped, 1); mem->used = 1; mem->ua = ua; mem->entries = entries; - *pmem = mem; - list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); + mutex_lock(&mem_list_mutex); -unlock_exit: - if (locked_entries && ret) - mm_iommu_adjust_locked_vm(mm, locked_entries, false); + list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next) { + /* Overlap? */ + if ((mem2->ua < (ua + (entries << PAGE_SHIFT))) && + (ua < (mem2->ua + + (mem2->entries << PAGE_SHIFT)))) { + ret = -EINVAL; + mutex_unlock(&mem_list_mutex); + goto free_exit; + } + } + + list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); mutex_unlock(&mem_list_mutex); + *pmem = mem; + + return 0; + +free_exit: + /* free the reference taken */ + for (i = 0; i < pinned; i++) + put_page(mem->hpages[i]); + + vfree(mem->hpas); + kfree(mem); + +unlock_exit: + mm_iommu_adjust_locked_vm(mm, locked_entries, false); + return ret; } @@ -266,7 +285,7 @@ static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) { long ret = 0; - unsigned long entries, dev_hpa; + unsigned long unlock_entries = 0; mutex_lock(&mem_list_mutex); @@ -287,17 +306,17 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) goto unlock_exit; } + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + unlock_entries = mem->entries; + /* @mapped became 0 so now mappings are disabled, release the region */ - entries = mem->entries; - dev_hpa = mem->dev_hpa; mm_iommu_release(mem); - if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) - mm_iommu_adjust_locked_vm(mm, entries, false); - unlock_exit: mutex_unlock(&mem_list_mutex); + mm_iommu_adjust_locked_vm(mm, unlock_entries, false); + return ret; } EXPORT_SYMBOL_GPL(mm_iommu_put); diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index f29d2f118b444aa6b060bcfa6fab6fb0bf321949..5d9c3ff728c9a96cfffd232466096473e091be65 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -98,10 +98,20 @@ static int find_free_bat(void) return -1; } +/* + * This function calculates the size of the larger block usable to map the + * beginning of an area based on the start address and size of that area: + * - max block size is 8M on 601 and 256 on other 6xx. + * - base address must be aligned to the block size. So the maximum block size + * is identified by the lowest bit set to 1 in the base address (for instance + * if base is 0x16000000, max size is 0x02000000). + * - block size has to be a power of two. This is calculated by finding the + * highest bit set to 1. + */ static unsigned int block_size(unsigned long base, unsigned long top) { unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20; - unsigned int base_shift = (fls(base) - 1) & 31; + unsigned int base_shift = (ffs(base) - 1) & 31; unsigned int block_shift = (fls(top - base) - 1) & 31; return min3(max_size, 1U << base_shift, 1U << block_shift); @@ -157,7 +167,7 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { - int done; + unsigned long done; unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; if (__map_without_bats) { @@ -169,10 +179,10 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) return __mmu_mapin_ram(base, top); done = __mmu_mapin_ram(base, border); - if (done != border - base) + if (done != border) return done; - return done + __mmu_mapin_ram(border, top); + return __mmu_mapin_ram(border, top); } void mmu_mark_initmem_nx(void) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 842b2c7e156aba4cb2a04d8897fb7aa6128c3b4d..50cd09b4e05d51a9d9c46722065f9ebf5e55295c 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -324,7 +324,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK config PPC_RADIX_MMU bool "Radix MMU Support" - depends on PPC_BOOK3S_64 + depends on PPC_BOOK3S_64 && HUGETLB_PAGE select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA default y help diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index a0f44f9923608929d352596e43af8311698c3467..13c6a47e61505632a7e2fab533a03a912e5d08a2 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2429,7 +2429,10 @@ static void dump_one_paca(int cpu) DUMP(p, trap_save, "%#-*x"); DUMP(p, irq_soft_mask, "%#-*x"); DUMP(p, irq_happened, "%#-*x"); - DUMP(p, io_sync, "%#-*x"); +#ifdef CONFIG_MMIOWB + DUMP(p, mmiowb_state.nesting_count, "%#-*x"); + DUMP(p, mmiowb_state.mmiowb_pending, "%#-*x"); +#endif DUMP(p, irq_work_pending, "%#-*x"); DUMP(p, nap_state_lost, "%#-*x"); DUMP(p, sprg_vdso, "%#-*llx"); diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index eb56c82d8aa14caf0e6f5507102dd22fb208b9ac..e66745decea1734388b48f3395404e523a9a7888 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -48,6 +48,7 @@ config RISCV select RISCV_TIMER select GENERIC_IRQ_MULTI_HANDLER select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_MMIOWB select HAVE_EBPF_JIT if 64BIT config MMU @@ -69,9 +70,6 @@ config STACKTRACE_SUPPORT config TRACE_IRQFLAGS_SUPPORT def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - config GENERIC_BUG def_bool y depends on BUG diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig new file mode 100644 index 0000000000000000000000000000000000000000..1a911ed8e772890f862655b55f1131a77f45d51c --- /dev/null +++ b/arch/riscv/configs/rv32_defconfig @@ -0,0 +1,84 @@ +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_CGROUP_BPF=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_EXPERT=y +CONFIG_BPF_SYSCALL=y +CONFIG_ARCH_RV32I=y +CONFIG_SMP=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_NETLINK_DIAG=y +CONFIG_PCI=y +CONFIG_PCIEPORTBUS=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_PCIE_XILINX=y +CONFIG_DEVTMPFS=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_AHCI_PLATFORM=y +CONFIG_NETDEVICES=y +CONFIG_VIRTIO_NET=y +CONFIG_MACB=y +CONFIG_E1000E=y +CONFIG_R8169=y +CONFIG_MICROSEMI_PHY=y +CONFIG_INPUT_MOUSEDEV=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +CONFIG_SERIAL_EARLYCON_RISCV_SBI=y +CONFIG_HVC_RISCV_SBI=y +# CONFIG_PTP_1588_CLOCK is not set +CONFIG_DRM=y +CONFIG_DRM_RADEON=y +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_USB=y +CONFIG_USB_XHCI_HCD=y +CONFIG_USB_XHCI_PLATFORM=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_HCD_PLATFORM=y +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_OHCI_HCD_PLATFORM=y +CONFIG_USB_STORAGE=y +CONFIG_USB_UAS=y +CONFIG_VIRTIO_MMIO=y +CONFIG_SIFIVE_PLIC=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_AUTOFS4_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_NFS_FS=y +CONFIG_NFS_V4=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_ROOT_NFS=y +CONFIG_CRYPTO_USER_API_HASH=y +CONFIG_CRYPTO_DEV_VIRTIO=y +CONFIG_PRINTK_TIME=y +# CONFIG_RCU_TRACE is not set diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index 57afe604b495bef44894b5088517c103376684d4..c207f6634b91c4ecc8f60b759c82056dd5624ed4 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -26,7 +26,7 @@ enum fixed_addresses { }; #define FIXADDR_SIZE (__end_of_fixed_addresses * PAGE_SIZE) -#define FIXADDR_TOP (PAGE_OFFSET) +#define FIXADDR_TOP (VMALLOC_START) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) #define FIXMAP_PAGE_IO PAGE_KERNEL diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index 1d9c1376dc642fb3fffb2ef764a6e88d9e8bc914..744fd92e77bc726f7a67aa7d68cb1beba1235f53 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -20,6 +20,7 @@ #define _ASM_RISCV_IO_H #include +#include extern void __iomem *ioremap(phys_addr_t offset, unsigned long size); @@ -99,18 +100,6 @@ static inline u64 __raw_readq(const volatile void __iomem *addr) } #endif -/* - * FIXME: I'm flip-flopping on whether or not we should keep this or enforce - * the ordering with I/O on spinlocks like PowerPC does. The worry is that - * drivers won't get this correct, but I also don't want to introduce a fence - * into the lock code that otherwise only uses AMOs (and is essentially defined - * by the ISA to be correct). For now I'm leaving this here: "o,w" is - * sufficient to ensure that all writes to the device have completed before the - * write to the spinlock is allowed to commit. I surmised this from reading - * "ACQUIRES VS I/O ACCESSES" in memory-barriers.txt. - */ -#define mmiowb() __asm__ __volatile__ ("fence o,w" : : : "memory"); - /* * Unordered I/O memory access primitives. These are even more relaxed than * the relaxed versions, as they don't even order accesses between successive @@ -165,7 +154,7 @@ static inline u64 __raw_readq(const volatile void __iomem *addr) #define __io_br() do {} while (0) #define __io_ar(v) __asm__ __volatile__ ("fence i,r" : : : "memory"); #define __io_bw() __asm__ __volatile__ ("fence w,o" : : : "memory"); -#define __io_aw() do {} while (0) +#define __io_aw() mmiowb_set_pending() #define readb(c) ({ u8 __v; __io_br(); __v = readb_cpu(c); __io_ar(__v); __v; }) #define readw(c) ({ u16 __v; __io_br(); __v = readw_cpu(c); __io_ar(__v); __v; }) diff --git a/arch/riscv/include/asm/mmiowb.h b/arch/riscv/include/asm/mmiowb.h new file mode 100644 index 0000000000000000000000000000000000000000..5d7e3a2b4e3b2ace6cc1d25097ad49bec655e77c --- /dev/null +++ b/arch/riscv/include/asm/mmiowb.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_RISCV_MMIOWB_H +#define _ASM_RISCV_MMIOWB_H + +/* + * "o,w" is sufficient to ensure that all writes to the device have completed + * before the write to the spinlock is allowed to commit. + */ +#define mmiowb() __asm__ __volatile__ ("fence o,w" : : : "memory"); + +#include + +#endif /* ASM_RISCV_MMIOWB_H */ diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h index bba3da6ef1572f41db64e59ca203ae32b9139180..a3d5273ded7c6d0782356f01abf7d5cdca753bc5 100644 --- a/arch/riscv/include/asm/syscall.h +++ b/arch/riscv/include/asm/syscall.h @@ -72,32 +72,20 @@ static inline void syscall_set_return_value(struct task_struct *task, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { - BUG_ON(i + n > 6); - if (i == 0) { - args[0] = regs->orig_a0; - args++; - i++; - n--; - } - memcpy(args, ®s->a1 + i * sizeof(regs->a1), n * sizeof(args[0])); + args[0] = regs->orig_a0; + args++; + memcpy(args, ®s->a1, 5 * sizeof(args[0])); } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) { - BUG_ON(i + n > 6); - if (i == 0) { - regs->orig_a0 = args[0]; - args++; - i++; - n--; - } - memcpy(®s->a1 + i * sizeof(regs->a1), args, n * sizeof(regs->a0)); + regs->orig_a0 = args[0]; + args++; + memcpy(®s->a1, args, 5 * sizeof(regs->a1)); } static inline int syscall_get_arch(void) diff --git a/arch/riscv/include/asm/tlb.h b/arch/riscv/include/asm/tlb.h index 439dc7072e05bf37a722bb983b69dabed39651fa..1ad8d093c58b89d7308661ea34d1011b7182dec7 100644 --- a/arch/riscv/include/asm/tlb.h +++ b/arch/riscv/include/asm/tlb.h @@ -18,6 +18,7 @@ struct mmu_gather; static void tlb_flush(struct mmu_gather *tlb); +#define tlb_flush tlb_flush #include static inline void tlb_flush(struct mmu_gather *tlb) diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index a00168b980d2e6ca265ae0424045508275fbbe3f..fb53a8089e769473434493d59bc408079dcbb519 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -300,7 +300,7 @@ do { \ " .balign 4\n" \ "4:\n" \ " li %0, %6\n" \ - " jump 2b, %1\n" \ + " jump 3b, %1\n" \ " .previous\n" \ " .section __ex_table,\"a\"\n" \ " .balign " RISCV_SZPTR "\n" \ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index f13f7f276639d504679034a36c53edc15f25dfe1..598568168d3511406fea38b23360c7e28a50a41f 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -4,7 +4,6 @@ ifdef CONFIG_FTRACE CFLAGS_REMOVE_ftrace.o = -pg -CFLAGS_REMOVE_setup.o = -pg endif extra-y += head.o @@ -29,8 +28,6 @@ obj-y += vdso.o obj-y += cacheinfo.o obj-y += vdso/ -CFLAGS_setup.o := -mcmodel=medany - obj-$(CONFIG_FPU) += fpu.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 7dd308129b40f1862ab04dc1e12c790bf7c111fe..2872edce894d1e0b79d58a4ed735649dd8261408 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -141,7 +141,7 @@ static int apply_r_riscv_hi20_rela(struct module *me, u32 *location, { s32 hi20; - if (IS_ENABLED(CMODEL_MEDLOW)) { + if (IS_ENABLED(CONFIG_CMODEL_MEDLOW)) { pr_err( "%s: target %016llx can not be addressed by the 32-bit offset from PC = %p\n", me->name, (long long)v, location); diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index ecb654f6a79ef105931a51950d520c1af845edff..540a331d1376922c62ba17bf0d9c786714d89948 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -48,14 +48,6 @@ struct screen_info screen_info = { }; #endif -unsigned long va_pa_offset; -EXPORT_SYMBOL(va_pa_offset); -unsigned long pfn_base; -EXPORT_SYMBOL(pfn_base); - -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - /* The lucky hart to first increment this variable will boot the other cores */ atomic_t hart_lottery; unsigned long boot_cpu_hartid; diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index a4b1d94371a0dbf6937bc0c8add512618ef1c5c6..4d403274c2e8d0436f2c74e3719cbc75fc057db8 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -169,8 +169,6 @@ static bool save_trace(unsigned long pc, void *arg) void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { walk_stackframe(tsk, NULL, save_trace, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index eb22ab49b3e008ec4ab677778302d5dbbea358b1..b68aac7018031cd5afe4ebb293051cbcc814969e 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -1,3 +1,9 @@ + +CFLAGS_init.o := -mcmodel=medany +ifdef CONFIG_FTRACE +CFLAGS_REMOVE_init.o = -pg +endif + obj-y += init.o obj-y += fault.o obj-y += extable.o diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index b379a75ac6a6778052b9161612357ba5df620648..bc7b77e34d0920f2190c7e8c4edd18658c526703 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -25,6 +25,10 @@ #include #include +unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] + __page_aligned_bss; +EXPORT_SYMBOL(empty_zero_page); + static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0, }; @@ -117,6 +121,14 @@ void __init setup_bootmem(void) */ memblock_reserve(reg->base, vmlinux_end - reg->base); mem_size = min(reg->size, (phys_addr_t)-PAGE_OFFSET); + + /* + * Remove memblock from the end of usable area to the + * end of region + */ + if (reg->base + mem_size < end) + memblock_remove(reg->base + mem_size, + end - reg->base - mem_size); } } BUG_ON(mem_size == 0); @@ -143,6 +155,11 @@ void __init setup_bootmem(void) } } +unsigned long va_pa_offset; +EXPORT_SYMBOL(va_pa_offset); +unsigned long pfn_base; +EXPORT_SYMBOL(pfn_base); + pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pgd_t trampoline_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); @@ -172,6 +189,25 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) } } +/* + * setup_vm() is called from head.S with MMU-off. + * + * Following requirements should be honoured for setup_vm() to work + * correctly: + * 1) It should use PC-relative addressing for accessing kernel symbols. + * To achieve this we always use GCC cmodel=medany. + * 2) The compiler instrumentation for FTRACE will not work for setup_vm() + * so disable compiler instrumentation when FTRACE is enabled. + * + * Currently, the above requirements are honoured by using custom CFLAGS + * for init.o in mm/Makefile. + */ + +#ifndef __riscv_cmodel_medany +#error "setup_vm() is called from head.S before relocate so it should " + "not use absolute addressing." +#endif + asmlinkage void __init setup_vm(void) { extern char _start; diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b6e3d0653002af7a8eb20b5a0f3e0d8b27ba0b16..07485582d0272a610b74e59f24bc4770bde20c5f 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -14,12 +14,6 @@ config LOCKDEP_SUPPORT config STACKTRACE_SUPPORT def_bool y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config ARCH_HAS_ILOG2_U32 def_bool n @@ -149,6 +143,7 @@ config S390 select HAVE_FUNCTION_TRACER select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS + select HAVE_GENERIC_GUP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 @@ -164,11 +159,13 @@ config S390 select HAVE_PERF_USER_STACK_DUMP select HAVE_MEMBLOCK_NODE_MAP select HAVE_MEMBLOCK_PHYS_MAP + select HAVE_MMU_GATHER_NO_GATHER select HAVE_MOD_ARCH_SPECIFIC select HAVE_NOP_MCOUNT select HAVE_OPROFILE select HAVE_PCI select HAVE_PERF_EVENTS + select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS @@ -188,7 +185,6 @@ config S390 select TTY select VIRT_CPU_ACCOUNTING select ARCH_HAS_SCALED_CPUTIME - select VIRT_TO_BUS select HAVE_NMI @@ -240,6 +236,7 @@ choice config MARCH_Z900 bool "IBM zSeries model z800 and z900" + depends on !CC_IS_CLANG select HAVE_MARCH_Z900_FEATURES help Select this to enable optimizations for model z800/z900 (2064 and @@ -248,6 +245,7 @@ config MARCH_Z900 config MARCH_Z990 bool "IBM zSeries model z890 and z990" + depends on !CC_IS_CLANG select HAVE_MARCH_Z990_FEATURES help Select this to enable optimizations for model z890/z990 (2084 and @@ -256,6 +254,7 @@ config MARCH_Z990 config MARCH_Z9_109 bool "IBM System z9" + depends on !CC_IS_CLANG select HAVE_MARCH_Z9_109_FEATURES help Select this to enable optimizations for IBM System z9 (2094 and @@ -347,12 +346,15 @@ config TUNE_DEFAULT config TUNE_Z900 bool "IBM zSeries model z800 and z900" + depends on !CC_IS_CLANG config TUNE_Z990 bool "IBM zSeries model z890 and z990" + depends on !CC_IS_CLANG config TUNE_Z9_109 bool "IBM System z9" + depends on !CC_IS_CLANG config TUNE_Z10 bool "IBM System z10" @@ -388,6 +390,9 @@ config COMPAT (and some other stuff like libraries and such) is needed for executing 31 bit applications. It is safe to say "Y". +config COMPAT_VDSO + def_bool COMPAT && !CC_IS_CLANG + config SYSVIPC_COMPAT def_bool y if COMPAT && SYSVIPC @@ -549,6 +554,17 @@ config ARCH_HAS_KEXEC_PURGATORY def_bool y depends on KEXEC_FILE +config KEXEC_VERIFY_SIG + bool "Verify kernel signature during kexec_file_load() syscall" + depends on KEXEC_FILE && SYSTEM_DATA_VERIFICATION + help + This option makes kernel signature verification mandatory for + the kexec_file_load() syscall. + + In addition to that option, you need to enable signature + verification for the corresponding kernel image type being + loaded in order for this to work. + config ARCH_RANDOM def_bool y prompt "s390 architectural random number generation API" @@ -609,6 +625,29 @@ config EXPOLINE_FULL endchoice +config RELOCATABLE + bool "Build a relocatable kernel" + select MODULE_REL_CRCS if MODVERSIONS + default y + help + This builds a kernel image that retains relocation information + so it can be loaded at an arbitrary address. + The kernel is linked as a position-independent executable (PIE) + and contains dynamic relocations which are processed early in the + bootup process. + The relocations make the kernel image about 15% larger (compressed + 10%), but are discarded at runtime. + +config RANDOMIZE_BASE + bool "Randomize the address of the kernel image (KASLR)" + depends on RELOCATABLE + default y + help + In support of Kernel Address Space Layout Randomization (KASLR), + this randomizes the address at which the kernel image is loaded, + as a security feature that deters exploit attempts relying on + knowledge of the location of kernel internals. + endmenu menu "Memory setup" @@ -837,6 +876,17 @@ config HAVE_PNETID menu "Virtualization" +config PROTECTED_VIRTUALIZATION_GUEST + def_bool n + prompt "Protected virtualization guest support" + help + Select this option, if you want to be able to run this + kernel as a protected virtualization KVM guest. + Protected virtualization capable machines have a mini hypervisor + located at machine level (an ultravisor). With help of the + Ultravisor, KVM will be able to run "protected" VMs, special + VMs whose memory and management data are unavailable to KVM. + config PFAULT def_bool y prompt "Pseudo page fault support" diff --git a/arch/s390/Makefile b/arch/s390/Makefile index e21053e5e0da2a06c3ba78e9967e55837ecaddc0..df1d6a150f3007a0cc6cfab410c9637ea0221170 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -16,10 +16,14 @@ KBUILD_AFLAGS_MODULE += -fPIC KBUILD_CFLAGS_MODULE += -fPIC KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 +ifeq ($(CONFIG_RELOCATABLE),y) +KBUILD_CFLAGS += -fPIE +LDFLAGS_vmlinux := -pie +endif aflags_dwarf := -Wa,-gdwarf-2 -KBUILD_AFLAGS_DECOMPRESSOR := -m64 -D__ASSEMBLY__ +KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__ KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf)) -KBUILD_CFLAGS_DECOMPRESSOR := -m64 -O2 +KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float KBUILD_CFLAGS_DECOMPRESSOR += -fno-asynchronous-unwind-tables @@ -111,7 +115,7 @@ endif cfi := $(call as-instr,.cfi_startproc\n.cfi_val_offset 15$(comma)-160\n.cfi_endproc,-DCONFIG_AS_CFI_VAL_OFFSET=1) KBUILD_CFLAGS += -mbackchain -msoft-float $(cflags-y) -KBUILD_CFLAGS += -pipe -fno-strength-reduce -Wno-sign-compare +KBUILD_CFLAGS += -pipe -Wno-sign-compare KBUILD_CFLAGS += -fno-asynchronous-unwind-tables $(cfi) KBUILD_AFLAGS += $(aflags-y) $(cfi) export KBUILD_AFLAGS_DECOMPRESSOR diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index c844eaf24ed739dc2b818daef401b523fc788643..c51496bbac1906c721485be0b2abac71106f6631 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -12,25 +12,35 @@ KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR) KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR) # -# Use -march=z900 for als.c to be able to print an error +# Use minimum architecture for als.c to be able to print an error # message if the kernel is started on a machine which is too old # -ifneq ($(CC_FLAGS_MARCH),-march=z900) +ifndef CONFIG_CC_IS_CLANG +CC_FLAGS_MARCH_MINIMUM := -march=z900 +else +CC_FLAGS_MARCH_MINIMUM := -march=z10 +endif + +ifneq ($(CC_FLAGS_MARCH),$(CC_FLAGS_MARCH_MINIMUM)) AFLAGS_REMOVE_head.o += $(CC_FLAGS_MARCH) -AFLAGS_head.o += -march=z900 +AFLAGS_head.o += $(CC_FLAGS_MARCH_MINIMUM) AFLAGS_REMOVE_mem.o += $(CC_FLAGS_MARCH) -AFLAGS_mem.o += -march=z900 +AFLAGS_mem.o += $(CC_FLAGS_MARCH_MINIMUM) CFLAGS_REMOVE_als.o += $(CC_FLAGS_MARCH) -CFLAGS_als.o += -march=z900 +CFLAGS_als.o += $(CC_FLAGS_MARCH_MINIMUM) CFLAGS_REMOVE_sclp_early_core.o += $(CC_FLAGS_MARCH) -CFLAGS_sclp_early_core.o += -march=z900 +CFLAGS_sclp_early_core.o += $(CC_FLAGS_MARCH_MINIMUM) endif CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char -obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o string.o ebcdic.o -obj-y += sclp_early_core.o mem.o ipl_vmparm.o cmdline.o ctype.o -targets := bzImage startup.a section_cmp.boot.data $(obj-y) +obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o +obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o +obj-y += ctype.o text_dma.o +obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) += uv.o +obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) subdir- := compressed OBJECTS := $(addprefix $(obj)/,$(obj-y)) @@ -48,7 +58,8 @@ define cmd_section_cmp touch $@ endef -$(obj)/bzImage: $(obj)/compressed/vmlinux $(obj)/section_cmp.boot.data FORCE +OBJCOPYFLAGS_bzImage := --pad-to $$(readelf -s $(obj)/compressed/vmlinux | awk '/\<_end\>/ {print or(strtonum("0x"$$2),4095)+1}') +$(obj)/bzImage: $(obj)/compressed/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE $(call if_changed,objcopy) $(obj)/section_cmp%: vmlinux $(obj)/compressed/vmlinux FORCE diff --git a/arch/s390/boot/als.c b/arch/s390/boot/als.c index f902215e9cd9b60d4f8ee1fc4dd1f05707c9ea6c..ff6801d401c44f7f9f8e55e98d73561ada2ed22a 100644 --- a/arch/s390/boot/als.c +++ b/arch/s390/boot/als.c @@ -99,7 +99,7 @@ static void facility_mismatch(void) print_machine_type(); print_missing_facilities(); sclp_early_printk("See Principles of Operations for facility bits\n"); - disabled_wait(0x8badcccc); + disabled_wait(); } void verify_facilities(void) diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 82bc06346e05846b2c6e2c01f06a1b20ecb1771c..ad57c2205a71bd9d7a1d854c2e4ff37885fa8f63 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -9,5 +9,10 @@ void setup_boot_command_line(void); void parse_boot_command_line(void); void setup_memory_end(void); void print_missing_facilities(void); +unsigned long get_random_base(unsigned long safe_addr); + +extern int kaslr_enabled; + +unsigned long read_ipl_report(unsigned long safe_offset); #endif /* BOOT_BOOT_H */ diff --git a/arch/s390/boot/compressed/decompressor.h b/arch/s390/boot/compressed/decompressor.h index e1c1f2ec60f4a88d14d0c4b28732df790e2ff6bd..c15eb7114d8363594b6673fba58aa4dbec3c468e 100644 --- a/arch/s390/boot/compressed/decompressor.h +++ b/arch/s390/boot/compressed/decompressor.h @@ -17,6 +17,11 @@ struct vmlinux_info { unsigned long bss_size; /* uncompressed image .bss size */ unsigned long bootdata_off; unsigned long bootdata_size; + unsigned long bootdata_preserved_off; + unsigned long bootdata_preserved_size; + unsigned long dynsym_start; + unsigned long rela_dyn_start; + unsigned long rela_dyn_end; }; extern char _vmlinux_info[]; diff --git a/arch/s390/boot/compressed/vmlinux.lds.S b/arch/s390/boot/compressed/vmlinux.lds.S index 7efc3938f5955dae79720d5323636def4138d9df..112b8d9f1e4cd9752e70eff11d3a164f5ac72c31 100644 --- a/arch/s390/boot/compressed/vmlinux.lds.S +++ b/arch/s390/boot/compressed/vmlinux.lds.S @@ -33,7 +33,29 @@ SECTIONS *(.data.*) _edata = . ; } + /* + * .dma section for code, data, ex_table that need to stay below 2 GB, + * even when the kernel is relocate: above 2 GB. + */ + _sdma = .; + .dma.text : { + . = ALIGN(PAGE_SIZE); + _stext_dma = .; + *(.dma.text) + . = ALIGN(PAGE_SIZE); + _etext_dma = .; + } + . = ALIGN(16); + .dma.ex_table : { + _start_dma_ex_table = .; + KEEP(*(.dma.ex_table)) + _stop_dma_ex_table = .; + } + .dma.data : { *(.dma.data) } + _edma = .; + BOOT_DATA + BOOT_DATA_PRESERVED /* * uncompressed image info used by the decompressor it should match diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index ce2cbbc417428f6a0a813c04d53cc4037f493cf6..028aab03a9e7782ceb5b9b5104091399c39d6e10 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -305,7 +305,7 @@ ENTRY(startup_kdump) xc 0x300(256),0x300 xc 0xe00(256),0xe00 xc 0xf00(256),0xf00 - lctlg %c0,%c15,0x200(%r0) # initialize control registers + lctlg %c0,%c15,.Lctl-.LPG0(%r13) # load control registers stcke __LC_BOOT_CLOCK mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 spt 6f-.LPG0(%r13) @@ -319,20 +319,54 @@ ENTRY(startup_kdump) .align 8 6: .long 0x7fffffff,0xffffffff +.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space + .quad 0 # cr1: primary space segment table + .quad .Lduct # cr2: dispatchable unit control table + .quad 0 # cr3: instruction authorization + .quad 0xffff # cr4: instruction authorization + .quad .Lduct # cr5: primary-aste origin + .quad 0 # cr6: I/O interrupts + .quad 0 # cr7: secondary space segment table + .quad 0 # cr8: access registers translation + .quad 0 # cr9: tracing off + .quad 0 # cr10: tracing off + .quad 0 # cr11: tracing off + .quad 0 # cr12: tracing off + .quad 0 # cr13: home space segment table + .quad 0xc0000000 # cr14: machine check handling off + .quad .Llinkage_stack # cr15: linkage stack operations + + .section .dma.data,"aw",@progbits +.Lduct: .long 0,.Laste,.Laste,0,.Lduald,0,0,0 + .long 0,0,0,0,0,0,0,0 +.Llinkage_stack: + .long 0,0,0x89000000,0,0,0,0x8a000000,0 + .align 64 +.Laste: .quad 0,0xffffffffffffffff,0,0,0,0,0,0 + .align 128 +.Lduald:.rept 8 + .long 0x80000000,0,0,0 # invalid access-list entries + .endr + .previous + #include "head_kdump.S" # # params at 10400 (setup.h) +# Must be keept in sync with struct parmarea in setup.h # .org PARMAREA - .long 0,0 # IPL_DEVICE - .long 0,0 # INITRD_START - .long 0,0 # INITRD_SIZE - .long 0,0 # OLDMEM_BASE - .long 0,0 # OLDMEM_SIZE + .quad 0 # IPL_DEVICE + .quad 0 # INITRD_START + .quad 0 # INITRD_SIZE + .quad 0 # OLDMEM_BASE + .quad 0 # OLDMEM_SIZE .org COMMAND_LINE .byte "root=/dev/ram0 ro" .byte 0 - .org 0x11000 + .org EARLY_SCCB_OFFSET + .fill 4096 + + .org HEAD_END diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 36beb56de02175dc5baec20ee8e0a64b34ae8431..3c49bde8aa5e3d6f82d3e1c2157b35421e9cd7e4 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -7,16 +7,19 @@ #include #include #include +#include #include "boot.h" char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; -struct ipl_parameter_block __bootdata(early_ipl_block); -int __bootdata(early_ipl_block_valid); +struct ipl_parameter_block __bootdata_preserved(ipl_block); +int __bootdata_preserved(ipl_block_valid); unsigned long __bootdata(memory_end); int __bootdata(memory_end_set); int __bootdata(noexec_disabled); +int kaslr_enabled __section(.data); + static inline int __diag308(unsigned long subcode, void *addr) { register unsigned long _addr asm("0") = (unsigned long)addr; @@ -45,13 +48,15 @@ void store_ipl_parmblock(void) { int rc; - rc = __diag308(DIAG308_STORE, &early_ipl_block); + uv_set_shared(__pa(&ipl_block)); + rc = __diag308(DIAG308_STORE, &ipl_block); + uv_remove_shared(__pa(&ipl_block)); if (rc == DIAG308_RC_OK && - early_ipl_block.hdr.version <= IPL_MAX_SUPPORTED_VERSION) - early_ipl_block_valid = 1; + ipl_block.hdr.version <= IPL_MAX_SUPPORTED_VERSION) + ipl_block_valid = 1; } -static size_t scpdata_length(const char *buf, size_t count) +static size_t scpdata_length(const u8 *buf, size_t count) { while (count) { if (buf[count - 1] != '\0' && buf[count - 1] != ' ') @@ -68,26 +73,26 @@ static size_t ipl_block_get_ascii_scpdata(char *dest, size_t size, size_t i; int has_lowercase; - count = min(size - 1, scpdata_length(ipb->ipl_info.fcp.scp_data, - ipb->ipl_info.fcp.scp_data_len)); + count = min(size - 1, scpdata_length(ipb->fcp.scp_data, + ipb->fcp.scp_data_len)); if (!count) goto out; has_lowercase = 0; for (i = 0; i < count; i++) { - if (!isascii(ipb->ipl_info.fcp.scp_data[i])) { + if (!isascii(ipb->fcp.scp_data[i])) { count = 0; goto out; } - if (!has_lowercase && islower(ipb->ipl_info.fcp.scp_data[i])) + if (!has_lowercase && islower(ipb->fcp.scp_data[i])) has_lowercase = 1; } if (has_lowercase) - memcpy(dest, ipb->ipl_info.fcp.scp_data, count); + memcpy(dest, ipb->fcp.scp_data, count); else for (i = 0; i < count; i++) - dest[i] = tolower(ipb->ipl_info.fcp.scp_data[i]); + dest[i] = tolower(ipb->fcp.scp_data[i]); out: dest[count] = '\0'; return count; @@ -103,14 +108,14 @@ static void append_ipl_block_parm(void) delim = early_command_line + len; /* '\0' character position */ parm = early_command_line + len + 1; /* append right after '\0' */ - switch (early_ipl_block.hdr.pbt) { - case DIAG308_IPL_TYPE_CCW: + switch (ipl_block.pb0_hdr.pbt) { + case IPL_PBT_CCW: rc = ipl_block_get_ascii_vmparm( - parm, COMMAND_LINE_SIZE - len - 1, &early_ipl_block); + parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); break; - case DIAG308_IPL_TYPE_FCP: + case IPL_PBT_FCP: rc = ipl_block_get_ascii_scpdata( - parm, COMMAND_LINE_SIZE - len - 1, &early_ipl_block); + parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); break; } if (rc) { @@ -141,7 +146,7 @@ void setup_boot_command_line(void) strcpy(early_command_line, strim(COMMAND_LINE)); /* append IPL PARM data to the boot command line */ - if (early_ipl_block_valid) + if (!is_prot_virt_guest() && ipl_block_valid) append_ipl_block_parm(); } @@ -211,6 +216,7 @@ void parse_boot_command_line(void) char *args; int rc; + kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE); args = strcpy(command_line_buf, early_command_line); while (*args) { args = next_arg(args, ¶m, &val); @@ -228,15 +234,21 @@ void parse_boot_command_line(void) if (!strcmp(param, "facilities")) modify_fac_list(val); + + if (!strcmp(param, "nokaslr")) + kaslr_enabled = 0; } } void setup_memory_end(void) { #ifdef CONFIG_CRASH_DUMP - if (!OLDMEM_BASE && early_ipl_block_valid && - early_ipl_block.hdr.pbt == DIAG308_IPL_TYPE_FCP && - early_ipl_block.ipl_info.fcp.opt == DIAG308_IPL_OPT_DUMP) { + if (OLDMEM_BASE) { + kaslr_enabled = 0; + } else if (ipl_block_valid && + ipl_block.pb0_hdr.pbt == IPL_PBT_FCP && + ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) { + kaslr_enabled = 0; if (!sclp_early_get_hsa_size(&memory_end) && memory_end) memory_end_set = 1; } diff --git a/arch/s390/boot/ipl_report.c b/arch/s390/boot/ipl_report.c new file mode 100644 index 0000000000000000000000000000000000000000..0b4965573656f668fe84e56f0df27e2a536414d5 --- /dev/null +++ b/arch/s390/boot/ipl_report.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include "boot.h" + +int __bootdata_preserved(ipl_secure_flag); + +unsigned long __bootdata_preserved(ipl_cert_list_addr); +unsigned long __bootdata_preserved(ipl_cert_list_size); + +unsigned long __bootdata(early_ipl_comp_list_addr); +unsigned long __bootdata(early_ipl_comp_list_size); + +#define for_each_rb_entry(entry, rb) \ + for (entry = rb->entries; \ + (void *) entry + sizeof(*entry) <= (void *) rb + rb->len; \ + entry++) + +static inline bool intersects(unsigned long addr0, unsigned long size0, + unsigned long addr1, unsigned long size1) +{ + return addr0 + size0 > addr1 && addr1 + size1 > addr0; +} + +static unsigned long find_bootdata_space(struct ipl_rb_components *comps, + struct ipl_rb_certificates *certs, + unsigned long safe_addr) +{ + struct ipl_rb_certificate_entry *cert; + struct ipl_rb_component_entry *comp; + size_t size; + + /* + * Find the length for the IPL report boot data + */ + early_ipl_comp_list_size = 0; + for_each_rb_entry(comp, comps) + early_ipl_comp_list_size += sizeof(*comp); + ipl_cert_list_size = 0; + for_each_rb_entry(cert, certs) + ipl_cert_list_size += sizeof(unsigned int) + cert->len; + size = ipl_cert_list_size + early_ipl_comp_list_size; + + /* + * Start from safe_addr to find a free memory area large + * enough for the IPL report boot data. This area is used + * for ipl_cert_list_addr/ipl_cert_list_size and + * early_ipl_comp_list_addr/early_ipl_comp_list_size. It must + * not overlap with any component or any certificate. + */ +repeat: + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && + intersects(INITRD_START, INITRD_SIZE, safe_addr, size)) + safe_addr = INITRD_START + INITRD_SIZE; + for_each_rb_entry(comp, comps) + if (intersects(safe_addr, size, comp->addr, comp->len)) { + safe_addr = comp->addr + comp->len; + goto repeat; + } + for_each_rb_entry(cert, certs) + if (intersects(safe_addr, size, cert->addr, cert->len)) { + safe_addr = cert->addr + cert->len; + goto repeat; + } + early_ipl_comp_list_addr = safe_addr; + ipl_cert_list_addr = safe_addr + early_ipl_comp_list_size; + + return safe_addr + size; +} + +static void copy_components_bootdata(struct ipl_rb_components *comps) +{ + struct ipl_rb_component_entry *comp, *ptr; + + ptr = (struct ipl_rb_component_entry *) early_ipl_comp_list_addr; + for_each_rb_entry(comp, comps) + memcpy(ptr++, comp, sizeof(*ptr)); +} + +static void copy_certificates_bootdata(struct ipl_rb_certificates *certs) +{ + struct ipl_rb_certificate_entry *cert; + void *ptr; + + ptr = (void *) ipl_cert_list_addr; + for_each_rb_entry(cert, certs) { + *(unsigned int *) ptr = cert->len; + ptr += sizeof(unsigned int); + memcpy(ptr, (void *) cert->addr, cert->len); + ptr += cert->len; + } +} + +unsigned long read_ipl_report(unsigned long safe_addr) +{ + struct ipl_rb_certificates *certs; + struct ipl_rb_components *comps; + struct ipl_pl_hdr *pl_hdr; + struct ipl_rl_hdr *rl_hdr; + struct ipl_rb_hdr *rb_hdr; + unsigned long tmp; + void *rl_end; + + /* + * Check if there is a IPL report by looking at the copy + * of the IPL parameter information block. + */ + if (!ipl_block_valid || + !(ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR)) + return safe_addr; + ipl_secure_flag = !!(ipl_block.hdr.flags & IPL_PL_FLAG_SIPL); + /* + * There is an IPL report, to find it load the pointer to the + * IPL parameter information block from lowcore and skip past + * the IPL parameter list, then align the address to a double + * word boundary. + */ + tmp = (unsigned long) S390_lowcore.ipl_parmblock_ptr; + pl_hdr = (struct ipl_pl_hdr *) tmp; + tmp = (tmp + pl_hdr->len + 7) & -8UL; + rl_hdr = (struct ipl_rl_hdr *) tmp; + /* Walk through the IPL report blocks in the IPL Report list */ + certs = NULL; + comps = NULL; + rl_end = (void *) rl_hdr + rl_hdr->len; + rb_hdr = (void *) rl_hdr + sizeof(*rl_hdr); + while ((void *) rb_hdr + sizeof(*rb_hdr) < rl_end && + (void *) rb_hdr + rb_hdr->len <= rl_end) { + + switch (rb_hdr->rbt) { + case IPL_RBT_CERTIFICATES: + certs = (struct ipl_rb_certificates *) rb_hdr; + break; + case IPL_RBT_COMPONENTS: + comps = (struct ipl_rb_components *) rb_hdr; + break; + default: + break; + } + + rb_hdr = (void *) rb_hdr + rb_hdr->len; + } + + /* + * With either the component list or the certificate list + * missing the kernel will stay ignorant of secure IPL. + */ + if (!comps || !certs) + return safe_addr; + + /* + * Copy component and certificate list to a safe area + * where the decompressed kernel can find them. + */ + safe_addr = find_bootdata_space(comps, certs, safe_addr); + copy_components_bootdata(comps); + copy_certificates_bootdata(certs); + + return safe_addr; +} diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c new file mode 100644 index 0000000000000000000000000000000000000000..3bdd8132e56bcb1230638e3480c49a2023863018 --- /dev/null +++ b/arch/s390/boot/kaslr.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2019 + */ +#include +#include +#include +#include +#include "compressed/decompressor.h" + +#define PRNG_MODE_TDES 1 +#define PRNG_MODE_SHA512 2 +#define PRNG_MODE_TRNG 3 + +struct prno_parm { + u32 res; + u32 reseed_counter; + u64 stream_bytes; + u8 V[112]; + u8 C[112]; +}; + +struct prng_parm { + u8 parm_block[32]; + u32 reseed_counter; + u64 byte_counter; +}; + +static int check_prng(void) +{ + if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) { + sclp_early_printk("KASLR disabled: CPU has no PRNG\n"); + return 0; + } + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + return PRNG_MODE_TRNG; + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_SHA512_DRNG_GEN)) + return PRNG_MODE_SHA512; + else + return PRNG_MODE_TDES; +} + +static unsigned long get_random(unsigned long limit) +{ + struct prng_parm prng = { + /* initial parameter block for tdes mode, copied from libica */ + .parm_block = { + 0x0F, 0x2B, 0x8E, 0x63, 0x8C, 0x8E, 0xD2, 0x52, + 0x64, 0xB7, 0xA0, 0x7B, 0x75, 0x28, 0xB8, 0xF4, + 0x75, 0x5F, 0xD2, 0xA6, 0x8D, 0x97, 0x11, 0xFF, + 0x49, 0xD8, 0x23, 0xF3, 0x7E, 0x21, 0xEC, 0xA0 + }, + }; + unsigned long seed, random; + struct prno_parm prno; + __u64 entropy[4]; + int mode, i; + + mode = check_prng(); + seed = get_tod_clock_fast(); + switch (mode) { + case PRNG_MODE_TRNG: + cpacf_trng(NULL, 0, (u8 *) &random, sizeof(random)); + break; + case PRNG_MODE_SHA512: + cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, &prno, NULL, 0, + (u8 *) &seed, sizeof(seed)); + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, &prno, (u8 *) &random, + sizeof(random), NULL, 0); + break; + case PRNG_MODE_TDES: + /* add entropy */ + *(unsigned long *) prng.parm_block ^= seed; + for (i = 0; i < 16; i++) { + cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block, + (char *) entropy, (char *) entropy, + sizeof(entropy)); + memcpy(prng.parm_block, entropy, sizeof(entropy)); + } + random = seed; + cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block, (u8 *) &random, + (u8 *) &random, sizeof(random)); + break; + default: + random = 0; + } + return random % limit; +} + +unsigned long get_random_base(unsigned long safe_addr) +{ + unsigned long base, start, end, kernel_size; + unsigned long block_sum, offset; + int i; + + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE) { + if (safe_addr < INITRD_START + INITRD_SIZE) + safe_addr = INITRD_START + INITRD_SIZE; + } + safe_addr = ALIGN(safe_addr, THREAD_SIZE); + + kernel_size = vmlinux.image_size + vmlinux.bss_size; + block_sum = 0; + for_each_mem_detect_block(i, &start, &end) { + if (memory_end_set) { + if (start >= memory_end) + break; + if (end > memory_end) + end = memory_end; + } + if (end - start < kernel_size) + continue; + block_sum += end - start - kernel_size; + } + if (!block_sum) { + sclp_early_printk("KASLR disabled: not enough memory\n"); + return 0; + } + + base = get_random(block_sum); + if (base == 0) + return 0; + if (base < safe_addr) + base = safe_addr; + block_sum = offset = 0; + for_each_mem_detect_block(i, &start, &end) { + if (memory_end_set) { + if (start >= memory_end) + break; + if (end > memory_end) + end = memory_end; + } + if (end - start < kernel_size) + continue; + block_sum += end - start - kernel_size; + if (base <= block_sum) { + base = start + base - offset; + base = ALIGN_DOWN(base, THREAD_SIZE); + break; + } + offset = block_sum; + } + return base; +} diff --git a/arch/s390/boot/machine_kexec_reloc.c b/arch/s390/boot/machine_kexec_reloc.c new file mode 100644 index 0000000000000000000000000000000000000000..b7a5d0f72097e243dce53177c12aae352bd5d9d1 --- /dev/null +++ b/arch/s390/boot/machine_kexec_reloc.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../kernel/machine_kexec_reloc.c" diff --git a/arch/s390/boot/mem_detect.c b/arch/s390/boot/mem_detect.c index 4cb771ba13fa7fb39da31ac6a8428744c8026d16..5d316fe40480446b9dd5f90fc0bb4f3bba6d3b55 100644 --- a/arch/s390/boot/mem_detect.c +++ b/arch/s390/boot/mem_detect.c @@ -25,7 +25,7 @@ static void *mem_detect_alloc_extended(void) { unsigned long offset = ALIGN(mem_safe_offset(), sizeof(u64)); - if (IS_ENABLED(BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && INITRD_START < offset + ENTRIES_EXTENDED_MAX) offset = ALIGN(INITRD_START + INITRD_SIZE, sizeof(u64)); diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index bdfc5549a2998e649392896dd1fe53a03db9c3f6..7b0d05414618be9e992f96298b1b92b38fea898c 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -1,11 +1,55 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include +#include #include +#include #include +#include +#include #include "compressed/decompressor.h" #include "boot.h" extern char __boot_data_start[], __boot_data_end[]; +extern char __boot_data_preserved_start[], __boot_data_preserved_end[]; +unsigned long __bootdata_preserved(__kaslr_offset); + +/* + * Some code and data needs to stay below 2 GB, even when the kernel would be + * relocated above 2 GB, because it has to use 31 bit addresses. + * Such code and data is part of the .dma section, and its location is passed + * over to the decompressed / relocated kernel via the .boot.preserved.data + * section. + */ +extern char _sdma[], _edma[]; +extern char _stext_dma[], _etext_dma[]; +extern struct exception_table_entry _start_dma_ex_table[]; +extern struct exception_table_entry _stop_dma_ex_table[]; +unsigned long __bootdata_preserved(__sdma) = __pa(&_sdma); +unsigned long __bootdata_preserved(__edma) = __pa(&_edma); +unsigned long __bootdata_preserved(__stext_dma) = __pa(&_stext_dma); +unsigned long __bootdata_preserved(__etext_dma) = __pa(&_etext_dma); +struct exception_table_entry * + __bootdata_preserved(__start_dma_ex_table) = _start_dma_ex_table; +struct exception_table_entry * + __bootdata_preserved(__stop_dma_ex_table) = _stop_dma_ex_table; + +int _diag210_dma(struct diag210 *addr); +int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode); +int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode); +void _diag0c_dma(struct hypfs_diag0c_entry *entry); +void _diag308_reset_dma(void); +struct diag_ops __bootdata_preserved(diag_dma_ops) = { + .diag210 = _diag210_dma, + .diag26c = _diag26c_dma, + .diag14 = _diag14_dma, + .diag0c = _diag0c_dma, + .diag308_reset = _diag308_reset_dma +}; +static struct diag210 _diag210_tmp_dma __section(".dma.data"); +struct diag210 *__bootdata_preserved(__diag210_tmp_dma) = &_diag210_tmp_dma; +void _swsusp_reset_dma(void); +unsigned long __bootdata_preserved(__swsusp_reset_dma) = __pa(_swsusp_reset_dma); void error(char *x) { @@ -13,7 +57,7 @@ void error(char *x) sclp_early_printk(x); sclp_early_printk("\n\n -- System halted"); - disabled_wait(0xdeadbeef); + disabled_wait(); } #ifdef CONFIG_KERNEL_UNCOMPRESSED @@ -23,19 +67,16 @@ unsigned long mem_safe_offset(void) } #endif -static void rescue_initrd(void) +static void rescue_initrd(unsigned long addr) { - unsigned long min_initrd_addr; - if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD)) return; if (!INITRD_START || !INITRD_SIZE) return; - min_initrd_addr = mem_safe_offset(); - if (min_initrd_addr <= INITRD_START) + if (addr <= INITRD_START) return; - memmove((void *)min_initrd_addr, (void *)INITRD_START, INITRD_SIZE); - INITRD_START = min_initrd_addr; + memmove((void *)addr, (void *)INITRD_START, INITRD_SIZE); + INITRD_START = addr; } static void copy_bootdata(void) @@ -43,23 +84,81 @@ static void copy_bootdata(void) if (__boot_data_end - __boot_data_start != vmlinux.bootdata_size) error(".boot.data section size mismatch"); memcpy((void *)vmlinux.bootdata_off, __boot_data_start, vmlinux.bootdata_size); + if (__boot_data_preserved_end - __boot_data_preserved_start != vmlinux.bootdata_preserved_size) + error(".boot.preserved.data section size mismatch"); + memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size); +} + +static void handle_relocs(unsigned long offset) +{ + Elf64_Rela *rela_start, *rela_end, *rela; + int r_type, r_sym, rc; + Elf64_Addr loc, val; + Elf64_Sym *dynsym; + + rela_start = (Elf64_Rela *) vmlinux.rela_dyn_start; + rela_end = (Elf64_Rela *) vmlinux.rela_dyn_end; + dynsym = (Elf64_Sym *) vmlinux.dynsym_start; + for (rela = rela_start; rela < rela_end; rela++) { + loc = rela->r_offset + offset; + val = rela->r_addend + offset; + r_sym = ELF64_R_SYM(rela->r_info); + if (r_sym) + val += dynsym[r_sym].st_value; + r_type = ELF64_R_TYPE(rela->r_info); + rc = arch_kexec_do_relocs(r_type, (void *) loc, val, 0); + if (rc) + error("Unknown relocation type"); + } } void startup_kernel(void) { + unsigned long random_lma; + unsigned long safe_addr; void *img; - rescue_initrd(); - sclp_early_read_info(); store_ipl_parmblock(); + safe_addr = mem_safe_offset(); + safe_addr = read_ipl_report(safe_addr); + uv_query_info(); + rescue_initrd(safe_addr); + sclp_early_read_info(); setup_boot_command_line(); parse_boot_command_line(); setup_memory_end(); detect_memory(); + + random_lma = __kaslr_offset = 0; + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) { + random_lma = get_random_base(safe_addr); + if (random_lma) { + __kaslr_offset = random_lma - vmlinux.default_lma; + img = (void *)vmlinux.default_lma; + vmlinux.default_lma += __kaslr_offset; + vmlinux.entry += __kaslr_offset; + vmlinux.bootdata_off += __kaslr_offset; + vmlinux.bootdata_preserved_off += __kaslr_offset; + vmlinux.rela_dyn_start += __kaslr_offset; + vmlinux.rela_dyn_end += __kaslr_offset; + vmlinux.dynsym_start += __kaslr_offset; + } + } + if (!IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) { img = decompress_kernel(); memmove((void *)vmlinux.default_lma, img, vmlinux.image_size); - } + } else if (__kaslr_offset) + memcpy((void *)vmlinux.default_lma, img, vmlinux.image_size); + copy_bootdata(); + if (IS_ENABLED(CONFIG_RELOCATABLE)) + handle_relocs(__kaslr_offset); + + if (__kaslr_offset) { + /* Clear non-relocated kernel */ + if (IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) + memset(img, 0, vmlinux.image_size); + } vmlinux.entry(); } diff --git a/arch/s390/boot/text_dma.S b/arch/s390/boot/text_dma.S new file mode 100644 index 0000000000000000000000000000000000000000..9715715c4c28d2d976b404167ed1b35dfc823ab1 --- /dev/null +++ b/arch/s390/boot/text_dma.S @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code that needs to run below 2 GB. + * + * Copyright IBM Corp. 2019 + */ + +#include +#include +#include + +#ifdef CC_USING_EXPOLINE + .pushsection .dma.text.__s390_indirect_jump_r14,"axG" +__dma__s390_indirect_jump_r14: + larl %r1,0f + ex 0,0(%r1) + j . +0: br %r14 + .popsection +#endif + + .section .dma.text,"ax" +/* + * Simplified version of expoline thunk. The normal thunks can not be used here, + * because they might be more than 2 GB away, and not reachable by the relative + * branch. No comdat, exrl, etc. optimizations used here, because it only + * affects a few functions that are not performance-relevant. + */ + .macro BR_EX_DMA_r14 +#ifdef CC_USING_EXPOLINE + jg __dma__s390_indirect_jump_r14 +#else + br %r14 +#endif + .endm + +/* + * int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode) + */ +ENTRY(_diag14_dma) + lgr %r1,%r2 + lgr %r2,%r3 + lgr %r3,%r4 + lhi %r5,-EIO + sam31 + diag %r1,%r2,0x14 +.Ldiag14_ex: + ipm %r5 + srl %r5,28 +.Ldiag14_fault: + sam64 + lgfr %r2,%r5 + BR_EX_DMA_r14 + EX_TABLE_DMA(.Ldiag14_ex, .Ldiag14_fault) +ENDPROC(_diag14_dma) + +/* + * int _diag210_dma(struct diag210 *addr) + */ +ENTRY(_diag210_dma) + lgr %r1,%r2 + lhi %r2,-1 + sam31 + diag %r1,%r0,0x210 +.Ldiag210_ex: + ipm %r2 + srl %r2,28 +.Ldiag210_fault: + sam64 + lgfr %r2,%r2 + BR_EX_DMA_r14 + EX_TABLE_DMA(.Ldiag210_ex, .Ldiag210_fault) +ENDPROC(_diag210_dma) + +/* + * int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode) + */ +ENTRY(_diag26c_dma) + lghi %r5,-EOPNOTSUPP + sam31 + diag %r2,%r4,0x26c +.Ldiag26c_ex: + sam64 + lgfr %r2,%r5 + BR_EX_DMA_r14 + EX_TABLE_DMA(.Ldiag26c_ex, .Ldiag26c_ex) +ENDPROC(_diag26c_dma) + +/* + * void _diag0c_dma(struct hypfs_diag0c_entry *entry) + */ +ENTRY(_diag0c_dma) + sam31 + diag %r2,%r2,0x0c + sam64 + BR_EX_DMA_r14 +ENDPROC(_diag0c_dma) + +/* + * void _swsusp_reset_dma(void) + */ +ENTRY(_swsusp_reset_dma) + larl %r1,restart_entry + larl %r2,.Lrestart_diag308_psw + og %r1,0(%r2) + stg %r1,0(%r0) + lghi %r0,0 + diag %r0,%r0,0x308 +restart_entry: + lhi %r1,1 + sigp %r1,%r0,SIGP_SET_ARCHITECTURE + sam64 + BR_EX_DMA_r14 +ENDPROC(_swsusp_reset_dma) + +/* + * void _diag308_reset_dma(void) + * + * Calls diag 308 subcode 1 and continues execution + */ +ENTRY(_diag308_reset_dma) + larl %r4,.Lctlregs # Save control registers + stctg %c0,%c15,0(%r4) + lg %r2,0(%r4) # Disable lowcore protection + nilh %r2,0xefff + larl %r4,.Lctlreg0 + stg %r2,0(%r4) + lctlg %c0,%c0,0(%r4) + larl %r4,.Lfpctl # Floating point control register + stfpc 0(%r4) + larl %r4,.Lprefix # Save prefix register + stpx 0(%r4) + larl %r4,.Lprefix_zero # Set prefix register to 0 + spx 0(%r4) + larl %r4,.Lcontinue_psw # Save PSW flags + epsw %r2,%r3 + stm %r2,%r3,0(%r4) + larl %r4,restart_part2 # Setup restart PSW at absolute 0 + larl %r3,.Lrestart_diag308_psw + og %r4,0(%r3) # Save PSW + lghi %r3,0 + sturg %r4,%r3 # Use sturg, because of large pages + lghi %r1,1 + lghi %r0,0 + diag %r0,%r1,0x308 +restart_part2: + lhi %r0,0 # Load r0 with zero + lhi %r1,2 # Use mode 2 = ESAME (dump) + sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode + sam64 # Switch to 64 bit addressing mode + larl %r4,.Lctlregs # Restore control registers + lctlg %c0,%c15,0(%r4) + larl %r4,.Lfpctl # Restore floating point ctl register + lfpc 0(%r4) + larl %r4,.Lprefix # Restore prefix register + spx 0(%r4) + larl %r4,.Lcontinue_psw # Restore PSW flags + lpswe 0(%r4) +.Lcontinue: + BR_EX_DMA_r14 +ENDPROC(_diag308_reset_dma) + + .section .dma.data,"aw",@progbits +.align 8 +.Lrestart_diag308_psw: + .long 0x00080000,0x80000000 + +.align 8 +.Lcontinue_psw: + .quad 0,.Lcontinue + +.align 8 +.Lctlreg0: + .quad 0 +.Lctlregs: + .rept 16 + .quad 0 + .endr +.Lfpctl: + .long 0 +.Lprefix: + .long 0 +.Lprefix_zero: + .long 0 diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c new file mode 100644 index 0000000000000000000000000000000000000000..ed007f4a6444ff84b7c7abac01b442459e619458 --- /dev/null +++ b/arch/s390/boot/uv.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +int __bootdata_preserved(prot_virt_guest); + +void uv_query_info(void) +{ + struct uv_cb_qui uvcb = { + .header.cmd = UVC_CMD_QUI, + .header.len = sizeof(uvcb) + }; + + if (!test_facility(158)) + return; + + if (uv_call(0, (uint64_t)&uvcb)) + return; + + if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) && + test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list)) + prot_virt_guest = 1; +} diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 9824c7bad9d48592afa1b7a530fd02f30858eecb..b0920b35f87b8b5a64a983921393d1d21ff75ba3 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -64,6 +64,7 @@ CONFIG_NUMA=y CONFIG_PREEMPT=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_VERIFY_SIG=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_MEMORY_HOTPLUG=y diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 4fcbe5792744b4d59f4fc731b082c80d05b6990c..09aa5cb148737eac85000af34e52a38efd1b1a69 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -65,6 +65,7 @@ CONFIG_NR_CPUS=512 CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_VERIFY_SIG=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_MEMORY_HOTPLUG=y diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/crypto/crc32be-vx.S index 2bf01ba44107cd678e1fd38b31e5be18080f80b0..0099044e2c8605ce58d19c68bd887c78956c03d0 100644 --- a/arch/s390/crypto/crc32be-vx.S +++ b/arch/s390/crypto/crc32be-vx.S @@ -207,5 +207,6 @@ ENTRY(crc32_be_vgfm_16) .Ldone: VLGVF %r2,%v2,3 BR_EX %r14 +ENDPROC(crc32_be_vgfm_16) .previous diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/crypto/crc32le-vx.S index 7d6f568bd3ad1fe19586e7597ae127b519c7709f..71caf0f4ec08be8a6ecf4614b86230936f9353f0 100644 --- a/arch/s390/crypto/crc32le-vx.S +++ b/arch/s390/crypto/crc32le-vx.S @@ -105,13 +105,14 @@ ENTRY(crc32_le_vgfm_16) larl %r5,.Lconstants_CRC_32_LE j crc32_le_vgfm_generic +ENDPROC(crc32_le_vgfm_16) ENTRY(crc32c_le_vgfm_16) larl %r5,.Lconstants_CRC_32C_LE j crc32_le_vgfm_generic +ENDPROC(crc32c_le_vgfm_16) - -crc32_le_vgfm_generic: +ENTRY(crc32_le_vgfm_generic) /* Load CRC-32 constants */ VLM CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5 @@ -267,5 +268,6 @@ crc32_le_vgfm_generic: .Ldone: VLGVF %r2,%v2,2 BR_EX %r14 +ENDPROC(crc32_le_vgfm_generic) .previous diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index a97a1802cfb4d37d6e0ef5b9a9305aaf45febff7..12cca467af7d101cd53d10b4c54815f7190a6364 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -61,6 +61,7 @@ static unsigned int prng_reseed_limit; module_param_named(reseed_limit, prng_reseed_limit, int, 0); MODULE_PARM_DESC(prng_reseed_limit, "PRNG reseed limit"); +static bool trng_available; /* * Any one who considers arithmetical methods of producing random digits is, @@ -115,46 +116,68 @@ static const u8 initial_parm_block[32] __initconst = { /* * generate_entropy: - * This algorithm produces 64 bytes of entropy data based on 1024 - * individual stckf() invocations assuming that each stckf() value - * contributes 0.25 bits of entropy. So the caller gets 256 bit - * entropy per 64 byte or 4 bits entropy per byte. + * This function fills a given buffer with random bytes. The entropy within + * the random bytes given back is assumed to have at least 50% - meaning + * a 64 bytes buffer has at least 64 * 8 / 2 = 256 bits of entropy. + * Within the function the entropy generation is done in junks of 64 bytes. + * So the caller should also ask for buffer fill in multiples of 64 bytes. + * The generation of the entropy is based on the assumption that every stckf() + * invocation produces 0.5 bits of entropy. To accumulate 256 bits of entropy + * at least 512 stckf() values are needed. The entropy relevant part of the + * stckf value is bit 51 (counting starts at the left with bit nr 0) so + * here we use the lower 4 bytes and exor the values into 2k of bufferspace. + * To be on the save side, if there is ever a problem with stckf() the + * other half of the page buffer is filled with bytes from urandom via + * get_random_bytes(), so this function consumes 2k of urandom for each + * requested 64 bytes output data. Finally the buffer page is condensed into + * a 64 byte value by hashing with a SHA512 hash. */ static int generate_entropy(u8 *ebuf, size_t nbytes) { int n, ret = 0; - u8 *pg, *h, hash[64]; - - /* allocate 2 pages */ - pg = (u8 *) __get_free_pages(GFP_KERNEL, 1); + u8 *pg, pblock[80] = { + /* 8 x 64 bit init values */ + 0x6A, 0x09, 0xE6, 0x67, 0xF3, 0xBC, 0xC9, 0x08, + 0xBB, 0x67, 0xAE, 0x85, 0x84, 0xCA, 0xA7, 0x3B, + 0x3C, 0x6E, 0xF3, 0x72, 0xFE, 0x94, 0xF8, 0x2B, + 0xA5, 0x4F, 0xF5, 0x3A, 0x5F, 0x1D, 0x36, 0xF1, + 0x51, 0x0E, 0x52, 0x7F, 0xAD, 0xE6, 0x82, 0xD1, + 0x9B, 0x05, 0x68, 0x8C, 0x2B, 0x3E, 0x6C, 0x1F, + 0x1F, 0x83, 0xD9, 0xAB, 0xFB, 0x41, 0xBD, 0x6B, + 0x5B, 0xE0, 0xCD, 0x19, 0x13, 0x7E, 0x21, 0x79, + /* 128 bit counter total message bit length */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00 }; + + /* allocate one page stckf buffer */ + pg = (u8 *) __get_free_page(GFP_KERNEL); if (!pg) { prng_errorflag = PRNG_GEN_ENTROPY_FAILED; return -ENOMEM; } + /* fill the ebuf in chunks of 64 byte each */ while (nbytes) { - /* fill pages with urandom bytes */ - get_random_bytes(pg, 2*PAGE_SIZE); - /* exor pages with 1024 stckf values */ - for (n = 0; n < 2 * PAGE_SIZE / sizeof(u64); n++) { - u64 *p = ((u64 *)pg) + n; + /* fill lower 2k with urandom bytes */ + get_random_bytes(pg, PAGE_SIZE / 2); + /* exor upper 2k with 512 stckf values, offset 4 bytes each */ + for (n = 0; n < 512; n++) { + int offset = (PAGE_SIZE / 2) + (n * 4) - 4; + u64 *p = (u64 *)(pg + offset); *p ^= get_tod_clock_fast(); } - n = (nbytes < sizeof(hash)) ? nbytes : sizeof(hash); - if (n < sizeof(hash)) - h = hash; - else - h = ebuf; - /* hash over the filled pages */ - cpacf_kimd(CPACF_KIMD_SHA_512, h, pg, 2*PAGE_SIZE); - if (n < sizeof(hash)) - memcpy(ebuf, hash, n); + /* hash over the filled page */ + cpacf_klmd(CPACF_KLMD_SHA_512, pblock, pg, PAGE_SIZE); + n = (nbytes < 64) ? nbytes : 64; + memcpy(ebuf, pblock, n); ret += n; ebuf += n; nbytes -= n; } - free_pages((unsigned long)pg, 1); + memzero_explicit(pblock, sizeof(pblock)); + memzero_explicit(pg, PAGE_SIZE); + free_page((unsigned long)pg); return ret; } @@ -344,8 +367,8 @@ static int __init prng_sha512_selftest(void) static int __init prng_sha512_instantiate(void) { - int ret, datalen; - u8 seed[64 + 32 + 16]; + int ret, datalen, seedlen; + u8 seed[128 + 16]; pr_debug("prng runs in SHA-512 mode " "with chunksize=%d and reseed_limit=%u\n", @@ -368,16 +391,36 @@ static int __init prng_sha512_instantiate(void) if (ret) goto outfree; - /* generate initial seed bytestring, with 256 + 128 bits entropy */ - ret = generate_entropy(seed, 64 + 32); - if (ret != 64 + 32) - goto outfree; - /* followed by 16 bytes of unique nonce */ - get_tod_clock_ext(seed + 64 + 32); + /* generate initial seed, we need at least 256 + 128 bits entropy. */ + if (trng_available) { + /* + * Trng available, so use it. The trng works in chunks of + * 32 bytes and produces 100% entropy. So we pull 64 bytes + * which gives us 512 bits entropy. + */ + seedlen = 2 * 32; + cpacf_trng(NULL, 0, seed, seedlen); + } else { + /* + * No trng available, so use the generate_entropy() function. + * This function works in 64 byte junks and produces + * 50% entropy. So we pull 2*64 bytes which gives us 512 bits + * of entropy. + */ + seedlen = 2 * 64; + ret = generate_entropy(seed, seedlen); + if (ret != seedlen) + goto outfree; + } + + /* append the seed by 16 bytes of unique nonce */ + get_tod_clock_ext(seed + seedlen); + seedlen += 16; - /* initial seed of the prno drng */ + /* now initial seed of the prno drng */ cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, - &prng_data->prnows, NULL, 0, seed, sizeof(seed)); + &prng_data->prnows, NULL, 0, seed, seedlen); + memzero_explicit(seed, sizeof(seed)); /* if fips mode is enabled, generate a first block of random bytes for the FIPS 140-2 Conditional Self Test */ @@ -405,17 +448,26 @@ static void prng_sha512_deinstantiate(void) static int prng_sha512_reseed(void) { - int ret; + int ret, seedlen; u8 seed[64]; - /* fetch 256 bits of fresh entropy */ - ret = generate_entropy(seed, sizeof(seed)); - if (ret != sizeof(seed)) - return ret; + /* We need at least 256 bits of fresh entropy for reseeding */ + if (trng_available) { + /* trng produces 256 bits entropy in 32 bytes */ + seedlen = 32; + cpacf_trng(NULL, 0, seed, seedlen); + } else { + /* generate_entropy() produces 256 bits entropy in 64 bytes */ + seedlen = 64; + ret = generate_entropy(seed, seedlen); + if (ret != sizeof(seed)) + return ret; + } /* do a reseed of the prno drng with this bytestring */ cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, - &prng_data->prnows, NULL, 0, seed, sizeof(seed)); + &prng_data->prnows, NULL, 0, seed, seedlen); + memzero_explicit(seed, sizeof(seed)); return 0; } @@ -592,6 +644,7 @@ static ssize_t prng_sha512_read(struct file *file, char __user *ubuf, ret = -EFAULT; break; } + memzero_explicit(p, n); ubuf += n; nbytes -= n; ret += n; @@ -773,6 +826,10 @@ static int __init prng_init(void) if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) return -EOPNOTSUPP; + /* check if TRNG subfunction is available */ + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + trng_available = true; + /* choose prng mode */ if (prng_mode != PRNG_MODE_TDES) { /* check for MSA5 support for PRNO operations */ diff --git a/arch/s390/defconfig b/arch/s390/defconfig index 4d58a92b5d979f15e3469240c47a8e6f5fc4c189..c59b922cb6c561402134e03a0784654ac1500580 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -39,6 +39,7 @@ CONFIG_NR_CPUS=256 CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_VERIFY_SIG=y CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y diff --git a/arch/s390/hypfs/hypfs_diag0c.c b/arch/s390/hypfs/hypfs_diag0c.c index 72e3140fafb50a9b32d347a3b0e1a7807722f5e4..3235e4d82f2d550c209baaa6d94c5800e73c56c3 100644 --- a/arch/s390/hypfs/hypfs_diag0c.c +++ b/arch/s390/hypfs/hypfs_diag0c.c @@ -15,27 +15,13 @@ #define DBFS_D0C_HDR_VERSION 0 -/* - * Execute diagnose 0c in 31 bit mode - */ -static void diag0c(struct hypfs_diag0c_entry *entry) -{ - diag_stat_inc(DIAG_STAT_X00C); - asm volatile ( - " sam31\n" - " diag %0,%0,0x0c\n" - " sam64\n" - : /* no output register */ - : "a" (entry) - : "memory"); -} - /* * Get hypfs_diag0c_entry from CPU vector and store diag0c data */ static void diag0c_fn(void *data) { - diag0c(((void **) data)[smp_processor_id()]); + diag_stat_inc(DIAG_STAT_X00C); + diag_dma_ops.diag0c(((void **) data)[smp_processor_id()]); } /* diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 12d77cb11fe5a96269a7bde5fe6b8c6f11a23ea8..2531f673f099cb6ccdac14ff153f65eb59fcdc46 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -20,7 +20,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h -generic-y += rwsem.h +generic-y += mmiowb.h generic-y += trace_clock.h generic-y += unaligned.h generic-y += word-at-a-time.h diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h index fcf539efb32fb802d7c46030bda0a74c8df77639..c10d2ee2dfda4df8bc717d6800c5cd7f30d9e06d 100644 --- a/arch/s390/include/asm/airq.h +++ b/arch/s390/include/asm/airq.h @@ -14,7 +14,7 @@ struct airq_struct { struct hlist_node list; /* Handler queueing. */ - void (*handler)(struct airq_struct *); /* Thin-interrupt handler */ + void (*handler)(struct airq_struct *airq, bool floating); u8 *lsi_ptr; /* Local-Summary-Indicator pointer */ u8 lsi_mask; /* Local-Summary-Indicator mask */ u8 isc; /* Interrupt-subclass */ @@ -35,13 +35,15 @@ struct airq_iv { unsigned int *data; /* 32 bit value associated with each bit */ unsigned long bits; /* Number of bits in the vector */ unsigned long end; /* Number of highest allocated bit + 1 */ + unsigned long flags; /* Allocation flags */ spinlock_t lock; /* Lock to protect alloc & free */ }; -#define AIRQ_IV_ALLOC 1 /* Use an allocation bit mask */ -#define AIRQ_IV_BITLOCK 2 /* Allocate the lock bit mask */ -#define AIRQ_IV_PTR 4 /* Allocate the ptr array */ -#define AIRQ_IV_DATA 8 /* Allocate the data array */ +#define AIRQ_IV_ALLOC 1 /* Use an allocation bit mask */ +#define AIRQ_IV_BITLOCK 2 /* Allocate the lock bit mask */ +#define AIRQ_IV_PTR 4 /* Allocate the ptr array */ +#define AIRQ_IV_DATA 8 /* Allocate the data array */ +#define AIRQ_IV_CACHELINE 16 /* Cacheline alignment for the vector */ struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags); void airq_iv_release(struct airq_iv *iv); diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index d1f8a4d94cca02269841b6b6e0928ac699089b6b..9900d655014cc309559450316a436d67f90c3717 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -73,7 +73,7 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = 1UL << (nr & (BITS_PER_LONG - 1)); - __atomic64_or(mask, addr); + __atomic64_or(mask, (long *)addr); } static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) @@ -94,7 +94,7 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - __atomic64_and(mask, addr); + __atomic64_and(mask, (long *)addr); } static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) @@ -115,7 +115,7 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = 1UL << (nr & (BITS_PER_LONG - 1)); - __atomic64_xor(mask, addr); + __atomic64_xor(mask, (long *)addr); } static inline int @@ -125,7 +125,7 @@ test_and_set_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); - old = __atomic64_or_barrier(mask, addr); + old = __atomic64_or_barrier(mask, (long *)addr); return (old & mask) != 0; } @@ -136,7 +136,7 @@ test_and_clear_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - old = __atomic64_and_barrier(mask, addr); + old = __atomic64_and_barrier(mask, (long *)addr); return (old & ~mask) != 0; } @@ -147,7 +147,7 @@ test_and_change_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); - old = __atomic64_xor_barrier(mask, addr); + old = __atomic64_xor_barrier(mask, (long *)addr); return (old & mask) != 0; } diff --git a/arch/s390/include/asm/boot_data.h b/arch/s390/include/asm/boot_data.h index 2d999ccb977a76b942cd0c3e864795f4777b1850..f7eed27b3220fcc7e3583f2889650468c7066422 100644 --- a/arch/s390/include/asm/boot_data.h +++ b/arch/s390/include/asm/boot_data.h @@ -5,7 +5,14 @@ #include extern char early_command_line[COMMAND_LINE_SIZE]; -extern struct ipl_parameter_block early_ipl_block; -extern int early_ipl_block_valid; +extern struct ipl_parameter_block ipl_block; +extern int ipl_block_valid; +extern int ipl_secure_flag; + +extern unsigned long ipl_cert_list_addr; +extern unsigned long ipl_cert_list_size; + +extern unsigned long early_ipl_comp_list_addr; +extern unsigned long early_ipl_comp_list_size; #endif /* _ASM_S390_BOOT_DATA_H */ diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index 429f43a8a8e81fbac2837179ec5cf54ab04a49ba..713fc9735ffb046d2794bd821e6df89e647b6e32 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -15,7 +15,7 @@ ".section .rodata.str,\"aMS\",@progbits,1\n" \ "2: .asciz \""__FILE__"\"\n" \ ".previous\n" \ - ".section __bug_table,\"aw\"\n" \ + ".section __bug_table,\"awM\",@progbits,%2\n" \ "3: .long 1b-3b,2b-3b\n" \ " .short %0,%1\n" \ " .org 3b+%2\n" \ @@ -27,17 +27,17 @@ #else /* CONFIG_DEBUG_BUGVERBOSE */ -#define __EMIT_BUG(x) do { \ - asm volatile( \ - "0: j 0b+2\n" \ - "1:\n" \ - ".section __bug_table,\"aw\"\n" \ - "2: .long 1b-2b\n" \ - " .short %0\n" \ - " .org 2b+%1\n" \ - ".previous\n" \ - : : "i" (x), \ - "i" (sizeof(struct bug_entry))); \ +#define __EMIT_BUG(x) do { \ + asm volatile( \ + "0: j 0b+2\n" \ + "1:\n" \ + ".section __bug_table,\"awM\",@progbits,%1\n" \ + "2: .long 1b-2b\n" \ + " .short %0\n" \ + " .org 2b+%1\n" \ + ".previous\n" \ + : : "i" (x), \ + "i" (sizeof(struct bug_entry))); \ } while (0) #endif /* CONFIG_DEBUG_BUGVERBOSE */ diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 19562be22b7e30c12cfa8c78ada1fb39ebbecd14..0036eab14391d35854a9540c5a1d64033f3eab1c 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -308,4 +308,17 @@ union diag318_info { int diag204(unsigned long subcode, unsigned long size, void *addr); int diag224(void *ptr); int diag26c(void *req, void *resp, enum diag26c_sc subcode); + +struct hypfs_diag0c_entry; + +struct diag_ops { + int (*diag210)(struct diag210 *addr); + int (*diag26c)(void *req, void *resp, enum diag26c_sc subcode); + int (*diag14)(unsigned long rx, unsigned long ry1, unsigned long subcode); + void (*diag0c)(struct hypfs_diag0c_entry *entry); + void (*diag308_reset)(void); +}; + +extern struct diag_ops diag_dma_ops; +extern struct diag210 *__diag210_tmp_dma; #endif /* _ASM_S390_DIAG_H */ diff --git a/arch/s390/include/asm/ebcdic.h b/arch/s390/include/asm/ebcdic.h index 29441beb92e604ce24d03a035c1063ecc0d0b0f1..efb50fc6866cd0a60362bede8e713693b861d4b9 100644 --- a/arch/s390/include/asm/ebcdic.h +++ b/arch/s390/include/asm/ebcdic.h @@ -20,7 +20,7 @@ extern __u8 _ebc_tolower[256]; /* EBCDIC -> lowercase */ extern __u8 _ebc_toupper[256]; /* EBCDIC -> uppercase */ static inline void -codepage_convert(const __u8 *codepage, volatile __u8 * addr, unsigned long nr) +codepage_convert(const __u8 *codepage, volatile char *addr, unsigned long nr) { if (nr-- <= 0) return; diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index f74639a05f0ffc33f638c264af58c48933e36139..5775fc22f41076621e44d093c531ac633e0271dd 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -107,6 +107,10 @@ #define HWCAP_S390_VXRS_BCD 4096 #define HWCAP_S390_VXRS_EXT 8192 #define HWCAP_S390_GS 16384 +#define HWCAP_S390_VXRS_EXT2 32768 +#define HWCAP_S390_VXRS_PDE 65536 +#define HWCAP_S390_SORT 131072 +#define HWCAP_S390_DFLT 262144 /* Internal bits, not exposed via elf */ #define HWCAP_INT_SIE 1UL diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h index 80a4e5a9cb46022352f956ebe04c8007564c8879..ae27f756b409f4adddcc8f00dea2629271c8cf7d 100644 --- a/arch/s390/include/asm/extable.h +++ b/arch/s390/include/asm/extable.h @@ -19,6 +19,11 @@ struct exception_table_entry int insn, fixup; }; +extern struct exception_table_entry *__start_dma_ex_table; +extern struct exception_table_entry *__stop_dma_ex_table; + +const struct exception_table_entry *s390_search_extables(unsigned long addr); + static inline unsigned long extable_fixup(const struct exception_table_entry *x) { return (unsigned long)&x->fixup + x->fixup; diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 5a3c95b1195275092c65d554d81c24c47c70b849..68d362f8d6c17bafa112b08837dffd0dc8c47ad3 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -11,9 +11,16 @@ #define MCOUNT_RETURN_FIXUP 18 #endif +#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + #ifndef __ASSEMBLY__ +#ifdef CONFIG_CC_IS_CLANG +/* https://bugs.llvm.org/show_bug.cgi?id=41424 */ +#define ftrace_return_address(n) 0UL +#else #define ftrace_return_address(n) __builtin_return_address(n) +#endif void _mcount(void); void ftrace_caller(void); diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index f34d729347e4c7be32099e16ec79dedfa77438c1..ca421614722f60fccfdb1ab1d9a2f85a310d2250 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -30,14 +30,8 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define ioremap_wc ioremap_nocache #define ioremap_wt ioremap_nocache -static inline void __iomem *ioremap(unsigned long offset, unsigned long size) -{ - return (void __iomem *) offset; -} - -static inline void iounmap(volatile void __iomem *addr) -{ -} +void __iomem *ioremap(unsigned long offset, unsigned long size); +void iounmap(volatile void __iomem *addr); static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { @@ -57,14 +51,17 @@ static inline void ioport_unmap(void __iomem *p) * the corresponding device and create the mapping cookie. */ #define pci_iomap pci_iomap +#define pci_iomap_range pci_iomap_range #define pci_iounmap pci_iounmap -#define pci_iomap_wc pci_iomap -#define pci_iomap_wc_range pci_iomap_range +#define pci_iomap_wc pci_iomap_wc +#define pci_iomap_wc_range pci_iomap_wc_range #define memcpy_fromio(dst, src, count) zpci_memcpy_fromio(dst, src, count) #define memcpy_toio(dst, src, count) zpci_memcpy_toio(dst, src, count) #define memset_io(dst, val, count) zpci_memset_io(dst, val, count) +#define mmiowb() zpci_barrier() + #define __raw_readb zpci_read_u8 #define __raw_readw zpci_read_u16 #define __raw_readl zpci_read_u32 diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index a8389e2d2f034f36af377a8bf04b2dbcad590b50..084e71b7272afd421be5b6b4a9021ede7c9dc907 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -12,74 +12,36 @@ #include #include #include +#include -#define NSS_NAME_SIZE 8 - -#define IPL_PARM_BLK_FCP_LEN (sizeof(struct ipl_list_hdr) + \ - sizeof(struct ipl_block_fcp)) - -#define IPL_PARM_BLK0_FCP_LEN (sizeof(struct ipl_block_fcp) + 16) +struct ipl_parameter_block { + struct ipl_pl_hdr hdr; + union { + struct ipl_pb_hdr pb0_hdr; + struct ipl_pb0_common common; + struct ipl_pb0_fcp fcp; + struct ipl_pb0_ccw ccw; + char raw[PAGE_SIZE - sizeof(struct ipl_pl_hdr)]; + }; +} __packed __aligned(PAGE_SIZE); -#define IPL_PARM_BLK_CCW_LEN (sizeof(struct ipl_list_hdr) + \ - sizeof(struct ipl_block_ccw)) +#define NSS_NAME_SIZE 8 -#define IPL_PARM_BLK0_CCW_LEN (sizeof(struct ipl_block_ccw) + 16) +#define IPL_BP_FCP_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_fcp)) +#define IPL_BP0_FCP_LEN (sizeof(struct ipl_pb0_fcp)) +#define IPL_BP_CCW_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_ccw)) +#define IPL_BP0_CCW_LEN (sizeof(struct ipl_pb0_ccw)) #define IPL_MAX_SUPPORTED_VERSION (0) -struct ipl_list_hdr { - u32 len; - u8 reserved1[3]; - u8 version; - u32 blk0_len; - u8 pbt; - u8 flags; - u16 reserved2; - u8 loadparm[8]; -} __attribute__((packed)); - -struct ipl_block_fcp { - u8 reserved1[305-1]; - u8 opt; - u8 reserved2[3]; - u16 reserved3; - u16 devno; - u8 reserved4[4]; - u64 wwpn; - u64 lun; - u32 bootprog; - u8 reserved5[12]; - u64 br_lba; - u32 scp_data_len; - u8 reserved6[260]; - u8 scp_data[]; -} __attribute__((packed)); - -#define DIAG308_VMPARM_SIZE 64 -#define DIAG308_SCPDATA_SIZE (PAGE_SIZE - (sizeof(struct ipl_list_hdr) + \ - offsetof(struct ipl_block_fcp, scp_data))) - -struct ipl_block_ccw { - u8 reserved1[84]; - u16 reserved2 : 13; - u8 ssid : 3; - u16 devno; - u8 vm_flags; - u8 reserved3[3]; - u32 vm_parm_len; - u8 nss_name[8]; - u8 vm_parm[DIAG308_VMPARM_SIZE]; - u8 reserved4[8]; -} __attribute__((packed)); +#define IPL_RB_CERT_UNKNOWN ((unsigned short)-1) -struct ipl_parameter_block { - struct ipl_list_hdr hdr; - union { - struct ipl_block_fcp fcp; - struct ipl_block_ccw ccw; - char raw[PAGE_SIZE - sizeof(struct ipl_list_hdr)]; - } ipl_info; -} __packed __aligned(PAGE_SIZE); +#define DIAG308_VMPARM_SIZE (64) +#define DIAG308_SCPDATA_OFFSET offsetof(struct ipl_parameter_block, \ + fcp.scp_data) +#define DIAG308_SCPDATA_SIZE (PAGE_SIZE - DIAG308_SCPDATA_OFFSET) struct save_area; struct save_area * __init save_area_alloc(bool is_boot_cpu); @@ -88,7 +50,6 @@ void __init save_area_add_regs(struct save_area *, void *regs); void __init save_area_add_vxrs(struct save_area *, __vector128 *vxrs); extern void s390_reset_system(void); -extern void ipl_store_parameters(void); extern size_t ipl_block_get_ascii_vmparm(char *dest, size_t size, const struct ipl_parameter_block *ipb); @@ -122,6 +83,33 @@ extern struct ipl_info ipl_info; extern void setup_ipl(void); extern void set_os_info_reipl_block(void); +struct ipl_report { + struct ipl_parameter_block *ipib; + struct list_head components; + struct list_head certificates; + size_t size; +}; + +struct ipl_report_component { + struct list_head list; + struct ipl_rb_component_entry entry; +}; + +struct ipl_report_certificate { + struct list_head list; + struct ipl_rb_certificate_entry entry; + void *key; +}; + +struct kexec_buf; +struct ipl_report *ipl_report_init(struct ipl_parameter_block *ipib); +void *ipl_report_finish(struct ipl_report *report); +int ipl_report_free(struct ipl_report *report); +int ipl_report_add_component(struct ipl_report *report, struct kexec_buf *kbuf, + unsigned char flags, unsigned short cert); +int ipl_report_add_certificate(struct ipl_report *report, void *key, + unsigned long addr, unsigned long len); + /* * DIAG 308 support */ @@ -133,32 +121,12 @@ enum diag308_subcode { DIAG308_STORE = 6, }; -enum diag308_ipl_type { - DIAG308_IPL_TYPE_FCP = 0, - DIAG308_IPL_TYPE_CCW = 2, -}; - -enum diag308_opt { - DIAG308_IPL_OPT_IPL = 0x10, - DIAG308_IPL_OPT_DUMP = 0x20, -}; - -enum diag308_flags { - DIAG308_FLAGS_LP_VALID = 0x80, -}; - -enum diag308_vm_flags { - DIAG308_VM_FLAGS_NSS_VALID = 0x80, - DIAG308_VM_FLAGS_VP_VALID = 0x40, -}; - enum diag308_rc { DIAG308_RC_OK = 0x0001, DIAG308_RC_NOCONFIG = 0x0102, }; extern int diag308(unsigned long subcode, void *addr); -extern void diag308_reset(void); extern void store_status(void (*fn)(void *), void *data); extern void lgr_info_log(void); diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index afaf5e3c57fd8b66b59b220699131c4ee70018cb..9f75d67b8c20707ed109d4c0ac8da0f368158e76 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -47,7 +47,6 @@ enum interruption_class { IRQEXT_CMC, IRQEXT_FTP, IRQIO_CIO, - IRQIO_QAI, IRQIO_DAS, IRQIO_C15, IRQIO_C70, @@ -55,12 +54,14 @@ enum interruption_class { IRQIO_VMR, IRQIO_LCS, IRQIO_CTC, - IRQIO_APB, IRQIO_ADM, IRQIO_CSC, - IRQIO_PCI, - IRQIO_MSI, IRQIO_VIR, + IRQIO_QAI, + IRQIO_APB, + IRQIO_PCF, + IRQIO_PCD, + IRQIO_MSI, IRQIO_VAI, IRQIO_GAL, NMI_NMI, diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index 825dd0f7f2211863a5b127c193220f4c6556dfa4..ea398a05f6432334154161606771c5b3d52e11a5 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -11,6 +11,7 @@ #include #include +#include /* * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. * I.e. Maximum page that is mapped directly into kernel memory, @@ -42,6 +43,9 @@ /* The native architecture */ #define KEXEC_ARCH KEXEC_ARCH_S390 +/* Allow kexec_file to load a segment to 0 */ +#define KEXEC_BUF_MEM_UNKNOWN -1 + /* Provide a dummy definition to avoid build failures. */ static inline void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs) { } @@ -51,20 +55,24 @@ struct s390_load_data { /* Pointer to the kernel buffer. Used to register cmdline etc.. */ void *kernel_buf; + /* Load address of the kernel_buf. */ + unsigned long kernel_mem; + + /* Parmarea in the kernel buffer. */ + struct parmarea *parm; + /* Total size of loaded segments in memory. Used as an offset. */ size_t memsz; - /* Load address of initrd. Used to register INITRD_START in kernel. */ - unsigned long initrd_load_addr; + struct ipl_report *report; }; -int kexec_file_add_purgatory(struct kimage *image, - struct s390_load_data *data); -int kexec_file_add_initrd(struct kimage *image, - struct s390_load_data *data, - char *initrd, unsigned long initrd_len); -int *kexec_file_update_kernel(struct kimage *iamge, - struct s390_load_data *data); +int s390_verify_sig(const char *kernel, unsigned long kernel_len); +void *kexec_file_add_components(struct kimage *image, + int (*add_kernel)(struct kimage *image, + struct s390_load_data *data)); +int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val, + unsigned long addr); extern const struct kexec_file_ops s390_kexec_image_ops; extern const struct kexec_file_ops s390_kexec_elf_ops; diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h index 1b95da3fdd64e0d40d4960f89dfb3768298b5cec..7f22262b0e46ca9c0a6a0a8b4d4ac2683b4862e4 100644 --- a/arch/s390/include/asm/linkage.h +++ b/arch/s390/include/asm/linkage.h @@ -28,5 +28,12 @@ .long (_target) - . ; \ .previous +#define EX_TABLE_DMA(_fault, _target) \ + .section .dma.ex_table, "a" ; \ + .align 4 ; \ + .long (_fault) - . ; \ + .long (_target) - . ; \ + .previous + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 5b9f10b1e55dec03c2878a6ab510cb0d128002e5..237ee0c4169f7c8d68bd58537d9c3982cd8f3a69 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -129,7 +129,7 @@ struct lowcore { /* SMP info area */ __u32 cpu_nr; /* 0x03a0 */ __u32 softirq_pending; /* 0x03a4 */ - __u32 preempt_count; /* 0x03a8 */ + __s32 preempt_count; /* 0x03a8 */ __u32 spinlock_lockval; /* 0x03ac */ __u32 spinlock_index; /* 0x03b0 */ __u32 fpu_flags; /* 0x03b4 */ diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h index 123dac3717b33f35a5810b14f06093d15b30b95e..0033dcd663b1bb12f005e3b3846b38dbd8b048a2 100644 --- a/arch/s390/include/asm/nospec-insn.h +++ b/arch/s390/include/asm/nospec-insn.h @@ -32,23 +32,23 @@ _LC_BR_R1 = __LC_BR_R1 .endm .macro __THUNK_PROLOG_BR r1,r2 - __THUNK_PROLOG_NAME __s390x_indirect_jump_r\r2\()use_r\r1 + __THUNK_PROLOG_NAME __s390_indirect_jump_r\r2\()use_r\r1 .endm .macro __THUNK_PROLOG_BC d0,r1,r2 - __THUNK_PROLOG_NAME __s390x_indirect_branch_\d0\()_\r2\()use_\r1 + __THUNK_PROLOG_NAME __s390_indirect_branch_\d0\()_\r2\()use_\r1 .endm .macro __THUNK_BR r1,r2 - jg __s390x_indirect_jump_r\r2\()use_r\r1 + jg __s390_indirect_jump_r\r2\()use_r\r1 .endm .macro __THUNK_BC d0,r1,r2 - jg __s390x_indirect_branch_\d0\()_\r2\()use_\r1 + jg __s390_indirect_branch_\d0\()_\r2\()use_\r1 .endm .macro __THUNK_BRASL r1,r2,r3 - brasl \r1,__s390x_indirect_jump_r\r3\()use_r\r2 + brasl \r1,__s390_indirect_jump_r\r3\()use_r\r2 .endm .macro __DECODE_RR expand,reg,ruse diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 4e0efebc56a959057b0774949e4bd0b4a2e09e86..305befd553264fb42fff53873e5280e31a47fbd3 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -26,6 +26,9 @@ int pci_proc_domain(struct pci_bus *); #define ZPCI_BUS_NR 0 /* default bus number */ #define ZPCI_DEVFN 0 /* default device number */ +#define ZPCI_NR_DMA_SPACES 1 +#define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS + /* PCI Function Controls */ #define ZPCI_FC_FN_ENABLED 0x80 #define ZPCI_FC_ERROR 0x40 @@ -83,6 +86,8 @@ enum zpci_state { struct zpci_bar_struct { struct resource *res; /* bus resource */ + void __iomem *mio_wb; + void __iomem *mio_wt; u32 val; /* bar start & 3 flag bits */ u16 map_idx; /* index into bar mapping array */ u8 size; /* order 2 exponent */ @@ -112,6 +117,8 @@ struct zpci_dev { /* IRQ stuff */ u64 msi_addr; /* MSI address */ unsigned int max_msi; /* maximum number of MSI's */ + unsigned int msi_first_bit; + unsigned int msi_nr_irqs; struct airq_iv *aibv; /* adapter interrupt bit vector */ unsigned long aisb; /* number of the summary bit */ @@ -130,6 +137,7 @@ struct zpci_dev { struct iommu_device iommu_dev; /* IOMMU core handle */ char res_name[16]; + bool mio_capable; struct zpci_bar_struct bars[PCI_BAR_COUNT]; u64 start_dma; /* Start of available DMA addresses */ @@ -158,6 +166,7 @@ static inline bool zdev_enabled(struct zpci_dev *zdev) } extern const struct attribute_group *zpci_attr_groups[]; +extern unsigned int s390_pci_force_floating __initdata; /* ----------------------------------------------------------------------------- Prototypes @@ -219,6 +228,9 @@ struct zpci_dev *get_zdev_by_fid(u32); int zpci_dma_init(void); void zpci_dma_exit(void); +int __init zpci_irq_init(void); +void __init zpci_irq_exit(void); + /* FMB */ int zpci_fmb_enable_device(struct zpci_dev *); int zpci_fmb_disable_device(struct zpci_dev *); diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index b3b31b31f0d33a0f1abdb84f796306fad4dc9ab8..3ec52a05d50061348cd730a953341584d27d642c 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -43,6 +43,8 @@ struct clp_fh_list_entry { #define CLP_SET_ENABLE_PCI_FN 0 /* Yes, 0 enables it */ #define CLP_SET_DISABLE_PCI_FN 1 /* Yes, 1 disables it */ +#define CLP_SET_ENABLE_MIO 2 +#define CLP_SET_DISABLE_MIO 3 #define CLP_UTIL_STR_LEN 64 #define CLP_PFIP_NR_SEGMENTS 4 @@ -80,7 +82,8 @@ struct clp_req_query_pci { struct clp_rsp_query_pci { struct clp_rsp_hdr hdr; u16 vfn; /* virtual fn number */ - u16 : 7; + u16 : 6; + u16 mio_addr_avail : 1; u16 util_str_avail : 1; /* utility string available? */ u16 pfgid : 8; /* pci function group id */ u32 fid; /* pci function id */ @@ -96,6 +99,15 @@ struct clp_rsp_query_pci { u32 reserved[11]; u32 uid; /* user defined id */ u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */ + u32 reserved2[16]; + u32 mio_valid : 6; + u32 : 26; + u32 : 32; + struct { + u64 wb; + u64 wt; + } addr[PCI_BAR_COUNT]; + u32 reserved3[6]; } __packed; /* Query PCI function group request */ @@ -118,7 +130,11 @@ struct clp_rsp_query_pci_grp { u8 refresh : 1; /* TLB refresh mode */ u16 reserved2; u16 mui; - u64 reserved3; + u16 : 16; + u16 maxfaal; + u16 : 4; + u16 dnoi : 12; + u16 maxcpu; u64 dasm; /* dma address space mask */ u64 msia; /* MSI address */ u64 reserved4; diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index ba22a6ea51a144921d0722a35eea5e8265673454..ff81ed19c50656e5ccbf9042704b9980d0911925 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -2,6 +2,8 @@ #ifndef _ASM_S390_PCI_INSN_H #define _ASM_S390_PCI_INSN_H +#include + /* Load/Store status codes */ #define ZPCI_PCI_ST_FUNC_NOT_ENABLED 4 #define ZPCI_PCI_ST_FUNC_IN_ERR 8 @@ -38,6 +40,8 @@ #define ZPCI_MOD_FC_RESET_ERROR 7 #define ZPCI_MOD_FC_RESET_BLOCK 9 #define ZPCI_MOD_FC_SET_MEASURE 10 +#define ZPCI_MOD_FC_REG_INT_D 16 +#define ZPCI_MOD_FC_DEREG_INT_D 17 /* FIB function controls */ #define ZPCI_FIB_FC_ENABLED 0x80 @@ -51,16 +55,7 @@ #define ZPCI_FIB_FC_LS_BLOCKED 0x20 #define ZPCI_FIB_FC_DMAAS_REG 0x10 -/* Function Information Block */ -struct zpci_fib { - u32 fmt : 8; /* format */ - u32 : 24; - u32 : 32; - u8 fc; /* function controls */ - u64 : 56; - u64 pba; /* PCI base address */ - u64 pal; /* PCI address limit */ - u64 iota; /* I/O Translation Anchor */ +struct zpci_fib_fmt0 { u32 : 1; u32 isc : 3; /* Interrupt subclass */ u32 noi : 12; /* Number of interrupts */ @@ -72,16 +67,90 @@ struct zpci_fib { u32 : 32; u64 aibv; /* Adapter int bit vector address */ u64 aisb; /* Adapter int summary bit address */ +}; + +struct zpci_fib_fmt1 { + u32 : 4; + u32 noi : 12; + u32 : 16; + u32 dibvo : 16; + u32 : 16; + u64 : 64; + u64 : 64; +}; + +/* Function Information Block */ +struct zpci_fib { + u32 fmt : 8; /* format */ + u32 : 24; + u32 : 32; + u8 fc; /* function controls */ + u64 : 56; + u64 pba; /* PCI base address */ + u64 pal; /* PCI address limit */ + u64 iota; /* I/O Translation Anchor */ + union { + struct zpci_fib_fmt0 fmt0; + struct zpci_fib_fmt1 fmt1; + }; u64 fmb_addr; /* Function measurement block address and key */ u32 : 32; u32 gd; } __packed __aligned(8); +/* directed interruption information block */ +struct zpci_diib { + u32 : 1; + u32 isc : 3; + u32 : 28; + u16 : 16; + u16 nr_cpus; + u64 disb_addr; + u64 : 64; + u64 : 64; +} __packed __aligned(8); + +/* cpu directed interruption information block */ +struct zpci_cdiib { + u64 : 64; + u64 dibv_addr; + u64 : 64; + u64 : 64; + u64 : 64; +} __packed __aligned(8); + +union zpci_sic_iib { + struct zpci_diib diib; + struct zpci_cdiib cdiib; +}; + +DECLARE_STATIC_KEY_FALSE(have_mio); + u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status); int zpci_refresh_trans(u64 fn, u64 addr, u64 range); -int zpci_load(u64 *data, u64 req, u64 offset); -int zpci_store(u64 data, u64 req, u64 offset); -int zpci_store_block(const u64 *data, u64 req, u64 offset); -int zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc); +int __zpci_load(u64 *data, u64 req, u64 offset); +int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len); +int __zpci_store(u64 data, u64 req, u64 offset); +int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len); +int __zpci_store_block(const u64 *data, u64 req, u64 offset); +void zpci_barrier(void); +int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); + +static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc) +{ + union zpci_sic_iib iib = {{0}}; + + return __zpci_set_irq_ctrl(ctl, isc, &iib); +} + +#ifdef CONFIG_PCI +static inline void enable_mio_ctl(void) +{ + if (static_branch_likely(&have_mio)) + __ctl_set_bit(2, 5); +} +#else /* CONFIG_PCI */ +static inline void enable_mio_ctl(void) {} +#endif /* CONFIG_PCI */ #endif diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h index cbb9cb9c65476d3268240d3cfe50fd348e9f4e1f..cd060b5dd8fdd1f1595a47e1edeca75ec8651aa4 100644 --- a/arch/s390/include/asm/pci_io.h +++ b/arch/s390/include/asm/pci_io.h @@ -37,12 +37,10 @@ extern struct zpci_iomap_entry *zpci_iomap_start; #define zpci_read(LENGTH, RETTYPE) \ static inline RETTYPE zpci_read_##RETTYPE(const volatile void __iomem *addr) \ { \ - struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; \ - u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, LENGTH); \ u64 data; \ int rc; \ \ - rc = zpci_load(&data, req, ZPCI_OFFSET(addr)); \ + rc = zpci_load(&data, addr, LENGTH); \ if (rc) \ data = -1ULL; \ return (RETTYPE) data; \ @@ -52,11 +50,9 @@ static inline RETTYPE zpci_read_##RETTYPE(const volatile void __iomem *addr) \ static inline void zpci_write_##VALTYPE(VALTYPE val, \ const volatile void __iomem *addr) \ { \ - struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; \ - u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, LENGTH); \ u64 data = (VALTYPE) val; \ \ - zpci_store(data, req, ZPCI_OFFSET(addr)); \ + zpci_store(addr, data, LENGTH); \ } zpci_read(8, u64) @@ -68,36 +64,38 @@ zpci_write(4, u32) zpci_write(2, u16) zpci_write(1, u8) -static inline int zpci_write_single(u64 req, const u64 *data, u64 offset, u8 len) +static inline int zpci_write_single(volatile void __iomem *dst, const void *src, + unsigned long len) { u64 val; switch (len) { case 1: - val = (u64) *((u8 *) data); + val = (u64) *((u8 *) src); break; case 2: - val = (u64) *((u16 *) data); + val = (u64) *((u16 *) src); break; case 4: - val = (u64) *((u32 *) data); + val = (u64) *((u32 *) src); break; case 8: - val = (u64) *((u64 *) data); + val = (u64) *((u64 *) src); break; default: val = 0; /* let FW report error */ break; } - return zpci_store(val, req, offset); + return zpci_store(dst, val, len); } -static inline int zpci_read_single(u64 req, u64 *dst, u64 offset, u8 len) +static inline int zpci_read_single(void *dst, const volatile void __iomem *src, + unsigned long len) { u64 data; int cc; - cc = zpci_load(&data, req, offset); + cc = zpci_load(&data, src, len); if (cc) goto out; @@ -119,10 +117,8 @@ out: return cc; } -static inline int zpci_write_block(u64 req, const u64 *data, u64 offset) -{ - return zpci_store_block(data, req, offset); -} +int zpci_write_block(volatile void __iomem *dst, const void *src, + unsigned long len); static inline u8 zpci_get_max_write_size(u64 src, u64 dst, int len, int max) { @@ -140,18 +136,15 @@ static inline int zpci_memcpy_fromio(void *dst, const volatile void __iomem *src, unsigned long n) { - struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(src)]; - u64 req, offset = ZPCI_OFFSET(src); int size, rc = 0; while (n > 0) { size = zpci_get_max_write_size((u64 __force) src, (u64) dst, n, 8); - req = ZPCI_CREATE_REQ(entry->fh, entry->bar, size); - rc = zpci_read_single(req, dst, offset, size); + rc = zpci_read_single(dst, src, size); if (rc) break; - offset += size; + src += size; dst += size; n -= size; } @@ -161,8 +154,6 @@ static inline int zpci_memcpy_fromio(void *dst, static inline int zpci_memcpy_toio(volatile void __iomem *dst, const void *src, unsigned long n) { - struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(dst)]; - u64 req, offset = ZPCI_OFFSET(dst); int size, rc = 0; if (!src) @@ -171,16 +162,14 @@ static inline int zpci_memcpy_toio(volatile void __iomem *dst, while (n > 0) { size = zpci_get_max_write_size((u64 __force) dst, (u64) src, n, 128); - req = ZPCI_CREATE_REQ(entry->fh, entry->bar, size); - if (size > 8) /* main path */ - rc = zpci_write_block(req, src, offset); + rc = zpci_write_block(dst, src, size); else - rc = zpci_write_single(req, src, offset, size); + rc = zpci_write_single(dst, src, size); if (rc) break; - offset += size; src += size; + dst += size; n -= size; } return rc; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 76dc344edb8cfd9c661b456d8c19afa54d0fa530..9f0195d5fa167f5d7817f2a231c6e9f8f31f9b9f 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -238,7 +238,7 @@ static inline int is_module_addr(void *addr) #define _REGION_ENTRY_NOEXEC 0x100 /* region no-execute bit */ #define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */ #define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */ -#define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */ +#define _REGION_ENTRY_TYPE_MASK 0x0c /* region table type mask */ #define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ #define _REGION_ENTRY_TYPE_R2 0x08 /* region second table type */ #define _REGION_ENTRY_TYPE_R3 0x04 /* region third table type */ @@ -277,6 +277,7 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_PROTECT 0x200 /* segment protection bit */ #define _SEGMENT_ENTRY_NOEXEC 0x100 /* segment no-execute bit */ #define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */ +#define _SEGMENT_ENTRY_TYPE_MASK 0x0c /* segment table type mask */ #define _SEGMENT_ENTRY (0) #define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INVALID) @@ -614,15 +615,9 @@ static inline int pgd_none(pgd_t pgd) static inline int pgd_bad(pgd_t pgd) { - /* - * With dynamic page table levels the pgd can be a region table - * entry or a segment table entry. Check for the bit that are - * invalid for either table entry. - */ - unsigned long mask = - ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INVALID & - ~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH; - return (pgd_val(pgd) & mask) != 0; + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R1) + return 0; + return (pgd_val(pgd) & ~_REGION_ENTRY_BITS) != 0; } static inline unsigned long pgd_pfn(pgd_t pgd) @@ -703,6 +698,8 @@ static inline int pmd_large(pmd_t pmd) static inline int pmd_bad(pmd_t pmd) { + if ((pmd_val(pmd) & _SEGMENT_ENTRY_TYPE_MASK) > 0) + return 1; if (pmd_large(pmd)) return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0; return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0; @@ -710,8 +707,12 @@ static inline int pmd_bad(pmd_t pmd) static inline int pud_bad(pud_t pud) { - if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3) - return pmd_bad(__pmd(pud_val(pud))); + unsigned long type = pud_val(pud) & _REGION_ENTRY_TYPE_MASK; + + if (type > _REGION_ENTRY_TYPE_R3) + return 1; + if (type < _REGION_ENTRY_TYPE_R3) + return 0; if (pud_large(pud)) return (pud_val(pud) & ~_REGION_ENTRY_BITS_LARGE) != 0; return (pud_val(pud) & ~_REGION_ENTRY_BITS) != 0; @@ -719,8 +720,12 @@ static inline int pud_bad(pud_t pud) static inline int p4d_bad(p4d_t p4d) { - if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2) - return pud_bad(__pud(p4d_val(p4d))); + unsigned long type = p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK; + + if (type > _REGION_ENTRY_TYPE_R2) + return 1; + if (type < _REGION_ENTRY_TYPE_R2) + return 0; return (p4d_val(p4d) & ~_REGION_ENTRY_BITS) != 0; } @@ -1204,41 +1209,78 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) -#define pgd_offset_k(address) pgd_offset(&init_mm, address) -#define pgd_offset_raw(pgd, addr) ((pgd) + pgd_index(addr)) - #define pmd_deref(pmd) (pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN) #define pud_deref(pud) (pud_val(pud) & _REGION_ENTRY_ORIGIN) #define p4d_deref(pud) (p4d_val(pud) & _REGION_ENTRY_ORIGIN) #define pgd_deref(pgd) (pgd_val(pgd) & _REGION_ENTRY_ORIGIN) -static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) +/* + * The pgd_offset function *always* adds the index for the top-level + * region/segment table. This is done to get a sequence like the + * following to work: + * pgdp = pgd_offset(current->mm, addr); + * pgd = READ_ONCE(*pgdp); + * p4dp = p4d_offset(&pgd, addr); + * ... + * The subsequent p4d_offset, pud_offset and pmd_offset functions + * only add an index if they dereferenced the pointer. + */ +static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address) { - p4d_t *p4d = (p4d_t *) pgd; + unsigned long rste; + unsigned int shift; - if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1) - p4d = (p4d_t *) pgd_deref(*pgd); - return p4d + p4d_index(address); + /* Get the first entry of the top level table */ + rste = pgd_val(*pgd); + /* Pick up the shift from the table type of the first entry */ + shift = ((rste & _REGION_ENTRY_TYPE_MASK) >> 2) * 11 + 20; + return pgd + ((address >> shift) & (PTRS_PER_PGD - 1)); } -static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +#define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address) +#define pgd_offset_k(address) pgd_offset(&init_mm, address) + +static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { - pud_t *pud = (pud_t *) p4d; + if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) + return (p4d_t *) pgd_deref(*pgd) + p4d_index(address); + return (p4d_t *) pgd; +} - if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) - pud = (pud_t *) p4d_deref(*p4d); - return pud + pud_index(address); +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +{ + if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) + return (pud_t *) p4d_deref(*p4d) + pud_index(address); + return (pud_t *) p4d; } static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { - pmd_t *pmd = (pmd_t *) pud; + if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3) + return (pmd_t *) pud_deref(*pud) + pmd_index(address); + return (pmd_t *) pud; +} - if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) - pmd = (pmd_t *) pud_deref(*pud); - return pmd + pmd_index(address); +static inline pte_t *pte_offset(pmd_t *pmd, unsigned long address) +{ + return (pte_t *) pmd_deref(*pmd) + pte_index(address); +} + +#define pte_offset_kernel(pmd, address) pte_offset(pmd, address) +#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) +#define pte_unmap(pte) do { } while (0) + +static inline bool gup_fast_permitted(unsigned long start, int nr_pages) +{ + unsigned long len, end; + + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + if (end < start) + return false; + return end <= current->mm->context.asce_limit; } +#define gup_fast_permitted gup_fast_permitted #define pfn_pte(pfn,pgprot) mk_pte_phys(__pa((pfn) << PAGE_SHIFT),(pgprot)) #define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT) @@ -1249,12 +1291,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) #define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) #define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) -/* Find an entry in the lowest level page table.. */ -#define pte_offset(pmd, addr) ((pte_t *) pmd_deref(*(pmd)) + pte_index(addr)) -#define pte_offset_kernel(pmd, address) pte_offset(pmd,address) -#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) -#define pte_unmap(pte) do { } while (0) - static inline pmd_t pmd_wrprotect(pmd_t pmd) { pmd_val(pmd) &= ~_SEGMENT_ENTRY_WRITE; diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 81038ab357ce955682b713f0c4241611ba5f931f..b0fcbc37b637dfd0bc7d87eb3d07efdd7b97c7db 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -156,25 +156,6 @@ struct thread_struct { typedef struct thread_struct thread_struct; -/* - * Stack layout of a C stack frame. - */ -#ifndef __PACK_STACK -struct stack_frame { - unsigned long back_chain; - unsigned long empty1[5]; - unsigned long gprs[10]; - unsigned int empty2[8]; -}; -#else -struct stack_frame { - unsigned long empty1[5]; - unsigned int empty2[8]; - unsigned long gprs[10]; - unsigned long back_chain; -}; -#endif - #define ARCH_MIN_TASKALIGN 8 #define INIT_THREAD { \ @@ -206,11 +187,7 @@ struct mm_struct; struct seq_file; struct pt_regs; -typedef int (*dump_trace_func_t)(void *data, unsigned long address, int reliable); -void dump_trace(dump_trace_func_t func, void *data, - struct task_struct *task, unsigned long sp); void show_registers(struct pt_regs *regs); - void show_cacheinfo(struct seq_file *m); /* Free all resources held by a thread. */ @@ -244,55 +221,6 @@ static __no_kasan_or_inline unsigned short stap(void) return cpu_address; } -#define CALL_ARGS_0() \ - register unsigned long r2 asm("2") -#define CALL_ARGS_1(arg1) \ - register unsigned long r2 asm("2") = (unsigned long)(arg1) -#define CALL_ARGS_2(arg1, arg2) \ - CALL_ARGS_1(arg1); \ - register unsigned long r3 asm("3") = (unsigned long)(arg2) -#define CALL_ARGS_3(arg1, arg2, arg3) \ - CALL_ARGS_2(arg1, arg2); \ - register unsigned long r4 asm("4") = (unsigned long)(arg3) -#define CALL_ARGS_4(arg1, arg2, arg3, arg4) \ - CALL_ARGS_3(arg1, arg2, arg3); \ - register unsigned long r4 asm("5") = (unsigned long)(arg4) -#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5) \ - CALL_ARGS_4(arg1, arg2, arg3, arg4); \ - register unsigned long r4 asm("6") = (unsigned long)(arg5) - -#define CALL_FMT_0 -#define CALL_FMT_1 CALL_FMT_0, "0" (r2) -#define CALL_FMT_2 CALL_FMT_1, "d" (r3) -#define CALL_FMT_3 CALL_FMT_2, "d" (r4) -#define CALL_FMT_4 CALL_FMT_3, "d" (r5) -#define CALL_FMT_5 CALL_FMT_4, "d" (r6) - -#define CALL_CLOBBER_5 "0", "1", "14", "cc", "memory" -#define CALL_CLOBBER_4 CALL_CLOBBER_5 -#define CALL_CLOBBER_3 CALL_CLOBBER_4, "5" -#define CALL_CLOBBER_2 CALL_CLOBBER_3, "4" -#define CALL_CLOBBER_1 CALL_CLOBBER_2, "3" -#define CALL_CLOBBER_0 CALL_CLOBBER_1 - -#define CALL_ON_STACK(fn, stack, nr, args...) \ -({ \ - CALL_ARGS_##nr(args); \ - unsigned long prev; \ - \ - asm volatile( \ - " la %[_prev],0(15)\n" \ - " la 15,0(%[_stack])\n" \ - " stg %[_prev],%[_bc](15)\n" \ - " brasl 14,%[_fn]\n" \ - " la 15,0(%[_prev])\n" \ - : "+&d" (r2), [_prev] "=&a" (prev) \ - : [_stack] "a" (stack), \ - [_bc] "i" (offsetof(struct stack_frame, back_chain)), \ - [_fn] "X" (fn) CALL_FMT_##nr : CALL_CLOBBER_##nr); \ - r2; \ -}) - /* * Give up the time slice of the virtual PU. */ @@ -339,10 +267,10 @@ static __no_kasan_or_inline void __load_psw_mask(unsigned long mask) asm volatile( " larl %0,1f\n" - " stg %0,%O1+8(%R1)\n" - " lpswe %1\n" + " stg %0,%1\n" + " lpswe %2\n" "1:" - : "=&d" (addr), "=Q" (psw) : "Q" (psw) : "memory", "cc"); + : "=&d" (addr), "=Q" (psw.addr) : "Q" (psw) : "memory", "cc"); } /* @@ -387,12 +315,12 @@ void enabled_wait(void); /* * Function to drop a processor into disabled wait state */ -static inline void __noreturn disabled_wait(unsigned long code) +static inline void __noreturn disabled_wait(void) { psw_t psw; psw.mask = PSW_MASK_BASE | PSW_MASK_WAIT | PSW_MASK_BA | PSW_MASK_EA; - psw.addr = code; + psw.addr = _THIS_IP_; __load_psw(psw); while (1); } diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index ef4c9dec06a471addcb316f96fb135b79db06ca3..f577c5f6031adbfb318547575ca82e87848d50e6 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -79,6 +79,9 @@ struct sclp_info { unsigned char has_kss : 1; unsigned char has_gisaf : 1; unsigned char has_diag318 : 1; + unsigned char has_sipl : 1; + unsigned char has_sipl_g2 : 1; + unsigned char has_dirq : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h index 7afe4620685c93aac9b4bb9efaf196aacd4b055c..42de04ad9c07bee8967b03c6285e0741702e2e67 100644 --- a/arch/s390/include/asm/sections.h +++ b/arch/s390/include/asm/sections.h @@ -2,8 +2,20 @@ #ifndef _S390_SECTIONS_H #define _S390_SECTIONS_H +#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed + #include +extern bool initmem_freed; + +static inline int arch_is_kernel_initmem_freed(unsigned long addr) +{ + if (!initmem_freed) + return 0; + return addr >= (unsigned long)__init_begin && + addr < (unsigned long)__init_end; +} + /* * .boot.data section contains variables "shared" between the decompressor and * the decompressed kernel. The decompressor will store values in them, and @@ -16,4 +28,14 @@ */ #define __bootdata(var) __section(.boot.data.var) var +/* + * .boot.preserved.data is similar to .boot.data, but it is not part of the + * .init section and thus will be preserved for later use in the decompressed + * kernel. + */ +#define __bootdata_preserved(var) __section(.boot.preserved.data.var) var + +extern unsigned long __sdma, __edma; +extern unsigned long __stext_dma, __etext_dma; + #endif diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index efda97804aa4a5dcedba8a2512c3c890411c587d..925889d360c1dffa32d7242dbcc3bb817e0a21da 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -12,7 +12,10 @@ #define EP_OFFSET 0x10008 #define EP_STRING "S390EP" #define PARMAREA 0x10400 -#define PARMAREA_END 0x11000 +#define EARLY_SCCB_OFFSET 0x11000 +#define HEAD_END 0x12000 + +#define EARLY_SCCB_SIZE PAGE_SIZE /* * Machine features detected in early.c @@ -65,6 +68,16 @@ #define OLDMEM_SIZE (*(unsigned long *) (OLDMEM_SIZE_OFFSET)) #define COMMAND_LINE ((char *) (COMMAND_LINE_OFFSET)) +struct parmarea { + unsigned long ipl_device; /* 0x10400 */ + unsigned long initrd_start; /* 0x10408 */ + unsigned long initrd_size; /* 0x10410 */ + unsigned long oldmem_base; /* 0x10418 */ + unsigned long oldmem_size; /* 0x10420 */ + char pad1[0x10480 - 0x10428]; /* 0x10428 - 0x10480 */ + char command_line[ARCH_COMMAND_LINE_SIZE]; /* 0x10480 */ +}; + extern int noexec_disabled; extern int memory_end_set; extern unsigned long memory_end; @@ -134,6 +147,12 @@ extern void (*_machine_restart)(char *command); extern void (*_machine_halt)(void); extern void (*_machine_power_off)(void); +extern unsigned long __kaslr_offset; +static inline unsigned long kaslr_offset(void) +{ + return __kaslr_offset; +} + #else /* __ASSEMBLY__ */ #define IPL_DEVICE (IPL_DEVICE_OFFSET) diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h new file mode 100644 index 0000000000000000000000000000000000000000..49634bfbecdd084daa1e01e05048f577bb1d26f0 --- /dev/null +++ b/arch/s390/include/asm/stacktrace.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_STACKTRACE_H +#define _ASM_S390_STACKTRACE_H + +#include +#include +#include + +enum stack_type { + STACK_TYPE_UNKNOWN, + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_NODAT, + STACK_TYPE_RESTART, +}; + +struct stack_info { + enum stack_type type; + unsigned long begin, end; +}; + +const char *stack_type_name(enum stack_type type); +int get_stack_info(unsigned long sp, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask); + +static inline bool on_stack(struct stack_info *info, + unsigned long addr, size_t len) +{ + if (info->type == STACK_TYPE_UNKNOWN) + return false; + if (addr + len < addr) + return false; + return addr >= info->begin && addr + len < info->end; +} + +static inline unsigned long get_stack_pointer(struct task_struct *task, + struct pt_regs *regs) +{ + if (regs) + return (unsigned long) kernel_stack_pointer(regs); + if (task == current) + return current_stack_pointer(); + return (unsigned long) task->thread.ksp; +} + +/* + * Stack layout of a C stack frame. + */ +#ifndef __PACK_STACK +struct stack_frame { + unsigned long back_chain; + unsigned long empty1[5]; + unsigned long gprs[10]; + unsigned int empty2[8]; +}; +#else +struct stack_frame { + unsigned long empty1[5]; + unsigned int empty2[8]; + unsigned long gprs[10]; + unsigned long back_chain; +}; +#endif + +#define CALL_ARGS_0() \ + register unsigned long r2 asm("2") +#define CALL_ARGS_1(arg1) \ + register unsigned long r2 asm("2") = (unsigned long)(arg1) +#define CALL_ARGS_2(arg1, arg2) \ + CALL_ARGS_1(arg1); \ + register unsigned long r3 asm("3") = (unsigned long)(arg2) +#define CALL_ARGS_3(arg1, arg2, arg3) \ + CALL_ARGS_2(arg1, arg2); \ + register unsigned long r4 asm("4") = (unsigned long)(arg3) +#define CALL_ARGS_4(arg1, arg2, arg3, arg4) \ + CALL_ARGS_3(arg1, arg2, arg3); \ + register unsigned long r4 asm("5") = (unsigned long)(arg4) +#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5) \ + CALL_ARGS_4(arg1, arg2, arg3, arg4); \ + register unsigned long r4 asm("6") = (unsigned long)(arg5) + +#define CALL_FMT_0 "=&d" (r2) : +#define CALL_FMT_1 "+&d" (r2) : +#define CALL_FMT_2 CALL_FMT_1 "d" (r3), +#define CALL_FMT_3 CALL_FMT_2 "d" (r4), +#define CALL_FMT_4 CALL_FMT_3 "d" (r5), +#define CALL_FMT_5 CALL_FMT_4 "d" (r6), + +#define CALL_CLOBBER_5 "0", "1", "14", "cc", "memory" +#define CALL_CLOBBER_4 CALL_CLOBBER_5 +#define CALL_CLOBBER_3 CALL_CLOBBER_4, "5" +#define CALL_CLOBBER_2 CALL_CLOBBER_3, "4" +#define CALL_CLOBBER_1 CALL_CLOBBER_2, "3" +#define CALL_CLOBBER_0 CALL_CLOBBER_1 + +#define CALL_ON_STACK(fn, stack, nr, args...) \ +({ \ + CALL_ARGS_##nr(args); \ + unsigned long prev; \ + \ + asm volatile( \ + " la %[_prev],0(15)\n" \ + " la 15,0(%[_stack])\n" \ + " stg %[_prev],%[_bc](15)\n" \ + " brasl 14,%[_fn]\n" \ + " la 15,0(%[_prev])\n" \ + : [_prev] "=&a" (prev), CALL_FMT_##nr \ + [_stack] "a" (stack), \ + [_bc] "i" (offsetof(struct stack_frame, back_chain)), \ + [_fn] "X" (fn) : CALL_CLOBBER_##nr); \ + r2; \ +}) + +#endif /* _ASM_S390_STACKTRACE_H */ diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index 96f9a9151fde02fc6f76633d76d292f47512d364..ab3407aa4fd8232ec32b3ff5e898f75f092b7170 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -14,13 +14,8 @@ #include #include -/* - * The syscall table always contains 32 bit pointers since we know that the - * address of the function to be called is (way) below 4GB. So the "int" - * type here is what we want [need] for both 32 bit and 64 bit systems. - */ -extern const unsigned int sys_call_table[]; -extern const unsigned int sys_call_table_emu[]; +extern const unsigned long sys_call_table[]; +extern const unsigned long sys_call_table_emu[]; static inline long syscall_get_nr(struct task_struct *task, struct pt_regs *regs) @@ -56,40 +51,32 @@ static inline void syscall_set_return_value(struct task_struct *task, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { unsigned long mask = -1UL; + unsigned int n = 6; - /* - * No arguments for this syscall, there's nothing to do. - */ - if (!n) - return; - - BUG_ON(i + n > 6); #ifdef CONFIG_COMPAT if (test_tsk_thread_flag(task, TIF_31BIT)) mask = 0xffffffff; #endif while (n-- > 0) - if (i + n > 0) - args[n] = regs->gprs[2 + i + n] & mask; - if (i == 0) - args[0] = regs->orig_gpr2 & mask; + if (n > 0) + args[n] = regs->gprs[2 + n] & mask; + + args[0] = regs->orig_gpr2 & mask; } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) { - BUG_ON(i + n > 6); + unsigned int n = 6; + while (n-- > 0) - if (i + n > 0) - regs->gprs[2 + i + n] = args[n]; - if (i == 0) - regs->orig_gpr2 = args[0]; + if (n > 0) + regs->gprs[2 + n] = args[n]; + regs->orig_gpr2 = args[0]; } static inline int syscall_get_arch(void) diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h index 5596c5c625d26b3ab25c77cb96283a163335a443..3c3d6fe8e2f026e7cf166e9bef6bc1d2ed9fb925 100644 --- a/arch/s390/include/asm/syscall_wrapper.h +++ b/arch/s390/include/asm/syscall_wrapper.h @@ -119,8 +119,8 @@ "Type aliasing is used to sanitize syscall arguments");\ asmlinkage long __s390x_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_sys##name)))); \ - ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ - static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ + long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ __S390_SYS_STUBx(x, name, __VA_ARGS__) \ asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index b31c779cf58176ad3bf91ee816053cbcf40b3476..aa406c05a350589567699bfb304e1dfdbe287d25 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -22,98 +22,39 @@ * Pages used for the page tables is a different story. FIXME: more */ -#include -#include -#include -#include -#include -#include - -struct mmu_gather { - struct mm_struct *mm; - struct mmu_table_batch *batch; - unsigned int fullmm; - unsigned long start, end; -}; - -struct mmu_table_batch { - struct rcu_head rcu; - unsigned int nr; - void *tables[0]; -}; - -#define MAX_TABLE_BATCH \ - ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) - -extern void tlb_table_flush(struct mmu_gather *tlb); -extern void tlb_remove_table(struct mmu_gather *tlb, void *table); - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->start = start; - tlb->end = end; - tlb->fullmm = !(start | (end+1)); - tlb->batch = NULL; -} - -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ - __tlb_flush_mm_lazy(tlb->mm); -} - -static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - tlb_table_flush(tlb); -} - +void __tlb_remove_table(void *_table); +static inline void tlb_flush(struct mmu_gather *tlb); +static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size); -static inline void tlb_flush_mmu(struct mmu_gather *tlb) -{ - tlb_flush_mmu_tlbonly(tlb); - tlb_flush_mmu_free(tlb); -} +#define tlb_start_vma(tlb, vma) do { } while (0) +#define tlb_end_vma(tlb, vma) do { } while (0) -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (force) { - tlb->start = start; - tlb->end = end; - } +#define tlb_flush tlb_flush +#define pte_free_tlb pte_free_tlb +#define pmd_free_tlb pmd_free_tlb +#define p4d_free_tlb p4d_free_tlb +#define pud_free_tlb pud_free_tlb - tlb_flush_mmu(tlb); -} +#include +#include +#include /* * Release the page cache reference for a pte removed by * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page * has already been freed, so just do free_page_and_swap_cache. */ -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - free_page_and_swap_cache(page); - return false; /* avoid calling tlb_flush_mmu */ -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - free_page_and_swap_cache(page); -} - static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - return __tlb_remove_page(tlb, page); + free_page_and_swap_cache(page); + return false; } -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) +static inline void tlb_flush(struct mmu_gather *tlb) { - return tlb_remove_page(tlb, page); + __tlb_flush_mm_lazy(tlb->mm); } /* @@ -121,8 +62,17 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, * page table from the tlb. */ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, - unsigned long address) + unsigned long address) { + __tlb_adjust_range(tlb, address, PAGE_SIZE); + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_ptes = 1; + /* + * page_table_free_rcu takes care of the allocation bit masks + * of the 2K table fragments in the 4K page table page, + * then calls tlb_remove_table. + */ page_table_free_rcu(tlb, (unsigned long *) pte, address); } @@ -139,6 +89,10 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, if (mm_pmd_folded(tlb->mm)) return; pgtable_pmd_page_dtor(virt_to_page(pmd)); + __tlb_adjust_range(tlb, address, PAGE_SIZE); + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_puds = 1; tlb_remove_table(tlb, pmd); } @@ -154,6 +108,10 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, { if (mm_p4d_folded(tlb->mm)) return; + __tlb_adjust_range(tlb, address, PAGE_SIZE); + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_p4ds = 1; tlb_remove_table(tlb, p4d); } @@ -169,21 +127,11 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, { if (mm_pud_folded(tlb->mm)) return; + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_puds = 1; tlb_remove_table(tlb, pud); } -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0) -#define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr) do { } while (0) -#define tlb_migrate_finish(mm) do { } while (0) -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ -} #endif /* _S390_TLB_H */ diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 007fcb9aeeb83eb99d00a0ea116236f8213e9db1..bd2fd9a7821da55773883e6ede43f42e1518ee3c 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -55,8 +55,10 @@ raw_copy_from_user(void *to, const void __user *from, unsigned long n); unsigned long __must_check raw_copy_to_user(void __user *to, const void *from, unsigned long n); +#ifndef CONFIG_KASAN #define INLINE_COPY_FROM_USER #define INLINE_COPY_TO_USER +#endif #ifdef CONFIG_HAVE_MARCH_Z10_FEATURES diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h new file mode 100644 index 0000000000000000000000000000000000000000..6eb2ef105d8799a4fdd5b206835a390421a1033d --- /dev/null +++ b/arch/s390/include/asm/unwind.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_UNWIND_H +#define _ASM_S390_UNWIND_H + +#include +#include +#include +#include + +/* + * To use the stack unwinder it has to be initialized with unwind_start. + * There four combinations for task and regs: + * 1) task==NULL, regs==NULL: the unwind starts for the task that is currently + * running, sp/ip picked up from the CPU registers + * 2) task==NULL, regs!=NULL: the unwind starts from the sp/ip found in + * the struct pt_regs of an interrupt frame for the current task + * 3) task!=NULL, regs==NULL: the unwind starts for an inactive task with + * the sp picked up from task->thread.ksp and the ip picked up from the + * return address stored by __switch_to + * 4) task!=NULL, regs!=NULL: the sp/ip are picked up from the interrupt + * frame 'regs' of a inactive task + * If 'first_frame' is not zero unwind_start skips unwind frames until it + * reaches the specified stack pointer. + * The end of the unwinding is indicated with unwind_done, this can be true + * right after unwind_start, e.g. with first_frame!=0 that can not be found. + * unwind_next_frame skips to the next frame. + * Once the unwind is completed unwind_error() can be used to check if there + * has been a situation where the unwinder could not correctly understand + * the tasks call chain. + */ + +struct unwind_state { + struct stack_info stack_info; + unsigned long stack_mask; + struct task_struct *task; + struct pt_regs *regs; + unsigned long sp, ip; + int graph_idx; + bool reliable; + bool error; +}; + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long first_frame); +bool unwind_next_frame(struct unwind_state *state); +unsigned long unwind_get_return_address(struct unwind_state *state); + +static inline bool unwind_done(struct unwind_state *state) +{ + return state->stack_info.type == STACK_TYPE_UNKNOWN; +} + +static inline bool unwind_error(struct unwind_state *state) +{ + return state->error; +} + +static inline void unwind_start(struct unwind_state *state, + struct task_struct *task, + struct pt_regs *regs, + unsigned long sp) +{ + sp = sp ? : get_stack_pointer(task, regs); + __unwind_start(state, task, regs, sp); +} + +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +{ + return unwind_done(state) ? NULL : state->regs; +} + +#define unwind_for_each_frame(state, task, regs, first_frame) \ + for (unwind_start(state, task, regs, first_frame); \ + !unwind_done(state); \ + unwind_next_frame(state)) + +static inline void unwind_init(void) {} +static inline void unwind_module_init(struct module *mod, void *orc_ip, + size_t orc_ip_size, void *orc, + size_t orc_size) {} + +#ifdef CONFIG_KASAN +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) +#else +#define READ_ONCE_TASK_STACK(task, x) READ_ONCE(x) +#endif + +#endif /* _ASM_S390_UNWIND_H */ diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h new file mode 100644 index 0000000000000000000000000000000000000000..ef3c00b049ab45b4cd050d97a30e8210b63f7676 --- /dev/null +++ b/arch/s390/include/asm/uv.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Ultravisor Interfaces + * + * Copyright IBM Corp. 2019 + * + * Author(s): + * Vasily Gorbik + * Janosch Frank + */ +#ifndef _ASM_S390_UV_H +#define _ASM_S390_UV_H + +#include +#include +#include +#include + +#define UVC_RC_EXECUTED 0x0001 +#define UVC_RC_INV_CMD 0x0002 +#define UVC_RC_INV_STATE 0x0003 +#define UVC_RC_INV_LEN 0x0005 +#define UVC_RC_NO_RESUME 0x0007 + +#define UVC_CMD_QUI 0x0001 +#define UVC_CMD_SET_SHARED_ACCESS 0x1000 +#define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001 + +/* Bits in installed uv calls */ +enum uv_cmds_inst { + BIT_UVC_CMD_QUI = 0, + BIT_UVC_CMD_SET_SHARED_ACCESS = 8, + BIT_UVC_CMD_REMOVE_SHARED_ACCESS = 9, +}; + +struct uv_cb_header { + u16 len; + u16 cmd; /* Command Code */ + u16 rc; /* Response Code */ + u16 rrc; /* Return Reason Code */ +} __packed __aligned(8); + +struct uv_cb_qui { + struct uv_cb_header header; + u64 reserved08; + u64 inst_calls_list[4]; + u64 reserved30[15]; +} __packed __aligned(8); + +struct uv_cb_share { + struct uv_cb_header header; + u64 reserved08[3]; + u64 paddr; + u64 reserved28; +} __packed __aligned(8); + +static inline int uv_call(unsigned long r1, unsigned long r2) +{ + int cc; + + asm volatile( + "0: .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n" + " brc 3,0b\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=d" (cc) + : [r1] "a" (r1), [r2] "a" (r2) + : "memory", "cc"); + return cc; +} + +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST +extern int prot_virt_guest; + +static inline int is_prot_virt_guest(void) +{ + return prot_virt_guest; +} + +static inline int share(unsigned long addr, u16 cmd) +{ + struct uv_cb_share uvcb = { + .header.cmd = cmd, + .header.len = sizeof(uvcb), + .paddr = addr + }; + + if (!is_prot_virt_guest()) + return -ENOTSUPP; + /* + * Sharing is page wise, if we encounter addresses that are + * not page aligned, we assume something went wrong. If + * malloced structs are passed to this function, we could leak + * data to the hypervisor. + */ + BUG_ON(addr & ~PAGE_MASK); + + if (!uv_call(0, (u64)&uvcb)) + return 0; + return -EINVAL; +} + +/* + * Guest 2 request to the Ultravisor to make a page shared with the + * hypervisor for IO. + * + * @addr: Real or absolute address of the page to be shared + */ +static inline int uv_set_shared(unsigned long addr) +{ + return share(addr, UVC_CMD_SET_SHARED_ACCESS); +} + +/* + * Guest 2 request to the Ultravisor to make a page unshared. + * + * @addr: Real or absolute address of the page to be unshared + */ +static inline int uv_remove_shared(unsigned long addr) +{ + return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS); +} + +void uv_query_info(void); +#else +#define is_prot_virt_guest() 0 +static inline int uv_set_shared(unsigned long addr) { return 0; } +static inline int uv_remove_shared(unsigned long addr) { return 0; } +static inline void uv_query_info(void) {} +#endif + +#endif /* _ASM_S390_UV_H */ diff --git a/arch/s390/include/asm/vmlinux.lds.h b/arch/s390/include/asm/vmlinux.lds.h index 2d127f900352ddc13b468b11aa9fd457572b47d6..cbe670a6861b004f7696d4b1fe84de6d451efc52 100644 --- a/arch/s390/include/asm/vmlinux.lds.h +++ b/arch/s390/include/asm/vmlinux.lds.h @@ -18,3 +18,16 @@ *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.boot.data*))) \ __boot_data_end = .; \ } + +/* + * .boot.preserved.data is similar to .boot.data, but it is not part of the + * .init section and thus will be preserved for later use in the decompressed + * kernel. + */ +#define BOOT_DATA_PRESERVED \ + . = ALIGN(PAGE_SIZE); \ + .boot.preserved.data : { \ + __boot_data_preserved_start = .; \ + *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.boot.preserved.data*))) \ + __boot_data_preserved_end = .; \ + } diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h new file mode 100644 index 0000000000000000000000000000000000000000..fd32b1cd80d2944b5376574f85f10db855cacab1 --- /dev/null +++ b/arch/s390/include/uapi/asm/ipl.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_UAPI_IPL_H +#define _ASM_S390_UAPI_IPL_H + +#include + +/* IPL Parameter List header */ +struct ipl_pl_hdr { + __u32 len; + __u8 flags; + __u8 reserved1[2]; + __u8 version; +} __packed; + +#define IPL_PL_FLAG_IPLPS 0x80 +#define IPL_PL_FLAG_SIPL 0x40 +#define IPL_PL_FLAG_IPLSR 0x20 + +/* IPL Parameter Block header */ +struct ipl_pb_hdr { + __u32 len; + __u8 pbt; +} __packed; + +/* IPL Parameter Block types */ +enum ipl_pbt { + IPL_PBT_FCP = 0, + IPL_PBT_SCP_DATA = 1, + IPL_PBT_CCW = 2, +}; + +/* IPL Parameter Block 0 with common fields */ +struct ipl_pb0_common { + __u32 len; + __u8 pbt; + __u8 flags; + __u8 reserved1[2]; + __u8 loadparm[8]; + __u8 reserved2[84]; +} __packed; + +#define IPL_PB0_FLAG_LOADPARM 0x80 + +/* IPL Parameter Block 0 for FCP */ +struct ipl_pb0_fcp { + __u32 len; + __u8 pbt; + __u8 reserved1[3]; + __u8 loadparm[8]; + __u8 reserved2[304]; + __u8 opt; + __u8 reserved3[3]; + __u8 cssid; + __u8 reserved4[1]; + __u16 devno; + __u8 reserved5[4]; + __u64 wwpn; + __u64 lun; + __u32 bootprog; + __u8 reserved6[12]; + __u64 br_lba; + __u32 scp_data_len; + __u8 reserved7[260]; + __u8 scp_data[]; +} __packed; + +#define IPL_PB0_FCP_OPT_IPL 0x10 +#define IPL_PB0_FCP_OPT_DUMP 0x20 + +/* IPL Parameter Block 0 for CCW */ +struct ipl_pb0_ccw { + __u32 len; + __u8 pbt; + __u8 flags; + __u8 reserved1[2]; + __u8 loadparm[8]; + __u8 reserved2[84]; + __u16 reserved3 : 13; + __u8 ssid : 3; + __u16 devno; + __u8 vm_flags; + __u8 reserved4[3]; + __u32 vm_parm_len; + __u8 nss_name[8]; + __u8 vm_parm[64]; + __u8 reserved5[8]; +} __packed; + +#define IPL_PB0_CCW_VM_FLAG_NSS 0x80 +#define IPL_PB0_CCW_VM_FLAG_VP 0x40 + +/* IPL Parameter Block 1 for additional SCP data */ +struct ipl_pb1_scp_data { + __u32 len; + __u8 pbt; + __u8 scp_data[]; +} __packed; + +/* IPL Report List header */ +struct ipl_rl_hdr { + __u32 len; + __u8 flags; + __u8 reserved1[2]; + __u8 version; + __u8 reserved2[8]; +} __packed; + +/* IPL Report Block header */ +struct ipl_rb_hdr { + __u32 len; + __u8 rbt; + __u8 reserved1[11]; +} __packed; + +/* IPL Report Block types */ +enum ipl_rbt { + IPL_RBT_CERTIFICATES = 1, + IPL_RBT_COMPONENTS = 2, +}; + +/* IPL Report Block for the certificate list */ +struct ipl_rb_certificate_entry { + __u64 addr; + __u64 len; +} __packed; + +struct ipl_rb_certificates { + __u32 len; + __u8 rbt; + __u8 reserved1[11]; + struct ipl_rb_certificate_entry entries[]; +} __packed; + +/* IPL Report Block for the component list */ +struct ipl_rb_component_entry { + __u64 addr; + __u64 len; + __u8 flags; + __u8 reserved1[5]; + __u16 certificate_index; + __u8 reserved2[8]; +}; + +#define IPL_RB_COMPONENT_FLAG_SIGNED 0x80 +#define IPL_RB_COMPONENT_FLAG_VERIFIED 0x40 + +struct ipl_rb_components { + __u32 len; + __u8 rbt; + __u8 reserved1[11]; + struct ipl_rb_component_entry entries[]; +} __packed; + +#endif diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 8a62c7f72e1bd9f64c878507ffd26ba50acee79a..b0478d01a0c558bcbb8625eb23c039740d5eab87 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -39,6 +39,7 @@ CFLAGS_smp.o := -Wno-nonnull # CFLAGS_stacktrace.o += -fno-optimize-sibling-calls CFLAGS_dumpstack.o += -fno-optimize-sibling-calls +CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls # # Pass UTS_MACHINE for user_regset definition @@ -51,7 +52,7 @@ obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o early_nobss.o obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o -obj-y += nospec-branch.o ipl_vmparm.o +obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o extra-y += head64.o vmlinux.lds @@ -77,6 +78,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o +obj-$(CONFIG_IMA) += ima_arch.o + obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o @@ -86,7 +89,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace.o # vdso obj-y += vdso64/ -obj-$(CONFIG_COMPAT) += vdso32/ +obj-$(CONFIG_COMPAT_VDSO) += vdso32/ chkbss := head64.o early_nobss.o include $(srctree)/arch/s390/scripts/Makefile.chkbss diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 164bec175628ad3356bfa7bc1ab1ca849e5f0935..41ac4ad2131104127009aec4fc022dea5edc7309 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -16,6 +16,7 @@ #include #include #include +#include int main(void) { diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S index f268fca67e822a1e4b9d1547400aa1353c10af5e..2f39ea57f3589440ff7e8f7c00c1dfda4f80edd8 100644 --- a/arch/s390/kernel/base.S +++ b/arch/s390/kernel/base.S @@ -28,6 +28,7 @@ ENTRY(s390_base_mcck_handler) 1: la %r1,4095 lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1) lpswe __LC_MCK_OLD_PSW +ENDPROC(s390_base_mcck_handler) .section .bss .align 8 @@ -48,6 +49,7 @@ ENTRY(s390_base_ext_handler) 1: lmg %r0,%r15,__LC_SAVE_AREA_ASYNC ni __LC_EXT_OLD_PSW+1,0xfd # clear wait state bit lpswe __LC_EXT_OLD_PSW +ENDPROC(s390_base_ext_handler) .section .bss .align 8 @@ -68,6 +70,7 @@ ENTRY(s390_base_pgm_handler) lmg %r0,%r15,__LC_SAVE_AREA_SYNC lpswe __LC_PGM_OLD_PSW 1: lpswe disabled_wait_psw-0b(%r13) +ENDPROC(s390_base_pgm_handler) .align 8 disabled_wait_psw: @@ -79,71 +82,3 @@ disabled_wait_psw: s390_base_pgm_handler_fn: .quad 0 .previous - -# -# Calls diag 308 subcode 1 and continues execution -# -ENTRY(diag308_reset) - larl %r4,.Lctlregs # Save control registers - stctg %c0,%c15,0(%r4) - lg %r2,0(%r4) # Disable lowcore protection - nilh %r2,0xefff - larl %r4,.Lctlreg0 - stg %r2,0(%r4) - lctlg %c0,%c0,0(%r4) - larl %r4,.Lfpctl # Floating point control register - stfpc 0(%r4) - larl %r4,.Lprefix # Save prefix register - stpx 0(%r4) - larl %r4,.Lprefix_zero # Set prefix register to 0 - spx 0(%r4) - larl %r4,.Lcontinue_psw # Save PSW flags - epsw %r2,%r3 - stm %r2,%r3,0(%r4) - larl %r4,.Lrestart_psw # Setup restart PSW at absolute 0 - lghi %r3,0 - lg %r4,0(%r4) # Save PSW - sturg %r4,%r3 # Use sturg, because of large pages - lghi %r1,1 - lghi %r0,0 - diag %r0,%r1,0x308 -.Lrestart_part2: - lhi %r0,0 # Load r0 with zero - lhi %r1,2 # Use mode 2 = ESAME (dump) - sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode - sam64 # Switch to 64 bit addressing mode - larl %r4,.Lctlregs # Restore control registers - lctlg %c0,%c15,0(%r4) - larl %r4,.Lfpctl # Restore floating point ctl register - lfpc 0(%r4) - larl %r4,.Lprefix # Restore prefix register - spx 0(%r4) - larl %r4,.Lcontinue_psw # Restore PSW flags - lpswe 0(%r4) -.Lcontinue: - BR_EX %r14 -.align 16 -.Lrestart_psw: - .long 0x00080000,0x80000000 + .Lrestart_part2 - - .section .data..nosave,"aw",@progbits -.align 8 -.Lcontinue_psw: - .quad 0,.Lcontinue - .previous - - .section .bss -.align 8 -.Lctlreg0: - .quad 0 -.Lctlregs: - .rept 16 - .quad 0 - .endr -.Lfpctl: - .long 0 -.Lprefix: - .long 0 -.Lprefix_zero: - .long 0 - .previous diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index 7edaa733a77fdd9286382f745bdf103253fba26d..e9dac9a24d3fc06932397c0a2b0d21747748ea73 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -13,6 +13,7 @@ #include #include #include +#include struct diag_stat { unsigned int counter[NR_DIAG_STAT]; @@ -49,6 +50,9 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" }, }; +struct diag_ops __bootdata_preserved(diag_dma_ops); +struct diag210 *__bootdata_preserved(__diag210_tmp_dma); + static int show_diag_stat(struct seq_file *m, void *v) { struct diag_stat *stat; @@ -139,30 +143,10 @@ EXPORT_SYMBOL(diag_stat_inc_norecursion); /* * Diagnose 14: Input spool file manipulation */ -static inline int __diag14(unsigned long rx, unsigned long ry1, - unsigned long subcode) -{ - register unsigned long _ry1 asm("2") = ry1; - register unsigned long _ry2 asm("3") = subcode; - int rc = 0; - - asm volatile( - " sam31\n" - " diag %2,2,0x14\n" - " sam64\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (rc), "+d" (_ry2) - : "d" (rx), "d" (_ry1) - : "cc"); - - return rc; -} - int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) { diag_stat_inc(DIAG_STAT_X014); - return __diag14(rx, ry1, subcode); + return diag_dma_ops.diag14(rx, ry1, subcode); } EXPORT_SYMBOL(diag14); @@ -195,30 +179,17 @@ EXPORT_SYMBOL(diag204); */ int diag210(struct diag210 *addr) { - /* - * diag 210 needs its data below the 2GB border, so we - * use a static data area to be sure - */ - static struct diag210 diag210_tmp; static DEFINE_SPINLOCK(diag210_lock); unsigned long flags; int ccode; spin_lock_irqsave(&diag210_lock, flags); - diag210_tmp = *addr; + *__diag210_tmp_dma = *addr; diag_stat_inc(DIAG_STAT_X210); - asm volatile( - " lhi %0,-1\n" - " sam31\n" - " diag %1,0,0x210\n" - "0: ipm %0\n" - " srl %0,28\n" - "1: sam64\n" - EX_TABLE(0b, 1b) - : "=&d" (ccode) : "a" (&diag210_tmp) : "cc", "memory"); - - *addr = diag210_tmp; + ccode = diag_dma_ops.diag210(__diag210_tmp_dma); + + *addr = *__diag210_tmp_dma; spin_unlock_irqrestore(&diag210_lock, flags); return ccode; @@ -243,27 +214,9 @@ EXPORT_SYMBOL(diag224); /* * Diagnose 26C: Access Certain System Information */ -static inline int __diag26c(void *req, void *resp, enum diag26c_sc subcode) -{ - register unsigned long _req asm("2") = (addr_t) req; - register unsigned long _resp asm("3") = (addr_t) resp; - register unsigned long _subcode asm("4") = subcode; - register unsigned long _rc asm("5") = -EOPNOTSUPP; - - asm volatile( - " sam31\n" - " diag %[rx],%[ry],0x26c\n" - "0: sam64\n" - EX_TABLE(0b,0b) - : "+d" (_rc) - : [rx] "d" (_req), "d" (_resp), [ry] "d" (_subcode) - : "cc", "memory"); - return _rc; -} - int diag26c(void *req, void *resp, enum diag26c_sc subcode) { diag_stat_inc(DIAG_STAT_X26C); - return __diag26c(req, resp, subcode); + return diag_dma_ops.diag26c(req, resp, subcode); } EXPORT_SYMBOL(diag26c); diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index cb7f55bbe06e87eeb16a6e3f38d54fdc9be63824..9e87b68be21c776ffac7a8400c1e74b1a3a55d56 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -21,95 +21,124 @@ #include #include #include +#include -/* - * For dump_trace we have tree different stack to consider: - * - the panic stack which is used if the kernel stack has overflown - * - the asynchronous interrupt stack (cpu related) - * - the synchronous kernel stack (process related) - * The stack trace can start at any of the three stacks and can potentially - * touch all of them. The order is: panic stack, async stack, sync stack. - */ -static unsigned long __no_sanitize_address -__dump_trace(dump_trace_func_t func, void *data, unsigned long sp, - unsigned long low, unsigned long high) +const char *stack_type_name(enum stack_type type) { - struct stack_frame *sf; - struct pt_regs *regs; - - while (1) { - if (sp < low || sp > high - sizeof(*sf)) - return sp; - sf = (struct stack_frame *) sp; - if (func(data, sf->gprs[8], 0)) - return sp; - /* Follow the backchain. */ - while (1) { - low = sp; - sp = sf->back_chain; - if (!sp) - break; - if (sp <= low || sp > high - sizeof(*sf)) - return sp; - sf = (struct stack_frame *) sp; - if (func(data, sf->gprs[8], 1)) - return sp; - } - /* Zero backchain detected, check for interrupt frame. */ - sp = (unsigned long) (sf + 1); - if (sp <= low || sp > high - sizeof(*regs)) - return sp; - regs = (struct pt_regs *) sp; - if (!user_mode(regs)) { - if (func(data, regs->psw.addr, 1)) - return sp; - } - low = sp; - sp = regs->gprs[15]; + switch (type) { + case STACK_TYPE_TASK: + return "task"; + case STACK_TYPE_IRQ: + return "irq"; + case STACK_TYPE_NODAT: + return "nodat"; + case STACK_TYPE_RESTART: + return "restart"; + default: + return "unknown"; } } -void dump_trace(dump_trace_func_t func, void *data, struct task_struct *task, - unsigned long sp) +static inline bool in_stack(unsigned long sp, struct stack_info *info, + enum stack_type type, unsigned long low, + unsigned long high) +{ + if (sp < low || sp >= high) + return false; + info->type = type; + info->begin = low; + info->end = high; + return true; +} + +static bool in_task_stack(unsigned long sp, struct task_struct *task, + struct stack_info *info) +{ + unsigned long stack; + + stack = (unsigned long) task_stack_page(task); + return in_stack(sp, info, STACK_TYPE_TASK, stack, stack + THREAD_SIZE); +} + +static bool in_irq_stack(unsigned long sp, struct stack_info *info) { - unsigned long frame_size; + unsigned long frame_size, top; frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); -#ifdef CONFIG_CHECK_STACK - sp = __dump_trace(func, data, sp, - S390_lowcore.nodat_stack + frame_size - THREAD_SIZE, - S390_lowcore.nodat_stack + frame_size); -#endif - sp = __dump_trace(func, data, sp, - S390_lowcore.async_stack + frame_size - THREAD_SIZE, - S390_lowcore.async_stack + frame_size); - task = task ?: current; - __dump_trace(func, data, sp, - (unsigned long)task_stack_page(task), - (unsigned long)task_stack_page(task) + THREAD_SIZE); + top = S390_lowcore.async_stack + frame_size; + return in_stack(sp, info, STACK_TYPE_IRQ, top - THREAD_SIZE, top); +} + +static bool in_nodat_stack(unsigned long sp, struct stack_info *info) +{ + unsigned long frame_size, top; + + frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); + top = S390_lowcore.nodat_stack + frame_size; + return in_stack(sp, info, STACK_TYPE_NODAT, top - THREAD_SIZE, top); } -EXPORT_SYMBOL_GPL(dump_trace); -static int show_address(void *data, unsigned long address, int reliable) +static bool in_restart_stack(unsigned long sp, struct stack_info *info) { - if (reliable) - printk(" [<%016lx>] %pSR \n", address, (void *)address); - else - printk("([<%016lx>] %pSR)\n", address, (void *)address); + unsigned long frame_size, top; + + frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); + top = S390_lowcore.restart_stack + frame_size; + return in_stack(sp, info, STACK_TYPE_RESTART, top - THREAD_SIZE, top); +} + +int get_stack_info(unsigned long sp, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + if (!sp) + goto unknown; + + task = task ? : current; + + /* Check per-task stack */ + if (in_task_stack(sp, task, info)) + goto recursion_check; + + if (task != current) + goto unknown; + + /* Check per-cpu stacks */ + if (!in_irq_stack(sp, info) && + !in_nodat_stack(sp, info) && + !in_restart_stack(sp, info)) + goto unknown; + +recursion_check: + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING + "WARNING: stack recursion on stack type %d\n", + info->type); + goto unknown; + } + *visit_mask |= 1UL << info->type; return 0; +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; } void show_stack(struct task_struct *task, unsigned long *stack) { - unsigned long sp = (unsigned long) stack; + struct unwind_state state; - if (!sp) - sp = task ? task->thread.ksp : current_stack_pointer(); printk("Call Trace:\n"); - dump_trace(show_address, NULL, task, sp); if (!task) task = current; - debug_show_held_locks(task); + unwind_for_each_frame(&state, task, NULL, (unsigned long) stack) + printk(state.reliable ? " [<%016lx>] %pSR \n" : + "([<%016lx>] %pSR)\n", + state.ip, (void *) state.ip); + debug_show_held_locks(task ? : current); } static void show_last_breaking_event(struct pt_regs *regs) diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index d6edf45f93b9ac38620a2175105b3530a584ea0b..629f173f60cd9dd72b2f8955a2007e2f54eb528a 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "entry.h" /* @@ -138,9 +139,9 @@ static void early_pgm_check_handler(void) unsigned long addr; addr = S390_lowcore.program_old_psw.addr; - fixup = search_exception_tables(addr); + fixup = s390_search_extables(addr); if (!fixup) - disabled_wait(0); + disabled_wait(); /* Disable low address protection before storing into lowcore. */ __ctl_store(cr0, 0, 0); cr0_new = cr0 & ~(1UL << 28); @@ -235,6 +236,7 @@ static __init void detect_machine_facilities(void) clock_comparator_max = -1ULL >> 1; __ctl_set_bit(0, 53); } + enable_mio_ctl(); } static inline void save_vector_registers(void) @@ -296,7 +298,7 @@ static void __init check_image_bootable(void) sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n"); sclp_early_printk("This image does not contain all parts necessary for starting up. Use\n"); sclp_early_printk("bzImage or arch/s390/boot/compressed/vmlinux instead.\n"); - disabled_wait(0xbadb007); + disabled_wait(); } void __init startup_init(void) @@ -309,7 +311,6 @@ void __init startup_init(void) setup_facility_list(); detect_machine_type(); setup_arch_string(); - ipl_store_parameters(); setup_boot_command_line(); detect_diag9c(); detect_diag44(); diff --git a/arch/s390/kernel/early_nobss.c b/arch/s390/kernel/early_nobss.c index 8d73f7fae16e00422fa4ea9eb54c39ad0901fa89..52a3ef959341a0be9f3f8f414ccfd6882a4eef4c 100644 --- a/arch/s390/kernel/early_nobss.c +++ b/arch/s390/kernel/early_nobss.c @@ -25,7 +25,7 @@ static void __init reset_tod_clock(void) return; /* TOD clock not running. Set the clock to Unix Epoch. */ if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0) - disabled_wait(0); + disabled_wait(); memset(tod_clock_base, 0, 16); *(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH; diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 583d65ef5007a1de02bd61f09e7e22d5658826f3..3f4d272577d343cc41904e10d57800efb1d2a5cc 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -224,6 +224,7 @@ ENTRY(__bpon) .globl __bpon BPON BR_EX %r14 +ENDPROC(__bpon) /* * Scheduler resume function, called by switch_to @@ -248,6 +249,7 @@ ENTRY(__switch_to) lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40 BR_EX %r14 +ENDPROC(__switch_to) .L__critical_start: @@ -324,6 +326,7 @@ sie_exit: EX_TABLE(.Lrewind_pad4,.Lsie_fault) EX_TABLE(.Lrewind_pad2,.Lsie_fault) EX_TABLE(sie_exit,.Lsie_fault) +ENDPROC(sie64a) EXPORT_SYMBOL(sie64a) EXPORT_SYMBOL(sie_exit) #endif @@ -358,19 +361,19 @@ ENTRY(system_call) # load address of system call table lg %r10,__THREAD_sysc_table(%r13,%r12) llgh %r8,__PT_INT_CODE+2(%r11) - slag %r8,%r8,2 # shift and test for svc 0 + slag %r8,%r8,3 # shift and test for svc 0 jnz .Lsysc_nr_ok # svc 0: system call number in %r1 llgfr %r1,%r1 # clear high word in r1 cghi %r1,NR_syscalls jnl .Lsysc_nr_ok sth %r1,__PT_INT_CODE+2(%r11) - slag %r8,%r1,2 + slag %r8,%r1,3 .Lsysc_nr_ok: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stg %r2,__PT_ORIG_GPR2(%r11) stg %r7,STACK_FRAME_OVERHEAD(%r15) - lgf %r9,0(%r8,%r10) # get system call add. + lg %r9,0(%r8,%r10) # get system call add. TSTMSK __TI_flags(%r12),_TIF_TRACE jnz .Lsysc_tracesys BASR_EX %r14,%r9 # call sys_xxxx @@ -556,8 +559,8 @@ ENTRY(system_call) lghi %r0,NR_syscalls clgr %r0,%r2 jnh .Lsysc_tracenogo - sllg %r8,%r2,2 - lgf %r9,0(%r8,%r10) + sllg %r8,%r2,3 + lg %r9,0(%r8,%r10) .Lsysc_tracego: lmg %r3,%r7,__PT_R3(%r11) stg %r7,STACK_FRAME_OVERHEAD(%r15) @@ -570,6 +573,7 @@ ENTRY(system_call) lgr %r2,%r11 # pass pointer to pt_regs larl %r14,.Lsysc_return jg do_syscall_trace_exit +ENDPROC(system_call) # # a new process exits the kernel with ret_from_fork @@ -584,10 +588,16 @@ ENTRY(ret_from_fork) jne .Lsysc_tracenogo # it's a kernel thread lmg %r9,%r10,__PT_R9(%r11) # load gprs + la %r2,0(%r10) + BASR_EX %r14,%r9 + j .Lsysc_tracenogo +ENDPROC(ret_from_fork) + ENTRY(kernel_thread_starter) la %r2,0(%r10) BASR_EX %r14,%r9 j .Lsysc_tracenogo +ENDPROC(kernel_thread_starter) /* * Program check handler routine @@ -665,9 +675,9 @@ ENTRY(pgm_check_handler) larl %r1,pgm_check_table llgh %r10,__PT_INT_CODE+2(%r11) nill %r10,0x007f - sll %r10,2 + sll %r10,3 je .Lpgm_return - lgf %r9,0(%r10,%r1) # load address of handler routine + lg %r9,0(%r10,%r1) # load address of handler routine lgr %r2,%r11 # pass pointer to pt_regs BASR_EX %r14,%r9 # branch to interrupt-handler .Lpgm_return: @@ -698,6 +708,7 @@ ENTRY(pgm_check_handler) stg %r14,__LC_RETURN_PSW+8 lghi %r14,_PIF_SYSCALL | _PIF_PER_TRAP lpswe __LC_RETURN_PSW # branch to .Lsysc_per and enable irqs +ENDPROC(pgm_check_handler) /* * IO interrupt handler routine @@ -926,6 +937,7 @@ ENTRY(io_int_handler) ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts TRACE_IRQS_OFF j .Lio_return +ENDPROC(io_int_handler) /* * External interrupt handler routine @@ -965,6 +977,7 @@ ENTRY(ext_int_handler) lghi %r3,EXT_INTERRUPT brasl %r14,do_IRQ j .Lio_return +ENDPROC(ext_int_handler) /* * Load idle PSW. The second "half" of this function is in .Lcleanup_idle. @@ -989,6 +1002,7 @@ ENTRY(psw_idle) lpswe __SF_EMPTY(%r15) BR_EX %r14 .Lpsw_idle_end: +ENDPROC(psw_idle) /* * Store floating-point controls and floating-point or vector register @@ -1031,6 +1045,7 @@ ENTRY(save_fpu_regs) .Lsave_fpu_regs_exit: BR_EX %r14 .Lsave_fpu_regs_end: +ENDPROC(save_fpu_regs) EXPORT_SYMBOL(save_fpu_regs) /* @@ -1077,6 +1092,7 @@ load_fpu_regs: .Lload_fpu_regs_exit: BR_EX %r14 .Lload_fpu_regs_end: +ENDPROC(load_fpu_regs) .L__critical_end: @@ -1206,6 +1222,7 @@ ENTRY(mcck_int_handler) lg %r15,__LC_NODAT_STACK la %r11,STACK_FRAME_OVERHEAD(%r15) j .Lmcck_skip +ENDPROC(mcck_int_handler) # # PSW restart interrupt handler @@ -1232,6 +1249,7 @@ ENTRY(restart_int_handler) 2: sigp %r4,%r3,SIGP_STOP # sigp stop to current cpu brc 2,2b 3: j 3b +ENDPROC(restart_int_handler) .section .kprobes.text, "ax" @@ -1241,7 +1259,7 @@ ENTRY(restart_int_handler) * No need to properly save the registers, we are going to panic anyway. * Setup a pt_regs so that show_trace can provide a good call trace. */ -stack_overflow: +ENTRY(stack_overflow) lg %r15,__LC_NODAT_STACK # change to panic stack la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) @@ -1251,9 +1269,10 @@ stack_overflow: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs jg kernel_stack_overflow +ENDPROC(stack_overflow) #endif -cleanup_critical: +ENTRY(cleanup_critical) #if IS_ENABLED(CONFIG_KVM) clg %r9,BASED(.Lcleanup_table_sie) # .Lsie_gmap jl 0f @@ -1289,6 +1308,7 @@ cleanup_critical: clg %r9,BASED(.Lcleanup_table+104) # .Lload_fpu_regs_end jl .Lcleanup_load_fpu_regs 0: BR_EX %r14,%r11 +ENDPROC(cleanup_critical) .align 8 .Lcleanup_table: @@ -1512,7 +1532,7 @@ cleanup_critical: .quad .Lsie_skip - .Lsie_entry #endif .section .rodata, "a" -#define SYSCALL(esame,emu) .long __s390x_ ## esame +#define SYSCALL(esame,emu) .quad __s390x_ ## esame .globl sys_call_table sys_call_table: #include "asm/syscall_table.h" @@ -1520,7 +1540,7 @@ sys_call_table: #ifdef CONFIG_COMPAT -#define SYSCALL(esame,emu) .long __s390_ ## emu +#define SYSCALL(esame,emu) .quad __s390_ ## emu .globl sys_call_table_emu sys_call_table_emu: #include "asm/syscall_table.h" diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index c3816ae108b085afca4a9326ac2d0eb9c3f3b6a2..20420c2b8a146964e2018edc25fd1393d2823f45 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -65,7 +65,7 @@ int setup_profiling_timer(unsigned int multiplier); void __init time_init(void); int pfn_is_nosave(unsigned long); void s390_early_resume(void); -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip); +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip); struct s390_mmap_arg_struct; struct fadvise64_64_args; diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c index 594464f2129d4706fc4786d2de55d7c73974c97c..0da378e2eb25edcfee1f787b50eb900947b2ffc4 100644 --- a/arch/s390/kernel/fpu.c +++ b/arch/s390/kernel/fpu.c @@ -23,7 +23,7 @@ void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags) if (flags & KERNEL_FPC) /* Save floating point control */ - asm volatile("stfpc %0" : "=m" (state->fpc)); + asm volatile("stfpc %0" : "=Q" (state->fpc)); if (!MACHINE_HAS_VX) { if (flags & KERNEL_VXR_V0V7) { diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 39b13d71a8fe6dc2979e8a8320ae62b675b8ee9b..1bb85f60c0dd515efcda8a3f8812aa5ebc3e3018 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -201,17 +201,18 @@ device_initcall(ftrace_plt_init); * Hook the return address and push it in the stack of return addresses * in current thread info. */ -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) +unsigned long prepare_ftrace_return(unsigned long ra, unsigned long sp, + unsigned long ip) { if (unlikely(ftrace_graph_is_dead())) goto out; if (unlikely(atomic_read(¤t->tracing_graph_pause))) goto out; ip -= MCOUNT_INSN_SIZE; - if (!function_graph_enter(parent, ip, 0, NULL)) - parent = (unsigned long) return_to_handler; + if (!function_graph_enter(ra, ip, 0, (void *) sp)) + ra = (unsigned long) return_to_handler; out: - return parent; + return ra; } NOKPROBE_SYMBOL(prepare_ftrace_return); diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 56491e636eabc5da499de49f109ef4f0d3913f8e..5aea1a527443004f455f56637f017d023077ab19 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -26,7 +26,6 @@ ENTRY(startup_continue) 0: larl %r1,tod_clock_base mvc 0(16,%r1),__LC_BOOT_CLOCK larl %r13,.LPG1 # get base - lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers larl %r0,boot_vdso_data stg %r0,__LC_VDSO_PER_CPU # @@ -61,22 +60,6 @@ ENTRY(startup_continue) .align 16 .LPG1: -.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space - .quad 0 # cr1: primary space segment table - .quad .Lduct # cr2: dispatchable unit control table - .quad 0 # cr3: instruction authorization - .quad 0xffff # cr4: instruction authorization - .quad .Lduct # cr5: primary-aste origin - .quad 0 # cr6: I/O interrupts - .quad 0 # cr7: secondary space segment table - .quad 0 # cr8: access registers translation - .quad 0 # cr9: tracing off - .quad 0 # cr10: tracing off - .quad 0 # cr11: tracing off - .quad 0 # cr12: tracing off - .quad 0 # cr13: home space segment table - .quad 0xc0000000 # cr14: machine check handling off - .quad .Llinkage_stack # cr15: linkage stack operations .Lpcmsk:.quad 0x0000000180000000 .L4malign:.quad 0xffffffffffc00000 .Lscan2g:.quad 0x80000000 + 0x20000 - 8 # 2GB + 128K - 8 @@ -84,14 +67,5 @@ ENTRY(startup_continue) .Lparmaddr: .quad PARMAREA .align 64 -.Lduct: .long 0,.Laste,.Laste,0,.Lduald,0,0,0 - .long 0,0,0,0,0,0,0,0 -.Laste: .quad 0,0xffffffffffffffff,0,0,0,0,0,0 - .align 128 -.Lduald:.rept 8 - .long 0x80000000,0,0,0 # invalid access-list entries - .endr -.Llinkage_stack: - .long 0,0,0x89000000,0,0,0,0x8a000000,0 .Ldw: .quad 0x0002000180000000,0x0000000000000000 .Laregs:.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/arch/s390/kernel/ima_arch.c b/arch/s390/kernel/ima_arch.c new file mode 100644 index 0000000000000000000000000000000000000000..f3c3e6e1c5d38c8a2d401700b63ff55600baaece --- /dev/null +++ b/arch/s390/kernel/ima_arch.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +bool arch_ima_get_secureboot(void) +{ + return ipl_secure_flag; +} + +const char * const *arch_get_ima_policy(void) +{ + return NULL; +} diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 18a5d6317accd69f2e0c594bfeceed7433fb31ef..d836af3ccc38208b99428430ec629186281be48d 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "entry.h" #define IPL_PARM_BLOCK_VERSION 0 @@ -119,11 +120,15 @@ static char *dump_type_str(enum dump_type type) } } -struct ipl_parameter_block __bootdata(early_ipl_block); -int __bootdata(early_ipl_block_valid); +int __bootdata_preserved(ipl_block_valid); +struct ipl_parameter_block __bootdata_preserved(ipl_block); +int __bootdata_preserved(ipl_secure_flag); -static int ipl_block_valid; -static struct ipl_parameter_block ipl_block; +unsigned long __bootdata_preserved(ipl_cert_list_addr); +unsigned long __bootdata_preserved(ipl_cert_list_size); + +unsigned long __bootdata(early_ipl_comp_list_addr); +unsigned long __bootdata(early_ipl_comp_list_size); static int reipl_capabilities = IPL_TYPE_UNKNOWN; @@ -246,11 +251,11 @@ static __init enum ipl_type get_ipl_type(void) if (!ipl_block_valid) return IPL_TYPE_UNKNOWN; - switch (ipl_block.hdr.pbt) { - case DIAG308_IPL_TYPE_CCW: + switch (ipl_block.pb0_hdr.pbt) { + case IPL_PBT_CCW: return IPL_TYPE_CCW; - case DIAG308_IPL_TYPE_FCP: - if (ipl_block.ipl_info.fcp.opt == DIAG308_IPL_OPT_DUMP) + case IPL_PBT_FCP: + if (ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) return IPL_TYPE_FCP_DUMP; else return IPL_TYPE_FCP; @@ -269,12 +274,35 @@ static ssize_t ipl_type_show(struct kobject *kobj, struct kobj_attribute *attr, static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type); +static ssize_t ipl_secure_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%i\n", !!ipl_secure_flag); +} + +static struct kobj_attribute sys_ipl_secure_attr = + __ATTR(secure, 0444, ipl_secure_show, NULL); + +static ssize_t ipl_has_secure_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + if (MACHINE_IS_LPAR) + return sprintf(page, "%i\n", !!sclp.has_sipl); + else if (MACHINE_IS_VM) + return sprintf(page, "%i\n", !!sclp.has_sipl_g2); + else + return sprintf(page, "%i\n", 0); +} + +static struct kobj_attribute sys_ipl_has_secure_attr = + __ATTR(has_secure, 0444, ipl_has_secure_show, NULL); + static ssize_t ipl_vm_parm_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { char parm[DIAG308_VMPARM_SIZE + 1] = {}; - if (ipl_block_valid && (ipl_block.hdr.pbt == DIAG308_IPL_TYPE_CCW)) + if (ipl_block_valid && (ipl_block.pb0_hdr.pbt == IPL_PBT_CCW)) ipl_block_get_ascii_vmparm(parm, sizeof(parm), &ipl_block); return sprintf(page, "%s\n", parm); } @@ -287,12 +315,11 @@ static ssize_t sys_ipl_device_show(struct kobject *kobj, { switch (ipl_info.type) { case IPL_TYPE_CCW: - return sprintf(page, "0.%x.%04x\n", ipl_block.ipl_info.ccw.ssid, - ipl_block.ipl_info.ccw.devno); + return sprintf(page, "0.%x.%04x\n", ipl_block.ccw.ssid, + ipl_block.ccw.devno); case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: - return sprintf(page, "0.0.%04x\n", - ipl_block.ipl_info.fcp.devno); + return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno); default: return 0; } @@ -316,8 +343,8 @@ static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { - unsigned int size = ipl_block.ipl_info.fcp.scp_data_len; - void *scp_data = &ipl_block.ipl_info.fcp.scp_data; + unsigned int size = ipl_block.fcp.scp_data_len; + void *scp_data = &ipl_block.fcp.scp_data; return memory_read_from_buffer(buf, count, &off, scp_data, size); } @@ -333,13 +360,13 @@ static struct bin_attribute *ipl_fcp_bin_attrs[] = { /* FCP ipl device attributes */ DEFINE_IPL_ATTR_RO(ipl_fcp, wwpn, "0x%016llx\n", - (unsigned long long)ipl_block.ipl_info.fcp.wwpn); + (unsigned long long)ipl_block.fcp.wwpn); DEFINE_IPL_ATTR_RO(ipl_fcp, lun, "0x%016llx\n", - (unsigned long long)ipl_block.ipl_info.fcp.lun); + (unsigned long long)ipl_block.fcp.lun); DEFINE_IPL_ATTR_RO(ipl_fcp, bootprog, "%lld\n", - (unsigned long long)ipl_block.ipl_info.fcp.bootprog); + (unsigned long long)ipl_block.fcp.bootprog); DEFINE_IPL_ATTR_RO(ipl_fcp, br_lba, "%lld\n", - (unsigned long long)ipl_block.ipl_info.fcp.br_lba); + (unsigned long long)ipl_block.fcp.br_lba); static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) @@ -365,6 +392,8 @@ static struct attribute *ipl_fcp_attrs[] = { &sys_ipl_fcp_bootprog_attr.attr, &sys_ipl_fcp_br_lba_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, + &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, NULL, }; @@ -380,6 +409,8 @@ static struct attribute *ipl_ccw_attrs_vm[] = { &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, &sys_ipl_vm_parm_attr.attr, + &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, NULL, }; @@ -387,6 +418,8 @@ static struct attribute *ipl_ccw_attrs_lpar[] = { &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, + &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, NULL, }; @@ -495,14 +528,14 @@ static ssize_t reipl_generic_vmparm_store(struct ipl_parameter_block *ipb, if (!(isalnum(buf[i]) || isascii(buf[i]) || isprint(buf[i]))) return -EINVAL; - memset(ipb->ipl_info.ccw.vm_parm, 0, DIAG308_VMPARM_SIZE); - ipb->ipl_info.ccw.vm_parm_len = ip_len; + memset(ipb->ccw.vm_parm, 0, DIAG308_VMPARM_SIZE); + ipb->ccw.vm_parm_len = ip_len; if (ip_len > 0) { - ipb->ipl_info.ccw.vm_flags |= DIAG308_VM_FLAGS_VP_VALID; - memcpy(ipb->ipl_info.ccw.vm_parm, buf, ip_len); - ASCEBC(ipb->ipl_info.ccw.vm_parm, ip_len); + ipb->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_VP; + memcpy(ipb->ccw.vm_parm, buf, ip_len); + ASCEBC(ipb->ccw.vm_parm, ip_len); } else { - ipb->ipl_info.ccw.vm_flags &= ~DIAG308_VM_FLAGS_VP_VALID; + ipb->ccw.vm_flags &= ~IPL_PB0_CCW_VM_FLAG_VP; } return len; @@ -549,8 +582,8 @@ static ssize_t reipl_fcp_scpdata_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { - size_t size = reipl_block_fcp->ipl_info.fcp.scp_data_len; - void *scp_data = reipl_block_fcp->ipl_info.fcp.scp_data; + size_t size = reipl_block_fcp->fcp.scp_data_len; + void *scp_data = reipl_block_fcp->fcp.scp_data; return memory_read_from_buffer(buf, count, &off, scp_data, size); } @@ -566,17 +599,17 @@ static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj, if (off) return -EINVAL; - memcpy(reipl_block_fcp->ipl_info.fcp.scp_data, buf, count); + memcpy(reipl_block_fcp->fcp.scp_data, buf, count); if (scpdata_len % 8) { padding = 8 - (scpdata_len % 8); - memset(reipl_block_fcp->ipl_info.fcp.scp_data + scpdata_len, + memset(reipl_block_fcp->fcp.scp_data + scpdata_len, 0, padding); scpdata_len += padding; } - reipl_block_fcp->ipl_info.fcp.scp_data_len = scpdata_len; - reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN + scpdata_len; - reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN + scpdata_len; + reipl_block_fcp->hdr.len = IPL_BP_FCP_LEN + scpdata_len; + reipl_block_fcp->fcp.len = IPL_BP0_FCP_LEN + scpdata_len; + reipl_block_fcp->fcp.scp_data_len = scpdata_len; return count; } @@ -590,20 +623,20 @@ static struct bin_attribute *reipl_fcp_bin_attrs[] = { }; DEFINE_IPL_ATTR_RW(reipl_fcp, wwpn, "0x%016llx\n", "%llx\n", - reipl_block_fcp->ipl_info.fcp.wwpn); + reipl_block_fcp->fcp.wwpn); DEFINE_IPL_ATTR_RW(reipl_fcp, lun, "0x%016llx\n", "%llx\n", - reipl_block_fcp->ipl_info.fcp.lun); + reipl_block_fcp->fcp.lun); DEFINE_IPL_ATTR_RW(reipl_fcp, bootprog, "%lld\n", "%lld\n", - reipl_block_fcp->ipl_info.fcp.bootprog); + reipl_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_RW(reipl_fcp, br_lba, "%lld\n", "%lld\n", - reipl_block_fcp->ipl_info.fcp.br_lba); + reipl_block_fcp->fcp.br_lba); DEFINE_IPL_ATTR_RW(reipl_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", - reipl_block_fcp->ipl_info.fcp.devno); + reipl_block_fcp->fcp.devno); static void reipl_get_ascii_loadparm(char *loadparm, struct ipl_parameter_block *ibp) { - memcpy(loadparm, ibp->hdr.loadparm, LOADPARM_LEN); + memcpy(loadparm, ibp->common.loadparm, LOADPARM_LEN); EBCASC(loadparm, LOADPARM_LEN); loadparm[LOADPARM_LEN] = 0; strim(loadparm); @@ -638,11 +671,11 @@ static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, return -EINVAL; } /* initialize loadparm with blanks */ - memset(ipb->hdr.loadparm, ' ', LOADPARM_LEN); + memset(ipb->common.loadparm, ' ', LOADPARM_LEN); /* copy and convert to ebcdic */ - memcpy(ipb->hdr.loadparm, buf, lp_len); - ASCEBC(ipb->hdr.loadparm, LOADPARM_LEN); - ipb->hdr.flags |= DIAG308_FLAGS_LP_VALID; + memcpy(ipb->common.loadparm, buf, lp_len); + ASCEBC(ipb->common.loadparm, LOADPARM_LEN); + ipb->common.flags |= IPL_PB0_FLAG_LOADPARM; return len; } @@ -680,7 +713,7 @@ static struct attribute_group reipl_fcp_attr_group = { }; /* CCW reipl device attributes */ -DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ipl_info.ccw); +DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); /* NSS wrapper */ static ssize_t reipl_nss_loadparm_show(struct kobject *kobj, @@ -742,7 +775,7 @@ static struct attribute_group reipl_ccw_attr_group_lpar = { static void reipl_get_ascii_nss_name(char *dst, struct ipl_parameter_block *ipb) { - memcpy(dst, ipb->ipl_info.ccw.nss_name, NSS_NAME_SIZE); + memcpy(dst, ipb->ccw.nss_name, NSS_NAME_SIZE); EBCASC(dst, NSS_NAME_SIZE); dst[NSS_NAME_SIZE] = 0; } @@ -770,16 +803,14 @@ static ssize_t reipl_nss_name_store(struct kobject *kobj, if (nss_len > NSS_NAME_SIZE) return -EINVAL; - memset(reipl_block_nss->ipl_info.ccw.nss_name, 0x40, NSS_NAME_SIZE); + memset(reipl_block_nss->ccw.nss_name, 0x40, NSS_NAME_SIZE); if (nss_len > 0) { - reipl_block_nss->ipl_info.ccw.vm_flags |= - DIAG308_VM_FLAGS_NSS_VALID; - memcpy(reipl_block_nss->ipl_info.ccw.nss_name, buf, nss_len); - ASCEBC(reipl_block_nss->ipl_info.ccw.nss_name, nss_len); - EBC_TOUPPER(reipl_block_nss->ipl_info.ccw.nss_name, nss_len); + reipl_block_nss->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_NSS; + memcpy(reipl_block_nss->ccw.nss_name, buf, nss_len); + ASCEBC(reipl_block_nss->ccw.nss_name, nss_len); + EBC_TOUPPER(reipl_block_nss->ccw.nss_name, nss_len); } else { - reipl_block_nss->ipl_info.ccw.vm_flags &= - ~DIAG308_VM_FLAGS_NSS_VALID; + reipl_block_nss->ccw.vm_flags &= ~IPL_PB0_CCW_VM_FLAG_NSS; } return len; @@ -866,15 +897,21 @@ static void __reipl_run(void *unused) { switch (reipl_type) { case IPL_TYPE_CCW: + uv_set_shared(__pa(reipl_block_ccw)); diag308(DIAG308_SET, reipl_block_ccw); + uv_remove_shared(__pa(reipl_block_ccw)); diag308(DIAG308_LOAD_CLEAR, NULL); break; case IPL_TYPE_FCP: + uv_set_shared(__pa(reipl_block_fcp)); diag308(DIAG308_SET, reipl_block_fcp); + uv_remove_shared(__pa(reipl_block_fcp)); diag308(DIAG308_LOAD_CLEAR, NULL); break; case IPL_TYPE_NSS: + uv_set_shared(__pa(reipl_block_nss)); diag308(DIAG308_SET, reipl_block_nss); + uv_remove_shared(__pa(reipl_block_nss)); diag308(DIAG308_LOAD_CLEAR, NULL); break; case IPL_TYPE_UNKNOWN: @@ -883,7 +920,7 @@ static void __reipl_run(void *unused) case IPL_TYPE_FCP_DUMP: break; } - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); } static void reipl_run(struct shutdown_trigger *trigger) @@ -893,10 +930,10 @@ static void reipl_run(struct shutdown_trigger *trigger) static void reipl_block_ccw_init(struct ipl_parameter_block *ipb) { - ipb->hdr.len = IPL_PARM_BLK_CCW_LEN; + ipb->hdr.len = IPL_BP_CCW_LEN; ipb->hdr.version = IPL_PARM_BLOCK_VERSION; - ipb->hdr.blk0_len = IPL_PARM_BLK0_CCW_LEN; - ipb->hdr.pbt = DIAG308_IPL_TYPE_CCW; + ipb->pb0_hdr.len = IPL_BP0_CCW_LEN; + ipb->pb0_hdr.pbt = IPL_PBT_CCW; } static void reipl_block_ccw_fill_parms(struct ipl_parameter_block *ipb) @@ -904,21 +941,20 @@ static void reipl_block_ccw_fill_parms(struct ipl_parameter_block *ipb) /* LOADPARM */ /* check if read scp info worked and set loadparm */ if (sclp_ipl_info.is_valid) - memcpy(ipb->hdr.loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN); + memcpy(ipb->ccw.loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN); else /* read scp info failed: set empty loadparm (EBCDIC blanks) */ - memset(ipb->hdr.loadparm, 0x40, LOADPARM_LEN); - ipb->hdr.flags = DIAG308_FLAGS_LP_VALID; + memset(ipb->ccw.loadparm, 0x40, LOADPARM_LEN); + ipb->ccw.flags = IPL_PB0_FLAG_LOADPARM; /* VM PARM */ if (MACHINE_IS_VM && ipl_block_valid && - (ipl_block.ipl_info.ccw.vm_flags & DIAG308_VM_FLAGS_VP_VALID)) { + (ipl_block.ccw.vm_flags & IPL_PB0_CCW_VM_FLAG_VP)) { - ipb->ipl_info.ccw.vm_flags |= DIAG308_VM_FLAGS_VP_VALID; - ipb->ipl_info.ccw.vm_parm_len = - ipl_block.ipl_info.ccw.vm_parm_len; - memcpy(ipb->ipl_info.ccw.vm_parm, - ipl_block.ipl_info.ccw.vm_parm, DIAG308_VMPARM_SIZE); + ipb->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_VP; + ipb->ccw.vm_parm_len = ipl_block.ccw.vm_parm_len; + memcpy(ipb->ccw.vm_parm, + ipl_block.ccw.vm_parm, DIAG308_VMPARM_SIZE); } } @@ -958,8 +994,8 @@ static int __init reipl_ccw_init(void) reipl_block_ccw_init(reipl_block_ccw); if (ipl_info.type == IPL_TYPE_CCW) { - reipl_block_ccw->ipl_info.ccw.ssid = ipl_block.ipl_info.ccw.ssid; - reipl_block_ccw->ipl_info.ccw.devno = ipl_block.ipl_info.ccw.devno; + reipl_block_ccw->ccw.ssid = ipl_block.ccw.ssid; + reipl_block_ccw->ccw.devno = ipl_block.ccw.devno; reipl_block_ccw_fill_parms(reipl_block_ccw); } @@ -997,14 +1033,14 @@ static int __init reipl_fcp_init(void) * is invalid in the SCSI IPL parameter block, so take it * always from sclp_ipl_info. */ - memcpy(reipl_block_fcp->hdr.loadparm, sclp_ipl_info.loadparm, + memcpy(reipl_block_fcp->fcp.loadparm, sclp_ipl_info.loadparm, LOADPARM_LEN); } else { - reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN; + reipl_block_fcp->hdr.len = IPL_BP_FCP_LEN; reipl_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; - reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN; - reipl_block_fcp->hdr.pbt = DIAG308_IPL_TYPE_FCP; - reipl_block_fcp->ipl_info.fcp.opt = DIAG308_IPL_OPT_IPL; + reipl_block_fcp->fcp.len = IPL_BP0_FCP_LEN; + reipl_block_fcp->fcp.pbt = IPL_PBT_FCP; + reipl_block_fcp->fcp.opt = IPL_PB0_FCP_OPT_IPL; } reipl_capabilities |= IPL_TYPE_FCP; return 0; @@ -1022,10 +1058,10 @@ static int __init reipl_type_init(void) /* * If we have an OS info reipl block, this will be used */ - if (reipl_block->hdr.pbt == DIAG308_IPL_TYPE_FCP) { + if (reipl_block->pb0_hdr.pbt == IPL_PBT_FCP) { memcpy(reipl_block_fcp, reipl_block, size); reipl_type = IPL_TYPE_FCP; - } else if (reipl_block->hdr.pbt == DIAG308_IPL_TYPE_CCW) { + } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_CCW) { memcpy(reipl_block_ccw, reipl_block, size); reipl_type = IPL_TYPE_CCW; } @@ -1070,15 +1106,15 @@ static struct shutdown_action __refdata reipl_action = { /* FCP dump device attributes */ DEFINE_IPL_ATTR_RW(dump_fcp, wwpn, "0x%016llx\n", "%llx\n", - dump_block_fcp->ipl_info.fcp.wwpn); + dump_block_fcp->fcp.wwpn); DEFINE_IPL_ATTR_RW(dump_fcp, lun, "0x%016llx\n", "%llx\n", - dump_block_fcp->ipl_info.fcp.lun); + dump_block_fcp->fcp.lun); DEFINE_IPL_ATTR_RW(dump_fcp, bootprog, "%lld\n", "%lld\n", - dump_block_fcp->ipl_info.fcp.bootprog); + dump_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_RW(dump_fcp, br_lba, "%lld\n", "%lld\n", - dump_block_fcp->ipl_info.fcp.br_lba); + dump_block_fcp->fcp.br_lba); DEFINE_IPL_ATTR_RW(dump_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", - dump_block_fcp->ipl_info.fcp.devno); + dump_block_fcp->fcp.devno); static struct attribute *dump_fcp_attrs[] = { &sys_dump_fcp_device_attr.attr, @@ -1095,7 +1131,7 @@ static struct attribute_group dump_fcp_attr_group = { }; /* CCW dump device attributes */ -DEFINE_IPL_CCW_ATTR_RW(dump_ccw, device, dump_block_ccw->ipl_info.ccw); +DEFINE_IPL_CCW_ATTR_RW(dump_ccw, device, dump_block_ccw->ccw); static struct attribute *dump_ccw_attrs[] = { &sys_dump_ccw_device_attr.attr, @@ -1145,7 +1181,9 @@ static struct kset *dump_kset; static void diag308_dump(void *dump_block) { + uv_set_shared(__pa(dump_block)); diag308(DIAG308_SET, dump_block); + uv_remove_shared(__pa(dump_block)); while (1) { if (diag308(DIAG308_LOAD_NORMAL_DUMP, NULL) != 0x302) break; @@ -1187,10 +1225,10 @@ static int __init dump_ccw_init(void) free_page((unsigned long)dump_block_ccw); return rc; } - dump_block_ccw->hdr.len = IPL_PARM_BLK_CCW_LEN; + dump_block_ccw->hdr.len = IPL_BP_CCW_LEN; dump_block_ccw->hdr.version = IPL_PARM_BLOCK_VERSION; - dump_block_ccw->hdr.blk0_len = IPL_PARM_BLK0_CCW_LEN; - dump_block_ccw->hdr.pbt = DIAG308_IPL_TYPE_CCW; + dump_block_ccw->ccw.len = IPL_BP0_CCW_LEN; + dump_block_ccw->ccw.pbt = IPL_PBT_CCW; dump_capabilities |= DUMP_TYPE_CCW; return 0; } @@ -1209,11 +1247,11 @@ static int __init dump_fcp_init(void) free_page((unsigned long)dump_block_fcp); return rc; } - dump_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN; + dump_block_fcp->hdr.len = IPL_BP_FCP_LEN; dump_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; - dump_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN; - dump_block_fcp->hdr.pbt = DIAG308_IPL_TYPE_FCP; - dump_block_fcp->ipl_info.fcp.opt = DIAG308_IPL_OPT_DUMP; + dump_block_fcp->fcp.len = IPL_BP0_FCP_LEN; + dump_block_fcp->fcp.pbt = IPL_PBT_FCP; + dump_block_fcp->fcp.opt = IPL_PB0_FCP_OPT_DUMP; dump_capabilities |= DUMP_TYPE_FCP; return 0; } @@ -1337,7 +1375,7 @@ static void stop_run(struct shutdown_trigger *trigger) { if (strcmp(trigger->name, ON_PANIC_STR) == 0 || strcmp(trigger->name, ON_RESTART_STR) == 0) - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); smp_stop_cpu(); } @@ -1572,7 +1610,7 @@ static int __init s390_ipl_init(void) * READ SCP info provides the correct value. */ if (memcmp(sclp_ipl_info.loadparm, str, sizeof(str)) == 0 && ipl_block_valid) - memcpy(sclp_ipl_info.loadparm, ipl_block.hdr.loadparm, LOADPARM_LEN); + memcpy(sclp_ipl_info.loadparm, ipl_block.ccw.loadparm, LOADPARM_LEN); shutdown_actions_init(); shutdown_triggers_init(); return 0; @@ -1657,15 +1695,15 @@ void __init setup_ipl(void) ipl_info.type = get_ipl_type(); switch (ipl_info.type) { case IPL_TYPE_CCW: - ipl_info.data.ccw.dev_id.ssid = ipl_block.ipl_info.ccw.ssid; - ipl_info.data.ccw.dev_id.devno = ipl_block.ipl_info.ccw.devno; + ipl_info.data.ccw.dev_id.ssid = ipl_block.ccw.ssid; + ipl_info.data.ccw.dev_id.devno = ipl_block.ccw.devno; break; case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: ipl_info.data.fcp.dev_id.ssid = 0; - ipl_info.data.fcp.dev_id.devno = ipl_block.ipl_info.fcp.devno; - ipl_info.data.fcp.wwpn = ipl_block.ipl_info.fcp.wwpn; - ipl_info.data.fcp.lun = ipl_block.ipl_info.fcp.lun; + ipl_info.data.fcp.dev_id.devno = ipl_block.fcp.devno; + ipl_info.data.fcp.wwpn = ipl_block.fcp.wwpn; + ipl_info.data.fcp.lun = ipl_block.fcp.lun; break; case IPL_TYPE_NSS: case IPL_TYPE_UNKNOWN: @@ -1675,14 +1713,6 @@ void __init setup_ipl(void) atomic_notifier_chain_register(&panic_notifier_list, &on_panic_nb); } -void __init ipl_store_parameters(void) -{ - if (early_ipl_block_valid) { - memcpy(&ipl_block, &early_ipl_block, sizeof(ipl_block)); - ipl_block_valid = 1; - } -} - void s390_reset_system(void) { /* Disable prefixing */ @@ -1690,5 +1720,139 @@ void s390_reset_system(void) /* Disable lowcore protection */ __ctl_clear_bit(0, 28); - diag308_reset(); + diag_dma_ops.diag308_reset(); +} + +#ifdef CONFIG_KEXEC_FILE + +int ipl_report_add_component(struct ipl_report *report, struct kexec_buf *kbuf, + unsigned char flags, unsigned short cert) +{ + struct ipl_report_component *comp; + + comp = vzalloc(sizeof(*comp)); + if (!comp) + return -ENOMEM; + list_add_tail(&comp->list, &report->components); + + comp->entry.addr = kbuf->mem; + comp->entry.len = kbuf->memsz; + comp->entry.flags = flags; + comp->entry.certificate_index = cert; + + report->size += sizeof(comp->entry); + + return 0; +} + +int ipl_report_add_certificate(struct ipl_report *report, void *key, + unsigned long addr, unsigned long len) +{ + struct ipl_report_certificate *cert; + + cert = vzalloc(sizeof(*cert)); + if (!cert) + return -ENOMEM; + list_add_tail(&cert->list, &report->certificates); + + cert->entry.addr = addr; + cert->entry.len = len; + cert->key = key; + + report->size += sizeof(cert->entry); + report->size += cert->entry.len; + + return 0; +} + +struct ipl_report *ipl_report_init(struct ipl_parameter_block *ipib) +{ + struct ipl_report *report; + + report = vzalloc(sizeof(*report)); + if (!report) + return ERR_PTR(-ENOMEM); + + report->ipib = ipib; + INIT_LIST_HEAD(&report->components); + INIT_LIST_HEAD(&report->certificates); + + report->size = ALIGN(ipib->hdr.len, 8); + report->size += sizeof(struct ipl_rl_hdr); + report->size += sizeof(struct ipl_rb_components); + report->size += sizeof(struct ipl_rb_certificates); + + return report; +} + +void *ipl_report_finish(struct ipl_report *report) +{ + struct ipl_report_certificate *cert; + struct ipl_report_component *comp; + struct ipl_rb_certificates *certs; + struct ipl_parameter_block *ipib; + struct ipl_rb_components *comps; + struct ipl_rl_hdr *rl_hdr; + void *buf, *ptr; + + buf = vzalloc(report->size); + if (!buf) + return ERR_PTR(-ENOMEM); + ptr = buf; + + memcpy(ptr, report->ipib, report->ipib->hdr.len); + ipib = ptr; + if (ipl_secure_flag) + ipib->hdr.flags |= IPL_PL_FLAG_SIPL; + ipib->hdr.flags |= IPL_PL_FLAG_IPLSR; + ptr += report->ipib->hdr.len; + ptr = PTR_ALIGN(ptr, 8); + + rl_hdr = ptr; + ptr += sizeof(*rl_hdr); + + comps = ptr; + comps->rbt = IPL_RBT_COMPONENTS; + ptr += sizeof(*comps); + list_for_each_entry(comp, &report->components, list) { + memcpy(ptr, &comp->entry, sizeof(comp->entry)); + ptr += sizeof(comp->entry); + } + comps->len = ptr - (void *)comps; + + certs = ptr; + certs->rbt = IPL_RBT_CERTIFICATES; + ptr += sizeof(*certs); + list_for_each_entry(cert, &report->certificates, list) { + memcpy(ptr, &cert->entry, sizeof(cert->entry)); + ptr += sizeof(cert->entry); + } + certs->len = ptr - (void *)certs; + rl_hdr->len = ptr - (void *)rl_hdr; + + list_for_each_entry(cert, &report->certificates, list) { + memcpy(ptr, cert->key, cert->entry.len); + ptr += cert->entry.len; + } + + BUG_ON(ptr > buf + report->size); + return buf; +} + +int ipl_report_free(struct ipl_report *report) +{ + struct ipl_report_component *comp, *ncomp; + struct ipl_report_certificate *cert, *ncert; + + list_for_each_entry_safe(comp, ncomp, &report->components, list) + vfree(comp); + + list_for_each_entry_safe(cert, ncert, &report->certificates, list) + vfree(cert); + + vfree(report); + + return 0; } + +#endif diff --git a/arch/s390/kernel/ipl_vmparm.c b/arch/s390/kernel/ipl_vmparm.c index 411838c0a0af9962f3a5e43849e349543619cf3b..af43535a976df3647f487475415efbf1d6638c6d 100644 --- a/arch/s390/kernel/ipl_vmparm.c +++ b/arch/s390/kernel/ipl_vmparm.c @@ -11,11 +11,11 @@ size_t ipl_block_get_ascii_vmparm(char *dest, size_t size, char has_lowercase = 0; len = 0; - if ((ipb->ipl_info.ccw.vm_flags & DIAG308_VM_FLAGS_VP_VALID) && - (ipb->ipl_info.ccw.vm_parm_len > 0)) { + if ((ipb->ccw.vm_flags & IPL_PB0_CCW_VM_FLAG_VP) && + (ipb->ccw.vm_parm_len > 0)) { - len = min_t(size_t, size - 1, ipb->ipl_info.ccw.vm_parm_len); - memcpy(dest, ipb->ipl_info.ccw.vm_parm, len); + len = min_t(size_t, size - 1, ipb->ccw.vm_parm_len); + memcpy(dest, ipb->ccw.vm_parm, len); /* If at least one character is lowercase, we assume mixed * case; otherwise we convert everything to lowercase. */ diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 0cd5a5f96729dad40540016bf2d32acc1ff18893..8371855042dc2f07e37e5bf9e66300f645c04e83 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "entry.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat); @@ -73,7 +74,6 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQEXT_CMC, .name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"}, {.irq = IRQEXT_FTP, .name = "FTP", .desc = "[EXT] HMC FTP Service"}, {.irq = IRQIO_CIO, .name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"}, - {.irq = IRQIO_QAI, .name = "QAI", .desc = "[I/O] QDIO Adapter Interrupt"}, {.irq = IRQIO_DAS, .name = "DAS", .desc = "[I/O] DASD"}, {.irq = IRQIO_C15, .name = "C15", .desc = "[I/O] 3215"}, {.irq = IRQIO_C70, .name = "C70", .desc = "[I/O] 3270"}, @@ -81,14 +81,16 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQIO_VMR, .name = "VMR", .desc = "[I/O] Unit Record Devices"}, {.irq = IRQIO_LCS, .name = "LCS", .desc = "[I/O] LCS"}, {.irq = IRQIO_CTC, .name = "CTC", .desc = "[I/O] CTC"}, - {.irq = IRQIO_APB, .name = "APB", .desc = "[I/O] AP Bus"}, {.irq = IRQIO_ADM, .name = "ADM", .desc = "[I/O] EADM Subchannel"}, {.irq = IRQIO_CSC, .name = "CSC", .desc = "[I/O] CHSC Subchannel"}, - {.irq = IRQIO_PCI, .name = "PCI", .desc = "[I/O] PCI Interrupt" }, - {.irq = IRQIO_MSI, .name = "MSI", .desc = "[I/O] MSI Interrupt" }, {.irq = IRQIO_VIR, .name = "VIR", .desc = "[I/O] Virtual I/O Devices"}, - {.irq = IRQIO_VAI, .name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"}, - {.irq = IRQIO_GAL, .name = "GAL", .desc = "[I/O] GIB Alert"}, + {.irq = IRQIO_QAI, .name = "QAI", .desc = "[AIO] QDIO Adapter Interrupt"}, + {.irq = IRQIO_APB, .name = "APB", .desc = "[AIO] AP Bus"}, + {.irq = IRQIO_PCF, .name = "PCF", .desc = "[AIO] PCI Floating Interrupt"}, + {.irq = IRQIO_PCD, .name = "PCD", .desc = "[AIO] PCI Directed Interrupt"}, + {.irq = IRQIO_MSI, .name = "MSI", .desc = "[AIO] MSI Interrupt"}, + {.irq = IRQIO_VAI, .name = "VAI", .desc = "[AIO] Virtual I/O Devices AI"}, + {.irq = IRQIO_GAL, .name = "GAL", .desc = "[AIO] GIB Alert"}, {.irq = NMI_NMI, .name = "NMI", .desc = "[NMI] Machine Check"}, {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"}, }; @@ -116,6 +118,34 @@ void do_IRQ(struct pt_regs *regs, int irq) set_irq_regs(old_regs); } +static void show_msi_interrupt(struct seq_file *p, int irq) +{ + struct irq_desc *desc; + unsigned long flags; + int cpu; + + irq_lock_sparse(); + desc = irq_to_desc(irq); + if (!desc) + goto out; + + raw_spin_lock_irqsave(&desc->lock, flags); + seq_printf(p, "%3d: ", irq); + for_each_online_cpu(cpu) + seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu)); + + if (desc->irq_data.chip) + seq_printf(p, " %8s", desc->irq_data.chip->name); + + if (desc->action) + seq_printf(p, " %s", desc->action->name); + + seq_putc(p, '\n'); + raw_spin_unlock_irqrestore(&desc->lock, flags); +out: + irq_unlock_sparse(); +} + /* * show_interrupts is needed by /proc/interrupts. */ @@ -128,7 +158,7 @@ int show_interrupts(struct seq_file *p, void *v) if (index == 0) { seq_puts(p, " "); for_each_online_cpu(cpu) - seq_printf(p, "CPU%d ", cpu); + seq_printf(p, "CPU%-8d", cpu); seq_putc(p, '\n'); } if (index < NR_IRQS_BASE) { @@ -139,9 +169,10 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); goto out; } - if (index > NR_IRQS_BASE) + if (index < nr_irqs) { + show_msi_interrupt(p, index); goto out; - + } for (index = 0; index < NR_ARCH_IRQS; index++) { seq_printf(p, "%s: ", irqclass_sub_desc[index].name); irq = irqclass_sub_desc[index].irq; diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 5a286b012043bc2c4437ebb1509241dad0282546..6d0635ceddd0475de217b73cc05e1f0c3187e312 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -10,19 +10,26 @@ #include #include #include +#include #include -static int kexec_file_add_elf_kernel(struct kimage *image, - struct s390_load_data *data, - char *kernel, unsigned long kernel_len) +static int kexec_file_add_kernel_elf(struct kimage *image, + struct s390_load_data *data) { struct kexec_buf buf; const Elf_Ehdr *ehdr; const Elf_Phdr *phdr; + Elf_Addr entry; + void *kernel; int i, ret; + kernel = image->kernel_buf; ehdr = (Elf_Ehdr *)kernel; buf.image = image; + if (image->type == KEXEC_TYPE_CRASH) + entry = STARTUP_KDUMP_OFFSET; + else + entry = ehdr->e_entry; phdr = (void *)ehdr + ehdr->e_phoff; for (i = 0; i < ehdr->e_phnum; i++, phdr++) { @@ -33,30 +40,27 @@ static int kexec_file_add_elf_kernel(struct kimage *image, buf.bufsz = phdr->p_filesz; buf.mem = ALIGN(phdr->p_paddr, phdr->p_align); + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; buf.memsz = phdr->p_memsz; + data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz; - if (phdr->p_paddr == 0) { + if (entry - phdr->p_paddr < phdr->p_memsz) { data->kernel_buf = buf.buffer; - data->memsz += STARTUP_NORMAL_OFFSET; - - buf.buffer += STARTUP_NORMAL_OFFSET; - buf.bufsz -= STARTUP_NORMAL_OFFSET; - - buf.mem += STARTUP_NORMAL_OFFSET; - buf.memsz -= STARTUP_NORMAL_OFFSET; + data->kernel_mem = buf.mem; + data->parm = buf.buffer + PARMAREA; } - if (image->type == KEXEC_TYPE_CRASH) - buf.mem += crashk_res.start; - + ipl_report_add_component(data->report, &buf, + IPL_RB_COMPONENT_FLAG_SIGNED | + IPL_RB_COMPONENT_FLAG_VERIFIED, + IPL_RB_CERT_UNKNOWN); ret = kexec_add_buffer(&buf); if (ret) return ret; - - data->memsz += buf.memsz; } - return 0; + return data->memsz ? 0 : -EINVAL; } static void *s390_elf_load(struct kimage *image, @@ -64,11 +68,10 @@ static void *s390_elf_load(struct kimage *image, char *initrd, unsigned long initrd_len, char *cmdline, unsigned long cmdline_len) { - struct s390_load_data data = {0}; const Elf_Ehdr *ehdr; const Elf_Phdr *phdr; size_t size; - int i, ret; + int i; /* image->fobs->probe already checked for valid ELF magic number. */ ehdr = (Elf_Ehdr *)kernel; @@ -101,24 +104,7 @@ static void *s390_elf_load(struct kimage *image, if (size > kernel_len) return ERR_PTR(-EINVAL); - ret = kexec_file_add_elf_kernel(image, &data, kernel, kernel_len); - if (ret) - return ERR_PTR(ret); - - if (!data.memsz) - return ERR_PTR(-EINVAL); - - if (initrd) { - ret = kexec_file_add_initrd(image, &data, initrd, initrd_len); - if (ret) - return ERR_PTR(ret); - } - - ret = kexec_file_add_purgatory(image, &data); - if (ret) - return ERR_PTR(ret); - - return kexec_file_update_kernel(image, &data); + return kexec_file_add_components(image, kexec_file_add_kernel_elf); } static int s390_elf_probe(const char *buf, unsigned long len) @@ -144,4 +130,7 @@ static int s390_elf_probe(const char *buf, unsigned long len) const struct kexec_file_ops s390_kexec_elf_ops = { .probe = s390_elf_probe, .load = s390_elf_load, +#ifdef CONFIG_KEXEC_VERIFY_SIG + .verify_sig = s390_verify_sig, +#endif /* CONFIG_KEXEC_VERIFY_SIG */ }; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index 3800852595e8a194387d91df4cb767f35a3c7687..58318bf89fd905beb8fab9171be4f06e3c118a8a 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -10,31 +10,34 @@ #include #include #include +#include #include -static int kexec_file_add_image_kernel(struct kimage *image, - struct s390_load_data *data, - char *kernel, unsigned long kernel_len) +static int kexec_file_add_kernel_image(struct kimage *image, + struct s390_load_data *data) { struct kexec_buf buf; - int ret; buf.image = image; - buf.buffer = kernel + STARTUP_NORMAL_OFFSET; - buf.bufsz = kernel_len - STARTUP_NORMAL_OFFSET; + buf.buffer = image->kernel_buf; + buf.bufsz = image->kernel_buf_len; - buf.mem = STARTUP_NORMAL_OFFSET; + buf.mem = 0; if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; buf.memsz = buf.bufsz; - ret = kexec_add_buffer(&buf); + data->kernel_buf = image->kernel_buf; + data->kernel_mem = buf.mem; + data->parm = image->kernel_buf + PARMAREA; + data->memsz += buf.memsz; - data->kernel_buf = kernel; - data->memsz += buf.memsz + STARTUP_NORMAL_OFFSET; - - return ret; + ipl_report_add_component(data->report, &buf, + IPL_RB_COMPONENT_FLAG_SIGNED | + IPL_RB_COMPONENT_FLAG_VERIFIED, + IPL_RB_CERT_UNKNOWN); + return kexec_add_buffer(&buf); } static void *s390_image_load(struct kimage *image, @@ -42,24 +45,7 @@ static void *s390_image_load(struct kimage *image, char *initrd, unsigned long initrd_len, char *cmdline, unsigned long cmdline_len) { - struct s390_load_data data = {0}; - int ret; - - ret = kexec_file_add_image_kernel(image, &data, kernel, kernel_len); - if (ret) - return ERR_PTR(ret); - - if (initrd) { - ret = kexec_file_add_initrd(image, &data, initrd, initrd_len); - if (ret) - return ERR_PTR(ret); - } - - ret = kexec_file_add_purgatory(image, &data); - if (ret) - return ERR_PTR(ret); - - return kexec_file_update_kernel(image, &data); + return kexec_file_add_components(image, kexec_file_add_kernel_image); } static int s390_image_probe(const char *buf, unsigned long len) @@ -73,4 +59,7 @@ static int s390_image_probe(const char *buf, unsigned long len) const struct kexec_file_ops s390_kexec_image_ops = { .probe = s390_image_probe, .load = s390_image_load, +#ifdef CONFIG_KEXEC_VERIFY_SIG + .verify_sig = s390_verify_sig, +#endif /* CONFIG_KEXEC_VERIFY_SIG */ }; diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 7c0a095e9c5f6f3d483ceff698f7d7ffd06e614b..6f1388391620afc84734e526c51ca8dc8319ee14 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -27,29 +27,30 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = { }; -DEFINE_INSN_CACHE_OPS(dmainsn); +DEFINE_INSN_CACHE_OPS(s390_insn); -static void *alloc_dmainsn_page(void) -{ - void *page; +static int insn_page_in_use; +static char insn_page[PAGE_SIZE] __aligned(PAGE_SIZE); - page = (void *) __get_free_page(GFP_KERNEL | GFP_DMA); - if (page) - set_memory_x((unsigned long) page, 1); - return page; +static void *alloc_s390_insn_page(void) +{ + if (xchg(&insn_page_in_use, 1) == 1) + return NULL; + set_memory_x((unsigned long) &insn_page, 1); + return &insn_page; } -static void free_dmainsn_page(void *page) +static void free_s390_insn_page(void *page) { set_memory_nx((unsigned long) page, 1); - free_page((unsigned long)page); + xchg(&insn_page_in_use, 0); } -struct kprobe_insn_cache kprobe_dmainsn_slots = { - .mutex = __MUTEX_INITIALIZER(kprobe_dmainsn_slots.mutex), - .alloc = alloc_dmainsn_page, - .free = free_dmainsn_page, - .pages = LIST_HEAD_INIT(kprobe_dmainsn_slots.pages), +struct kprobe_insn_cache kprobe_s390_insn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_s390_insn_slots.mutex), + .alloc = alloc_s390_insn_page, + .free = free_s390_insn_page, + .pages = LIST_HEAD_INIT(kprobe_s390_insn_slots.pages), .insn_size = MAX_INSN_SIZE, }; @@ -102,7 +103,7 @@ static int s390_get_insn_slot(struct kprobe *p) */ p->ainsn.insn = NULL; if (is_kernel_addr(p->addr)) - p->ainsn.insn = get_dmainsn_slot(); + p->ainsn.insn = get_s390_insn_slot(); else if (is_module_addr(p->addr)) p->ainsn.insn = get_insn_slot(); return p->ainsn.insn ? 0 : -ENOMEM; @@ -114,7 +115,7 @@ static void s390_free_insn_slot(struct kprobe *p) if (!p->ainsn.insn) return; if (is_kernel_addr(p->addr)) - free_dmainsn_slot(p->ainsn.insn, 0); + free_s390_insn_slot(p->ainsn.insn, 0); else free_insn_slot(p->ainsn.insn, 0); p->ainsn.insn = NULL; @@ -572,7 +573,7 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) * In case the user-specified fault handler returned * zero, try to fix up. */ - entry = search_exception_tables(regs->psw.addr); + entry = s390_search_extables(regs->psw.addr); if (entry) { regs->psw.addr = extable_fixup(entry); return 1; diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index cb582649aba6b491a687c758a7460de90a23af3b..8a1ae140c5e2c420ef8ea1f53ab485d2edf1f86e 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -95,7 +96,7 @@ static void __do_machine_kdump(void *image) start_kdump(1); /* Die if start_kdump returns */ - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); } /* @@ -253,6 +254,9 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(high_memory); VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); + vmcoreinfo_append_str("SDMA=%lx\n", __sdma); + vmcoreinfo_append_str("EDMA=%lx\n", __edma); + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); } void machine_shutdown(void) @@ -280,7 +284,7 @@ static void __do_machine_kexec(void *data) (*data_mover)(&image->head, image->start); /* Die if kexec returns */ - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); } /* diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 32023b4f9dc00153aba7085e73e151b7156e573a..fbdd3ea73667424e5564ec06dc337e435acca63a 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -8,7 +8,12 @@ */ #include +#include #include +#include +#include +#include +#include #include const struct kexec_file_ops * const kexec_file_loaders[] = { @@ -17,38 +22,78 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { NULL, }; -int *kexec_file_update_kernel(struct kimage *image, - struct s390_load_data *data) -{ - unsigned long *loc; - - if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) - return ERR_PTR(-EINVAL); - - if (image->cmdline_buf_len) - memcpy(data->kernel_buf + COMMAND_LINE_OFFSET, - image->cmdline_buf, image->cmdline_buf_len); - - if (image->type == KEXEC_TYPE_CRASH) { - loc = (unsigned long *)(data->kernel_buf + OLDMEM_BASE_OFFSET); - *loc = crashk_res.start; - - loc = (unsigned long *)(data->kernel_buf + OLDMEM_SIZE_OFFSET); - *loc = crashk_res.end - crashk_res.start + 1; - } +#ifdef CONFIG_KEXEC_VERIFY_SIG +/* + * Module signature information block. + * + * The constituents of the signature section are, in order: + * + * - Signer's name + * - Key identifier + * - Signature data + * - Information block + */ +struct module_signature { + u8 algo; /* Public-key crypto algorithm [0] */ + u8 hash; /* Digest algorithm [0] */ + u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ + u8 signer_len; /* Length of signer's name [0] */ + u8 key_id_len; /* Length of key identifier [0] */ + u8 __pad[3]; + __be32 sig_len; /* Length of signature data */ +}; - if (image->initrd_buf) { - loc = (unsigned long *)(data->kernel_buf + INITRD_START_OFFSET); - *loc = data->initrd_load_addr; +#define PKEY_ID_PKCS7 2 - loc = (unsigned long *)(data->kernel_buf + INITRD_SIZE_OFFSET); - *loc = image->initrd_buf_len; +int s390_verify_sig(const char *kernel, unsigned long kernel_len) +{ + const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1; + struct module_signature *ms; + unsigned long sig_len; + + /* Skip signature verification when not secure IPLed. */ + if (!ipl_secure_flag) + return 0; + + if (marker_len > kernel_len) + return -EKEYREJECTED; + + if (memcmp(kernel + kernel_len - marker_len, MODULE_SIG_STRING, + marker_len)) + return -EKEYREJECTED; + kernel_len -= marker_len; + + ms = (void *)kernel + kernel_len - sizeof(*ms); + kernel_len -= sizeof(*ms); + + sig_len = be32_to_cpu(ms->sig_len); + if (sig_len >= kernel_len) + return -EKEYREJECTED; + kernel_len -= sig_len; + + if (ms->id_type != PKEY_ID_PKCS7) + return -EKEYREJECTED; + + if (ms->algo != 0 || + ms->hash != 0 || + ms->signer_len != 0 || + ms->key_id_len != 0 || + ms->__pad[0] != 0 || + ms->__pad[1] != 0 || + ms->__pad[2] != 0) { + return -EBADMSG; } - return NULL; + return verify_pkcs7_signature(kernel, kernel_len, + kernel + kernel_len, sig_len, + VERIFY_USE_PLATFORM_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); } +#endif /* CONFIG_KEXEC_VERIFY_SIG */ -static int kexec_file_update_purgatory(struct kimage *image) +static int kexec_file_update_purgatory(struct kimage *image, + struct s390_load_data *data) { u64 entry, type; int ret; @@ -90,7 +135,8 @@ static int kexec_file_update_purgatory(struct kimage *image) return ret; } -int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data) +static int kexec_file_add_purgatory(struct kimage *image, + struct s390_load_data *data) { struct kexec_buf buf; int ret; @@ -105,21 +151,21 @@ int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data) ret = kexec_load_purgatory(image, &buf); if (ret) return ret; + data->memsz += buf.memsz; - ret = kexec_file_update_purgatory(image); - return ret; + return kexec_file_update_purgatory(image, data); } -int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data, - char *initrd, unsigned long initrd_len) +static int kexec_file_add_initrd(struct kimage *image, + struct s390_load_data *data) { struct kexec_buf buf; int ret; buf.image = image; - buf.buffer = initrd; - buf.bufsz = initrd_len; + buf.buffer = image->initrd_buf; + buf.bufsz = image->initrd_buf_len; data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; @@ -127,11 +173,115 @@ int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data, buf.mem += crashk_res.start; buf.memsz = buf.bufsz; - data->initrd_load_addr = buf.mem; + data->parm->initrd_start = buf.mem; + data->parm->initrd_size = buf.memsz; data->memsz += buf.memsz; ret = kexec_add_buffer(&buf); - return ret; + if (ret) + return ret; + + return ipl_report_add_component(data->report, &buf, 0, 0); +} + +static int kexec_file_add_ipl_report(struct kimage *image, + struct s390_load_data *data) +{ + __u32 *lc_ipl_parmblock_ptr; + unsigned int len, ncerts; + struct kexec_buf buf; + unsigned long addr; + void *ptr, *end; + + buf.image = image; + + data->memsz = ALIGN(data->memsz, PAGE_SIZE); + buf.mem = data->memsz; + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; + + ptr = (void *)ipl_cert_list_addr; + end = ptr + ipl_cert_list_size; + ncerts = 0; + while (ptr < end) { + ncerts++; + len = *(unsigned int *)ptr; + ptr += sizeof(len); + ptr += len; + } + + addr = data->memsz + data->report->size; + addr += ncerts * sizeof(struct ipl_rb_certificate_entry); + ptr = (void *)ipl_cert_list_addr; + while (ptr < end) { + len = *(unsigned int *)ptr; + ptr += sizeof(len); + ipl_report_add_certificate(data->report, ptr, addr, len); + addr += len; + ptr += len; + } + + buf.buffer = ipl_report_finish(data->report); + buf.bufsz = data->report->size; + buf.memsz = buf.bufsz; + + data->memsz += buf.memsz; + + lc_ipl_parmblock_ptr = + data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr); + *lc_ipl_parmblock_ptr = (__u32)buf.mem; + + return kexec_add_buffer(&buf); +} + +void *kexec_file_add_components(struct kimage *image, + int (*add_kernel)(struct kimage *image, + struct s390_load_data *data)) +{ + struct s390_load_data data = {0}; + int ret; + + data.report = ipl_report_init(&ipl_block); + if (IS_ERR(data.report)) + return data.report; + + ret = add_kernel(image, &data); + if (ret) + goto out; + + if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) { + ret = -EINVAL; + goto out; + } + memcpy(data.parm->command_line, image->cmdline_buf, + image->cmdline_buf_len); + + if (image->type == KEXEC_TYPE_CRASH) { + data.parm->oldmem_base = crashk_res.start; + data.parm->oldmem_size = crashk_res.end - crashk_res.start + 1; + } + + if (image->initrd_buf) { + ret = kexec_file_add_initrd(image, &data); + if (ret) + goto out; + } + + ret = kexec_file_add_purgatory(image, &data); + if (ret) + goto out; + + if (data.kernel_mem == 0) { + unsigned long restart_psw = 0x0008000080000000UL; + restart_psw += image->start; + memcpy(data.kernel_buf, &restart_psw, sizeof(restart_psw)); + image->start = 0; + } + + ret = kexec_file_add_ipl_report(image, &data); +out: + ipl_report_free(data.report); + return ERR_PTR(ret); } int arch_kexec_apply_relocations_add(struct purgatory_info *pi, @@ -140,7 +290,7 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *symtab) { Elf_Rela *relas; - int i; + int i, r_type; relas = (void *)pi->ehdr + relsec->sh_offset; @@ -174,46 +324,8 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, addr = section->sh_addr + relas[i].r_offset; - switch (ELF64_R_TYPE(relas[i].r_info)) { - case R_390_8: /* Direct 8 bit. */ - *(u8 *)loc = val; - break; - case R_390_12: /* Direct 12 bit. */ - *(u16 *)loc &= 0xf000; - *(u16 *)loc |= val & 0xfff; - break; - case R_390_16: /* Direct 16 bit. */ - *(u16 *)loc = val; - break; - case R_390_20: /* Direct 20 bit. */ - *(u32 *)loc &= 0xf00000ff; - *(u32 *)loc |= (val & 0xfff) << 16; /* DL */ - *(u32 *)loc |= (val & 0xff000) >> 4; /* DH */ - break; - case R_390_32: /* Direct 32 bit. */ - *(u32 *)loc = val; - break; - case R_390_64: /* Direct 64 bit. */ - *(u64 *)loc = val; - break; - case R_390_PC16: /* PC relative 16 bit. */ - *(u16 *)loc = (val - addr); - break; - case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */ - *(u16 *)loc = (val - addr) >> 1; - break; - case R_390_PC32DBL: /* PC relative 32 bit shifted by 1. */ - *(u32 *)loc = (val - addr) >> 1; - break; - case R_390_PC32: /* PC relative 32 bit. */ - *(u32 *)loc = (val - addr); - break; - case R_390_PC64: /* PC relative 64 bit. */ - *(u64 *)loc = (val - addr); - break; - default: - break; - } + r_type = ELF64_R_TYPE(relas[i].r_info); + arch_kexec_do_relocs(r_type, loc, val, addr); } return 0; } @@ -225,10 +337,8 @@ int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, * load memory in head.S will be accessed, e.g. to register the next * command line. If the next kernel were smaller the current kernel * will panic at load. - * - * 0x11000 = sizeof(head.S) */ - if (buf_len < 0x11000) + if (buf_len < HEAD_END) return -ENOEXEC; return kexec_image_probe_default(image, buf, buf_len); diff --git a/arch/s390/kernel/machine_kexec_reloc.c b/arch/s390/kernel/machine_kexec_reloc.c new file mode 100644 index 0000000000000000000000000000000000000000..1dded39239f86605ef0599159ead4c6143f17978 --- /dev/null +++ b/arch/s390/kernel/machine_kexec_reloc.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val, + unsigned long addr) +{ + switch (r_type) { + case R_390_NONE: + break; + case R_390_8: /* Direct 8 bit. */ + *(u8 *)loc = val; + break; + case R_390_12: /* Direct 12 bit. */ + *(u16 *)loc &= 0xf000; + *(u16 *)loc |= val & 0xfff; + break; + case R_390_16: /* Direct 16 bit. */ + *(u16 *)loc = val; + break; + case R_390_20: /* Direct 20 bit. */ + *(u32 *)loc &= 0xf00000ff; + *(u32 *)loc |= (val & 0xfff) << 16; /* DL */ + *(u32 *)loc |= (val & 0xff000) >> 4; /* DH */ + break; + case R_390_32: /* Direct 32 bit. */ + *(u32 *)loc = val; + break; + case R_390_64: /* Direct 64 bit. */ + *(u64 *)loc = val; + break; + case R_390_PC16: /* PC relative 16 bit. */ + *(u16 *)loc = (val - addr); + break; + case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */ + *(u16 *)loc = (val - addr) >> 1; + break; + case R_390_PC32DBL: /* PC relative 32 bit shifted by 1. */ + *(u32 *)loc = (val - addr) >> 1; + break; + case R_390_PC32: /* PC relative 32 bit. */ + *(u32 *)loc = (val - addr); + break; + case R_390_PC64: /* PC relative 64 bit. */ + *(u64 *)loc = (val - addr); + break; + case R_390_RELATIVE: + *(unsigned long *) loc = val; + break; + default: + return 1; + } + return 0; +} diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index e93fbf02490cf2fc24da579d46eaae59e225d9e2..9e1660a6b9db6e49be93e4b145bb639dee52d773 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -20,6 +20,7 @@ ENTRY(ftrace_stub) BR_EX %r14 +ENDPROC(ftrace_stub) #define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) #define STACK_PTREGS (STACK_FRAME_OVERHEAD) @@ -28,7 +29,7 @@ ENTRY(ftrace_stub) ENTRY(_mcount) BR_EX %r14 - +ENDPROC(_mcount) EXPORT_SYMBOL(_mcount) ENTRY(ftrace_caller) @@ -61,10 +62,11 @@ ENTRY(ftrace_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER # The j instruction gets runtime patched to a nop instruction. # See ftrace_enable_ftrace_graph_caller. -ENTRY(ftrace_graph_caller) + .globl ftrace_graph_caller +ftrace_graph_caller: j ftrace_graph_caller_end - lg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) - lg %r3,(STACK_PTREGS_PSW+8)(%r15) + lmg %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15) + lg %r4,(STACK_PTREGS_PSW+8)(%r15) brasl %r14,prepare_ftrace_return stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) ftrace_graph_caller_end: @@ -73,6 +75,7 @@ ftrace_graph_caller_end: lg %r1,(STACK_PTREGS_PSW+8)(%r15) lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) BR_EX %r1 +ENDPROC(ftrace_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -86,5 +89,6 @@ ENTRY(return_to_handler) lgr %r14,%r2 lmg %r2,%r5,32(%r15) BR_EX %r14 +ENDPROC(return_to_handler) #endif diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 8c867b43c8ebc5b60469b52700ad6198daf1d874..0a487fae763eeab927bf5faa65e9ebdcf140e766 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -125,7 +125,7 @@ void nmi_free_per_cpu(struct lowcore *lc) static notrace void s390_handle_damage(void) { smp_emergency_stop(); - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); while (1); } NOKPROBE_SYMBOL(s390_handle_damage); diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index bdddaae9655984dfbf59ccee386c7bb3608cb183..29e511f5bf0687d705d8fbde208138571d73e293 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include static int __init nobp_setup_early(char *str) @@ -37,7 +38,7 @@ static int __init nospec_report(void) { if (test_facility(156)) pr_info("Spectre V2 mitigation: etokens\n"); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) pr_info("Spectre V2 mitigation: execute trampolines\n"); if (__test_facility(82, S390_lowcore.alt_stfle_fac_list)) pr_info("Spectre V2 mitigation: limited branch prediction\n"); @@ -58,15 +59,15 @@ early_param("nospectre_v2", nospectre_v2_setup_early); void __init nospec_auto_detect(void) { - if (test_facility(156)) { + if (test_facility(156) || cpu_mitigations_off()) { /* * The machine supports etokens. * Disable expolines and disable nobp. */ - if (IS_ENABLED(CC_USING_EXPOLINE)) + if (__is_defined(CC_USING_EXPOLINE)) nospec_disable = 1; __clear_facility(82, S390_lowcore.alt_stfle_fac_list); - } else if (IS_ENABLED(CC_USING_EXPOLINE)) { + } else if (__is_defined(CC_USING_EXPOLINE)) { /* * The kernel has been compiled with expolines. * Keep expolines enabled and disable nobp. diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c index e30e580ae36209d3395c2cf25a412ba5b9385480..48f472bf929087c06ad6dc5ed9023e0b43b55a6f 100644 --- a/arch/s390/kernel/nospec-sysfs.c +++ b/arch/s390/kernel/nospec-sysfs.c @@ -15,7 +15,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, { if (test_facility(156)) return sprintf(buf, "Mitigation: etokens\n"); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) return sprintf(buf, "Mitigation: execute trampolines\n"); if (__test_facility(82, S390_lowcore.alt_stfle_fac_list)) return sprintf(buf, "Mitigation: limited branch prediction\n"); diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index e1c54d28713a8d1daaf4f416e4e7b16fc427059b..48d48b6187c0d720bff1cbd836d5233455658ed5 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -2,8 +2,8 @@ /* * Performance event support for s390x - CPU-measurement Counter Facility * - * Copyright IBM Corp. 2012, 2017 - * Author(s): Hendrik Brueckner + * Copyright IBM Corp. 2012, 2019 + * Author(s): Hendrik Brueckner */ #define KMSG_COMPONENT "cpum_cf" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt @@ -26,7 +26,7 @@ static enum cpumf_ctr_set get_counter_set(u64 event) set = CPUMF_CTR_SET_USER; else if (event < 128) set = CPUMF_CTR_SET_CRYPTO; - else if (event < 256) + else if (event < 288) set = CPUMF_CTR_SET_EXT; else if (event >= 448 && event < 496) set = CPUMF_CTR_SET_MT_DIAG; @@ -50,12 +50,19 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_CRYPTO: + if ((cpuhw->info.csvn >= 1 && cpuhw->info.csvn <= 5 && + hwc->config > 79) || + (cpuhw->info.csvn >= 6 && hwc->config > 83)) + err = -EOPNOTSUPP; + break; case CPUMF_CTR_SET_EXT: if (cpuhw->info.csvn < 1) err = -EOPNOTSUPP; if ((cpuhw->info.csvn == 1 && hwc->config > 159) || (cpuhw->info.csvn == 2 && hwc->config > 175) || - (cpuhw->info.csvn > 2 && hwc->config > 255)) + (cpuhw->info.csvn >= 3 && cpuhw->info.csvn <= 5 + && hwc->config > 255) || + (cpuhw->info.csvn >= 6 && hwc->config > 287)) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_MT_DIAG: diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c index b6854812d2ed56f11cbd03865c16b26290518611..d4e031f7b9c818bb106616ed47736b6dc80e68df 100644 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ b/arch/s390/kernel/perf_cpum_cf_diag.c @@ -306,15 +306,20 @@ static size_t cf_diag_ctrset_size(enum cpumf_ctr_set ctrset, ctrset_size = 2; break; case CPUMF_CTR_SET_CRYPTO: - ctrset_size = 16; + if (info->csvn >= 1 && info->csvn <= 5) + ctrset_size = 16; + else if (info->csvn == 6) + ctrset_size = 20; break; case CPUMF_CTR_SET_EXT: if (info->csvn == 1) ctrset_size = 32; else if (info->csvn == 2) ctrset_size = 48; - else if (info->csvn >= 3) + else if (info->csvn >= 3 && info->csvn <= 5) ctrset_size = 128; + else if (info->csvn == 6) + ctrset_size = 160; break; case CPUMF_CTR_SET_MT_DIAG: if (info->csvn > 3) diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index b45238c8972870c421c397cf3fb19fe2ea1cbf72..34cc96449b304c1eb98b01bd4a30ef005118cd53 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -31,22 +31,26 @@ CPUMF_EVENT_ATTR(cf_fvn3, PROBLEM_STATE_CPU_CYCLES, 0x0020); CPUMF_EVENT_ATTR(cf_fvn3, PROBLEM_STATE_INSTRUCTIONS, 0x0021); CPUMF_EVENT_ATTR(cf_fvn3, L1D_DIR_WRITES, 0x0004); CPUMF_EVENT_ATTR(cf_fvn3, L1D_PENALTY_CYCLES, 0x0005); -CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_FUNCTIONS, 0x0040); -CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_CYCLES, 0x0041); -CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_BLOCKED_FUNCTIONS, 0x0042); -CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_BLOCKED_CYCLES, 0x0043); -CPUMF_EVENT_ATTR(cf_svn_generic, SHA_FUNCTIONS, 0x0044); -CPUMF_EVENT_ATTR(cf_svn_generic, SHA_CYCLES, 0x0045); -CPUMF_EVENT_ATTR(cf_svn_generic, SHA_BLOCKED_FUNCTIONS, 0x0046); -CPUMF_EVENT_ATTR(cf_svn_generic, SHA_BLOCKED_CYCLES, 0x0047); -CPUMF_EVENT_ATTR(cf_svn_generic, DEA_FUNCTIONS, 0x0048); -CPUMF_EVENT_ATTR(cf_svn_generic, DEA_CYCLES, 0x0049); -CPUMF_EVENT_ATTR(cf_svn_generic, DEA_BLOCKED_FUNCTIONS, 0x004a); -CPUMF_EVENT_ATTR(cf_svn_generic, DEA_BLOCKED_CYCLES, 0x004b); -CPUMF_EVENT_ATTR(cf_svn_generic, AES_FUNCTIONS, 0x004c); -CPUMF_EVENT_ATTR(cf_svn_generic, AES_CYCLES, 0x004d); -CPUMF_EVENT_ATTR(cf_svn_generic, AES_BLOCKED_FUNCTIONS, 0x004e); -CPUMF_EVENT_ATTR(cf_svn_generic, AES_BLOCKED_CYCLES, 0x004f); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_FUNCTIONS, 0x0040); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_CYCLES, 0x0041); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS, 0x0042); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_BLOCKED_CYCLES, 0x0043); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_FUNCTIONS, 0x0044); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_CYCLES, 0x0045); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_BLOCKED_FUNCTIONS, 0x0046); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_BLOCKED_CYCLES, 0x0047); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_FUNCTIONS, 0x0048); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_CYCLES, 0x0049); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_BLOCKED_FUNCTIONS, 0x004a); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_BLOCKED_CYCLES, 0x004b); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_FUNCTIONS, 0x004c); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_CYCLES, 0x004d); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_BLOCKED_FUNCTIONS, 0x004e); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_BLOCKED_CYCLES, 0x004f); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_FUNCTION_COUNT, 0x0050); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_CYCLES_COUNT, 0x0051); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_BLOCKED_FUNCTION_COUNT, 0x0052); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_BLOCKED_CYCLES_COUNT, 0x0053); CPUMF_EVENT_ATTR(cf_z10, L1I_L2_SOURCED_WRITES, 0x0080); CPUMF_EVENT_ATTR(cf_z10, L1D_L2_SOURCED_WRITES, 0x0081); CPUMF_EVENT_ATTR(cf_z10, L1I_L3_LOCAL_WRITES, 0x0082); @@ -262,23 +266,47 @@ static struct attribute *cpumcf_fvn3_pmu_event_attr[] __initdata = { NULL, }; -static struct attribute *cpumcf_svn_generic_pmu_event_attr[] __initdata = { - CPUMF_EVENT_PTR(cf_svn_generic, PRNG_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, PRNG_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, PRNG_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, PRNG_BLOCKED_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, SHA_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, SHA_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, SHA_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, SHA_BLOCKED_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, DEA_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, DEA_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, DEA_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, DEA_BLOCKED_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, AES_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, AES_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, AES_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, AES_BLOCKED_CYCLES), +static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_CYCLES), + NULL, +}; + +static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_6, ECC_FUNCTION_COUNT), + CPUMF_EVENT_PTR(cf_svn_6, ECC_CYCLES_COUNT), + CPUMF_EVENT_PTR(cf_svn_6, ECC_BLOCKED_FUNCTION_COUNT), + CPUMF_EVENT_PTR(cf_svn_6, ECC_BLOCKED_CYCLES_COUNT), NULL, }; @@ -562,7 +590,18 @@ __init const struct attribute_group **cpumf_cf_event_group(void) default: cfvn = none; } - csvn = cpumcf_svn_generic_pmu_event_attr; + + /* Determine version specific crypto set */ + switch (ci.csvn) { + case 1 ... 5: + csvn = cpumcf_svn_12345_pmu_event_attr; + break; + case 6: + csvn = cpumcf_svn_6_pmu_event_attr; + break; + default: + csvn = none; + } /* Determine model-specific counter set(s) */ get_cpu_id(&cpu_id); diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index 0d770e513abf404ff2cea26f7c01931b4d60cf4b..fcb6c2e92b0715d58ec0859b75e15cea0fa14756 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -21,6 +21,7 @@ #include #include #include +#include const char *perf_pmu_name(void) { @@ -219,20 +220,13 @@ static int __init service_level_perf_register(void) } arch_initcall(service_level_perf_register); -static int __perf_callchain_kernel(void *data, unsigned long address, int reliable) -{ - struct perf_callchain_entry_ctx *entry = data; - - perf_callchain_store(entry, address); - return 0; -} - void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - if (user_mode(regs)) - return; - dump_trace(__perf_callchain_kernel, entry, NULL, regs->gprs[15]); + struct unwind_state state; + + unwind_for_each_frame(&state, current, regs, 0) + perf_callchain_store(entry, state.ip); } /* Perf definitions for PMU event attributes in sysfs */ diff --git a/arch/s390/kernel/pgm_check.S b/arch/s390/kernel/pgm_check.S index 3e62aae34ea3f4070ba61b455688f16ca98b7de3..59dee9d3bebf145aba71d6a2b164823294dad8f5 100644 --- a/arch/s390/kernel/pgm_check.S +++ b/arch/s390/kernel/pgm_check.S @@ -7,7 +7,7 @@ #include -#define PGM_CHECK(handler) .long handler +#define PGM_CHECK(handler) .quad handler #define PGM_CHECK_DEFAULT PGM_CHECK(default_trap_handler) /* diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 6e758bb6cd29b70821bec49fd69b6d8d76111d2e..63873aa6693fe0881ad3eeb040bb285674025ca4 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include "entry.h" diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 6fe2e1875058bec89419d680bef0110249ba9ed3..5de13307b7038a591561173594060fc9ea3bede0 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -109,7 +109,8 @@ static void show_cpu_summary(struct seq_file *m, void *v) { static const char *hwcap_str[] = { "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", - "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs" + "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs", + "vxe2", "vxp", "sort", "dflt" }; static const char * const int_hwcap_str[] = { "sie" diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S index 7f14adf512c6d229cd4d68dac2c51c8c3f1fe643..4a22163962eb3870df6dc3818c1e53b87ecf0982 100644 --- a/arch/s390/kernel/reipl.S +++ b/arch/s390/kernel/reipl.S @@ -73,6 +73,7 @@ ENTRY(store_status) lgr %r9,%r2 lgr %r2,%r3 BR_EX %r9 +ENDPROC(store_status) .section .bss .align 8 diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index c97c2d40fe15aae36d446036043cb4cf70fe2dd7..fe396673e8a66c77bf8fb6aec75df18fe4ec8c18 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -58,11 +58,15 @@ ENTRY(relocate_kernel) j .base .done: sgr %r0,%r0 # clear register r0 + cghi %r3,0 + je .diag la %r4,load_psw-.base(%r13) # load psw-address into the register o %r3,4(%r4) # or load address into psw st %r3,4(%r4) mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0 + .diag: diag %r0,%r0,0x308 +ENDPROC(relocate_kernel) .align 8 load_psw: diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 2c642af526ce83658504e50b7d95700086589bd2..f8544d51743077fae1fa3af96dc9ba67f4e8ec05 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -50,6 +50,7 @@ #include #include +#include #include #include #include @@ -65,11 +66,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include "entry.h" /* @@ -89,12 +92,25 @@ char elf_platform[ELF_PLATFORM_SIZE]; unsigned long int_hwcap = 0; +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST +int __bootdata_preserved(prot_virt_guest); +#endif + int __bootdata(noexec_disabled); int __bootdata(memory_end_set); unsigned long __bootdata(memory_end); unsigned long __bootdata(max_physmem_end); struct mem_detect_info __bootdata(mem_detect); +struct exception_table_entry *__bootdata_preserved(__start_dma_ex_table); +struct exception_table_entry *__bootdata_preserved(__stop_dma_ex_table); +unsigned long __bootdata_preserved(__swsusp_reset_dma); +unsigned long __bootdata_preserved(__stext_dma); +unsigned long __bootdata_preserved(__etext_dma); +unsigned long __bootdata_preserved(__sdma); +unsigned long __bootdata_preserved(__edma); +unsigned long __bootdata_preserved(__kaslr_offset); + unsigned long VMALLOC_START; EXPORT_SYMBOL(VMALLOC_START); @@ -736,6 +752,15 @@ static void __init reserve_initrd(void) #endif } +/* + * Reserve the memory area used to pass the certificate lists + */ +static void __init reserve_certificate_list(void) +{ + if (ipl_cert_list_addr) + memblock_reserve(ipl_cert_list_addr, ipl_cert_list_size); +} + static void __init reserve_mem_detect_info(void) { unsigned long start, size; @@ -814,9 +839,10 @@ static void __init reserve_kernel(void) { unsigned long start_pfn = PFN_UP(__pa(_end)); - memblock_reserve(0, PARMAREA_END); + memblock_reserve(0, HEAD_END); memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn) - (unsigned long)_stext); + memblock_reserve(__sdma, __edma - __sdma); } static void __init setup_memory(void) @@ -914,7 +940,15 @@ static int __init setup_hwcaps(void) elf_hwcap |= HWCAP_S390_VXRS_EXT; if (test_facility(135)) elf_hwcap |= HWCAP_S390_VXRS_BCD; + if (test_facility(148)) + elf_hwcap |= HWCAP_S390_VXRS_EXT2; + if (test_facility(152)) + elf_hwcap |= HWCAP_S390_VXRS_PDE; } + if (test_facility(150)) + elf_hwcap |= HWCAP_S390_SORT; + if (test_facility(151)) + elf_hwcap |= HWCAP_S390_DFLT; /* * Guarded storage support HWCAP_S390_GS is bit 12. @@ -1022,6 +1056,38 @@ static void __init setup_control_program_code(void) asm volatile("diag %0,0,0x318\n" : : "d" (diag318_info.val)); } +/* + * Print the component list from the IPL report + */ +static void __init log_component_list(void) +{ + struct ipl_rb_component_entry *ptr, *end; + char *str; + + if (!early_ipl_comp_list_addr) + return; + if (ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR) + pr_info("Linux is running with Secure-IPL enabled\n"); + else + pr_info("Linux is running with Secure-IPL disabled\n"); + ptr = (void *) early_ipl_comp_list_addr; + end = (void *) ptr + early_ipl_comp_list_size; + pr_info("The IPL report contains the following components:\n"); + while (ptr < end) { + if (ptr->flags & IPL_RB_COMPONENT_FLAG_SIGNED) { + if (ptr->flags & IPL_RB_COMPONENT_FLAG_VERIFIED) + str = "signed, verified"; + else + str = "signed, verification failed"; + } else { + str = "not signed"; + } + pr_info("%016llx - %016llx (%s)\n", + ptr->addr, ptr->addr + ptr->len, str); + ptr++; + } +} + /* * Setup function called from init/main.c just after the banner * was printed. @@ -1042,6 +1108,8 @@ void __init setup_arch(char **cmdline_p) else pr_info("Linux is running as a guest in 64-bit mode\n"); + log_component_list(); + /* Have one command line that is parsed and saved in /proc/cmdline */ /* boot_command_line has been already set up in early.c */ *cmdline_p = boot_command_line; @@ -1073,6 +1141,7 @@ void __init setup_arch(char **cmdline_p) reserve_oldmem(); reserve_kernel(); reserve_initrd(); + reserve_certificate_list(); reserve_mem_detect_info(); memblock_allow_resize(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index bd197baf1dc337f018af35eeb19635b1c95998b7..35fafa2b91a80a21fabb3e80500718add7152f6f 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include "entry.h" @@ -689,7 +690,7 @@ void __init smp_save_dump_cpus(void) smp_save_cpu_regs(sa, addr, is_boot_cpu, page); } memblock_free(page, PAGE_SIZE); - diag308_reset(); + diag_dma_ops.diag308_reset(); pcpu_set_smt(0); } #endif /* CONFIG_CRASH_DUMP */ diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 460dcfba7d4ec08db7de61942ea387ef38579a99..f6a620f854e168b733ee73f7f6eeda0910317f7d 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -11,65 +11,52 @@ #include #include #include - -static int __save_address(void *data, unsigned long address, int nosched) -{ - struct stack_trace *trace = data; - - if (nosched && in_sched_functions(address)) - return 0; - if (trace->skip > 0) { - trace->skip--; - return 0; - } - if (trace->nr_entries < trace->max_entries) { - trace->entries[trace->nr_entries++] = address; - return 0; - } - return 1; -} - -static int save_address(void *data, unsigned long address, int reliable) -{ - return __save_address(data, address, 0); -} - -static int save_address_nosched(void *data, unsigned long address, int reliable) -{ - return __save_address(data, address, 1); -} +#include +#include void save_stack_trace(struct stack_trace *trace) { - unsigned long sp; - - sp = current_stack_pointer(); - dump_trace(save_address, trace, NULL, sp); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + struct unwind_state state; + + unwind_for_each_frame(&state, current, NULL, 0) { + if (trace->nr_entries >= trace->max_entries) + break; + if (trace->skip > 0) + trace->skip--; + else + trace->entries[trace->nr_entries++] = state.ip; + } } EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - unsigned long sp; - - sp = tsk->thread.ksp; - if (tsk == current) - sp = current_stack_pointer(); - dump_trace(save_address_nosched, trace, tsk, sp); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + struct unwind_state state; + + unwind_for_each_frame(&state, tsk, NULL, 0) { + if (trace->nr_entries >= trace->max_entries) + break; + if (in_sched_functions(state.ip)) + continue; + if (trace->skip > 0) + trace->skip--; + else + trace->entries[trace->nr_entries++] = state.ip; + } } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { - unsigned long sp; - - sp = kernel_stack_pointer(regs); - dump_trace(save_address, trace, NULL, sp); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + struct unwind_state state; + + unwind_for_each_frame(&state, current, regs, 0) { + if (trace->nr_entries >= trace->max_entries) + break; + if (trace->skip > 0) + trace->skip--; + else + trace->entries[trace->nr_entries++] = state.ip; + } } EXPORT_SYMBOL_GPL(save_stack_trace_regs); diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S index 993100c31d65541be4eacca268c623316bab49d4..19a3c427801a861fd5412eea689d12c20b7c5369 100644 --- a/arch/s390/kernel/swsusp.S +++ b/arch/s390/kernel/swsusp.S @@ -108,6 +108,7 @@ ENTRY(swsusp_arch_suspend) lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) lghi %r2,0 BR_EX %r14 +ENDPROC(swsusp_arch_suspend) /* * Restore saved memory image to correct place and restore register context. @@ -154,20 +155,13 @@ ENTRY(swsusp_arch_resume) ptlb /* flush tlb */ /* Reset System */ - larl %r1,restart_entry - larl %r2,.Lrestart_diag308_psw - og %r1,0(%r2) - stg %r1,0(%r0) larl %r1,.Lnew_pgm_check_psw epsw %r2,%r3 stm %r2,%r3,0(%r1) mvc __LC_PGM_NEW_PSW(16,%r0),0(%r1) - lghi %r0,0 - diag %r0,%r0,0x308 -restart_entry: - lhi %r1,1 - sigp %r1,%r0,SIGP_SET_ARCHITECTURE - sam64 + larl %r1,__swsusp_reset_dma + lg %r1,0(%r1) + BASR_EX %r14,%r1 #ifdef CONFIG_SMP larl %r1,smp_cpu_mt_shift icm %r1,15,0(%r1) @@ -267,6 +261,7 @@ restore_registers: lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) lghi %r2,0 BR_EX %r14 +ENDPROC(swsusp_arch_resume) .section .data..nosave,"aw",@progbits .align 8 @@ -275,8 +270,6 @@ restore_registers: .Lpanic_string: .asciz "Resume not possible because suspend CPU is no longer available\n" .align 8 -.Lrestart_diag308_psw: - .long 0x00080000,0x80000000 .Lrestart_suspend_psw: .quad 0x0000000180000000,restart_suspend .Lnew_pgm_check_psw: diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 02579f95f391b6524ddd28004cc6d6a511be974b..061418f787c3712f4091cfeb94b8dfb5d2b1eb03 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -426,3 +426,7 @@ 421 32 rt_sigtimedwait_time64 - compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 - sys_futex 423 32 sched_rr_get_interval_time64 - sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register sys_io_uring_register diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 8003b38c1688f00ba02656e3e7a37f75de9d59f4..82e81a9f711269177274a3beeee80ce6ba915da8 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -49,7 +49,7 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) report_user_fault(regs, si_signo, 0); } else { const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->psw.addr); + fixup = s390_search_extables(regs->psw.addr); if (fixup) regs->psw.addr = extable_fixup(fixup); else { @@ -263,5 +263,6 @@ NOKPROBE_SYMBOL(kernel_stack_overflow); void __init trap_init(void) { + sort_extable(__start_dma_ex_table, __stop_dma_ex_table); local_mcck_enable(); } diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c new file mode 100644 index 0000000000000000000000000000000000000000..57fd4e902f1f49eaf3125c29482f0e32ab8cce23 --- /dev/null +++ b/arch/s390/kernel/unwind_bc.c @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +static bool outside_of_stack(struct unwind_state *state, unsigned long sp) +{ + return (sp <= state->sp) || + (sp + sizeof(struct stack_frame) > state->stack_info.end); +} + +static bool update_stack_info(struct unwind_state *state, unsigned long sp) +{ + struct stack_info *info = &state->stack_info; + unsigned long *mask = &state->stack_mask; + + /* New stack pointer leaves the current stack */ + if (get_stack_info(sp, state->task, info, mask) != 0 || + !on_stack(info, sp, sizeof(struct stack_frame))) + /* 'sp' does not point to a valid stack */ + return false; + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + struct stack_info *info = &state->stack_info; + struct stack_frame *sf; + struct pt_regs *regs; + unsigned long sp, ip; + bool reliable; + + regs = state->regs; + if (unlikely(regs)) { + sp = READ_ONCE_TASK_STACK(state->task, regs->gprs[15]); + if (unlikely(outside_of_stack(state, sp))) { + if (!update_stack_info(state, sp)) + goto out_err; + } + sf = (struct stack_frame *) sp; + ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + reliable = false; + regs = NULL; + } else { + sf = (struct stack_frame *) state->sp; + sp = READ_ONCE_TASK_STACK(state->task, sf->back_chain); + if (likely(sp)) { + /* Non-zero back-chain points to the previous frame */ + if (unlikely(outside_of_stack(state, sp))) { + if (!update_stack_info(state, sp)) + goto out_err; + } + sf = (struct stack_frame *) sp; + ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + reliable = true; + } else { + /* No back-chain, look for a pt_regs structure */ + sp = state->sp + STACK_FRAME_OVERHEAD; + if (!on_stack(info, sp, sizeof(struct pt_regs))) + goto out_stop; + regs = (struct pt_regs *) sp; + if (user_mode(regs)) + goto out_stop; + ip = READ_ONCE_TASK_STACK(state->task, regs->psw.addr); + reliable = true; + } + } + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* Decode any ftrace redirection */ + if (ip == (unsigned long) return_to_handler) + ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + ip, (void *) sp); +#endif + + /* Update unwind state */ + state->sp = sp; + state->ip = ip; + state->regs = regs; + state->reliable = reliable; + return true; + +out_err: + state->error = true; +out_stop: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long sp) +{ + struct stack_info *info = &state->stack_info; + unsigned long *mask = &state->stack_mask; + struct stack_frame *sf; + unsigned long ip; + bool reliable; + + memset(state, 0, sizeof(*state)); + state->task = task; + state->regs = regs; + + /* Don't even attempt to start from user mode regs: */ + if (regs && user_mode(regs)) { + info->type = STACK_TYPE_UNKNOWN; + return; + } + + /* Get current stack pointer and initialize stack info */ + if (get_stack_info(sp, task, info, mask) != 0 || + !on_stack(info, sp, sizeof(struct stack_frame))) { + /* Something is wrong with the stack pointer */ + info->type = STACK_TYPE_UNKNOWN; + state->error = true; + return; + } + + /* Get the instruction pointer from pt_regs or the stack frame */ + if (regs) { + ip = READ_ONCE_TASK_STACK(state->task, regs->psw.addr); + reliable = true; + } else { + sf = (struct stack_frame *) sp; + ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + reliable = false; + } + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* Decode any ftrace redirection */ + if (ip == (unsigned long) return_to_handler) + ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + ip, NULL); +#endif + + /* Update unwind state */ + state->sp = sp; + state->ip = ip; + state->reliable = reliable; +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index e7920a68a12ecf977e73a237a446bf65a5dfbecc..243d8b1185bfcdbf39d0480aa7245f11c7eb89cc 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -29,7 +29,7 @@ #include #include -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO extern char vdso32_start, vdso32_end; static void *vdso32_kbase = &vdso32_start; static unsigned int vdso32_pages; @@ -55,7 +55,7 @@ static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, vdso_pagelist = vdso64_pagelist; vdso_pages = vdso64_pages; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO if (vma->vm_mm->context.compat_mm) { vdso_pagelist = vdso32_pagelist; vdso_pages = vdso32_pages; @@ -76,7 +76,7 @@ static int vdso_mremap(const struct vm_special_mapping *sm, unsigned long vdso_pages; vdso_pages = vdso64_pages; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO if (vma->vm_mm->context.compat_mm) vdso_pages = vdso32_pages; #endif @@ -223,7 +223,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return 0; vdso_pages = vdso64_pages; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO mm->context.compat_mm = is_compat_task(); if (mm->context.compat_mm) vdso_pages = vdso32_pages; @@ -280,7 +280,7 @@ static int __init vdso_init(void) int i; vdso_init_data(vdso_data); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO /* Calculate the size of the 32 bit vDSO */ vdso32_pages = ((&vdso32_end - &vdso32_start + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile index e76309fbbcb3b6e23af21350f98f2b555502b978..aee9ffbccb54852089b3c0bc7f6576931ce87b0b 100644 --- a/arch/s390/kernel/vdso32/Makefile +++ b/arch/s390/kernel/vdso32/Makefile @@ -19,7 +19,7 @@ KBUILD_AFLAGS_31 += -m31 -s KBUILD_CFLAGS_31 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_31 += -m31 -fPIC -shared -fno-common -fno-builtin KBUILD_CFLAGS_31 += -nostdlib -Wl,-soname=linux-vdso32.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=both) + -Wl,--hash-style=both $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_31) $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_31) diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index f849ac61c5da02ee8b764bc3c01fc44c16137e04..bec19e7e6e1cf6aa718596d4717558101c4ba702 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -19,7 +19,7 @@ KBUILD_AFLAGS_64 += -m64 -s KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin KBUILD_CFLAGS_64 += -nostdlib -Wl,-soname=linux-vdso64.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=both) + -Wl,--hash-style=both $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64) $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64) diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 8429ab07971575394622444ea6be40eb85b37f62..49d55327de0bcaa2412d035a58a4f1a4fe3dca79 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -72,6 +72,7 @@ SECTIONS __end_ro_after_init = .; RW_DATA_SECTION(0x100, PAGE_SIZE, THREAD_SIZE) + BOOT_DATA_PRESERVED _edata = .; /* End of data section */ @@ -143,6 +144,18 @@ SECTIONS INIT_DATA_SECTION(0x100) PERCPU_SECTION(0x100) + + .dynsym ALIGN(8) : { + __dynsym_start = .; + *(.dynsym) + __dynsym_end = .; + } + .rela.dyn ALIGN(8) : { + __rela_dyn_start = .; + *(.rela*) + __rela_dyn_end = .; + } + . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ @@ -161,6 +174,12 @@ SECTIONS QUAD(__bss_stop - __bss_start) /* bss_size */ QUAD(__boot_data_start) /* bootdata_off */ QUAD(__boot_data_end - __boot_data_start) /* bootdata_size */ + QUAD(__boot_data_preserved_start) /* bootdata_preserved_off */ + QUAD(__boot_data_preserved_end - + __boot_data_preserved_start) /* bootdata_preserved_size */ + QUAD(__dynsym_start) /* dynsym_start */ + QUAD(__rela_dyn_start) /* rela_dyn_start */ + QUAD(__rela_dyn_end) /* rela_dyn_end */ } :NONE /* Debugging sections. */ diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index a69a0911ed0e82720b10b124d0153681f2c821ea..c475ca49cfc6b43c02ab924e218e541a92b677b8 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -37,7 +37,7 @@ static inline u64 get_vtimer(void) { u64 timer; - asm volatile("stpt %0" : "=m" (timer)); + asm volatile("stpt %0" : "=Q" (timer)); return timer; } @@ -48,7 +48,7 @@ static inline void set_vtimer(u64 expires) asm volatile( " stpt %0\n" /* Store current cpu timer value */ " spt %1" /* Set new value imm. afterwards */ - : "=m" (timer) : "m" (expires)); + : "=Q" (timer) : "Q" (expires)); S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer; S390_lowcore.last_update_timer = expires; } @@ -135,8 +135,8 @@ static int do_account_vtime(struct task_struct *tsk) #else " stck %1" /* Store current tod clock value */ #endif - : "=m" (S390_lowcore.last_update_timer), - "=m" (S390_lowcore.last_update_clock)); + : "=Q" (S390_lowcore.last_update_timer), + "=Q" (S390_lowcore.last_update_clock)); clock = S390_lowcore.last_update_clock - clock; timer -= S390_lowcore.last_update_timer; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 82162867f378d225ede29ff32adee1983072a7ac..37503ae62486cab04b418afeb488a93919313d9b 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -3194,7 +3194,7 @@ out: } EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); -static void gib_alert_irq_handler(struct airq_struct *airq) +static void gib_alert_irq_handler(struct airq_struct *airq, bool floating) { inc_irq_stat(IRQIO_GAL); process_gib_alert_list(); diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S index 53008da0519076fd57f32ca057ac21333cdf51c6..dc0874f2e203c79c691ffb5746a3c0650aaa3cbc 100644 --- a/arch/s390/lib/mem.S +++ b/arch/s390/lib/mem.S @@ -178,6 +178,7 @@ ENTRY(__memset\bits) BR_EX %r14 .L__memset_mvc\bits: mvc \bytes(1,%r1),0(%r1) +ENDPROC(__memset\bits) .endm __MEMSET 16,2,sth diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index f5880bfd1b0cb1bb67ae7e8f2ac68c6404c88224..3175413186b9d1e777e3151c9f344b879b755bb9 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -4,7 +4,7 @@ # obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o -obj-y += page-states.o gup.o pageattr.o pgtable.o pgalloc.o +obj-y += page-states.o pageattr.o pgtable.o pgalloc.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 11613362c4e75f9d3668a9618dfed7d7265d9d32..c220399ae196e07ab3d6cf5a2ef5a4931eaff025 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -247,12 +247,24 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code) current); } +const struct exception_table_entry *s390_search_extables(unsigned long addr) +{ + const struct exception_table_entry *fixup; + + fixup = search_extable(__start_dma_ex_table, + __stop_dma_ex_table - __start_dma_ex_table, + addr); + if (!fixup) + fixup = search_exception_tables(addr); + return fixup; +} + static noinline void do_no_context(struct pt_regs *regs) { const struct exception_table_entry *fixup; /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->psw.addr); + fixup = s390_search_extables(regs->psw.addr); if (fixup) { regs->psw.addr = extable_fixup(fixup); return; diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c deleted file mode 100644 index 2809d11c7a283b825dc28b1ce9bde1cc4af28b09..0000000000000000000000000000000000000000 --- a/arch/s390/mm/gup.c +++ /dev/null @@ -1,300 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Lockless get_user_pages_fast for s390 - * - * Copyright IBM Corp. 2010 - * Author(s): Martin Schwidefsky - */ -#include -#include -#include -#include -#include -#include -#include - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - unsigned long mask; - pte_t *ptep, pte; - - mask = (write ? _PAGE_PROTECT : 0) | _PAGE_INVALID | _PAGE_SPECIAL; - - ptep = ((pte_t *) pmd_deref(pmd)) + pte_index(addr); - do { - pte = *ptep; - barrier(); - /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_protnone(pte)) - return 0; - if ((pte_val(pte) & mask) != 0) - return 0; - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - head = compound_head(page); - if (!page_cache_get_speculative(head)) - return 0; - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_page(head); - return 0; - } - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - - return 1; -} - -static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - unsigned long mask; - int refs; - - mask = (write ? _SEGMENT_ENTRY_PROTECT : 0) | _SEGMENT_ENTRY_INVALID; - if ((pmd_val(pmd) & mask) != 0) - return 0; - VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT)); - - refs = 0; - head = pmd_page(pmd); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) { - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} - - -static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp, pmd; - - pmdp = (pmd_t *) pudp; - if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) - pmdp = (pmd_t *) pud_deref(pud); - pmdp += pmd_index(addr); - do { - pmd = *pmdp; - barrier(); - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (unlikely(pmd_large(pmd))) { - /* - * NUMA hinting faults need to be handled in the GUP - * slowpath for accounting purposes and so that they - * can be serialised against THP migration. - */ - if (pmd_protnone(pmd)) - return 0; - if (!gup_huge_pmd(pmdp, pmd, addr, next, - write, pages, nr)) - return 0; - } else if (!gup_pte_range(pmdp, pmd, addr, next, - write, pages, nr)) - return 0; - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - unsigned long mask; - int refs; - - mask = (write ? _REGION_ENTRY_PROTECT : 0) | _REGION_ENTRY_INVALID; - if ((pud_val(pud) & mask) != 0) - return 0; - VM_BUG_ON(!pfn_valid(pud_pfn(pud))); - - refs = 0; - head = pud_page(pud); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pud_val(pud) != pud_val(*pudp))) { - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} - -static inline int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp, pud; - - pudp = (pud_t *) p4dp; - if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) - pudp = (pud_t *) p4d_deref(p4d); - pudp += pud_index(addr); - do { - pud = *pudp; - barrier(); - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (unlikely(pud_large(pud))) { - if (!gup_huge_pud(pudp, pud, addr, next, write, pages, - nr)) - return 0; - } else if (!gup_pmd_range(pudp, pud, addr, next, write, pages, - nr)) - return 0; - } while (pudp++, addr = next, addr != end); - - return 1; -} - -static inline int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long next; - p4d_t *p4dp, p4d; - - p4dp = (p4d_t *) pgdp; - if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1) - p4dp = (p4d_t *) pgd_deref(pgd); - p4dp += p4d_index(addr); - do { - p4d = *p4dp; - barrier(); - next = p4d_addr_end(addr, end); - if (p4d_none(p4d)) - return 0; - if (!gup_pud_range(p4dp, p4d, addr, next, write, pages, nr)) - return 0; - } while (p4dp++, addr = next, addr != end); - - return 1; -} - -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next, flags; - pgd_t *pgdp, pgd; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if ((end <= start) || (end > mm->context.asce_limit)) - return 0; - /* - * local_irq_save() doesn't prevent pagetable teardown, but does - * prevent the pagetables from being freed on s390. - * - * So long as we atomically load page table pointers versus teardown, - * we can follow the address down to the the page and take a ref on it. - */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd = *pgdp; - barrier(); - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_p4d_range(pgdp, pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - int nr, ret; - - might_sleep(); - start &= PAGE_MASK; - nr = __get_user_pages_fast(start, nr_pages, write, pages); - if (nr == nr_pages) - return nr; - - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - ret = get_user_pages_unlocked(start, nr_pages - nr, pages, - write ? FOLL_WRITE : 0); - /* Have to be a bit careful with return values */ - if (nr > 0) - ret = (ret < 0) ? nr : ret + nr; - return ret; -} diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 3e82f66d5c613e82e051235786f49c5d2e79b31f..7cf48eefec8fc291ee3090c77107327547b85e95 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -49,6 +49,8 @@ unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); +bool initmem_freed; + static void __init setup_zero_pages(void) { unsigned int order; @@ -148,6 +150,7 @@ void __init mem_init(void) void free_initmem(void) { + initmem_freed = true; __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RW | SET_MEMORY_NX); diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 97b3ee53852b36b34fb1653373c57c2ff3cedb24..818deeb1ebc3dc4c7fba9d6e86bef87c082df3c0 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -16,6 +16,7 @@ #include #include #include +#include static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) { diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index db6bb2f97a2c62f9334c1e1cf8ab39460c15acf0..99e06213a22b7c259ba90831c4cf7cbda6d1a301 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -290,7 +290,7 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, tlb_remove_table(tlb, table); } -static void __tlb_remove_table(void *_table) +void __tlb_remove_table(void *_table) { unsigned int mask = (unsigned long) _table & 3; void *table = (void *)((unsigned long) _table ^ mask); @@ -316,67 +316,6 @@ static void __tlb_remove_table(void *_table) } } -static void tlb_remove_table_smp_sync(void *arg) -{ - /* Simply deliver the interrupt */ -} - -static void tlb_remove_table_one(void *table) -{ - /* - * This isn't an RCU grace period and hence the page-tables cannot be - * assumed to be actually RCU-freed. - * - * It is however sufficient for software page-table walkers that rely - * on IRQ disabling. See the comment near struct mmu_table_batch. - */ - smp_call_function(tlb_remove_table_smp_sync, NULL, 1); - __tlb_remove_table(table); -} - -static void tlb_remove_table_rcu(struct rcu_head *head) -{ - struct mmu_table_batch *batch; - int i; - - batch = container_of(head, struct mmu_table_batch, rcu); - - for (i = 0; i < batch->nr; i++) - __tlb_remove_table(batch->tables[i]); - - free_page((unsigned long)batch); -} - -void tlb_table_flush(struct mmu_gather *tlb) -{ - struct mmu_table_batch **batch = &tlb->batch; - - if (*batch) { - call_rcu(&(*batch)->rcu, tlb_remove_table_rcu); - *batch = NULL; - } -} - -void tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct mmu_table_batch **batch = &tlb->batch; - - tlb->mm->context.flush_mm = 1; - if (*batch == NULL) { - *batch = (struct mmu_table_batch *) - __get_free_page(GFP_NOWAIT | __GFP_NOWARN); - if (*batch == NULL) { - __tlb_flush_mm_lazy(tlb->mm); - tlb_remove_table_one(table); - return; - } - (*batch)->nr = 0; - } - (*batch)->tables[(*batch)->nr++] = table; - if ((*batch)->nr == MAX_TABLE_BATCH) - tlb_flush_mmu(tlb); -} - /* * Base infrastructure required to generate basic asces, region, segment, * and page tables that do not make use of enhanced features like EDAT1. diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 8485d6dc275496ab69d76cfa29bb2ae66c14e9b5..9ebd01219812cc711055518c50b0dbfe5b454e89 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -410,6 +410,7 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, return old; } +#ifdef CONFIG_PGSTE static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -427,6 +428,7 @@ static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr) pmd = pmd_alloc(mm, pud, addr); return pmd; } +#endif pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t new) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 0472e27febdfaa620cf4cd5849690907486d1136..b403fa14847dce14c7150214fdceb624740d2592 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -413,6 +413,8 @@ void __init vmem_map_init(void) __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); + __set_memory(__stext_dma, (__etext_dma - __stext_dma) >> PAGE_SHIFT, + SET_MEMORY_RO | SET_MEMORY_X); pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 51dd0267d0148142bdac0849bd034c75c0b7c0ff..5e7c6303315906d95163e8ceac5ba7125d196f1f 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -455,7 +455,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) EMIT4(0xb9040000, REG_2, BPF_REG_0); /* Restore registers */ save_restore_regs(jit, REGS_RESTORE, stack_depth); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) { + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) { jit->r14_thunk_ip = jit->prg; /* Generate __s390_indirect_jump_r14 thunk */ if (test_facility(35)) { @@ -473,7 +473,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) /* br %r14 */ _EMIT2(0x07fe); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable && + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable && (jit->seen & SEEN_FUNC)) { jit->r1_thunk_ip = jit->prg; /* Generate __s390_indirect_jump_r1 thunk */ @@ -999,7 +999,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i /* lg %w1,(%l) */ EMIT6_DISP_LH(0xe3000000, 0x0004, REG_W1, REG_0, REG_L, EMIT_CONST_U64(func)); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) { + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) { /* brasl %r14,__s390_indirect_jump_r1 */ EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); } else { diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c index 43d9525c36fc3e491525acd7d50caf3ae79802a1..7441857df51bac5c3eeecc2b4bcef4d3746bd7c2 100644 --- a/arch/s390/oprofile/init.c +++ b/arch/s390/oprofile/init.c @@ -13,23 +13,17 @@ #include #include #include - -static int __s390_backtrace(void *data, unsigned long address, int reliable) -{ - unsigned int *depth = data; - - if (*depth == 0) - return 1; - (*depth)--; - oprofile_add_trace(address); - return 0; -} +#include static void s390_backtrace(struct pt_regs *regs, unsigned int depth) { - if (user_mode(regs)) - return; - dump_trace(__s390_backtrace, &depth, NULL, regs->gprs[15]); + struct unwind_state state; + + unwind_for_each_frame(&state, current, regs, 0) { + if (depth-- == 0) + break; + oprofile_add_trace(state.ip); + } } int __init oprofile_arch_init(struct oprofile_operations *ops) diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile index 22d0871291eef12438398207cb06e964b14ad73d..748626a3302826b359d89411efaa5f5019b5ca8e 100644 --- a/arch/s390/pci/Makefile +++ b/arch/s390/pci/Makefile @@ -3,5 +3,5 @@ # Makefile for the s390 PCI subsystem. # -obj-$(CONFIG_PCI) += pci.o pci_dma.o pci_clp.o pci_sysfs.o \ +obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \ pci_event.o pci_debug.o pci_insn.o pci_mmio.o diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index dc9bc82c072c6d9309882f84dca095b131a53b90..0ebb7c405a2508d28c66530eaf1b39b4598635b5 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -24,11 +24,9 @@ #include #include #include -#include -#include #include +#include #include -#include #include #include @@ -37,30 +35,13 @@ #include #include -#define DEBUG /* enable pr_debug */ - -#define SIC_IRQ_MODE_ALL 0 -#define SIC_IRQ_MODE_SINGLE 1 - -#define ZPCI_NR_DMA_SPACES 1 -#define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS - /* list of all detected zpci devices */ static LIST_HEAD(zpci_list); static DEFINE_SPINLOCK(zpci_list_lock); -static struct irq_chip zpci_irq_chip = { - .name = "zPCI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, -}; - static DECLARE_BITMAP(zpci_domain, ZPCI_NR_DEVICES); static DEFINE_SPINLOCK(zpci_domain_lock); -static struct airq_iv *zpci_aisb_iv; -static struct airq_iv *zpci_aibv[ZPCI_NR_DEVICES]; - #define ZPCI_IOMAP_ENTRIES \ min(((unsigned long) ZPCI_NR_DEVICES * PCI_BAR_COUNT / 2), \ ZPCI_IOMAP_MAX_ENTRIES) @@ -70,6 +51,8 @@ static unsigned long *zpci_iomap_bitmap; struct zpci_iomap_entry *zpci_iomap_start; EXPORT_SYMBOL_GPL(zpci_iomap_start); +DEFINE_STATIC_KEY_FALSE(have_mio); + static struct kmem_cache *zdev_fmb_cache; struct zpci_dev *get_zdev_by_fid(u32 fid) @@ -123,39 +106,6 @@ int pci_proc_domain(struct pci_bus *bus) } EXPORT_SYMBOL_GPL(pci_proc_domain); -/* Modify PCI: Register adapter interruptions */ -static int zpci_set_airq(struct zpci_dev *zdev) -{ - u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); - struct zpci_fib fib = {0}; - u8 status; - - fib.isc = PCI_ISC; - fib.sum = 1; /* enable summary notifications */ - fib.noi = airq_iv_end(zdev->aibv); - fib.aibv = (unsigned long) zdev->aibv->vector; - fib.aibvo = 0; /* each zdev has its own interrupt vector */ - fib.aisb = (unsigned long) zpci_aisb_iv->vector + (zdev->aisb/64)*8; - fib.aisbo = zdev->aisb & 63; - - return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; -} - -/* Modify PCI: Unregister adapter interruptions */ -static int zpci_clear_airq(struct zpci_dev *zdev) -{ - u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); - struct zpci_fib fib = {0}; - u8 cc, status; - - cc = zpci_mod_fc(req, &fib, &status); - if (cc == 3 || (cc == 1 && status == 24)) - /* Function already gone or IRQs already deregistered. */ - cc = 0; - - return cc ? -EIO : 0; -} - /* Modify PCI: Register I/O address translation parameters */ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, u64 base, u64 limit, u64 iota) @@ -241,7 +191,7 @@ static int zpci_cfg_load(struct zpci_dev *zdev, int offset, u32 *val, u8 len) u64 data; int rc; - rc = zpci_load(&data, req, offset); + rc = __zpci_load(&data, req, offset); if (!rc) { data = le64_to_cpu((__force __le64) data); data >>= (8 - len) * 8; @@ -259,7 +209,7 @@ static int zpci_cfg_store(struct zpci_dev *zdev, int offset, u32 val, u8 len) data <<= (8 - len) * 8; data = (__force u64) cpu_to_le64(data); - rc = zpci_store(data, req, offset); + rc = __zpci_store(data, req, offset); return rc; } @@ -276,18 +226,48 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count) zpci_memcpy_toio(to, from, count); } +void __iomem *ioremap(unsigned long ioaddr, unsigned long size) +{ + struct vm_struct *area; + unsigned long offset; + + if (!size) + return NULL; + + if (!static_branch_unlikely(&have_mio)) + return (void __iomem *) ioaddr; + + offset = ioaddr & ~PAGE_MASK; + ioaddr &= PAGE_MASK; + size = PAGE_ALIGN(size + offset); + area = get_vm_area(size, VM_IOREMAP); + if (!area) + return NULL; + + if (ioremap_page_range((unsigned long) area->addr, + (unsigned long) area->addr + size, + ioaddr, PAGE_KERNEL)) { + vunmap(area->addr); + return NULL; + } + return (void __iomem *) ((unsigned long) area->addr + offset); +} +EXPORT_SYMBOL(ioremap); + +void iounmap(volatile void __iomem *addr) +{ + if (static_branch_likely(&have_mio)) + vunmap((__force void *) ((unsigned long) addr & PAGE_MASK)); +} +EXPORT_SYMBOL(iounmap); + /* Create a virtual mapping cookie for a PCI BAR */ -void __iomem *pci_iomap_range(struct pci_dev *pdev, - int bar, - unsigned long offset, - unsigned long max) +static void __iomem *pci_iomap_range_fh(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) { struct zpci_dev *zdev = to_zpci(pdev); int idx; - if (!pci_resource_len(pdev, bar) || bar >= PCI_BAR_COUNT) - return NULL; - idx = zdev->bars[bar].map_idx; spin_lock(&zpci_iomap_lock); /* Detect overrun */ @@ -298,6 +278,30 @@ void __iomem *pci_iomap_range(struct pci_dev *pdev, return (void __iomem *) ZPCI_ADDR(idx) + offset; } + +static void __iomem *pci_iomap_range_mio(struct pci_dev *pdev, int bar, + unsigned long offset, + unsigned long max) +{ + unsigned long barsize = pci_resource_len(pdev, bar); + struct zpci_dev *zdev = to_zpci(pdev); + void __iomem *iova; + + iova = ioremap((unsigned long) zdev->bars[bar].mio_wt, barsize); + return iova ? iova + offset : iova; +} + +void __iomem *pci_iomap_range(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) +{ + if (!pci_resource_len(pdev, bar) || bar >= PCI_BAR_COUNT) + return NULL; + + if (static_branch_likely(&have_mio)) + return pci_iomap_range_mio(pdev, bar, offset, max); + else + return pci_iomap_range_fh(pdev, bar, offset, max); +} EXPORT_SYMBOL(pci_iomap_range); void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) @@ -306,7 +310,37 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) } EXPORT_SYMBOL(pci_iomap); -void pci_iounmap(struct pci_dev *pdev, void __iomem *addr) +static void __iomem *pci_iomap_wc_range_mio(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) +{ + unsigned long barsize = pci_resource_len(pdev, bar); + struct zpci_dev *zdev = to_zpci(pdev); + void __iomem *iova; + + iova = ioremap((unsigned long) zdev->bars[bar].mio_wb, barsize); + return iova ? iova + offset : iova; +} + +void __iomem *pci_iomap_wc_range(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) +{ + if (!pci_resource_len(pdev, bar) || bar >= PCI_BAR_COUNT) + return NULL; + + if (static_branch_likely(&have_mio)) + return pci_iomap_wc_range_mio(pdev, bar, offset, max); + else + return pci_iomap_range_fh(pdev, bar, offset, max); +} +EXPORT_SYMBOL(pci_iomap_wc_range); + +void __iomem *pci_iomap_wc(struct pci_dev *dev, int bar, unsigned long maxlen) +{ + return pci_iomap_wc_range(dev, bar, 0, maxlen); +} +EXPORT_SYMBOL(pci_iomap_wc); + +static void pci_iounmap_fh(struct pci_dev *pdev, void __iomem *addr) { unsigned int idx = ZPCI_IDX(addr); @@ -319,6 +353,19 @@ void pci_iounmap(struct pci_dev *pdev, void __iomem *addr) } spin_unlock(&zpci_iomap_lock); } + +static void pci_iounmap_mio(struct pci_dev *pdev, void __iomem *addr) +{ + iounmap(addr); +} + +void pci_iounmap(struct pci_dev *pdev, void __iomem *addr) +{ + if (static_branch_likely(&have_mio)) + pci_iounmap_mio(pdev, addr); + else + pci_iounmap_fh(pdev, addr); +} EXPORT_SYMBOL(pci_iounmap); static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, @@ -354,136 +401,6 @@ static struct pci_ops pci_root_ops = { .write = pci_write, }; -static void zpci_irq_handler(struct airq_struct *airq) -{ - unsigned long si, ai; - struct airq_iv *aibv; - int irqs_on = 0; - - inc_irq_stat(IRQIO_PCI); - for (si = 0;;) { - /* Scan adapter summary indicator bit vector */ - si = airq_iv_scan(zpci_aisb_iv, si, airq_iv_end(zpci_aisb_iv)); - if (si == -1UL) { - if (irqs_on++) - /* End of second scan with interrupts on. */ - break; - /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, NULL, PCI_ISC)) - break; - si = 0; - continue; - } - - /* Scan the adapter interrupt vector for this device. */ - aibv = zpci_aibv[si]; - for (ai = 0;;) { - ai = airq_iv_scan(aibv, ai, airq_iv_end(aibv)); - if (ai == -1UL) - break; - inc_irq_stat(IRQIO_MSI); - airq_iv_lock(aibv, ai); - generic_handle_irq(airq_iv_get_data(aibv, ai)); - airq_iv_unlock(aibv, ai); - } - } -} - -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) -{ - struct zpci_dev *zdev = to_zpci(pdev); - unsigned int hwirq, msi_vecs; - unsigned long aisb; - struct msi_desc *msi; - struct msi_msg msg; - int rc, irq; - - zdev->aisb = -1UL; - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; - msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); - - /* Allocate adapter summary indicator bit */ - aisb = airq_iv_alloc_bit(zpci_aisb_iv); - if (aisb == -1UL) - return -EIO; - zdev->aisb = aisb; - - /* Create adapter interrupt vector */ - zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK); - if (!zdev->aibv) - return -ENOMEM; - - /* Wire up shortcut pointer */ - zpci_aibv[aisb] = zdev->aibv; - - /* Request MSI interrupts */ - hwirq = 0; - for_each_pci_msi_entry(msi, pdev) { - if (hwirq >= msi_vecs) - break; - irq = irq_alloc_desc(0); /* Alloc irq on node 0 */ - if (irq < 0) - return -ENOMEM; - rc = irq_set_msi_desc(irq, msi); - if (rc) - return rc; - irq_set_chip_and_handler(irq, &zpci_irq_chip, - handle_simple_irq); - msg.data = hwirq; - msg.address_lo = zdev->msi_addr & 0xffffffff; - msg.address_hi = zdev->msi_addr >> 32; - pci_write_msi_msg(irq, &msg); - airq_iv_set_data(zdev->aibv, hwirq, irq); - hwirq++; - } - - /* Enable adapter interrupts */ - rc = zpci_set_airq(zdev); - if (rc) - return rc; - - return (msi_vecs == nvec) ? 0 : msi_vecs; -} - -void arch_teardown_msi_irqs(struct pci_dev *pdev) -{ - struct zpci_dev *zdev = to_zpci(pdev); - struct msi_desc *msi; - int rc; - - /* Disable adapter interrupts */ - rc = zpci_clear_airq(zdev); - if (rc) - return; - - /* Release MSI interrupts */ - for_each_pci_msi_entry(msi, pdev) { - if (!msi->irq) - continue; - if (msi->msi_attrib.is_msix) - __pci_msix_desc_mask_irq(msi, 1); - else - __pci_msi_desc_mask_irq(msi, 1, 1); - irq_set_msi_desc(msi->irq, NULL); - irq_free_desc(msi->irq); - msi->msg.address_lo = 0; - msi->msg.address_hi = 0; - msi->msg.data = 0; - msi->irq = 0; - } - - if (zdev->aisb != -1UL) { - zpci_aibv[zdev->aisb] = NULL; - airq_iv_free_bit(zpci_aisb_iv, zdev->aisb); - zdev->aisb = -1UL; - } - if (zdev->aibv) { - airq_iv_release(zdev->aibv); - zdev->aibv = NULL; - } -} - #ifdef CONFIG_PCI_IOV static struct resource iov_res = { .name = "PCI IOV res", @@ -495,6 +412,7 @@ static struct resource iov_res = { static void zpci_map_resources(struct pci_dev *pdev) { + struct zpci_dev *zdev = to_zpci(pdev); resource_size_t len; int i; @@ -502,8 +420,13 @@ static void zpci_map_resources(struct pci_dev *pdev) len = pci_resource_len(pdev, i); if (!len) continue; - pdev->resource[i].start = - (resource_size_t __force) pci_iomap(pdev, i, 0); + + if (static_branch_likely(&have_mio)) + pdev->resource[i].start = + (resource_size_t __force) zdev->bars[i].mio_wb; + else + pdev->resource[i].start = + (resource_size_t __force) pci_iomap(pdev, i, 0); pdev->resource[i].end = pdev->resource[i].start + len - 1; } @@ -524,6 +447,9 @@ static void zpci_unmap_resources(struct pci_dev *pdev) resource_size_t len; int i; + if (static_branch_likely(&have_mio)) + return; + for (i = 0; i < PCI_BAR_COUNT; i++) { len = pci_resource_len(pdev, i); if (!len) @@ -533,41 +459,6 @@ static void zpci_unmap_resources(struct pci_dev *pdev) } } -static struct airq_struct zpci_airq = { - .handler = zpci_irq_handler, - .isc = PCI_ISC, -}; - -static int __init zpci_irq_init(void) -{ - int rc; - - rc = register_adapter_interrupt(&zpci_airq); - if (rc) - goto out; - /* Set summary to 1 to be called every time for the ISC. */ - *zpci_airq.lsi_ptr = 1; - - rc = -ENOMEM; - zpci_aisb_iv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); - if (!zpci_aisb_iv) - goto out_airq; - - zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, NULL, PCI_ISC); - return 0; - -out_airq: - unregister_adapter_interrupt(&zpci_airq); -out: - return rc; -} - -static void zpci_irq_exit(void) -{ - airq_iv_release(zpci_aisb_iv); - unregister_adapter_interrupt(&zpci_airq); -} - static int zpci_alloc_iomap(struct zpci_dev *zdev) { unsigned long entry; @@ -958,7 +849,9 @@ static void zpci_mem_exit(void) kmem_cache_destroy(zdev_fmb_cache); } -static unsigned int s390_pci_probe = 1; +static unsigned int s390_pci_probe __initdata = 1; +static unsigned int s390_pci_no_mio __initdata; +unsigned int s390_pci_force_floating __initdata; static unsigned int s390_pci_initialized; char * __init pcibios_setup(char *str) @@ -967,6 +860,14 @@ char * __init pcibios_setup(char *str) s390_pci_probe = 0; return NULL; } + if (!strcmp(str, "nomio")) { + s390_pci_no_mio = 1; + return NULL; + } + if (!strcmp(str, "force_floating")) { + s390_pci_force_floating = 1; + return NULL; + } return str; } @@ -985,6 +886,9 @@ static int __init pci_base_init(void) if (!test_facility(69) || !test_facility(71)) return 0; + if (test_facility(153) && !s390_pci_no_mio) + static_branch_enable(&have_mio); + rc = zpci_debug_init(); if (rc) goto out; diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index eeb7450db18c06b8760cd5e1a577234ebc2c95f7..3a36b07a557134a41e70ce2f2d50b1055360b9ad 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -163,7 +163,14 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev, memcpy(zdev->util_str, response->util_str, sizeof(zdev->util_str)); } + zdev->mio_capable = response->mio_addr_avail; + for (i = 0; i < PCI_BAR_COUNT; i++) { + if (!(response->mio_valid & (1 << (PCI_BAR_COUNT - i - 1)))) + continue; + zdev->bars[i].mio_wb = (void __iomem *) response->addr[i].wb; + zdev->bars[i].mio_wt = (void __iomem *) response->addr[i].wt; + } return 0; } @@ -279,11 +286,18 @@ int clp_enable_fh(struct zpci_dev *zdev, u8 nr_dma_as) int rc; rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_PCI_FN); - if (!rc) - /* Success -> store enabled handle in zdev */ - zdev->fh = fh; + zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); + if (rc) + goto out; - zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, zdev->fh, rc); + zdev->fh = fh; + if (zdev->mio_capable) { + rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_MIO); + zpci_dbg(3, "ena mio fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); + if (rc) + clp_disable_fh(zdev); + } +out: return rc; } @@ -296,11 +310,10 @@ int clp_disable_fh(struct zpci_dev *zdev) return 0; rc = clp_set_pci_fn(&fh, 0, CLP_SET_DISABLE_PCI_FN); + zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); if (!rc) - /* Success -> store disabled handle in zdev */ zdev->fh = fh; - zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, zdev->fh, rc); return rc; } diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index f069929e82114004adea2cc0bfc3abc15bdf23da..02f9505c99a83e5d7e11c76eb16eeb4a3b38266d 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -8,9 +8,11 @@ #include #include #include +#include #include #include #include +#include #include #define ZPCI_INSN_BUSY_DELAY 1 /* 1 microsecond */ @@ -96,13 +98,15 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range) } /* Set Interruption Controls */ -int zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc) +int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) { if (!test_facility(72)) return -EIO; - asm volatile ( - " .insn rsy,0xeb00000000d1,%[ctl],%[isc],%[u]\n" - : : [ctl] "d" (ctl), [isc] "d" (isc << 27), [u] "Q" (*unused)); + + asm volatile( + ".insn rsy,0xeb00000000d1,%[ctl],%[isc],%[iib]\n" + : : [ctl] "d" (ctl), [isc] "d" (isc << 27), [iib] "Q" (*iib)); + return 0; } @@ -140,7 +144,7 @@ static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status) return cc; } -int zpci_load(u64 *data, u64 req, u64 offset) +int __zpci_load(u64 *data, u64 req, u64 offset) { u8 status; int cc; @@ -156,6 +160,52 @@ int zpci_load(u64 *data, u64 req, u64 offset) return (cc > 0) ? -EIO : cc; } +EXPORT_SYMBOL_GPL(__zpci_load); + +static inline int zpci_load_fh(u64 *data, const volatile void __iomem *addr, + unsigned long len) +{ + struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; + u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + + return __zpci_load(data, req, ZPCI_OFFSET(addr)); +} + +static inline int __pcilg_mio(u64 *data, u64 ioaddr, u64 len, u8 *status) +{ + register u64 addr asm("2") = ioaddr; + register u64 r3 asm("3") = len; + int cc = -ENXIO; + u64 __data; + + asm volatile ( + " .insn rre,0xb9d60000,%[data],%[ioaddr]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [data] "=d" (__data), "+d" (r3) + : [ioaddr] "d" (addr) + : "cc"); + *status = r3 >> 24 & 0xff; + *data = __data; + return cc; +} + +int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len) +{ + u8 status; + int cc; + + if (!static_branch_unlikely(&have_mio)) + return zpci_load_fh(data, addr, len); + + cc = __pcilg_mio(data, (__force u64) addr, len, &status); + if (cc) + zpci_err_insn(cc, status, 0, (__force u64) addr); + + return (cc > 0) ? -EIO : cc; +} EXPORT_SYMBOL_GPL(zpci_load); /* PCI Store */ @@ -178,7 +228,7 @@ static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status) return cc; } -int zpci_store(u64 data, u64 req, u64 offset) +int __zpci_store(u64 data, u64 req, u64 offset) { u8 status; int cc; @@ -194,6 +244,50 @@ int zpci_store(u64 data, u64 req, u64 offset) return (cc > 0) ? -EIO : cc; } +EXPORT_SYMBOL_GPL(__zpci_store); + +static inline int zpci_store_fh(const volatile void __iomem *addr, u64 data, + unsigned long len) +{ + struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; + u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + + return __zpci_store(data, req, ZPCI_OFFSET(addr)); +} + +static inline int __pcistg_mio(u64 data, u64 ioaddr, u64 len, u8 *status) +{ + register u64 addr asm("2") = ioaddr; + register u64 r3 asm("3") = len; + int cc = -ENXIO; + + asm volatile ( + " .insn rre,0xb9d40000,%[data],%[ioaddr]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), "+d" (r3) + : [data] "d" (data), [ioaddr] "d" (addr) + : "cc"); + *status = r3 >> 24 & 0xff; + return cc; +} + +int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len) +{ + u8 status; + int cc; + + if (!static_branch_unlikely(&have_mio)) + return zpci_store_fh(addr, data, len); + + cc = __pcistg_mio(data, (__force u64) addr, len, &status); + if (cc) + zpci_err_insn(cc, status, 0, (__force u64) addr); + + return (cc > 0) ? -EIO : cc; +} EXPORT_SYMBOL_GPL(zpci_store); /* PCI Store Block */ @@ -214,7 +308,7 @@ static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status) return cc; } -int zpci_store_block(const u64 *data, u64 req, u64 offset) +int __zpci_store_block(const u64 *data, u64 req, u64 offset) { u8 status; int cc; @@ -230,4 +324,63 @@ int zpci_store_block(const u64 *data, u64 req, u64 offset) return (cc > 0) ? -EIO : cc; } -EXPORT_SYMBOL_GPL(zpci_store_block); +EXPORT_SYMBOL_GPL(__zpci_store_block); + +static inline int zpci_write_block_fh(volatile void __iomem *dst, + const void *src, unsigned long len) +{ + struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(dst)]; + u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + u64 offset = ZPCI_OFFSET(dst); + + return __zpci_store_block(src, req, offset); +} + +static inline int __pcistb_mio(const u64 *data, u64 ioaddr, u64 len, u8 *status) +{ + int cc = -ENXIO; + + asm volatile ( + " .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[data]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [len] "+d" (len) + : [ioaddr] "d" (ioaddr), [data] "Q" (*data) + : "cc"); + *status = len >> 24 & 0xff; + return cc; +} + +int zpci_write_block(volatile void __iomem *dst, + const void *src, unsigned long len) +{ + u8 status; + int cc; + + if (!static_branch_unlikely(&have_mio)) + return zpci_write_block_fh(dst, src, len); + + cc = __pcistb_mio(src, (__force u64) dst, len, &status); + if (cc) + zpci_err_insn(cc, status, 0, (__force u64) dst); + + return (cc > 0) ? -EIO : cc; +} +EXPORT_SYMBOL_GPL(zpci_write_block); + +static inline void __pciwb_mio(void) +{ + unsigned long unused = 0; + + asm volatile (".insn rre,0xb9d50000,%[op],%[op]\n" + : [op] "+d" (unused)); +} + +void zpci_barrier(void) +{ + if (static_branch_likely(&have_mio)) + __pciwb_mio(); +} +EXPORT_SYMBOL_GPL(zpci_barrier); diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c new file mode 100644 index 0000000000000000000000000000000000000000..d80616ae8dd8ade256beae5268cab2924a379deb --- /dev/null +++ b/arch/s390/pci/pci_irq.c @@ -0,0 +1,486 @@ +// SPDX-License-Identifier: GPL-2.0 +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include +#include + +static enum {FLOATING, DIRECTED} irq_delivery; + +#define SIC_IRQ_MODE_ALL 0 +#define SIC_IRQ_MODE_SINGLE 1 +#define SIC_IRQ_MODE_DIRECT 4 +#define SIC_IRQ_MODE_D_ALL 16 +#define SIC_IRQ_MODE_D_SINGLE 17 +#define SIC_IRQ_MODE_SET_CPU 18 + +/* + * summary bit vector + * FLOATING - summary bit per function + * DIRECTED - summary bit per cpu (only used in fallback path) + */ +static struct airq_iv *zpci_sbv; + +/* + * interrupt bit vectors + * FLOATING - interrupt bit vector per function + * DIRECTED - interrupt bit vector per cpu + */ +static struct airq_iv **zpci_ibv; + +/* Modify PCI: Register adapter interruptions */ +static int zpci_set_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); + struct zpci_fib fib = {0}; + u8 status; + + fib.fmt0.isc = PCI_ISC; + fib.fmt0.sum = 1; /* enable summary notifications */ + fib.fmt0.noi = airq_iv_end(zdev->aibv); + fib.fmt0.aibv = (unsigned long) zdev->aibv->vector; + fib.fmt0.aibvo = 0; /* each zdev has its own interrupt vector */ + fib.fmt0.aisb = (unsigned long) zpci_sbv->vector + (zdev->aisb/64)*8; + fib.fmt0.aisbo = zdev->aisb & 63; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister adapter interruptions */ +static int zpci_clear_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); + struct zpci_fib fib = {0}; + u8 cc, status; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +/* Modify PCI: Register CPU directed interruptions */ +static int zpci_set_directed_irq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT_D); + struct zpci_fib fib = {0}; + u8 status; + + fib.fmt = 1; + fib.fmt1.noi = zdev->msi_nr_irqs; + fib.fmt1.dibvo = zdev->msi_first_bit; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister CPU directed interruptions */ +static int zpci_clear_directed_irq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT_D); + struct zpci_fib fib = {0}; + u8 cc, status; + + fib.fmt = 1; + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest, + bool force) +{ + struct msi_desc *entry = irq_get_msi_desc(data->irq); + struct msi_msg msg = entry->msg; + + msg.address_lo &= 0xff0000ff; + msg.address_lo |= (cpumask_first(dest) << 8); + pci_write_msi_msg(data->irq, &msg); + + return IRQ_SET_MASK_OK; +} + +static struct irq_chip zpci_irq_chip = { + .name = "PCI-MSI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, + .irq_set_affinity = zpci_set_irq_affinity, +}; + +static void zpci_handle_cpu_local_irq(bool rescan) +{ + struct airq_iv *dibv = zpci_ibv[smp_processor_id()]; + unsigned long bit; + int irqs_on = 0; + + for (bit = 0;;) { + /* Scan the directed IRQ bit vector */ + bit = airq_iv_scan(dibv, bit, airq_iv_end(dibv)); + if (bit == -1UL) { + if (!rescan || irqs_on++) + /* End of second scan with interrupts on. */ + break; + /* First scan complete, reenable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC)) + break; + bit = 0; + continue; + } + inc_irq_stat(IRQIO_MSI); + generic_handle_irq(airq_iv_get_data(dibv, bit)); + } +} + +struct cpu_irq_data { + call_single_data_t csd; + atomic_t scheduled; +}; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_irq_data, irq_data); + +static void zpci_handle_remote_irq(void *data) +{ + atomic_t *scheduled = data; + + do { + zpci_handle_cpu_local_irq(false); + } while (atomic_dec_return(scheduled)); +} + +static void zpci_handle_fallback_irq(void) +{ + struct cpu_irq_data *cpu_data; + unsigned long cpu; + int irqs_on = 0; + + for (cpu = 0;;) { + cpu = airq_iv_scan(zpci_sbv, cpu, airq_iv_end(zpci_sbv)); + if (cpu == -1UL) { + if (irqs_on++) + /* End of second scan with interrupts on. */ + break; + /* First scan complete, reenable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) + break; + cpu = 0; + continue; + } + cpu_data = &per_cpu(irq_data, cpu); + if (atomic_inc_return(&cpu_data->scheduled) > 1) + continue; + + cpu_data->csd.func = zpci_handle_remote_irq; + cpu_data->csd.info = &cpu_data->scheduled; + cpu_data->csd.flags = 0; + smp_call_function_single_async(cpu, &cpu_data->csd); + } +} + +static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating) +{ + if (floating) { + inc_irq_stat(IRQIO_PCF); + zpci_handle_fallback_irq(); + } else { + inc_irq_stat(IRQIO_PCD); + zpci_handle_cpu_local_irq(true); + } +} + +static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) +{ + unsigned long si, ai; + struct airq_iv *aibv; + int irqs_on = 0; + + inc_irq_stat(IRQIO_PCF); + for (si = 0;;) { + /* Scan adapter summary indicator bit vector */ + si = airq_iv_scan(zpci_sbv, si, airq_iv_end(zpci_sbv)); + if (si == -1UL) { + if (irqs_on++) + /* End of second scan with interrupts on. */ + break; + /* First scan complete, reenable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) + break; + si = 0; + continue; + } + + /* Scan the adapter interrupt vector for this device. */ + aibv = zpci_ibv[si]; + for (ai = 0;;) { + ai = airq_iv_scan(aibv, ai, airq_iv_end(aibv)); + if (ai == -1UL) + break; + inc_irq_stat(IRQIO_MSI); + airq_iv_lock(aibv, ai); + generic_handle_irq(airq_iv_get_data(aibv, ai)); + airq_iv_unlock(aibv, ai); + } + } +} + +int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +{ + struct zpci_dev *zdev = to_zpci(pdev); + unsigned int hwirq, msi_vecs, cpu; + unsigned long bit; + struct msi_desc *msi; + struct msi_msg msg; + int rc, irq; + + zdev->aisb = -1UL; + zdev->msi_first_bit = -1U; + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); + + if (irq_delivery == DIRECTED) { + /* Allocate cpu vector bits */ + bit = airq_iv_alloc(zpci_ibv[0], msi_vecs); + if (bit == -1UL) + return -EIO; + } else { + /* Allocate adapter summary indicator bit */ + bit = airq_iv_alloc_bit(zpci_sbv); + if (bit == -1UL) + return -EIO; + zdev->aisb = bit; + + /* Create adapter interrupt vector */ + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK); + if (!zdev->aibv) + return -ENOMEM; + + /* Wire up shortcut pointer */ + zpci_ibv[bit] = zdev->aibv; + /* Each function has its own interrupt vector */ + bit = 0; + } + + /* Request MSI interrupts */ + hwirq = bit; + for_each_pci_msi_entry(msi, pdev) { + rc = -EIO; + if (hwirq - bit >= msi_vecs) + break; + irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE, msi->affinity); + if (irq < 0) + return -ENOMEM; + rc = irq_set_msi_desc(irq, msi); + if (rc) + return rc; + irq_set_chip_and_handler(irq, &zpci_irq_chip, + handle_percpu_irq); + msg.data = hwirq; + if (irq_delivery == DIRECTED) { + msg.address_lo = zdev->msi_addr & 0xff0000ff; + msg.address_lo |= msi->affinity ? + (cpumask_first(&msi->affinity->mask) << 8) : 0; + for_each_possible_cpu(cpu) { + airq_iv_set_data(zpci_ibv[cpu], hwirq, irq); + } + } else { + msg.address_lo = zdev->msi_addr & 0xffffffff; + airq_iv_set_data(zdev->aibv, hwirq, irq); + } + msg.address_hi = zdev->msi_addr >> 32; + pci_write_msi_msg(irq, &msg); + hwirq++; + } + + zdev->msi_first_bit = bit; + zdev->msi_nr_irqs = msi_vecs; + + if (irq_delivery == DIRECTED) + rc = zpci_set_directed_irq(zdev); + else + rc = zpci_set_airq(zdev); + if (rc) + return rc; + + return (msi_vecs == nvec) ? 0 : msi_vecs; +} + +void arch_teardown_msi_irqs(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + struct msi_desc *msi; + int rc; + + /* Disable interrupts */ + if (irq_delivery == DIRECTED) + rc = zpci_clear_directed_irq(zdev); + else + rc = zpci_clear_airq(zdev); + if (rc) + return; + + /* Release MSI interrupts */ + for_each_pci_msi_entry(msi, pdev) { + if (!msi->irq) + continue; + if (msi->msi_attrib.is_msix) + __pci_msix_desc_mask_irq(msi, 1); + else + __pci_msi_desc_mask_irq(msi, 1, 1); + irq_set_msi_desc(msi->irq, NULL); + irq_free_desc(msi->irq); + msi->msg.address_lo = 0; + msi->msg.address_hi = 0; + msi->msg.data = 0; + msi->irq = 0; + } + + if (zdev->aisb != -1UL) { + zpci_ibv[zdev->aisb] = NULL; + airq_iv_free_bit(zpci_sbv, zdev->aisb); + zdev->aisb = -1UL; + } + if (zdev->aibv) { + airq_iv_release(zdev->aibv); + zdev->aibv = NULL; + } + + if ((irq_delivery == DIRECTED) && zdev->msi_first_bit != -1U) + airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs); +} + +static struct airq_struct zpci_airq = { + .handler = zpci_floating_irq_handler, + .isc = PCI_ISC, +}; + +static void __init cpu_enable_directed_irq(void *unused) +{ + union zpci_sic_iib iib = {{0}}; + + iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector; + + __zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib); + zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC); +} + +static int __init zpci_directed_irq_init(void) +{ + union zpci_sic_iib iib = {{0}}; + unsigned int cpu; + + zpci_sbv = airq_iv_create(num_possible_cpus(), 0); + if (!zpci_sbv) + return -ENOMEM; + + iib.diib.isc = PCI_ISC; + iib.diib.nr_cpus = num_possible_cpus(); + iib.diib.disb_addr = (u64) zpci_sbv->vector; + __zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib); + + zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv), + GFP_KERNEL); + if (!zpci_ibv) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + /* + * Per CPU IRQ vectors look the same but bit-allocation + * is only done on the first vector. + */ + zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE, + AIRQ_IV_DATA | + AIRQ_IV_CACHELINE | + (!cpu ? AIRQ_IV_ALLOC : 0)); + if (!zpci_ibv[cpu]) + return -ENOMEM; + } + on_each_cpu(cpu_enable_directed_irq, NULL, 1); + + zpci_irq_chip.irq_set_affinity = zpci_set_irq_affinity; + + return 0; +} + +static int __init zpci_floating_irq_init(void) +{ + zpci_ibv = kcalloc(ZPCI_NR_DEVICES, sizeof(*zpci_ibv), GFP_KERNEL); + if (!zpci_ibv) + return -ENOMEM; + + zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); + if (!zpci_sbv) + goto out_free; + + return 0; + +out_free: + kfree(zpci_ibv); + return -ENOMEM; +} + +int __init zpci_irq_init(void) +{ + int rc; + + irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING; + if (s390_pci_force_floating) + irq_delivery = FLOATING; + + if (irq_delivery == DIRECTED) + zpci_airq.handler = zpci_directed_irq_handler; + + rc = register_adapter_interrupt(&zpci_airq); + if (rc) + goto out; + /* Set summary to 1 to be called every time for the ISC. */ + *zpci_airq.lsi_ptr = 1; + + switch (irq_delivery) { + case FLOATING: + rc = zpci_floating_irq_init(); + break; + case DIRECTED: + rc = zpci_directed_irq_init(); + break; + } + + if (rc) + goto out_airq; + + /* + * Enable floating IRQs (with suppression after one IRQ). When using + * directed IRQs this enables the fallback path. + */ + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC); + + return 0; +out_airq: + unregister_adapter_interrupt(&zpci_airq); +out: + return rc; +} + +void __init zpci_irq_exit(void) +{ + unsigned int cpu; + + if (irq_delivery == DIRECTED) { + for_each_possible_cpu(cpu) { + airq_iv_release(zpci_ibv[cpu]); + } + } + kfree(zpci_ibv); + if (zpci_sbv) + airq_iv_release(zpci_sbv); + unregister_adapter_interrupt(&zpci_airq); +} diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index ce6a3f75065bf9719eb06a6c3c03f9b8b33f1082..dc1ae4ff79d7aa5fec71d29456437c0d579398f1 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -4,7 +4,7 @@ OBJECT_FILES_NON_STANDARD := y purgatory-y := head.o purgatory.o string.o sha256.o mem.o -targets += $(purgatory-y) purgatory.ro kexec-purgatory.c +targets += $(purgatory-y) purgatory.lds purgatory purgatory.ro PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) $(obj)/sha256.o: $(srctree)/lib/sha256.c FORCE @@ -16,22 +16,26 @@ $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE $(obj)/string.o: $(srctree)/arch/s390/lib/string.c FORCE $(call if_changed_rule,cc_o_c) -LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -LDFLAGS_purgatory.ro += -z nodefaultlib KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding KBUILD_CFLAGS += -c -MD -Os -m64 -msoft-float -fno-common +KBUILD_CFLAGS += $(CLANG_FLAGS) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) -$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE +LDFLAGS_purgatory := -r --no-undefined -nostdlib -z nodefaultlib -T +$(obj)/purgatory: $(obj)/purgatory.lds $(PURGATORY_OBJS) FORCE $(call if_changed,ld) -quiet_cmd_bin2c = BIN2C $@ - cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@ +OBJCOPYFLAGS_purgatory.ro := -O elf64-s390 +OBJCOPYFLAGS_purgatory.ro += --remove-section='*debug*' +OBJCOPYFLAGS_purgatory.ro += --remove-section='.comment' +OBJCOPYFLAGS_purgatory.ro += --remove-section='.note.*' +$(obj)/purgatory.ro: $(obj)/purgatory FORCE + $(call if_changed,objcopy) -$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE - $(call if_changed,bin2c) +$(obj)/kexec-purgatory.o: $(obj)/kexec-purgatory.S $(obj)/purgatory.ro FORCE + $(call if_changed_rule,as_o_S) obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += kexec-purgatory.o diff --git a/arch/s390/purgatory/kexec-purgatory.S b/arch/s390/purgatory/kexec-purgatory.S new file mode 100644 index 0000000000000000000000000000000000000000..8293753100aea9221b4e9b2dd7971b3c40b46391 --- /dev/null +++ b/arch/s390/purgatory/kexec-purgatory.S @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + + .section .rodata, "a" + + .align 8 +kexec_purgatory: + .globl kexec_purgatory + .incbin "arch/s390/purgatory/purgatory.ro" +.Lkexec_purgatroy_end: + + .align 8 +kexec_purgatory_size: + .globl kexec_purgatory_size + .quad .Lkexec_purgatroy_end - kexec_purgatory diff --git a/arch/s390/purgatory/purgatory.lds.S b/arch/s390/purgatory/purgatory.lds.S new file mode 100644 index 0000000000000000000000000000000000000000..482eb4fbcef190d7c97ee7f20e7ae905de79dc3f --- /dev/null +++ b/arch/s390/purgatory/purgatory.lds.S @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include + +OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") +OUTPUT_ARCH(s390:64-bit) + +ENTRY(purgatory_start) + +SECTIONS +{ + . = 0; + .head.text : { + _head = . ; + HEAD_TEXT + _ehead = . ; + } + .text : { + _text = .; /* Text */ + *(.text) + *(.text.*) + _etext = . ; + } + .rodata : { + _rodata = . ; + *(.rodata) /* read-only data */ + *(.rodata.*) + _erodata = . ; + } + .data : { + _data = . ; + *(.data) + *(.data.*) + _edata = . ; + } + + . = ALIGN(256); + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + . = ALIGN(8); /* For convenience during zeroing */ + _ebss = .; + } + _end = .; + + /* Sections to be discarded */ + /DISCARD/ : { + *(.eh_frame) + *(*__ksymtab*) + *(___kcrctab*) + } +} diff --git a/arch/s390/scripts/Makefile.chkbss b/arch/s390/scripts/Makefile.chkbss index cd7e8f4419f51c5ab5426e39ec7dacfb47c22f80..884a9caff5fb8be71c80fce894d78ef1fe5f955d 100644 --- a/arch/s390/scripts/Makefile.chkbss +++ b/arch/s390/scripts/Makefile.chkbss @@ -11,7 +11,8 @@ chkbss: $(addprefix $(obj)/, $(chkbss-files)) quiet_cmd_chkbss = CHKBSS $< cmd_chkbss = \ - if ! $(OBJDUMP) -j .bss -w -h $< | awk 'END { if ($$3) exit 1 }'; then \ + if $(OBJDUMP) -h $< | grep -q "\.bss" && \ + ! $(OBJDUMP) -j .bss -w -h $< | awk 'END { if ($$3) exit 1 }'; then \ echo "error: $< .bss section is not empty" >&2; exit 1; \ fi; \ touch $@; diff --git a/arch/s390/tools/opcodes.txt b/arch/s390/tools/opcodes.txt index 1cbed82cd17b7823bbdd227f2b8d314cc2552b97..64638b764d1cca5885ce0d751b880cacced54fdf 100644 --- a/arch/s390/tools/opcodes.txt +++ b/arch/s390/tools/opcodes.txt @@ -1,3 +1,5 @@ +0000 illegal E +0002 brkpt E 0101 pr E 0102 upt E 0104 ptff E @@ -257,6 +259,7 @@ b258 bsg RRE_RR b25a bsa RRE_RR b25d clst RRE_RR b25e srst RRE_RR +b25f chsc RRE_R0 b263 cmpsc RRE_RR b274 siga S_RD b276 xsch S_00 @@ -277,6 +280,9 @@ b29d lfpc S_RD b2a5 tre RRE_RR b2a6 cu21 RRF_U0RR b2a7 cu12 RRF_U0RR +b2ad nqap RRE_RR +b2ae dqap RRE_RR +b2af pqap RRE_RR b2b0 stfle S_RD b2b1 stfl S_RD b2b2 lpswe S_RD @@ -290,6 +296,7 @@ b2e5 epctr RRE_RR b2e8 ppa RRF_U0RR b2ec etnd RRE_R0 b2ed ecpga RRE_RR +b2f0 iucv RRE_RR b2f8 tend S_00 b2fa niai IE_UU b2fc tabort S_RD @@ -559,12 +566,15 @@ b998 alcr RRE_RR b999 slbr RRE_RR b99a epair RRE_R0 b99b esair RRE_R0 +b99c eqbs RRF_U0RR b99d esea RRE_R0 b99e pti RRE_RR b99f ssair RRE_R0 +b9a0 clp RRF_U0RR b9a1 tpei RRE_RR b9a2 ptf RRE_R0 b9aa lptea RRF_RURR2 +b9ab essa RRF_U0RR b9ac irbm RRE_RR b9ae rrbm RRE_RR b9af pfmf RRE_RR @@ -1039,6 +1049,7 @@ eb7a agsi SIY_IRD eb7e algsi SIY_IRD eb80 icmh RSY_RURD eb81 icmy RSY_RURD +eb8a sqbs RSY_RDRU eb8e mvclu RSY_RRRD eb8f clclu RSY_RRRD eb90 stmy RSY_RRRD diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index b1c91ea9a958e939da0a91d720346f6d229a384c..0be08d586d40c64ee7db194210048a7724480ba6 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -90,12 +90,6 @@ config ARCH_DEFCONFIG default "arch/sh/configs/shx3_defconfig" if SUPERH32 default "arch/sh/configs/cayman_defconfig" if SUPERH64 -config RWSEM_GENERIC_SPINLOCK - def_bool y - -config RWSEM_XCHGADD_ALGORITHM - bool - config GENERIC_BUG def_bool y depends on BUG && SUPERH32 diff --git a/arch/sh/boards/of-generic.c b/arch/sh/boards/of-generic.c index 958f46da3a7912cfd94a7b517e8e88b88fec3a5a..d91065e81a4e5cffcb2b86463c8dba9190c0d7fa 100644 --- a/arch/sh/boards/of-generic.c +++ b/arch/sh/boards/of-generic.c @@ -164,10 +164,10 @@ static struct sh_machine_vector __initmv sh_of_generic_mv = { struct sh_clk_ops; -void __init arch_init_clk_ops(struct sh_clk_ops **ops, int idx) +void __init __weak arch_init_clk_ops(struct sh_clk_ops **ops, int idx) { } -void __init plat_irq_setup(void) +void __init __weak plat_irq_setup(void) { } diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 7bf2cb680d328462c4e621eae24005f1c9f35afc..73fff39a0122f0405f0940036a09cc4283946d44 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -17,7 +17,6 @@ generic-y += mm-arch-hooks.h generic-y += parport.h generic-y += percpu.h generic-y += preempt.h -generic-y += rwsem.h generic-y += serial.h generic-y += sizes.h generic-y += trace_clock.h diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index 4f7f235f15f856775ee2fbdae67e06ad20fdc363..c28e37a344adce0e6785c0d0a9802ff1244aa719 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -229,9 +229,6 @@ __BUILD_IOPORT_STRING(q, u64) #define IO_SPACE_LIMIT 0xffffffff -/* synco on SH-4A, otherwise a nop */ -#define mmiowb() wmb() - /* We really want to try and get these to memcpy etc */ void memcpy_fromio(void *, const volatile void __iomem *, unsigned long); void memcpy_toio(volatile void __iomem *, const void *, unsigned long); diff --git a/arch/sh/include/asm/mmiowb.h b/arch/sh/include/asm/mmiowb.h new file mode 100644 index 0000000000000000000000000000000000000000..535d59735f1d85c1d990b09bccf8ba0a4e54a7b1 --- /dev/null +++ b/arch/sh/include/asm/mmiowb.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_SH_MMIOWB_H +#define __ASM_SH_MMIOWB_H + +#include + +/* synco on SH-4A, otherwise a nop */ +#define mmiowb() wmb() + +#include + +#endif /* __ASM_SH_MMIOWB_H */ diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 8ad73cb311216a0a8801ee3890ed27788a79ad61..b56f908b13950e31335984ec09c2f88973915c90 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -70,6 +70,15 @@ do { \ tlb_remove_page((tlb), (pte)); \ } while (0) +#if CONFIG_PGTABLE_LEVELS > 2 +#define __pmd_free_tlb(tlb, pmdp, addr) \ +do { \ + struct page *page = virt_to_page(pmdp); \ + pgtable_pmd_page_dtor(page); \ + tlb_remove_page((tlb), page); \ +} while (0); +#endif + static inline void check_pgt_cache(void) { quicklist_trim(QUICK_PT, NULL, 25, 16); diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h index 786ee0fde3b010f3bae867220ec9c68d84e895e8..7fd929cd2e7a08f8d102852b23bbe754a5c6145e 100644 --- a/arch/sh/include/asm/spinlock-llsc.h +++ b/arch/sh/include/asm/spinlock-llsc.h @@ -47,6 +47,8 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) { unsigned long tmp; + /* This could be optimised with ARCH_HAS_MMIOWB */ + mmiowb(); __asm__ __volatile__ ( "mov #1, %0 ! arch_spin_unlock \n\t" "mov.l %0, @%1 \n\t" diff --git a/arch/sh/include/asm/syscall_32.h b/arch/sh/include/asm/syscall_32.h index 6e118799831c32dc37b8cf21960d284c5bec3646..8c9d7e5e5dcc02375eeafab25e47878b76239aaa 100644 --- a/arch/sh/include/asm/syscall_32.h +++ b/arch/sh/include/asm/syscall_32.h @@ -48,51 +48,28 @@ static inline void syscall_set_return_value(struct task_struct *task, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { - /* - * Do this simply for now. If we need to start supporting - * fetching arguments from arbitrary indices, this will need some - * extra logic. Presently there are no in-tree users that depend - * on this behaviour. - */ - BUG_ON(i); /* Argument pattern is: R4, R5, R6, R7, R0, R1 */ - switch (n) { - case 6: args[5] = regs->regs[1]; - case 5: args[4] = regs->regs[0]; - case 4: args[3] = regs->regs[7]; - case 3: args[2] = regs->regs[6]; - case 2: args[1] = regs->regs[5]; - case 1: args[0] = regs->regs[4]; - case 0: - break; - default: - BUG(); - } + args[5] = regs->regs[1]; + args[4] = regs->regs[0]; + args[3] = regs->regs[7]; + args[2] = regs->regs[6]; + args[1] = regs->regs[5]; + args[0] = regs->regs[4]; } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) { - /* Same note as above applies */ - BUG_ON(i); - - switch (n) { - case 6: regs->regs[1] = args[5]; - case 5: regs->regs[0] = args[4]; - case 4: regs->regs[7] = args[3]; - case 3: regs->regs[6] = args[2]; - case 2: regs->regs[5] = args[1]; - case 1: regs->regs[4] = args[0]; - break; - default: - BUG(); - } + regs->regs[1] = args[5]; + regs->regs[0] = args[4]; + regs->regs[7] = args[3]; + regs->regs[6] = args[2]; + regs->regs[5] = args[1]; + regs->regs[4] = args[0]; } static inline int syscall_get_arch(void) diff --git a/arch/sh/include/asm/syscall_64.h b/arch/sh/include/asm/syscall_64.h index 43882580c7f99bec93e519f1b4182c1daad2fbf0..22fad97da06619a137f6f4cd3e3ca4f6a9bddfcc 100644 --- a/arch/sh/include/asm/syscall_64.h +++ b/arch/sh/include/asm/syscall_64.h @@ -47,20 +47,16 @@ static inline void syscall_set_return_value(struct task_struct *task, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { - BUG_ON(i + n > 6); - memcpy(args, ®s->regs[2 + i], n * sizeof(args[0])); + memcpy(args, ®s->regs[2], 6 * sizeof(args[0])); } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) { - BUG_ON(i + n > 6); - memcpy(®s->regs[2 + i], args, n * sizeof(args[0])); + memcpy(®s->regs[2], args, 6 * sizeof(args[0])); } static inline int syscall_get_arch(void) diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 77abe192fb43d90cd6d56bfe878b54126188ac34..bc77f3dd4261da2213368c182cc6bc327b785d98 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -11,133 +11,8 @@ #ifdef CONFIG_MMU #include -#include -#include -#include -/* - * TLB handling. This allows us to remove pages from the page - * tables, and efficiently handle the TLB issues. - */ -struct mmu_gather { - struct mm_struct *mm; - unsigned int fullmm; - unsigned long start, end; -}; - -static inline void init_tlb_gather(struct mmu_gather *tlb) -{ - tlb->start = TASK_SIZE; - tlb->end = 0; - - if (tlb->fullmm) { - tlb->start = 0; - tlb->end = TASK_SIZE; - } -} - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->start = start; - tlb->end = end; - tlb->fullmm = !(start | (end+1)); - - init_tlb_gather(tlb); -} - -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (tlb->fullmm || force) - flush_tlb_mm(tlb->mm); - - /* keep the page table cache within bounds */ - check_pgt_cache(); -} - -static inline void -tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address) -{ - if (tlb->start > address) - tlb->start = address; - if (tlb->end < address + PAGE_SIZE) - tlb->end = address + PAGE_SIZE; -} - -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) - -/* - * In the case of tlb vma handling, we can optimise these away in the - * case where we're doing a full MM flush. When we're doing a munmap, - * the vmas are adjusted to only cover the region to be torn down. - */ -static inline void -tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) -{ - if (!tlb->fullmm) - flush_cache_range(vma, vma->vm_start, vma->vm_end); -} - -static inline void -tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) -{ - if (!tlb->fullmm && tlb->end) { - flush_tlb_range(vma, tlb->start, tlb->end); - init_tlb_gather(tlb); - } -} - -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ -} - -static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ -} - -static inline void tlb_flush_mmu(struct mmu_gather *tlb) -{ -} - -static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - free_page_and_swap_cache(page); - return false; /* avoid calling tlb_flush_mmu */ -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - __tlb_remove_page(tlb, page); -} - -static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return __tlb_remove_page(tlb, page); -} - -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return tlb_remove_page(tlb, page); -} - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ -} - -#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) -#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) -#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) - -#define tlb_migrate_finish(mm) do { } while (0) +#include #if defined(CONFIG_CPU_SH4) || defined(CONFIG_SUPERH64) extern void tlb_wire_entry(struct vm_area_struct *, unsigned long, pte_t); @@ -157,11 +32,6 @@ static inline void tlb_unwire_entry(void) #else /* CONFIG_MMU */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while (0) -#define tlb_flush(tlb) do { } while (0) - #include #endif /* CONFIG_MMU */ diff --git a/arch/sh/kernel/stacktrace.c b/arch/sh/kernel/stacktrace.c index f3cb2cccb2624de9a509082047b7c6f10f1997ee..2950b19ad077208114e13b8547d5f678649ea6fc 100644 --- a/arch/sh/kernel/stacktrace.c +++ b/arch/sh/kernel/stacktrace.c @@ -49,8 +49,6 @@ void save_stack_trace(struct stack_trace *trace) unsigned long *sp = (unsigned long *)current_stack_pointer; unwind_stack(current, NULL, sp, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace); @@ -84,7 +82,5 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) unsigned long *sp = (unsigned long *)tsk->thread.sp; unwind_stack(current, NULL, sp, &save_stack_ops_nosched, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index bfda678576e4335788f844db6ec7632fda5faedf..480b057556ee45a3871485ce7301d2436cca8255 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -426,3 +426,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 40f8f4f73fe8fea23c31b73d9e4441dbd67fabf1..f6421c9ce5d3f0b7590f198c394ea8d761014e81 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -63,6 +63,7 @@ config SPARC64 select HAVE_KRETPROBES select HAVE_KPROBES select HAVE_RCU_TABLE_FREE if SMP + select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE select HAVE_MEMBLOCK_NODE_MAP select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_DYNAMIC_FTRACE @@ -191,14 +192,6 @@ config NR_CPUS source "kernel/Kconfig.hz" -config RWSEM_GENERIC_SPINLOCK - bool - default y if SPARC32 - -config RWSEM_XCHGADD_ALGORITHM - bool - default y if SPARC64 - config GENERIC_HWEIGHT bool default y diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index a22cfd5c0ee8665d96f40dcdfacd2c784a2fad62..95c44380b1d6748a54fd31863b6043c9c6782025 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -15,10 +15,10 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += msi.h generic-y += preempt.h -generic-y += rwsem.h generic-y += serial.h generic-y += trace_clock.h generic-y += word-at-a-time.h diff --git a/arch/sparc/include/asm/io_64.h b/arch/sparc/include/asm/io_64.h index b162c23ae8c2305eea1ce1f8b6bb2a16aadcca24..688911051b4461b458924ab7066994d90304438f 100644 --- a/arch/sparc/include/asm/io_64.h +++ b/arch/sparc/include/asm/io_64.h @@ -396,8 +396,6 @@ static inline void memcpy_toio(volatile void __iomem *dst, const void *src, } } -#define mmiowb() - #ifdef __KERNEL__ /* On sparc64 we have the whole physical IO address space accessible diff --git a/arch/sparc/include/asm/syscall.h b/arch/sparc/include/asm/syscall.h index 053989e3f6a6f1435323873ea010723ac09736bd..4d075434e8164c18e140249d65cbffdb28290dc6 100644 --- a/arch/sparc/include/asm/syscall.h +++ b/arch/sparc/include/asm/syscall.h @@ -96,11 +96,11 @@ static inline void syscall_set_return_value(struct task_struct *task, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { int zero_extend = 0; unsigned int j; + unsigned int n = 6; #ifdef CONFIG_SPARC64 if (test_tsk_thread_flag(task, TIF_32BIT)) @@ -108,7 +108,7 @@ static inline void syscall_get_arguments(struct task_struct *task, #endif for (j = 0; j < n; j++) { - unsigned long val = regs->u_regs[UREG_I0 + i + j]; + unsigned long val = regs->u_regs[UREG_I0 + j]; if (zero_extend) args[j] = (u32) val; @@ -119,13 +119,12 @@ static inline void syscall_get_arguments(struct task_struct *task, static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) { - unsigned int j; + unsigned int i; - for (j = 0; j < n; j++) - regs->u_regs[UREG_I0 + i + j] = args[j]; + for (i = 0; i < 6; i++) + regs->u_regs[UREG_I0 + i] = args[i]; } static inline int syscall_get_arch(void) diff --git a/arch/sparc/include/asm/tlb_32.h b/arch/sparc/include/asm/tlb_32.h index 343cea19e5735b200eecb8c87c1a51ef54ae9e39..5cd28a8793e3975aef8e7389ec232068fb6feffb 100644 --- a/arch/sparc/include/asm/tlb_32.h +++ b/arch/sparc/include/asm/tlb_32.h @@ -2,24 +2,6 @@ #ifndef _SPARC_TLB_H #define _SPARC_TLB_H -#define tlb_start_vma(tlb, vma) \ -do { \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define tlb_end_vma(tlb, vma) \ -do { \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define __tlb_remove_tlb_entry(tlb, pte, address) \ - do { } while (0) - -#define tlb_flush(tlb) \ -do { \ - flush_tlb_mm((tlb)->mm); \ -} while (0) - #include #endif /* _SPARC_TLB_H */ diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index a8af6023c1263f7b43a4a52f52089235493923bd..14b93c5564e3572c07993c74217fb0b89ee36573 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -73,6 +73,11 @@ static inline void iommu_batch_start(struct device *dev, unsigned long prot, uns p->npages = 0; } +static inline bool iommu_use_atu(struct iommu *iommu, u64 mask) +{ + return iommu->atu && mask > DMA_BIT_MASK(32); +} + /* Interrupts must be disabled. */ static long iommu_batch_flush(struct iommu_batch *p, u64 mask) { @@ -92,7 +97,7 @@ static long iommu_batch_flush(struct iommu_batch *p, u64 mask) prot &= (HV_PCI_MAP_ATTR_READ | HV_PCI_MAP_ATTR_WRITE); while (npages != 0) { - if (mask <= DMA_BIT_MASK(32) || !pbm->iommu->atu) { + if (!iommu_use_atu(pbm->iommu, mask)) { num = pci_sun4v_iommu_map(devhandle, HV_PCI_TSBID(0, entry), npages, @@ -179,7 +184,6 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size, unsigned long flags, order, first_page, npages, n; unsigned long prot = 0; struct iommu *iommu; - struct atu *atu; struct iommu_map_table *tbl; struct page *page; void *ret; @@ -205,13 +209,11 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size, memset((char *)first_page, 0, PAGE_SIZE << order); iommu = dev->archdata.iommu; - atu = iommu->atu; - mask = dev->coherent_dma_mask; - if (mask <= DMA_BIT_MASK(32) || !atu) + if (!iommu_use_atu(iommu, mask)) tbl = &iommu->tbl; else - tbl = &atu->tbl; + tbl = &iommu->atu->tbl; entry = iommu_tbl_range_alloc(dev, tbl, npages, NULL, (unsigned long)(-1), 0); @@ -333,7 +335,7 @@ static void dma_4v_free_coherent(struct device *dev, size_t size, void *cpu, atu = iommu->atu; devhandle = pbm->devhandle; - if (dvma <= DMA_BIT_MASK(32)) { + if (!iommu_use_atu(iommu, dvma)) { tbl = &iommu->tbl; iotsb_num = 0; /* we don't care for legacy iommu */ } else { @@ -374,7 +376,7 @@ static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page, npages >>= IO_PAGE_SHIFT; mask = *dev->dma_mask; - if (mask <= DMA_BIT_MASK(32)) + if (!iommu_use_atu(iommu, mask)) tbl = &iommu->tbl; else tbl = &atu->tbl; @@ -510,7 +512,7 @@ static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist, IO_PAGE_SIZE) >> IO_PAGE_SHIFT; mask = *dev->dma_mask; - if (mask <= DMA_BIT_MASK(32)) + if (!iommu_use_atu(iommu, mask)) tbl = &iommu->tbl; else tbl = &atu->tbl; diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index b9a5a04b2d2c543791088b69aae612ed56a97e5e..a1dd24307b001aa95801d3e24003ffd719711728 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -469,3 +469,7 @@ 421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 sys_futex sys_futex 423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 00bcbe2326d9ea712bb6e824da7e998cfd9e43f2..b506ad06aefc8cbd7ca4c45d6db0ee1f01ddc584 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += irq_work.h generic-y += kdebug.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += param.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/um/include/asm/syscall-generic.h b/arch/um/include/asm/syscall-generic.h index 9fb9cf8cd39a3b29f45a80d1a4281d2abd82a262..98e50c50c12efb65ee100eecff49ea6a54e74853 100644 --- a/arch/um/include/asm/syscall-generic.h +++ b/arch/um/include/asm/syscall-generic.h @@ -53,84 +53,30 @@ static inline void syscall_set_return_value(struct task_struct *task, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { const struct uml_pt_regs *r = ®s->regs; - switch (i) { - case 0: - if (!n--) - break; - *args++ = UPT_SYSCALL_ARG1(r); - case 1: - if (!n--) - break; - *args++ = UPT_SYSCALL_ARG2(r); - case 2: - if (!n--) - break; - *args++ = UPT_SYSCALL_ARG3(r); - case 3: - if (!n--) - break; - *args++ = UPT_SYSCALL_ARG4(r); - case 4: - if (!n--) - break; - *args++ = UPT_SYSCALL_ARG5(r); - case 5: - if (!n--) - break; - *args++ = UPT_SYSCALL_ARG6(r); - case 6: - if (!n--) - break; - default: - BUG(); - break; - } + *args++ = UPT_SYSCALL_ARG1(r); + *args++ = UPT_SYSCALL_ARG2(r); + *args++ = UPT_SYSCALL_ARG3(r); + *args++ = UPT_SYSCALL_ARG4(r); + *args++ = UPT_SYSCALL_ARG5(r); + *args = UPT_SYSCALL_ARG6(r); } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) { struct uml_pt_regs *r = ®s->regs; - switch (i) { - case 0: - if (!n--) - break; - UPT_SYSCALL_ARG1(r) = *args++; - case 1: - if (!n--) - break; - UPT_SYSCALL_ARG2(r) = *args++; - case 2: - if (!n--) - break; - UPT_SYSCALL_ARG3(r) = *args++; - case 3: - if (!n--) - break; - UPT_SYSCALL_ARG4(r) = *args++; - case 4: - if (!n--) - break; - UPT_SYSCALL_ARG5(r) = *args++; - case 5: - if (!n--) - break; - UPT_SYSCALL_ARG6(r) = *args++; - case 6: - if (!n--) - break; - default: - BUG(); - break; - } + UPT_SYSCALL_ARG1(r) = *args++; + UPT_SYSCALL_ARG2(r) = *args++; + UPT_SYSCALL_ARG3(r) = *args++; + UPT_SYSCALL_ARG4(r) = *args++; + UPT_SYSCALL_ARG5(r) = *args++; + UPT_SYSCALL_ARG6(r) = *args; } /* See arch/x86/um/asm/syscall.h for syscall_get_arch() definition. */ diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index dce6db147f24563eb14310aaabf76cada9a878bb..70ee6038390060a03dd84cbfa7e14ca249be1fd7 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -2,162 +2,8 @@ #ifndef __UM_TLB_H #define __UM_TLB_H -#include -#include -#include -#include #include - -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - -/* struct mmu_gather is an opaque type used by the mm code for passing around - * any data needed by arch specific code for tlb_remove_page. - */ -struct mmu_gather { - struct mm_struct *mm; - unsigned int need_flush; /* Really unmapped some ptes? */ - unsigned long start; - unsigned long end; - unsigned int fullmm; /* non-zero means full mm flush */ -}; - -static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, - unsigned long address) -{ - if (tlb->start > address) - tlb->start = address; - if (tlb->end < address + PAGE_SIZE) - tlb->end = address + PAGE_SIZE; -} - -static inline void init_tlb_gather(struct mmu_gather *tlb) -{ - tlb->need_flush = 0; - - tlb->start = TASK_SIZE; - tlb->end = 0; - - if (tlb->fullmm) { - tlb->start = 0; - tlb->end = TASK_SIZE; - } -} - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->start = start; - tlb->end = end; - tlb->fullmm = !(start | (end+1)); - - init_tlb_gather(tlb); -} - -extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end); - -static inline void -tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ - flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end); -} - -static inline void -tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - init_tlb_gather(tlb); -} - -static inline void -tlb_flush_mmu(struct mmu_gather *tlb) -{ - if (!tlb->need_flush) - return; - - tlb_flush_mmu_tlbonly(tlb); - tlb_flush_mmu_free(tlb); -} - -/* arch_tlb_finish_mmu - * Called at the end of the shootdown operation to free up any resources - * that were required. - */ -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (force) { - tlb->start = start; - tlb->end = end; - tlb->need_flush = 1; - } - tlb_flush_mmu(tlb); - - /* keep the page table cache within bounds */ - check_pgt_cache(); -} - -/* tlb_remove_page - * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), - * while handling the additional races in SMP caused by other CPUs - * caching valid mappings in their TLBs. - */ -static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - tlb->need_flush = 1; - free_page_and_swap_cache(page); - return false; /* avoid calling tlb_flush_mmu */ -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - __tlb_remove_page(tlb, page); -} - -static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return __tlb_remove_page(tlb, page); -} - -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return tlb_remove_page(tlb, page); -} - -/** - * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. - * - * Record the fact that pte's were really umapped in ->need_flush, so we can - * later optimise away the tlb invalidate. This helps when userspace is - * unmapping already-unmapped pages, which happens quite a lot. - */ -#define tlb_remove_tlb_entry(tlb, ptep, address) \ - do { \ - tlb->need_flush = 1; \ - __tlb_remove_tlb_entry(tlb, ptep, address); \ - } while (0) - -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ -} - -#define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) - -#define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr) - -#define pmd_free_tlb(tlb, pmdp, addr) __pmd_free_tlb(tlb, pmdp, addr) - -#define tlb_migrate_finish(mm) do {} while (0) +#include +#include #endif diff --git a/arch/um/kernel/stacktrace.c b/arch/um/kernel/stacktrace.c index ebe7bcf62684c5312aaec16bbe5cadd4f13ff595..bd95e020d5091858b3ec377ebca5e1127c59a956 100644 --- a/arch/um/kernel/stacktrace.c +++ b/arch/um/kernel/stacktrace.c @@ -63,8 +63,6 @@ static const struct stacktrace_ops dump_ops = { static void __save_stack_trace(struct task_struct *tsk, struct stack_trace *trace) { dump_trace(tsk, &dump_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace(struct stack_trace *trace) diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 817d82608712ab6f603edd6514c16c2744b08584..2445dfcf64446bd1fbcadb4fe75bab9c6ea62efd 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -20,6 +20,7 @@ config UNICORE32 select GENERIC_IOMAP select MODULES_USE_ELF_REL select NEED_DMA_MAP_STATE + select MMU_GATHER_NO_RANGE if MMU help UniCore-32 is 32-bit Instruction Set Architecture, including a series of low-power-consumption RISC chip @@ -38,12 +39,6 @@ config STACKTRACE_SUPPORT config LOCKDEP_SUPPORT def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - -config RWSEM_XCHGADD_ALGORITHM - bool - config ARCH_HAS_ILOG2_U32 bool diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild index d77d953c04c1cfbe039bf207fa4db8b362e65f22..b301a0b3c0b2b1f1213174bab772d30a1b144131 100644 --- a/arch/unicore32/include/asm/Kbuild +++ b/arch/unicore32/include/asm/Kbuild @@ -22,6 +22,7 @@ generic-y += kvm_para.h generic-y += local.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += parport.h generic-y += percpu.h diff --git a/arch/unicore32/include/asm/tlb.h b/arch/unicore32/include/asm/tlb.h index 9cca15cdae94c706508968a5131acadbba4ca4e6..00a8477333f6db4d745d4da8ef3f7cb28807b838 100644 --- a/arch/unicore32/include/asm/tlb.h +++ b/arch/unicore32/include/asm/tlb.h @@ -12,10 +12,9 @@ #ifndef __UNICORE_TLB_H__ #define __UNICORE_TLB_H__ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) +/* + * unicore32 lacks an efficient flush_tlb_range(), use flush_tlb_mm(). + */ #define __pte_free_tlb(tlb, pte, addr) \ do { \ diff --git a/arch/unicore32/kernel/stacktrace.c b/arch/unicore32/kernel/stacktrace.c index 9976e767d51c2eca3803c1dd5f4210549ccf2dc1..e37da8c6837be5782ea968cbf96f379d88c905f9 100644 --- a/arch/unicore32/kernel/stacktrace.c +++ b/arch/unicore32/kernel/stacktrace.c @@ -120,8 +120,6 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) } walk_stackframe(&frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace(struct stack_trace *trace) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5ad92419be19c5d67fae37158bac2b93e105d1bc..0a3cc347143f87fc870e5363e9cab46bd6dfd48a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -14,6 +14,7 @@ config X86_32 select ARCH_WANT_IPC_PARSE_VERSION select CLKSRC_I8253 select CLONE_BACKWARDS + select HAVE_DEBUG_STACKOVERFLOW select MODULES_USE_ELF_REL select OLD_SIGACTION @@ -28,7 +29,6 @@ config X86_64 select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select SWIOTLB - select X86_DEV_DMA_OPS select ARCH_HAS_SYSCALL_WRAPPER # @@ -65,6 +65,7 @@ config X86 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE select ARCH_HAS_SET_MEMORY + select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE @@ -74,6 +75,7 @@ config X86 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO + select ARCH_STACKWALK select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 @@ -138,7 +140,6 @@ config X86 select HAVE_COPY_THREAD_TLS select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK - select HAVE_DEBUG_STACKOVERFLOW select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS @@ -183,7 +184,6 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if PARAVIRT - select HAVE_RCU_TABLE_INVALIDATE if HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION select HAVE_FUNCTION_ARG_ACCESS_API @@ -268,9 +268,6 @@ config ARCH_MAY_HAVE_PC_FDC def_bool y depends on ISA_DMA_API -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config GENERIC_CALIBRATE_DELAY def_bool y @@ -703,8 +700,6 @@ config STA2X11 bool "STA2X11 Companion Chip Support" depends on X86_32_NON_STANDARD && PCI select ARCH_HAS_PHYS_TO_DMA - select X86_DEV_DMA_OPS - select X86_DMA_REMAP select SWIOTLB select MFD_STA2X11 select GPIOLIB @@ -783,14 +778,6 @@ config PARAVIRT_SPINLOCKS If you are unsure how to answer this question, answer Y. -config QUEUED_LOCK_STAT - bool "Paravirt queued spinlock statistics" - depends on PARAVIRT_SPINLOCKS && DEBUG_FS - ---help--- - Enable the collection of statistical data on the slowpath - behavior of paravirtualized queued spinlocks and report - them on debugfs. - source "arch/x86/xen/Kconfig" config KVM_GUEST @@ -1330,8 +1317,16 @@ config MICROCODE_AMD processors will be enabled. config MICROCODE_OLD_INTERFACE - def_bool y + bool "Ancient loading interface (DEPRECATED)" + default n depends on MICROCODE + ---help--- + DO NOT USE THIS! This is the ancient /dev/cpu/microcode interface + which was used by userspace tools like iucode_tool and microcode.ctl. + It is inadequate because it runs too late to be able to properly + load microcode on a machine and it needs special tools. Instead, you + should've switched to the early loading method with the initrd or + builtin microcode by now: Documentation/x86/microcode.txt config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" @@ -1499,7 +1494,7 @@ config X86_CPA_STATISTICS depends on DEBUG_FS ---help--- Expose statistics about the Change Page Attribute mechanims, which - helps to determine the effectivness of preserving large and huge + helps to determine the effectiveness of preserving large and huge page mappings when mapping protections are changed. config ARCH_HAS_MEM_ENCRYPT @@ -1606,12 +1601,9 @@ config ARCH_FLATMEM_ENABLE depends on X86_32 && !NUMA config ARCH_DISCONTIGMEM_ENABLE - def_bool y - depends on NUMA && X86_32 - -config ARCH_DISCONTIGMEM_DEFAULT - def_bool y + def_bool n depends on NUMA && X86_32 + depends on BROKEN config ARCH_SPARSEMEM_ENABLE def_bool y @@ -1620,8 +1612,7 @@ config ARCH_SPARSEMEM_ENABLE select SPARSEMEM_VMEMMAP_ENABLE if X86_64 config ARCH_SPARSEMEM_DEFAULT - def_bool y - depends on X86_64 + def_bool X86_64 || (NUMA && X86_32) config ARCH_SELECT_MEMORY_MODEL def_bool y @@ -2878,11 +2869,6 @@ config HAVE_ATOMIC_IOMAP config X86_DEV_DMA_OPS bool - depends on X86_64 || STA2X11 - -config X86_DMA_REMAP - bool - depends on STA2X11 config HAVE_GENERIC_GUP def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a587805c6687f6721ae8140da8144701c9abb49b..56e748a7679f4b931b420eaec6bd63f174337e59 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -47,7 +47,7 @@ export REALMODE_CFLAGS export BITS ifdef CONFIG_X86_NEED_RELOCS - LDFLAGS_vmlinux := --emit-relocs + LDFLAGS_vmlinux := --emit-relocs --discard-none endif # diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 0ef4ad55b29b203386c8d94aed8afa3c4e5a2fe1..ad84239e595eac0cb2ea3510f9480d66735e282b 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -276,7 +276,7 @@ static unsigned long get_acpi_srat_table(void) if (acpi_table) { header = (struct acpi_table_header *)acpi_table; - if (ACPI_COMPARE_NAME(header->signature, ACPI_SIG_SRAT)) + if (ACPI_COMPARE_NAMESEG(header->signature, ACPI_SIG_SRAT)) return acpi_table; } entry += size; diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index c0d6c560df69e0e63941a34539660770304ff612..5a237e8dbf8d563504a6cfcb4a67a2b7350bed0a 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -352,7 +352,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, boot_params->hdr.loadflags &= ~KASLR_FLAG; /* Save RSDP address for later use. */ - boot_params->acpi_rsdp_addr = get_rsdp_addr(); + /* boot_params->acpi_rsdp_addr = get_rsdp_addr(); */ sanitize_boot_params(boot_params); diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 9f908112bbb97e35b87aaf4a440fbd8d47c742e7..2b2481acc6615ae3825d6ad74b5afbb8ceaf196a 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -25,18 +25,6 @@ CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_OSF_PARTITION=y -CONFIG_AMIGA_PARTITION=y -CONFIG_MAC_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -CONFIG_UNIXWARE_DISKLABEL=y -CONFIG_SGI_PARTITION=y -CONFIG_SUN_PARTITION=y -CONFIG_KARMA_PARTITION=y -CONFIG_EFI_PARTITION=y CONFIG_SMP=y CONFIG_X86_GENERIC=y CONFIG_HPET_TIMER=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 1d3badfda09ee86d5119599a5ad9d8d9e4960ba3..e8829abf063acb73f91be93e4f25295a8720778c 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -24,18 +24,6 @@ CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_OSF_PARTITION=y -CONFIG_AMIGA_PARTITION=y -CONFIG_MAC_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -CONFIG_UNIXWARE_DISKLABEL=y -CONFIG_SGI_PARTITION=y -CONFIG_SUN_PARTITION=y -CONFIG_KARMA_PARTITION=y -CONFIG_EFI_PARTITION=y CONFIG_SMP=y CONFIG_CALGARY_IOMMU=y CONFIG_NR_CPUS=64 diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S index 3b6e70d085da89775317c8e2a560625ab4799e01..8457cdd47f751167a2321ebf063eb18bdb4ef8aa 100644 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S @@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2) vpaddq t2,t1,t1 vmovq t1x,d4 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small + # amount. Careful: we must not assume the carry bits 'd0 >> 26', + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit + # integers. It's true in a single-block implementation, but not here. + # d1 += d0 >> 26 mov d0,%rax shr $26,%rax @@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S index e6add74d78a595b63789d419100b7c30b024e0fc..6f0be7a869641c92c4993e378b53c07a7d385f29 100644 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S @@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx @@ -524,6 +524,12 @@ ENTRY(poly1305_2block_sse2) paddq t2,t1 movq t1,d4 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small + # amount. Careful: we must not assume the carry bits 'd0 >> 26', + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit + # integers. It's true in a single-block implementation, but not here. + # d1 += d0 >> 26 mov d0,%rax shr $26,%rax @@ -562,16 +568,16 @@ ENTRY(poly1305_2block_sse2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index d309f30cf7af84e67ac38910eff4256da9c25a11..7b23431be5cb6a535f657c27868dbb0600d65423 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -650,6 +650,7 @@ ENTRY(__switch_to_asm) pushl %ebx pushl %edi pushl %esi + pushfl /* switch stack */ movl %esp, TASK_threadsp(%eax) @@ -672,6 +673,7 @@ ENTRY(__switch_to_asm) #endif /* restore callee-saved registers */ + popfl popl %esi popl %edi popl %ebx @@ -766,13 +768,12 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) -.Lneed_resched: cmpl $0, PER_CPU_VAR(__preempt_count) jnz restore_all_kernel testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all_kernel call preempt_schedule_irq - jmp .Lneed_resched + jmp restore_all_kernel END(resume_kernel) #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1f0efdb7b6294daba3e315be0b990ba8296b3fea..20e45d9b4e156cc90a464715b0370f27f259ee80 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -298,7 +298,7 @@ ENTRY(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset + movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset #endif #ifdef CONFIG_RETPOLINE @@ -430,8 +430,8 @@ END(irq_entries_start) * it before we actually move ourselves to the IRQ stack. */ - movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) - movq PER_CPU_VAR(irq_stack_ptr), %rsp + movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8) + movq PER_CPU_VAR(hardirq_stack_ptr), %rsp #ifdef CONFIG_DEBUG_ENTRY /* @@ -645,10 +645,9 @@ retint_kernel: /* Check if we need preemption */ btl $9, EFLAGS(%rsp) /* were interrupts off? */ jnc 1f -0: cmpl $0, PER_CPU_VAR(__preempt_count) + cmpl $0, PER_CPU_VAR(__preempt_count) jnz 1f call preempt_schedule_irq - jmp 0b 1: #endif /* @@ -841,7 +840,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) /** * idtentry - Generate an IDT entry stub @@ -879,7 +878,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * @paranoid == 2 is special: the stub will never switch stacks. This is for * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -925,13 +924,13 @@ ENTRY(\sym) .endif .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + subq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif call \do_sym .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + addq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif /* these procedures expect "no swapgs" flag in ebx */ @@ -1129,7 +1128,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ hv_stimer0_callback_vector hv_stimer0_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 5bfe2243a08f882c4ab622cd87799ac1a28ff3c2..42fe42e82bafae8918bc3d8470743505e9345654 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -116,7 +116,7 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE targets += vdsox32.lds $(vobjx32s-y) $(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg +$(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 007b3fe9d727cbc8c55f78c1734b3d28551a7dda..98c7d12b945c28380679980deab66c5633633405 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -29,12 +29,12 @@ extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); #ifdef CONFIG_PARAVIRT_CLOCK -extern u8 pvclock_page +extern u8 pvclock_page[PAGE_SIZE] __attribute__((visibility("hidden"))); #endif #ifdef CONFIG_HYPERV_TSCPAGE -extern u8 hvclock_page +extern u8 hvclock_page[PAGE_SIZE] __attribute__((visibility("hidden"))); #endif diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index fa847a620f40f2993005ba10f127a50aafca2c59..a20b134de2a891d52aa9b88b59d4e78fbc13fc6b 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -7,7 +7,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, void *stripped_addr, size_t stripped_len, - FILE *outfile, const char *name) + FILE *outfile, const char *image_name) { int found_load = 0; unsigned long load_size = -1; /* Work around bogus warning */ @@ -93,11 +93,12 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, int k; ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + GET_LE(&symtab_hdr->sh_entsize) * i; - const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) + - GET_LE(&sym->st_name); + const char *sym_name = raw_addr + + GET_LE(&strtab_hdr->sh_offset) + + GET_LE(&sym->st_name); for (k = 0; k < NSYMS; k++) { - if (!strcmp(name, required_syms[k].name)) { + if (!strcmp(sym_name, required_syms[k].name)) { if (syms[k]) { fail("duplicate symbol %s\n", required_syms[k].name); @@ -134,7 +135,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, if (syms[sym_vvar_start] % 4096) fail("vvar_begin must be a multiple of 4096\n"); - if (!name) { + if (!image_name) { fwrite(stripped_addr, stripped_len, 1, outfile); return; } @@ -157,7 +158,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, } fprintf(outfile, "\n};\n\n"); - fprintf(outfile, "const struct vdso_image %s = {\n", name); + fprintf(outfile, "const struct vdso_image %s = {\n", image_name); fprintf(outfile, "\t.data = raw_data,\n"); fprintf(outfile, "\t.size = %lu,\n", mapping_size); if (alt_sec) { diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 7d2d7c801dba6abb226b630104d1f038242562cf..f15441b07dad8a94b914299e3d0ed1b5fc909677 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -3,10 +3,14 @@ #include #include #include +#include #include +#include #include "../perf_event.h" +static DEFINE_PER_CPU(unsigned int, perf_nmi_counter); + static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -112,23 +116,144 @@ static __initconst const u64 amd_hw_cache_event_ids }, }; +static __initconst const u64 amd_hw_cache_event_ids_f17h + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0040, /* Data Cache Accesses */ + [C(RESULT_MISS)] = 0xc860, /* L2$ access from DC Miss */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0xff5a, /* h/w prefetch DC Fills */ + [C(RESULT_MISS)] = 0, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0080, /* Instruction cache fetches */ + [C(RESULT_MISS)] = 0x0081, /* Instruction cache misses */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0xff45, /* All L2 DTLB accesses */ + [C(RESULT_MISS)] = 0xf045, /* L2 DTLB misses (PT walks) */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0084, /* L1 ITLB misses, L2 ITLB hits */ + [C(RESULT_MISS)] = 0xff85, /* L1 ITLB misses, L2 misses */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c2, /* Retired Branch Instr. */ + [C(RESULT_MISS)] = 0x00c3, /* Retired Mispredicted BI */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + /* - * AMD Performance Monitor K7 and later. + * AMD Performance Monitor K7 and later, up to and including Family 16h: */ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, - [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, + [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ +}; + +/* + * AMD Performance Monitor Family 17h and later: + */ +static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287, + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187, }; static u64 amd_pmu_event_map(int hw_event) { + if (boot_cpu_data.x86 >= 0x17) + return amd_f17h_perfmon_event_map[hw_event]; + return amd_perfmon_event_map[hw_event]; } @@ -429,6 +554,132 @@ static void amd_pmu_cpu_dead(int cpu) } } +/* + * When a PMC counter overflows, an NMI is used to process the event and + * reset the counter. NMI latency can result in the counter being updated + * before the NMI can run, which can result in what appear to be spurious + * NMIs. This function is intended to wait for the NMI to run and reset + * the counter to avoid possible unhandled NMI messages. + */ +#define OVERFLOW_WAIT_COUNT 50 + +static void amd_pmu_wait_on_overflow(int idx) +{ + unsigned int i; + u64 counter; + + /* + * Wait for the counter to be reset if it has overflowed. This loop + * should exit very, very quickly, but just in case, don't wait + * forever... + */ + for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { + rdmsrl(x86_pmu_event_addr(idx), counter); + if (counter & (1ULL << (x86_pmu.cntval_bits - 1))) + break; + + /* Might be in IRQ context, so can't sleep */ + udelay(1); + } +} + +static void amd_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int idx; + + x86_pmu_disable_all(); + + /* + * This shouldn't be called from NMI context, but add a safeguard here + * to return, since if we're in NMI context we can't wait for an NMI + * to reset an overflowed counter value. + */ + if (in_nmi()) + return; + + /* + * Check each counter for overflow and wait for it to be reset by the + * NMI if it has overflowed. This relies on the fact that all active + * counters are always enabled when this function is caled and + * ARCH_PERFMON_EVENTSEL_INT is always set. + */ + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_wait_on_overflow(idx); + } +} + +static void amd_pmu_disable_event(struct perf_event *event) +{ + x86_pmu_disable_event(event); + + /* + * This can be called from NMI context (via x86_pmu_stop). The counter + * may have overflowed, but either way, we'll never see it get reset + * by the NMI if we're already in the NMI. And the NMI latency support + * below will take care of any pending NMI that might have been + * generated by the overflow. + */ + if (in_nmi()) + return; + + amd_pmu_wait_on_overflow(event->hw.idx); +} + +/* + * Because of NMI latency, if multiple PMC counters are active or other sources + * of NMIs are received, the perf NMI handler can handle one or more overflowed + * PMC counters outside of the NMI associated with the PMC overflow. If the NMI + * doesn't arrive at the LAPIC in time to become a pending NMI, then the kernel + * back-to-back NMI support won't be active. This PMC handler needs to take into + * account that this can occur, otherwise this could result in unknown NMI + * messages being issued. Examples of this is PMC overflow while in the NMI + * handler when multiple PMCs are active or PMC overflow while handling some + * other source of an NMI. + * + * Attempt to mitigate this by using the number of active PMCs to determine + * whether to return NMI_HANDLED if the perf NMI handler did not handle/reset + * any PMCs. The per-CPU perf_nmi_counter variable is set to a minimum of the + * number of active PMCs or 2. The value of 2 is used in case an NMI does not + * arrive at the LAPIC in time to be collapsed into an already pending NMI. + */ +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int active, handled; + + /* + * Obtain the active count before calling x86_pmu_handle_irq() since + * it is possible that x86_pmu_handle_irq() may make a counter + * inactive (through x86_pmu_stop). + */ + active = __bitmap_weight(cpuc->active_mask, X86_PMC_IDX_MAX); + + /* Process any counter overflows */ + handled = x86_pmu_handle_irq(regs); + + /* + * If a counter was handled, record the number of possible remaining + * NMIs that can occur. + */ + if (handled) { + this_cpu_write(perf_nmi_counter, + min_t(unsigned int, 2, active)); + + return handled; + } + + if (!this_cpu_read(perf_nmi_counter)) + return NMI_DONE; + + this_cpu_dec(perf_nmi_counter); + + return NMI_HANDLED; +} + static struct event_constraint * amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -621,11 +872,11 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, + .handle_irq = amd_pmu_handle_irq, + .disable_all = amd_pmu_disable_all, .enable_all = x86_pmu_enable_all, .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, + .disable = amd_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_K7_EVNTSEL0, @@ -718,9 +969,10 @@ __init int amd_pmu_init(void) x86_pmu.amd_nb_constraints = 0; } - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + if (boot_cpu_data.x86 >= 0x17) + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids_f17h, sizeof(hw_cache_event_ids)); + else + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids)); return 0; } @@ -732,7 +984,7 @@ void amd_pmu_enable_virt(void) cpuc->perf_ctr_virt_mask = 0; /* Reload all events */ - x86_pmu_disable_all(); + amd_pmu_disable_all(); x86_pmu_enable_all(0); } EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); @@ -750,7 +1002,7 @@ void amd_pmu_disable_virt(void) cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; /* Reload all events */ - x86_pmu_disable_all(); + amd_pmu_disable_all(); x86_pmu_enable_all(0); } EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index e2b1447192a888ffafb2883ddbdfbbd37c1e9315..f315425d8468f473bf11e95c6039b59ac6b318a4 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -560,6 +560,21 @@ int x86_pmu_hw_config(struct perf_event *event) return -EINVAL; } + /* sample_regs_user never support XMM registers */ + if (unlikely(event->attr.sample_regs_user & PEBS_XMM_REGS)) + return -EINVAL; + /* + * Besides the general purpose registers, XMM registers may + * be collected in PEBS on some platforms, e.g. Icelake + */ + if (unlikely(event->attr.sample_regs_intr & PEBS_XMM_REGS)) { + if (x86_pmu.pebs_no_xmm_regs) + return -EINVAL; + + if (!event->attr.precise_ip) + return -EINVAL; + } + return x86_setup_perfctr(event); } @@ -661,6 +676,10 @@ static inline int is_x86_event(struct perf_event *event) return event->pmu == &pmu; } +struct pmu *x86_get_pmu(void) +{ + return &pmu; +} /* * Event scheduler state: * @@ -849,18 +868,43 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) struct event_constraint *c; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; struct perf_event *e; - int i, wmin, wmax, unsched = 0; + int n0, i, wmin, wmax, unsched = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); + /* + * Compute the number of events already present; see x86_pmu_add(), + * validate_group() and x86_pmu_commit_txn(). For the former two + * cpuc->n_events hasn't been updated yet, while for the latter + * cpuc->n_txn contains the number of events added in the current + * transaction. + */ + n0 = cpuc->n_events; + if (cpuc->txn_flags & PERF_PMU_TXN_ADD) + n0 -= cpuc->n_txn; + if (x86_pmu.start_scheduling) x86_pmu.start_scheduling(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { - cpuc->event_constraint[i] = NULL; - c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); - cpuc->event_constraint[i] = c; + c = cpuc->event_constraint[i]; + + /* + * Previously scheduled events should have a cached constraint, + * while new events should not have one. + */ + WARN_ON_ONCE((c && i >= n0) || (!c && i < n0)); + + /* + * Request constraints for new events; or for those events that + * have a dynamic constraint -- for those the constraint can + * change due to external factors (sibling state, allow_tfa). + */ + if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { + c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); + cpuc->event_constraint[i] = c; + } wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); @@ -925,25 +969,20 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (!unsched && assign) { for (i = 0; i < n; i++) { e = cpuc->event_list[i]; - e->hw.flags |= PERF_X86_EVENT_COMMITTED; if (x86_pmu.commit_scheduling) x86_pmu.commit_scheduling(cpuc, i, assign[i]); } } else { - for (i = 0; i < n; i++) { + for (i = n0; i < n; i++) { e = cpuc->event_list[i]; - /* - * do not put_constraint() on comitted events, - * because they are good to go - */ - if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) - continue; /* * release events that failed scheduling */ if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(cpuc, e); + + cpuc->event_constraint[i] = NULL; } } @@ -1349,8 +1388,9 @@ void x86_pmu_stop(struct perf_event *event, int flags) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { + if (test_bit(hwc->idx, cpuc->active_mask)) { x86_pmu.disable(event); + __clear_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; @@ -1371,11 +1411,6 @@ static void x86_pmu_del(struct perf_event *event, int flags) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; - /* - * event is descheduled - */ - event->hw.flags &= ~PERF_X86_EVENT_COMMITTED; - /* * If we're called during a txn, we only need to undo x86_pmu.add. * The events never got scheduled and ->cancel_txn will truncate @@ -1412,6 +1447,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; } + cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; perf_event_update_userpage(event); @@ -1447,16 +1483,8 @@ int x86_pmu_handle_irq(struct pt_regs *regs) apic_write(APIC_LVTPC, APIC_DM_NMI); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active_mask)) { - /* - * Though we deactivated the counter some cpus - * might still deliver spurious interrupts still - * in flight. Catch them: - */ - if (__test_and_clear_bit(idx, cpuc->running)) - handled++; + if (!test_bit(idx, cpuc->active_mask)) continue; - } event = cpuc->events[idx]; @@ -2031,7 +2059,7 @@ static int validate_event(struct perf_event *event) if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); - c = x86_pmu.get_event_constraints(fake_cpuc, -1, event); + c = x86_pmu.get_event_constraints(fake_cpuc, 0, event); if (!c || !c->weight) ret = -EINVAL; @@ -2079,8 +2107,7 @@ static int validate_group(struct perf_event *event) if (n < 0) goto out; - fake_cpuc->n_events = n; - + fake_cpuc->n_events = 0; ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); out: @@ -2355,6 +2382,15 @@ void arch_perf_update_userpage(struct perf_event *event, cyc2ns_read_end(); } +/* + * Determine whether the regs were taken from an irq/exception handler rather + * than from perf_arch_fetch_caller_regs(). + */ +static bool perf_hw_regs(struct pt_regs *regs) +{ + return regs->flags & X86_EFLAGS_FIXED; +} + void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { @@ -2366,11 +2402,15 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re return; } - if (perf_callchain_store(entry, regs->ip)) - return; + if (perf_hw_regs(regs)) { + if (perf_callchain_store(entry, regs->ip)) + return; + unwind_start(&state, current, regs, NULL); + } else { + unwind_start(&state, current, NULL, (void *)regs->sp); + } - for (unwind_start(&state, current, regs, NULL); !unwind_done(&state); - unwind_next_frame(&state)) { + for (; !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr || perf_callchain_store(entry, addr)) return; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8baa441d8000f6c4efbde5afe72d4e5a518d2184..ef763f535e3abbd034857ad48a678c1281a358c4 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -239,6 +239,35 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct event_constraint intel_icl_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x1c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf), + INTEL_EVENT_CONSTRAINT(0x32, 0xf), /* SW_PREFETCH_ACCESS.* */ + INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf), + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */ + INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */ + INTEL_EVENT_CONSTRAINT(0xa3, 0xf), /* CYCLE_ACTIVITY.* */ + INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xe6, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0xf0, 0xf4, 0xf), + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_icl_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff9fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff9fffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE), + EVENT_EXTRA_END +}; + EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); @@ -1827,6 +1856,45 @@ static __initconst const u64 glp_hw_cache_extra_regs }, }; +#define TNT_LOCAL_DRAM BIT_ULL(26) +#define TNT_DEMAND_READ GLM_DEMAND_DATA_RD +#define TNT_DEMAND_WRITE GLM_DEMAND_RFO +#define TNT_LLC_ACCESS GLM_ANY_RESPONSE +#define TNT_SNP_ANY (SNB_SNP_NOT_NEEDED|SNB_SNP_MISS| \ + SNB_NO_FWD|SNB_SNP_FWD|SNB_HITM) +#define TNT_LLC_MISS (TNT_SNP_ANY|SNB_NON_DRAM|TNT_LOCAL_DRAM) + +static __initconst const u64 tnt_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = TNT_DEMAND_READ| + TNT_LLC_ACCESS, + [C(RESULT_MISS)] = TNT_DEMAND_READ| + TNT_LLC_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TNT_DEMAND_WRITE| + TNT_LLC_ACCESS, + [C(RESULT_MISS)] = TNT_DEMAND_WRITE| + TNT_LLC_MISS, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, +}; + +static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffffff9fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xffffff9fffull, RSP_1), + EVENT_EXTRA_END +}; + #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ #define KNL_MCDRAM_LOCAL BIT_ULL(21) @@ -2015,7 +2083,7 @@ static void intel_tfa_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int /* * We're going to use PMC3, make sure TFA is set before we touch it. */ - if (cntr == 3 && !cpuc->is_fake) + if (cntr == 3) intel_set_tfa(cpuc, true); } @@ -2091,15 +2159,19 @@ static void intel_pmu_disable_event(struct perf_event *event) cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); cpuc->intel_cp_status &= ~(1ull << hwc->idx); - if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_disable(event); - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc); return; } x86_pmu_disable_event(event); + + /* + * Needs to be called after x86_pmu_disable_event, + * so we don't trigger the event without PEBS bit set. + */ + if (unlikely(event->attr.precise_ip)) + intel_pmu_pebs_disable(event); } static void intel_pmu_del_event(struct perf_event *event) @@ -2145,6 +2217,11 @@ static void intel_pmu_enable_fixed(struct perf_event *event) bits <<= (idx * 4); mask = 0xfULL << (idx * 4); + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { + bits |= ICL_FIXED_0_ADAPTIVE << (idx * 4); + mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4); + } + rdmsrl(hwc->config_base, ctrl_val); ctrl_val &= ~mask; ctrl_val |= bits; @@ -2688,7 +2765,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { - if ((event->hw.config & c->cmask) == c->code) { + if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; } @@ -2838,7 +2915,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; struct intel_excl_states *xlo; int tid = cpuc->excl_thread_id; - int is_excl, i; + int is_excl, i, w; /* * validating a group does not require @@ -2894,36 +2971,40 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * SHARED : sibling counter measuring non-exclusive event * UNUSED : sibling counter unused */ + w = c->weight; for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) { /* * exclusive event in sibling counter * our corresponding counter cannot be used * regardless of our event */ - if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) + if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) { __clear_bit(i, c->idxmsk); + w--; + continue; + } /* * if measuring an exclusive event, sibling * measuring non-exclusive, then counter cannot * be used */ - if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) + if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) { __clear_bit(i, c->idxmsk); + w--; + continue; + } } - /* - * recompute actual bit weight for scheduling algorithm - */ - c->weight = hweight64(c->idxmsk64); - /* * if we return an empty mask, then switch * back to static empty constraint to avoid * the cost of freeing later on */ - if (c->weight == 0) + if (!w) c = &emptyconstraint; + c->weight = w; + return c; } @@ -2931,11 +3012,9 @@ static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { - struct event_constraint *c1 = NULL; - struct event_constraint *c2; + struct event_constraint *c1, *c2; - if (idx >= 0) /* fake does < 0 */ - c1 = cpuc->event_constraint[idx]; + c1 = cpuc->event_constraint[idx]; /* * first time only @@ -2943,7 +3022,8 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, * - dynamic constraint: handled by intel_get_excl_constraints() */ c2 = __intel_get_event_constraints(cpuc, idx, event); - if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) { + if (c1) { + WARN_ON_ONCE(!(c1->flags & PERF_X86_EVENT_DYNAMIC)); bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX); c1->weight = c2->weight; c2 = c1; @@ -3131,7 +3211,7 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event) flags &= ~PERF_SAMPLE_TIME; if (!event->attr.exclude_kernel) flags &= ~PERF_SAMPLE_REGS_USER; - if (event->attr.sample_regs_user & ~PEBS_REGS) + if (event->attr.sample_regs_user & ~PEBS_GP_REGS) flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); return flags; } @@ -3185,7 +3265,7 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret; if (event->attr.precise_ip) { - if (!event->attr.freq) { + if (!(event->attr.freq || event->attr.wakeup_events)) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & ~intel_pmu_large_pebs_flags(event))) @@ -3366,6 +3446,12 @@ static struct event_constraint counter0_constraint = static struct event_constraint counter2_constraint = EVENT_CONSTRAINT(0, 0x4, 0); +static struct event_constraint fixed0_constraint = + FIXED_EVENT_CONSTRAINT(0x00c0, 0); + +static struct event_constraint fixed0_counter0_constraint = + INTEL_ALL_EVENT_CONSTRAINT(0, 0x100000001ULL); + static struct event_constraint * hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -3384,6 +3470,21 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return c; } +static struct event_constraint * +icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + /* + * Fixed counter 0 has less skid. + * Force instruction:ppp in Fixed counter 0 + */ + if ((event->attr.precise_ip == 3) && + constraint_match(&fixed0_constraint, event->hw.config)) + return &fixed0_constraint; + + return hsw_get_event_constraints(cpuc, idx, event); +} + static struct event_constraint * glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -3399,6 +3500,29 @@ glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return c; } +static struct event_constraint * +tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c; + + /* + * :ppp means to do reduced skid PEBS, + * which is available on PMC0 and fixed counter 0. + */ + if (event->attr.precise_ip == 3) { + /* Force instruction:ppp on PMC0 and Fixed counter 0 */ + if (constraint_match(&fixed0_constraint, event->hw.config)) + return &fixed0_counter0_constraint; + + return &counter0_constraint; + } + + c = intel_get_event_constraints(cpuc, idx, event); + + return c; +} + static bool allow_tsx_force_abort = true; static struct event_constraint * @@ -3410,7 +3534,7 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx, /* * Without TFA we must not use PMC3. */ - if (!allow_tsx_force_abort && test_bit(3, c->idxmsk) && idx >= 0) { + if (!allow_tsx_force_abort && test_bit(3, c->idxmsk)) { c = dyn_constraint(cpuc, c, idx); c->idxmsk64 &= ~(1ULL << 3); c->weight--; @@ -3507,6 +3631,8 @@ static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu) int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) { + cpuc->pebs_record_size = x86_pmu.pebs_record_size; + if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { cpuc->shared_regs = allocate_shared_regs(cpu); if (!cpuc->shared_regs) @@ -3575,6 +3701,12 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = NULL; + if (x86_pmu.flags & PMU_FL_TFA) { + WARN_ON_ONCE(cpuc->tfa_shadow); + cpuc->tfa_shadow = ~0ULL; + intel_set_tfa(cpuc, false); + } + if (x86_pmu.version > 1) flip_smm_bit(&x86_pmu.attr_freeze_on_smi); @@ -4108,6 +4240,42 @@ static struct attribute *hsw_tsx_events_attrs[] = { NULL }; +EVENT_ATTR_STR(tx-capacity-read, tx_capacity_read, "event=0x54,umask=0x80"); +EVENT_ATTR_STR(tx-capacity-write, tx_capacity_write, "event=0x54,umask=0x2"); +EVENT_ATTR_STR(el-capacity-read, el_capacity_read, "event=0x54,umask=0x80"); +EVENT_ATTR_STR(el-capacity-write, el_capacity_write, "event=0x54,umask=0x2"); + +static struct attribute *icl_events_attrs[] = { + EVENT_PTR(mem_ld_hsw), + EVENT_PTR(mem_st_hsw), + NULL, +}; + +static struct attribute *icl_tsx_events_attrs[] = { + EVENT_PTR(tx_start), + EVENT_PTR(tx_abort), + EVENT_PTR(tx_commit), + EVENT_PTR(tx_capacity_read), + EVENT_PTR(tx_capacity_write), + EVENT_PTR(tx_conflict), + EVENT_PTR(el_start), + EVENT_PTR(el_abort), + EVENT_PTR(el_commit), + EVENT_PTR(el_capacity_read), + EVENT_PTR(el_capacity_write), + EVENT_PTR(el_conflict), + EVENT_PTR(cycles_t), + EVENT_PTR(cycles_ct), + NULL, +}; + +static __init struct attribute **get_icl_events_attrs(void) +{ + return boot_cpu_has(X86_FEATURE_RTM) ? + merge_attr(icl_events_attrs, icl_tsx_events_attrs) : + icl_events_attrs; +} + static ssize_t freeze_on_smi_show(struct device *cdev, struct device_attribute *attr, char *buf) @@ -4147,6 +4315,50 @@ done: return count; } +static void update_tfa_sched(void *ignored) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * check if PMC3 is used + * and if so force schedule out for all event types all contexts + */ + if (test_bit(3, cpuc->active_mask)) + perf_pmu_resched(x86_get_pmu()); +} + +static ssize_t show_sysctl_tfa(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", allow_tsx_force_abort); +} + +static ssize_t set_sysctl_tfa(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + bool val; + ssize_t ret; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + /* no change */ + if (val == allow_tsx_force_abort) + return count; + + allow_tsx_force_abort = val; + + get_online_cpus(); + on_each_cpu(update_tfa_sched, NULL, 1); + put_online_cpus(); + + return count; +} + + static DEVICE_ATTR_RW(freeze_on_smi); static ssize_t branches_show(struct device *cdev, @@ -4179,7 +4391,9 @@ static struct attribute *intel_pmu_caps_attrs[] = { NULL }; -static DEVICE_BOOL_ATTR(allow_tsx_force_abort, 0644, allow_tsx_force_abort); +static DEVICE_ATTR(allow_tsx_force_abort, 0644, + show_sysctl_tfa, + set_sysctl_tfa); static struct attribute *intel_pmu_attrs[] = { &dev_attr_freeze_on_smi.attr, @@ -4440,6 +4654,32 @@ __init int intel_pmu_init(void) name = "goldmont_plus"; break; + case INTEL_FAM6_ATOM_TREMONT_X: + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; + + intel_pmu_lbr_init_skl(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.extra_regs = intel_tnt_extra_regs; + /* + * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS + * for precise cycles. + */ + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.lbr_pt_coexist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.get_event_constraints = tnt_get_event_constraints; + extra_attr = slm_format_attr; + pr_cont("Tremont events, "); + name = "Tremont"; + break; + case INTEL_FAM6_WESTMERE: case INTEL_FAM6_WESTMERE_EP: case INTEL_FAM6_WESTMERE_EX: @@ -4688,13 +4928,41 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = tfa_get_event_constraints; x86_pmu.enable_all = intel_tfa_pmu_enable_all; x86_pmu.commit_scheduling = intel_tfa_commit_scheduling; - intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr.attr; + intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr; } pr_cont("Skylake events, "); name = "skylake"; break; + case INTEL_FAM6_ICELAKE_MOBILE: + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; + intel_pmu_lbr_init_skl(); + + x86_pmu.event_constraints = intel_icl_event_constraints; + x86_pmu.pebs_constraints = intel_icl_pebs_event_constraints; + x86_pmu.extra_regs = intel_icl_extra_regs; + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = icl_get_event_constraints; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; + extra_attr = merge_attr(extra_attr, skl_format_attr); + x86_pmu.cpu_events = get_icl_events_attrs(); + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02); + x86_pmu.lbr_pt_coexist = true; + intel_pmu_pebs_data_source_skl(false); + pr_cont("Icelake events, "); + name = "icelake"; + break; + default: switch (x86_pmu.version) { case 1: diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 94a4b7fc75d0ecf344bade95be1cf563576250d2..6072f92cb8eaffbc141582ff56cc1c2ff840c37c 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -76,15 +76,15 @@ * Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. * perf code: 0x04 - * Available model: HSW ULT,CNL + * Available model: HSW ULT,KBL,CNL * Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. * perf code: 0x05 - * Available model: HSW ULT,CNL + * Available model: HSW ULT,KBL,CNL * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 - * Available model: HSW ULT,GLM,CNL + * Available model: HSW ULT,KBL,GLM,CNL * Scope: Package (physical package) * */ @@ -566,8 +566,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, hswult_cstates), X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_MOBILE, cnl_cstates), @@ -578,6 +578,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_X, glm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), + + X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_MOBILE, snb_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 10c99ce1feaddf5fa196bfbd385cbd02b55ef57a..7a9f5dac5abe4a5f391d7cb3cc18afb5a23ec77b 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -849,6 +849,26 @@ struct event_constraint intel_skl_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_icl_pebs_event_constraints[] = { + INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x400000000ULL), /* SLOTS */ + + INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf), /* MEM_INST_RETIRED.LOAD */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf), /* MEM_INST_RETIRED.STORE */ + + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), /* MEM_LOAD_*_RETIRED.* */ + + INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */ + + /* + * Everything else is handled by PMU_FL_PEBS_ALL, because we + * need the full constraints from the main table. + */ + + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; @@ -858,7 +878,7 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) if (x86_pmu.pebs_constraints) { for_each_event_constraint(c, x86_pmu.pebs_constraints) { - if ((event->hw.config & c->cmask) == c->code) { + if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; } @@ -906,17 +926,87 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) if (cpuc->n_pebs == cpuc->n_large_pebs) { threshold = ds->pebs_absolute_maximum - - reserved * x86_pmu.pebs_record_size; + reserved * cpuc->pebs_record_size; } else { - threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; + threshold = ds->pebs_buffer_base + cpuc->pebs_record_size; } ds->pebs_interrupt_threshold = threshold; } +static void adaptive_pebs_record_size_update(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 pebs_data_cfg = cpuc->pebs_data_cfg; + int sz = sizeof(struct pebs_basic); + + if (pebs_data_cfg & PEBS_DATACFG_MEMINFO) + sz += sizeof(struct pebs_meminfo); + if (pebs_data_cfg & PEBS_DATACFG_GP) + sz += sizeof(struct pebs_gprs); + if (pebs_data_cfg & PEBS_DATACFG_XMMS) + sz += sizeof(struct pebs_xmm); + if (pebs_data_cfg & PEBS_DATACFG_LBRS) + sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry); + + cpuc->pebs_record_size = sz; +} + +#define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ + PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \ + PERF_SAMPLE_TRANSACTION) + +static u64 pebs_update_adaptive_cfg(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + u64 sample_type = attr->sample_type; + u64 pebs_data_cfg = 0; + bool gprs, tsx_weight; + + if (!(sample_type & ~(PERF_SAMPLE_IP|PERF_SAMPLE_TIME)) && + attr->precise_ip > 1) + return pebs_data_cfg; + + if (sample_type & PERF_PEBS_MEMINFO_TYPE) + pebs_data_cfg |= PEBS_DATACFG_MEMINFO; + + /* + * We need GPRs when: + * + user requested them + * + precise_ip < 2 for the non event IP + * + For RTM TSX weight we need GPRs for the abort code. + */ + gprs = (sample_type & PERF_SAMPLE_REGS_INTR) && + (attr->sample_regs_intr & PEBS_GP_REGS); + + tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) && + ((attr->config & INTEL_ARCH_EVENT_MASK) == + x86_pmu.rtm_abort_event); + + if (gprs || (attr->precise_ip < 2) || tsx_weight) + pebs_data_cfg |= PEBS_DATACFG_GP; + + if ((sample_type & PERF_SAMPLE_REGS_INTR) && + (attr->sample_regs_intr & PEBS_XMM_REGS)) + pebs_data_cfg |= PEBS_DATACFG_XMMS; + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + /* + * For now always log all LBRs. Could configure this + * later. + */ + pebs_data_cfg |= PEBS_DATACFG_LBRS | + ((x86_pmu.lbr_nr-1) << PEBS_DATACFG_LBR_SHIFT); + } + + return pebs_data_cfg; +} + static void -pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu) +pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, + struct perf_event *event, bool add) { + struct pmu *pmu = event->ctx->pmu; /* * Make sure we get updated with the first PEBS * event. It will trigger also during removal, but @@ -933,6 +1023,29 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu) update = true; } + /* + * The PEBS record doesn't shrink on pmu::del(). Doing so would require + * iterating all remaining PEBS events to reconstruct the config. + */ + if (x86_pmu.intel_cap.pebs_baseline && add) { + u64 pebs_data_cfg; + + /* Clear pebs_data_cfg and pebs_record_size for first PEBS. */ + if (cpuc->n_pebs == 1) { + cpuc->pebs_data_cfg = 0; + cpuc->pebs_record_size = sizeof(struct pebs_basic); + } + + pebs_data_cfg = pebs_update_adaptive_cfg(event); + + /* Update pebs_record_size if new event requires more data. */ + if (pebs_data_cfg & ~cpuc->pebs_data_cfg) { + cpuc->pebs_data_cfg |= pebs_data_cfg; + adaptive_pebs_record_size_update(); + update = true; + } + } + if (update) pebs_update_threshold(cpuc); } @@ -947,7 +1060,7 @@ void intel_pmu_pebs_add(struct perf_event *event) if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs++; - pebs_update_state(needed_cb, cpuc, event->ctx->pmu); + pebs_update_state(needed_cb, cpuc, event, true); } void intel_pmu_pebs_enable(struct perf_event *event) @@ -960,11 +1073,19 @@ void intel_pmu_pebs_enable(struct perf_event *event) cpuc->pebs_enabled |= 1ULL << hwc->idx; - if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) + if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5)) cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled |= 1ULL << 63; + if (x86_pmu.intel_cap.pebs_baseline) { + hwc->config |= ICL_EVENTSEL_ADAPTIVE; + if (cpuc->pebs_data_cfg != cpuc->active_pebs_data_cfg) { + wrmsrl(MSR_PEBS_DATA_CFG, cpuc->pebs_data_cfg); + cpuc->active_pebs_data_cfg = cpuc->pebs_data_cfg; + } + } + /* * Use auto-reload if possible to save a MSR write in the PMI. * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD. @@ -991,7 +1112,7 @@ void intel_pmu_pebs_del(struct perf_event *event) if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs--; - pebs_update_state(needed_cb, cpuc, event->ctx->pmu); + pebs_update_state(needed_cb, cpuc, event, false); } void intel_pmu_pebs_disable(struct perf_event *event) @@ -1004,7 +1125,8 @@ void intel_pmu_pebs_disable(struct perf_event *event) cpuc->pebs_enabled &= ~(1ULL << hwc->idx); - if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) + if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && + (x86_pmu.version < 5)) cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32)); else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); @@ -1125,34 +1247,57 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 0; } -static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs) +static inline u64 intel_get_tsx_weight(u64 tsx_tuning) { - if (pebs->tsx_tuning) { - union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; + if (tsx_tuning) { + union hsw_tsx_tuning tsx = { .value = tsx_tuning }; return tsx.cycles_last_block; } return 0; } -static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs) +static inline u64 intel_get_tsx_transaction(u64 tsx_tuning, u64 ax) { - u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; + u64 txn = (tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; /* For RTM XABORTs also log the abort code from AX */ - if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1)) - txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; + if ((txn & PERF_TXN_TRANSACTION) && (ax & 1)) + txn |= ((ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; return txn; } -static void setup_pebs_sample_data(struct perf_event *event, - struct pt_regs *iregs, void *__pebs, - struct perf_sample_data *data, - struct pt_regs *regs) +static inline u64 get_pebs_status(void *n) { + if (x86_pmu.intel_cap.pebs_format < 4) + return ((struct pebs_record_nhm *)n)->status; + return ((struct pebs_basic *)n)->applicable_counters; +} + #define PERF_X86_EVENT_PEBS_HSW_PREC \ (PERF_X86_EVENT_PEBS_ST_HSW | \ PERF_X86_EVENT_PEBS_LD_HSW | \ PERF_X86_EVENT_PEBS_NA_HSW) + +static u64 get_data_src(struct perf_event *event, u64 aux) +{ + u64 val = PERF_MEM_NA; + int fl = event->hw.flags; + bool fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC); + + if (fl & PERF_X86_EVENT_PEBS_LDLAT) + val = load_latency_data(aux); + else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC)) + val = precise_datala_hsw(event, aux); + else if (fst) + val = precise_store_data(aux); + return val; +} + +static void setup_pebs_fixed_sample_data(struct perf_event *event, + struct pt_regs *iregs, void *__pebs, + struct perf_sample_data *data, + struct pt_regs *regs) +{ /* * We cast to the biggest pebs_record but are careful not to * unconditionally access the 'extra' entries. @@ -1160,17 +1305,13 @@ static void setup_pebs_sample_data(struct perf_event *event, struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct pebs_record_skl *pebs = __pebs; u64 sample_type; - int fll, fst, dsrc; - int fl = event->hw.flags; + int fll; if (pebs == NULL) return; sample_type = event->attr.sample_type; - dsrc = sample_type & PERF_SAMPLE_DATA_SRC; - - fll = fl & PERF_X86_EVENT_PEBS_LDLAT; - fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC); + fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT; perf_sample_data_init(data, 0, event->hw.last_period); @@ -1185,16 +1326,8 @@ static void setup_pebs_sample_data(struct perf_event *event, /* * data.data_src encodes the data source */ - if (dsrc) { - u64 val = PERF_MEM_NA; - if (fll) - val = load_latency_data(pebs->dse); - else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC)) - val = precise_datala_hsw(event, pebs->dse); - else if (fst) - val = precise_store_data(pebs->dse); - data->data_src.val = val; - } + if (sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = get_data_src(event, pebs->dse); /* * We must however always use iregs for the unwinder to stay sane; the @@ -1281,10 +1414,11 @@ static void setup_pebs_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll) - data->weight = intel_hsw_weight(pebs); + data->weight = intel_get_tsx_weight(pebs->tsx_tuning); if (sample_type & PERF_SAMPLE_TRANSACTION) - data->txn = intel_hsw_transaction(pebs); + data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, + pebs->ax); } /* @@ -1301,6 +1435,140 @@ static void setup_pebs_sample_data(struct perf_event *event, data->br_stack = &cpuc->lbr_stack; } +static void adaptive_pebs_save_regs(struct pt_regs *regs, + struct pebs_gprs *gprs) +{ + regs->ax = gprs->ax; + regs->bx = gprs->bx; + regs->cx = gprs->cx; + regs->dx = gprs->dx; + regs->si = gprs->si; + regs->di = gprs->di; + regs->bp = gprs->bp; + regs->sp = gprs->sp; +#ifndef CONFIG_X86_32 + regs->r8 = gprs->r8; + regs->r9 = gprs->r9; + regs->r10 = gprs->r10; + regs->r11 = gprs->r11; + regs->r12 = gprs->r12; + regs->r13 = gprs->r13; + regs->r14 = gprs->r14; + regs->r15 = gprs->r15; +#endif +} + +/* + * With adaptive PEBS the layout depends on what fields are configured. + */ + +static void setup_pebs_adaptive_sample_data(struct perf_event *event, + struct pt_regs *iregs, void *__pebs, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct pebs_basic *basic = __pebs; + void *next_record = basic + 1; + u64 sample_type; + u64 format_size; + struct pebs_meminfo *meminfo = NULL; + struct pebs_gprs *gprs = NULL; + struct x86_perf_regs *perf_regs; + + if (basic == NULL) + return; + + perf_regs = container_of(regs, struct x86_perf_regs, regs); + perf_regs->xmm_regs = NULL; + + sample_type = event->attr.sample_type; + format_size = basic->format_size; + perf_sample_data_init(data, 0, event->hw.last_period); + data->period = event->hw.last_period; + + if (event->attr.use_clockid == 0) + data->time = native_sched_clock_from_tsc(basic->tsc); + + /* + * We must however always use iregs for the unwinder to stay sane; the + * record BP,SP,IP can point into thin air when the record is from a + * previous PMI context or an (I)RET happened between the record and + * PMI. + */ + if (sample_type & PERF_SAMPLE_CALLCHAIN) + data->callchain = perf_callchain(event, iregs); + + *regs = *iregs; + /* The ip in basic is EventingIP */ + set_linear_ip(regs, basic->ip); + regs->flags = PERF_EFLAGS_EXACT; + + /* + * The record for MEMINFO is in front of GP + * But PERF_SAMPLE_TRANSACTION needs gprs->ax. + * Save the pointer here but process later. + */ + if (format_size & PEBS_DATACFG_MEMINFO) { + meminfo = next_record; + next_record = meminfo + 1; + } + + if (format_size & PEBS_DATACFG_GP) { + gprs = next_record; + next_record = gprs + 1; + + if (event->attr.precise_ip < 2) { + set_linear_ip(regs, gprs->ip); + regs->flags &= ~PERF_EFLAGS_EXACT; + } + + if (sample_type & PERF_SAMPLE_REGS_INTR) + adaptive_pebs_save_regs(regs, gprs); + } + + if (format_size & PEBS_DATACFG_MEMINFO) { + if (sample_type & PERF_SAMPLE_WEIGHT) + data->weight = meminfo->latency ?: + intel_get_tsx_weight(meminfo->tsx_tuning); + + if (sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = get_data_src(event, meminfo->aux); + + if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) + data->addr = meminfo->address; + + if (sample_type & PERF_SAMPLE_TRANSACTION) + data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, + gprs ? gprs->ax : 0); + } + + if (format_size & PEBS_DATACFG_XMMS) { + struct pebs_xmm *xmm = next_record; + + next_record = xmm + 1; + perf_regs->xmm_regs = xmm->xmm; + } + + if (format_size & PEBS_DATACFG_LBRS) { + struct pebs_lbr *lbr = next_record; + int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) + & 0xff) + 1; + next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry); + + if (has_branch_stack(event)) { + intel_pmu_store_pebs_lbrs(lbr); + data->br_stack = &cpuc->lbr_stack; + } + } + + WARN_ONCE(next_record != __pebs + (format_size >> 48), + "PEBS record size %llu, expected %llu, config %llx\n", + format_size >> 48, + (u64)(next_record - __pebs), + basic->format_size); +} + static inline void * get_next_pebs_record_by_bit(void *base, void *top, int bit) { @@ -1318,19 +1586,19 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) if (base == NULL) return NULL; - for (at = base; at < top; at += x86_pmu.pebs_record_size) { - struct pebs_record_nhm *p = at; + for (at = base; at < top; at += cpuc->pebs_record_size) { + unsigned long status = get_pebs_status(at); - if (test_bit(bit, (unsigned long *)&p->status)) { + if (test_bit(bit, (unsigned long *)&status)) { /* PEBS v3 has accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) return at; - if (p->status == (1 << bit)) + if (status == (1 << bit)) return at; /* clear non-PEBS bit and re-check */ - pebs_status = p->status & cpuc->pebs_enabled; + pebs_status = status & cpuc->pebs_enabled; pebs_status &= PEBS_COUNTER_MASK; if (pebs_status == (1 << bit)) return at; @@ -1410,11 +1678,18 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) static void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, void *base, void *top, - int bit, int count) + int bit, int count, + void (*setup_sample)(struct perf_event *, + struct pt_regs *, + void *, + struct perf_sample_data *, + struct pt_regs *)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct perf_sample_data data; - struct pt_regs regs; + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; void *at = get_next_pebs_record_by_bit(base, top, bit); if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { @@ -1429,20 +1704,20 @@ static void __intel_pmu_pebs_event(struct perf_event *event, return; while (count > 1) { - setup_pebs_sample_data(event, iregs, at, &data, ®s); - perf_event_output(event, &data, ®s); - at += x86_pmu.pebs_record_size; + setup_sample(event, iregs, at, &data, regs); + perf_event_output(event, &data, regs); + at += cpuc->pebs_record_size; at = get_next_pebs_record_by_bit(at, top, bit); count--; } - setup_pebs_sample_data(event, iregs, at, &data, ®s); + setup_sample(event, iregs, at, &data, regs); /* * All but the last records are processed. * The last one is left to be able to call the overflow handler. */ - if (perf_event_overflow(event, &data, ®s)) { + if (perf_event_overflow(event, &data, regs)) { x86_pmu_stop(event, 0); return; } @@ -1483,7 +1758,27 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) return; } - __intel_pmu_pebs_event(event, iregs, at, top, 0, n); + __intel_pmu_pebs_event(event, iregs, at, top, 0, n, + setup_pebs_fixed_sample_data); +} + +static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) +{ + struct perf_event *event; + int bit; + + /* + * The drain_pebs() could be called twice in a short period + * for auto-reload event in pmu::read(). There are no + * overflows have happened in between. + * It needs to call intel_pmu_save_and_restart_reload() to + * update the event->count for this case. + */ + for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) { + event = cpuc->events[bit]; + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) + intel_pmu_save_and_restart_reload(event, 0); + } } static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) @@ -1513,19 +1808,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) } if (unlikely(base >= top)) { - /* - * The drain_pebs() could be called twice in a short period - * for auto-reload event in pmu::read(). There are no - * overflows have happened in between. - * It needs to call intel_pmu_save_and_restart_reload() to - * update the event->count for this case. - */ - for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, - size) { - event = cpuc->events[bit]; - if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) - intel_pmu_save_and_restart_reload(event, 0); - } + intel_pmu_pebs_event_update_no_drain(cpuc, size); return; } @@ -1538,8 +1821,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) /* PEBS v3 has more accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) { - for_each_set_bit(bit, (unsigned long *)&pebs_status, - size) + for_each_set_bit(bit, (unsigned long *)&pebs_status, size) counts[bit]++; continue; @@ -1578,8 +1860,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * If collision happened, the record will be dropped. */ if (p->status != (1ULL << bit)) { - for_each_set_bit(i, (unsigned long *)&pebs_status, - x86_pmu.max_pebs_events) + for_each_set_bit(i, (unsigned long *)&pebs_status, size) error[i]++; continue; } @@ -1587,7 +1868,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) counts[bit]++; } - for (bit = 0; bit < size; bit++) { + for_each_set_bit(bit, (unsigned long *)&mask, size) { if ((counts[bit] == 0) && (error[bit] == 0)) continue; @@ -1608,11 +1889,66 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) if (counts[bit]) { __intel_pmu_pebs_event(event, iregs, base, - top, bit, counts[bit]); + top, bit, counts[bit], + setup_pebs_fixed_sample_data); } } } +static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs) +{ + short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct perf_event *event; + void *base, *at, *top; + int bit, size; + u64 mask; + + if (!x86_pmu.pebs_active) + return; + + base = (struct pebs_basic *)(unsigned long)ds->pebs_buffer_base; + top = (struct pebs_basic *)(unsigned long)ds->pebs_index; + + ds->pebs_index = ds->pebs_buffer_base; + + mask = ((1ULL << x86_pmu.max_pebs_events) - 1) | + (((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED); + size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed; + + if (unlikely(base >= top)) { + intel_pmu_pebs_event_update_no_drain(cpuc, size); + return; + } + + for (at = base; at < top; at += cpuc->pebs_record_size) { + u64 pebs_status; + + pebs_status = get_pebs_status(at) & cpuc->pebs_enabled; + pebs_status &= mask; + + for_each_set_bit(bit, (unsigned long *)&pebs_status, size) + counts[bit]++; + } + + for_each_set_bit(bit, (unsigned long *)&mask, size) { + if (counts[bit] == 0) + continue; + + event = cpuc->events[bit]; + if (WARN_ON_ONCE(!event)) + continue; + + if (WARN_ON_ONCE(!event->attr.precise_ip)) + continue; + + __intel_pmu_pebs_event(event, iregs, base, + top, bit, counts[bit], + setup_pebs_adaptive_sample_data); + } +} + /* * BTS, PEBS probe and setup */ @@ -1628,12 +1964,18 @@ void __init intel_ds_init(void) x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; - if (x86_pmu.version <= 4) + if (x86_pmu.version <= 4) { x86_pmu.pebs_no_isolation = 1; + x86_pmu.pebs_no_xmm_regs = 1; + } if (x86_pmu.pebs) { char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; + char *pebs_qual = ""; int format = x86_pmu.intel_cap.pebs_format; + if (format < 4) + x86_pmu.intel_cap.pebs_baseline = 0; + switch (format) { case 0: pr_cont("PEBS fmt0%c, ", pebs_type); @@ -1669,6 +2011,29 @@ void __init intel_ds_init(void) x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; + case 4: + x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl; + x86_pmu.pebs_record_size = sizeof(struct pebs_basic); + if (x86_pmu.intel_cap.pebs_baseline) { + x86_pmu.large_pebs_flags |= + PERF_SAMPLE_BRANCH_STACK | + PERF_SAMPLE_TIME; + x86_pmu.flags |= PMU_FL_PEBS_ALL; + pebs_qual = "-baseline"; + } else { + /* Only basic record supported */ + x86_pmu.pebs_no_xmm_regs = 1; + x86_pmu.large_pebs_flags &= + ~(PERF_SAMPLE_ADDR | + PERF_SAMPLE_TIME | + PERF_SAMPLE_DATA_SRC | + PERF_SAMPLE_TRANSACTION | + PERF_SAMPLE_REGS_USER | + PERF_SAMPLE_REGS_INTR); + } + pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); + break; + default: pr_cont("no PEBS fmt%d%c, ", format, pebs_type); x86_pmu.pebs = 0; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 580c1b91c454024cf6062b8c1013ac1f8a1d5e5a..6f814a27416b4268b94f0d83a69b022548c226e3 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -488,6 +488,8 @@ void intel_pmu_lbr_add(struct perf_event *event) * be 'new'. Conversely, a new event can get installed through the * context switch path for the first time. */ + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0) + cpuc->lbr_pebs_users++; perf_sched_cb_inc(event->ctx->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) intel_pmu_lbr_reset(); @@ -507,8 +509,11 @@ void intel_pmu_lbr_del(struct perf_event *event) task_ctx->lbr_callstack_users--; } + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0) + cpuc->lbr_pebs_users--; cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); + WARN_ON_ONCE(cpuc->lbr_pebs_users < 0); perf_sched_cb_dec(event->ctx->pmu); } @@ -658,7 +663,13 @@ void intel_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (!cpuc->lbr_users) + /* + * Don't read when all LBRs users are using adaptive PEBS. + * + * This could be smarter and actually check the event, + * but this simple approach seems to work for now. + */ + if (!cpuc->lbr_users || cpuc->lbr_users == cpuc->lbr_pebs_users) return; if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) @@ -1080,6 +1091,28 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) } } +void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int i; + + cpuc->lbr_stack.nr = x86_pmu.lbr_nr; + for (i = 0; i < x86_pmu.lbr_nr; i++) { + u64 info = lbr->lbr[i].info; + struct perf_branch_entry *e = &cpuc->lbr_entries[i]; + + e->from = lbr->lbr[i].from; + e->to = lbr->lbr[i].to; + e->mispred = !!(info & LBR_INFO_MISPRED); + e->predicted = !(info & LBR_INFO_MISPRED); + e->in_tx = !!(info & LBR_INFO_IN_TX); + e->abort = !!(info & LBR_INFO_ABORT); + e->cycles = info & LBR_INFO_CYCLES; + e->reserved = 0; + } + intel_pmu_lbr_filter(cpuc); +} + /* * Map interface branch filters onto LBR filters */ diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index fb3a2f13fc709256e81719a229d5d3fdcde6e430..339d7628080cf2d83bcff2f17db305f0e29ffec7 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1525,8 +1525,7 @@ static __init int pt_init(void) } if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) - pt_pmu.pmu.capabilities = - PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; + pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG; pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE; pt_pmu.pmu.attr_groups = pt_attr_groups; diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 94dc564146ca89190cc203d2b78e6651b4742f19..37ebf6fc5415b4f89e881318e27924216cf2e0d1 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -775,6 +775,8 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init), + + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, skl_rapl_init), {}, }; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 9fe64c01a2e5a9572386352669e0c03e833b1a61..fc40a1473058e94f793b211dfa14ebf74a05ce47 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1367,6 +1367,11 @@ static const struct intel_uncore_init_fun skx_uncore_init __initconst = { .pci_init = skx_uncore_pci_init, }; +static const struct intel_uncore_init_fun icl_uncore_init __initconst = { + .cpu_init = icl_uncore_cpu_init, + .pci_init = skl_uncore_pci_init, +}; + static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init), @@ -1393,6 +1398,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, icl_uncore_init), {}, }; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 853a49a8ccf6748024e7da090c8e01c0c8edaeb1..79eb2e21e4f043cc6a18a43b1fd998dcd750b594 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -512,6 +512,7 @@ int skl_uncore_pci_init(void); void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); +void icl_uncore_cpu_init(void); int snb_pci2phy_map_init(int devid); /* uncore_snbep.c */ diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 13493f43b24739928c006fb0dc1fe600f21ac9a9..f8431819b3e122b279c5e87af82f29214885f59e 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -34,6 +34,8 @@ #define PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC 0x3e33 #define PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC 0x3eca #define PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC 0x3e32 +#define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02 +#define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -93,6 +95,12 @@ #define SKL_UNC_PERF_GLOBAL_CTL 0xe01 #define SKL_UNC_GLOBAL_CTL_CORE_ALL ((1 << 5) - 1) +/* ICL Cbo register */ +#define ICL_UNC_CBO_CONFIG 0x396 +#define ICL_UNC_NUM_CBO_MASK 0xf +#define ICL_UNC_CBO_0_PER_CTR0 0x702 +#define ICL_UNC_CBO_MSR_OFFSET 0x8 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); @@ -280,6 +288,70 @@ void skl_uncore_cpu_init(void) snb_uncore_arb.ops = &skl_uncore_msr_ops; } +static struct intel_uncore_type icl_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .perf_ctr_bits = 44, + .perf_ctr = ICL_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = ICL_UNC_CBO_MSR_OFFSET, + .ops = &skl_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + +static struct uncore_event_desc icl_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff"), + { /* end: all zeroes */ }, +}; + +static struct attribute *icl_uncore_clock_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group icl_uncore_clock_format_group = { + .name = "format", + .attrs = icl_uncore_clock_formats_attr, +}; + +static struct intel_uncore_type icl_uncore_clockbox = { + .name = "clock", + .num_counters = 1, + .num_boxes = 1, + .fixed_ctr_bits = 48, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_CTL_EV_SEL_MASK, + .format_group = &icl_uncore_clock_format_group, + .ops = &skl_uncore_msr_ops, + .event_descs = icl_uncore_events, +}; + +static struct intel_uncore_type *icl_msr_uncores[] = { + &icl_uncore_cbox, + &snb_uncore_arb, + &icl_uncore_clockbox, + NULL, +}; + +static int icl_get_cbox_num(void) +{ + u64 num_boxes; + + rdmsrl(ICL_UNC_CBO_CONFIG, num_boxes); + + return num_boxes & ICL_UNC_NUM_CBO_MASK; +} + +void icl_uncore_cpu_init(void) +{ + uncore_msr_uncores = icl_msr_uncores; + icl_uncore_cbox.num_boxes = icl_get_cbox_num(); + snb_uncore_arb.ops = &skl_uncore_msr_ops; +} + enum { SNB_PCI_UNCORE_IMC, }; @@ -668,6 +740,18 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { { /* end: all zeroes */ }, }; +static const struct pci_device_id icl_uncore_pci_ids[] = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + static struct pci_driver snb_uncore_pci_driver = { .name = "snb_uncore", .id_table = snb_uncore_pci_ids, @@ -693,6 +777,11 @@ static struct pci_driver skl_uncore_pci_driver = { .id_table = skl_uncore_pci_ids, }; +static struct pci_driver icl_uncore_pci_driver = { + .name = "icl_uncore", + .id_table = icl_uncore_pci_ids, +}; + struct imc_uncore_pci_dev { __u32 pci_id; struct pci_driver *driver; @@ -732,6 +821,8 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(CFL_4S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Server */ IMC_DEV(CFL_6S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Server */ IMC_DEV(CFL_8S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Server */ + IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ + IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ { /* end marker */ } }; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index a878e6286e4afa0a6840d90d84f1386ee4934605..f3f4c2263501d1e6dc2390d0b4f404c487e33691 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -89,6 +89,7 @@ static bool test_intel(int idx) case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_KABYLAKE_MOBILE: case INTEL_FAM6_KABYLAKE_DESKTOP: + case INTEL_FAM6_ICELAKE_MOBILE: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index a75955741c50422b9894d454c1a33ec7c9790a77..07fc84bb85c1e9e85138cd9205045215e1d0d528 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -49,28 +49,33 @@ struct event_constraint { unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; u64 idxmsk64; }; - u64 code; - u64 cmask; - int weight; - int overlap; - int flags; + u64 code; + u64 cmask; + int weight; + int overlap; + int flags; + unsigned int size; }; + +static inline bool constraint_match(struct event_constraint *c, u64 ecode) +{ + return ((ecode & c->cmask) - c->code) <= (u64)c->size; +} + /* * struct hw_perf_event.flags flags */ #define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ #define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ #define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ -#define PERF_X86_EVENT_COMMITTED 0x0008 /* event passed commit_txn */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x0010 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x0020 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */ -#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ -#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x0800 /* use large PEBS */ - +#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ +#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */ +#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ +#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ +#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ struct amd_nb { int nb_id; /* NorthBridge id */ @@ -96,25 +101,43 @@ struct amd_nb { PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ PERF_SAMPLE_PERIOD) -#define PEBS_REGS \ - (PERF_REG_X86_AX | \ - PERF_REG_X86_BX | \ - PERF_REG_X86_CX | \ - PERF_REG_X86_DX | \ - PERF_REG_X86_DI | \ - PERF_REG_X86_SI | \ - PERF_REG_X86_SP | \ - PERF_REG_X86_BP | \ - PERF_REG_X86_IP | \ - PERF_REG_X86_FLAGS | \ - PERF_REG_X86_R8 | \ - PERF_REG_X86_R9 | \ - PERF_REG_X86_R10 | \ - PERF_REG_X86_R11 | \ - PERF_REG_X86_R12 | \ - PERF_REG_X86_R13 | \ - PERF_REG_X86_R14 | \ - PERF_REG_X86_R15) +#define PEBS_GP_REGS \ + ((1ULL << PERF_REG_X86_AX) | \ + (1ULL << PERF_REG_X86_BX) | \ + (1ULL << PERF_REG_X86_CX) | \ + (1ULL << PERF_REG_X86_DX) | \ + (1ULL << PERF_REG_X86_DI) | \ + (1ULL << PERF_REG_X86_SI) | \ + (1ULL << PERF_REG_X86_SP) | \ + (1ULL << PERF_REG_X86_BP) | \ + (1ULL << PERF_REG_X86_IP) | \ + (1ULL << PERF_REG_X86_FLAGS) | \ + (1ULL << PERF_REG_X86_R8) | \ + (1ULL << PERF_REG_X86_R9) | \ + (1ULL << PERF_REG_X86_R10) | \ + (1ULL << PERF_REG_X86_R11) | \ + (1ULL << PERF_REG_X86_R12) | \ + (1ULL << PERF_REG_X86_R13) | \ + (1ULL << PERF_REG_X86_R14) | \ + (1ULL << PERF_REG_X86_R15)) + +#define PEBS_XMM_REGS \ + ((1ULL << PERF_REG_X86_XMM0) | \ + (1ULL << PERF_REG_X86_XMM1) | \ + (1ULL << PERF_REG_X86_XMM2) | \ + (1ULL << PERF_REG_X86_XMM3) | \ + (1ULL << PERF_REG_X86_XMM4) | \ + (1ULL << PERF_REG_X86_XMM5) | \ + (1ULL << PERF_REG_X86_XMM6) | \ + (1ULL << PERF_REG_X86_XMM7) | \ + (1ULL << PERF_REG_X86_XMM8) | \ + (1ULL << PERF_REG_X86_XMM9) | \ + (1ULL << PERF_REG_X86_XMM10) | \ + (1ULL << PERF_REG_X86_XMM11) | \ + (1ULL << PERF_REG_X86_XMM12) | \ + (1ULL << PERF_REG_X86_XMM13) | \ + (1ULL << PERF_REG_X86_XMM14) | \ + (1ULL << PERF_REG_X86_XMM15)) /* * Per register state. @@ -207,10 +230,16 @@ struct cpu_hw_events { int n_pebs; int n_large_pebs; + /* Current super set of events hardware configuration */ + u64 pebs_data_cfg; + u64 active_pebs_data_cfg; + int pebs_record_size; + /* * Intel LBR bits */ int lbr_users; + int lbr_pebs_users; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; struct er_account *lbr_sel; @@ -257,18 +286,29 @@ struct cpu_hw_events { void *kfree_on_online[X86_PERF_KFREE_MAX]; }; -#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ +#define __EVENT_CONSTRAINT_RANGE(c, e, n, m, w, o, f) { \ { .idxmsk64 = (n) }, \ .code = (c), \ + .size = (e) - (c), \ .cmask = (m), \ .weight = (w), \ .overlap = (o), \ .flags = f, \ } +#define __EVENT_CONSTRAINT(c, n, m, w, o, f) \ + __EVENT_CONSTRAINT_RANGE(c, c, n, m, w, o, f) + #define EVENT_CONSTRAINT(c, n, m) \ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) +/* + * The constraint_match() function only works for 'simple' event codes + * and not for extended (AMD64_EVENTSEL_EVENT) events codes. + */ +#define EVENT_CONSTRAINT_RANGE(c, e, n, m) \ + __EVENT_CONSTRAINT_RANGE(c, e, n, m, HWEIGHT(n), 0, 0) + #define INTEL_EXCLEVT_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\ 0, PERF_X86_EVENT_EXCL) @@ -303,6 +343,12 @@ struct cpu_hw_events { #define INTEL_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) +/* + * Constraint on a range of Event codes + */ +#define INTEL_EVENT_CONSTRAINT_RANGE(c, e, n) \ + EVENT_CONSTRAINT_RANGE(c, e, n, ARCH_PERFMON_EVENTSEL_EVENT) + /* * Constraint on the Event code + UMask + fixed-mask * @@ -350,6 +396,9 @@ struct cpu_hw_events { #define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) +#define INTEL_FLAGS_EVENT_CONSTRAINT_RANGE(c, e, n) \ + EVENT_CONSTRAINT_RANGE(c, e, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) + /* Check only flags, but allow all event/umask */ #define INTEL_ALL_EVENT_CONSTRAINT(code, n) \ EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS) @@ -366,6 +415,11 @@ struct cpu_hw_events { ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) +#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(code, end, n) \ + __EVENT_CONSTRAINT_RANGE(code, end, n, \ + ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) + #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \ __EVENT_CONSTRAINT(code, n, \ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ @@ -473,6 +527,7 @@ union perf_capabilities { * values > 32bit. */ u64 full_width_write:1; + u64 pebs_baseline:1; }; u64 capabilities; }; @@ -613,14 +668,16 @@ struct x86_pmu { pebs_broken :1, pebs_prec_dist :1, pebs_no_tlb :1, - pebs_no_isolation :1; + pebs_no_isolation :1, + pebs_no_xmm_regs :1; int pebs_record_size; int pebs_buffer_size; + int max_pebs_events; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); - int max_pebs_events; unsigned long large_pebs_flags; + u64 rtm_abort_event; /* * Intel LBR @@ -714,6 +771,7 @@ static struct perf_pmu_events_ht_attr event_attr_##v = { \ .event_str_ht = ht, \ } +struct pmu *x86_get_pmu(void); extern struct x86_pmu x86_pmu __read_mostly; static inline bool x86_pmu_has_lbr_callstack(void) @@ -941,6 +999,8 @@ extern struct event_constraint intel_bdw_pebs_event_constraints[]; extern struct event_constraint intel_skl_pebs_event_constraints[]; +extern struct event_constraint intel_icl_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_add(struct perf_event *event); @@ -959,6 +1019,8 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); void intel_pmu_auto_reload_read(struct perf_event *event); +void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr); + void intel_ds_init(void); void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 8eb6fbee8e135fdc9861c7a998fcbb11ca4e4077..5c056b8aebefc83468729d0a8bba6e171914c21e 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -86,6 +86,11 @@ static void hv_apic_write(u32 reg, u32 val) static void hv_apic_eoi_write(u32 reg, u32 val) { + struct hv_vp_assist_page *hvp = hv_vp_assist_page[smp_processor_id()]; + + if (hvp && (xchg(&hvp->apic_assist, 0) & 0x1)) + return; + wrmsr(HV_X64_MSR_EOI, val, 0); } diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c index a861b0456b1a7a0bd1e71bb2e948f169ae9a8a09..07f21a06392fb9940c28025e654d66c2740846f1 100644 --- a/arch/x86/hyperv/hv_spinlock.c +++ b/arch/x86/hyperv/hv_spinlock.c @@ -56,7 +56,7 @@ static void hv_qlock_wait(u8 *byte, u8 val) /* * Hyper-V does not support this so far. */ -bool hv_vcpu_is_preempted(int vcpu) +__visible bool hv_vcpu_is_preempted(int vcpu) { return false; } diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 321fe5f5d0e96f8ed3f4962dbf982bc60551cf0e..4d5fcd47ab75a4e2815f2ed381b9356b3c18e7d1 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -61,9 +61,8 @@ } while (0) #define RELOAD_SEG(seg) { \ - unsigned int pre = GET_SEG(seg); \ + unsigned int pre = (seg) | 3; \ unsigned int cur = get_user_seg(seg); \ - pre |= 3; \ if (pre != cur) \ set_user_seg(seg, pre); \ } @@ -72,6 +71,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_32 __user *sc) { unsigned int tmpflags, err = 0; + u16 gs, fs, es, ds; void __user *buf; u32 tmp; @@ -79,16 +79,10 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, current->restart_block.fn = do_no_restart_syscall; get_user_try { - /* - * Reload fs and gs if they have changed in the signal - * handler. This does not handle long fs/gs base changes in - * the handler, but does not clobber them at least in the - * normal case. - */ - RELOAD_SEG(gs); - RELOAD_SEG(fs); - RELOAD_SEG(ds); - RELOAD_SEG(es); + gs = GET_SEG(gs); + fs = GET_SEG(fs); + ds = GET_SEG(ds); + es = GET_SEG(es); COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); COPY(ax); @@ -106,6 +100,17 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, buf = compat_ptr(tmp); } get_user_catch(err); + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ + RELOAD_SEG(gs); + RELOAD_SEG(fs); + RELOAD_SEG(ds); + RELOAD_SEG(es); + err |= fpu__restore_sig(buf, 1); force_iret(); diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index a0ab9ab61c754c6a413744bd54eb9599bfbf5dc1..eebd05942e6ca6a87c8b8e5b6aad95a91cf3ab39 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -11,3 +11,4 @@ generic-y += early_ioremap.h generic-y += export.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 31b627b43a8e01933d6209e746f4c08912d0cdef..464034db299f781104da5f05a6a3604320f8d4d7 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -19,6 +19,17 @@ .endm #endif +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +.macro ANNOTATE_IGNORE_ALTERNATIVE + .Lannotate_\@: + .pushsection .discard.ignore_alts + .long .Lannotate_\@ - . + .popsection +.endm + /* * Issue one struct alt_instr descriptor entry (need to put it into * the section .altinstructions, see below). This entry contains diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4c74073a19ccd4b2aa93078c734729e54e65bdd7..094fbc9c0b1c0332a267fb501fb037d3064a85ed 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -45,6 +45,16 @@ #define LOCK_PREFIX "" #endif +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +#define ANNOTATE_IGNORE_ALTERNATIVE \ + "999:\n\t" \ + ".pushsection .discard.ignore_alts\n\t" \ + ".long 999b - .\n\t" \ + ".popsection\n\t" + struct alt_instr { s32 instr_offset; /* original instruction */ s32 repl_offset; /* offset to replacement instruction */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 6467757bb39f6b6622c0121fe40f9f6fbcfd0b39..3ff577c0b1024af1ed0e0fc9f2805f5a8ba5e804 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -148,30 +148,6 @@ _ASM_PTR (entry); \ .popsection -.macro ALIGN_DESTINATION - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - .section .fixup,"ax" -103: addl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail - .previous - - _ASM_EXTABLE_UA(100b, 103b) - _ASM_EXTABLE_UA(101b, 103b) - .endm - #else # define _EXPAND_EXTABLE_HANDLE(x) #x # define _ASM_EXTABLE_HANDLE(from, to, handler) \ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index d153d570bb04755d9fb106e3375db55dd3114fd7..8e790ec219a5fd5be0e812736ff7be167a5cd20e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -36,16 +36,17 @@ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). */ -#define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) +#define RLONG_ADDR(x) "m" (*(volatile long *) (x)) +#define WBYTE_ADDR(x) "+m" (*(volatile char *) (x)) -#define ADDR BITOP_ADDR(addr) +#define ADDR RLONG_ADDR(addr) /* * We do the locked ops that don't return the old value as * a mask operation on a byte. */ #define IS_IMMEDIATE(nr) (__builtin_constant_p(nr)) -#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3)) +#define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3)) #define CONST_MASK(nr) (1 << ((nr) & 7)) /** @@ -73,7 +74,7 @@ set_bit(long nr, volatile unsigned long *addr) : "memory"); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" - : BITOP_ADDR(addr) : "Ir" (nr) : "memory"); + : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } @@ -88,7 +89,7 @@ set_bit(long nr, volatile unsigned long *addr) */ static __always_inline void __set_bit(long nr, volatile unsigned long *addr) { - asm volatile(__ASM_SIZE(bts) " %1,%0" : ADDR : "Ir" (nr) : "memory"); + asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } /** @@ -110,8 +111,7 @@ clear_bit(long nr, volatile unsigned long *addr) : "iq" ((u8)~CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" - : BITOP_ADDR(addr) - : "Ir" (nr)); + : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } @@ -131,7 +131,7 @@ static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *ad static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) { - asm volatile(__ASM_SIZE(btr) " %1,%0" : ADDR : "Ir" (nr)); + asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) @@ -139,7 +139,7 @@ static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile bool negative; asm volatile(LOCK_PREFIX "andb %2,%1" CC_SET(s) - : CC_OUT(s) (negative), ADDR + : CC_OUT(s) (negative), WBYTE_ADDR(addr) : "ir" ((char) ~(1 << nr)) : "memory"); return negative; } @@ -155,13 +155,9 @@ static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile * __clear_bit() is non-atomic and implies release semantics before the memory * operation. It can be used for an unlock if no other CPUs can concurrently * modify other bits in the word. - * - * No memory barrier is required here, because x86 cannot reorder stores past - * older loads. Same principle as spin_unlock. */ static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) { - barrier(); __clear_bit(nr, addr); } @@ -176,7 +172,7 @@ static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long * */ static __always_inline void __change_bit(long nr, volatile unsigned long *addr) { - asm volatile(__ASM_SIZE(btc) " %1,%0" : ADDR : "Ir" (nr)); + asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } /** @@ -196,8 +192,7 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr) : "iq" ((u8)CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" - : BITOP_ADDR(addr) - : "Ir" (nr)); + : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } @@ -242,8 +237,8 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * asm(__ASM_SIZE(bts) " %2,%1" CC_SET(c) - : CC_OUT(c) (oldbit), ADDR - : "Ir" (nr)); + : CC_OUT(c) (oldbit) + : ADDR, "Ir" (nr) : "memory"); return oldbit; } @@ -282,8 +277,8 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long asm volatile(__ASM_SIZE(btr) " %2,%1" CC_SET(c) - : CC_OUT(c) (oldbit), ADDR - : "Ir" (nr)); + : CC_OUT(c) (oldbit) + : ADDR, "Ir" (nr) : "memory"); return oldbit; } @@ -294,8 +289,8 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon asm volatile(__ASM_SIZE(btc) " %2,%1" CC_SET(c) - : CC_OUT(c) (oldbit), ADDR - : "Ir" (nr) : "memory"); + : CC_OUT(c) (oldbit) + : ADDR, "Ir" (nr) : "memory"); return oldbit; } @@ -326,7 +321,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l asm volatile(__ASM_SIZE(bt) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) - : "m" (*(unsigned long *)addr), "Ir" (nr)); + : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory"); return oldbit; } diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 29c70641544355ffb9b6947db066951311b868b3..cff3f3f3bfe0895a4e6c78b515b86c29b54cb2ca 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -7,6 +7,64 @@ #include #include +#ifdef CONFIG_X86_64 + +/* Macro to enforce the same ordering and stack sizes */ +#define ESTACKS_MEMBERS(guardsize, db2_holesize)\ + char DF_stack_guard[guardsize]; \ + char DF_stack[EXCEPTION_STKSZ]; \ + char NMI_stack_guard[guardsize]; \ + char NMI_stack[EXCEPTION_STKSZ]; \ + char DB2_stack_guard[guardsize]; \ + char DB2_stack[db2_holesize]; \ + char DB1_stack_guard[guardsize]; \ + char DB1_stack[EXCEPTION_STKSZ]; \ + char DB_stack_guard[guardsize]; \ + char DB_stack[EXCEPTION_STKSZ]; \ + char MCE_stack_guard[guardsize]; \ + char MCE_stack[EXCEPTION_STKSZ]; \ + char IST_top_guard[guardsize]; \ + +/* The exception stacks' physical storage. No guard pages required */ +struct exception_stacks { + ESTACKS_MEMBERS(0, 0) +}; + +/* The effective cpu entry area mapping with guard pages. */ +struct cea_exception_stacks { + ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ) +}; + +/* + * The exception stack ordering in [cea_]exception_stacks + */ +enum exception_stack_ordering { + ESTACK_DF, + ESTACK_NMI, + ESTACK_DB2, + ESTACK_DB1, + ESTACK_DB, + ESTACK_MCE, + N_EXCEPTION_STACKS +}; + +#define CEA_ESTACK_SIZE(st) \ + sizeof(((struct cea_exception_stacks *)0)->st## _stack) + +#define CEA_ESTACK_BOT(ceastp, st) \ + ((unsigned long)&(ceastp)->st## _stack) + +#define CEA_ESTACK_TOP(ceastp, st) \ + (CEA_ESTACK_BOT(ceastp, st) + CEA_ESTACK_SIZE(st)) + +#define CEA_ESTACK_OFFS(st) \ + offsetof(struct cea_exception_stacks, st## _stack) + +#define CEA_ESTACK_PAGES \ + (sizeof(struct cea_exception_stacks) / PAGE_SIZE) + +#endif + /* * cpu_entry_area is a percpu region that contains things needed by the CPU * and early entry/exit code. Real types aren't used for all fields here @@ -32,12 +90,9 @@ struct cpu_entry_area { #ifdef CONFIG_X86_64 /* - * Exception stacks used for IST entries. - * - * In the future, this should have a separate slot for each stack - * with guard pages between them. + * Exception stacks used for IST entries with guard pages. */ - char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; + struct cea_exception_stacks estacks; #endif #ifdef CONFIG_CPU_SUP_INTEL /* @@ -57,6 +112,7 @@ struct cpu_entry_area { #define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); +DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); extern void setup_cpu_entry_areas(void); extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); @@ -76,4 +132,7 @@ static inline struct entry_stack *cpu_entry_stack(int cpu) return &get_cpu_entry_area(cpu)->entry_stack_page.stack; } +#define __this_cpu_ist_top_va(name) \ + CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name) + #endif diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 0e56ff7e484857a1fdd8673fdfa2e0b784e23cea..1d337c51f7e6e365688047ab4aecad3d024a85e3 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -156,11 +156,14 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #else /* - * Static testing of CPU features. Used the same as boot_cpu_has(). - * These will statically patch the target code for additional - * performance. + * Static testing of CPU features. Used the same as boot_cpu_has(). It + * statically patches the target code for additional performance. Use + * static_cpu_has() only in fast paths, where every cycle counts. Which + * means that the boot_cpu_has() variant is already fast enough for the + * majority of cases and you should stick to using it as it is generally + * only two instructions: a RIP-relative MOV and a TEST. */ -static __always_inline __pure bool _static_cpu_has(u16 bit) +static __always_inline bool _static_cpu_has(u16 bit) { asm_volatile_goto("1: jmp 6f\n" "2:\n" diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 9e5ca30738e5824ac5d9d049030e3c87ae5a72b4..1a8609a15856f6e02076f04324d06a803e3133c0 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -104,11 +104,9 @@ static inline void debug_stack_usage_dec(void) { __this_cpu_dec(debug_stack_usage); } -int is_debug_stack(unsigned long addr); void debug_stack_set_zero(void); void debug_stack_reset(void); #else /* !X86_64 */ -static inline int is_debug_stack(unsigned long addr) { return 0; } static inline void debug_stack_set_zero(void) { } static inline void debug_stack_reset(void) { } static inline void debug_stack_usage_inc(void) { } diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 50ba74a34a37cba234be23e599cb4486c7aae3de..9da8cccdf3fb59d2fa9eb0f895f261709bc50102 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -103,8 +103,6 @@ enum fixed_addresses { #ifdef CONFIG_PARAVIRT FIX_PARAVIRT_BOOTMAP, #endif - FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ - FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ #ifdef CONFIG_X86_INTEL_MID FIX_LNW_VRTC, #endif diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index fb04a3ded7ddb2ab284404f0caf0f1e6b1af23aa..745a19d34f23f245d17fc50e13786d4a6ca6d34a 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -253,7 +253,7 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) WARN_ON(system_state != SYSTEM_BOOTING); - if (static_cpu_has(X86_FEATURE_XSAVES)) + if (boot_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XSAVES, xstate, lmask, hmask, err); else XSTATE_OP(XSAVE, xstate, lmask, hmask, err); @@ -275,7 +275,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) WARN_ON(system_state != SYSTEM_BOOTING); - if (static_cpu_has(X86_FEATURE_XSAVES)) + if (boot_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); @@ -497,8 +497,7 @@ static inline void fpregs_activate(struct fpu *fpu) * - switch_fpu_finish() restores the new state as * necessary. */ -static inline void -switch_fpu_prepare(struct fpu *old_fpu, int cpu) +static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { if (static_cpu_has(X86_FEATURE_FPU) && old_fpu->initialized) { if (!copy_fpregs_to_fpstate(old_fpu)) diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index ae26df1c27896d20d25ba18555d14625b905077a..8380c3ddd4b2ee29ec5a9ca7a117b0f1501bc6f0 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -8,7 +8,7 @@ /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS 8 -#define MAX_FIXED_PEBS_EVENTS 3 +#define MAX_FIXED_PEBS_EVENTS 4 /* * A debug store configuration. diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 686247db3106f2b01e058ff6ab3e7a20948b2ddf..a06a9f8294ea5b1c8cb7552d2e1c8d30e3434a74 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -90,8 +90,6 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_writew __writew #define __raw_writel __writel -#define mmiowb() barrier() - #ifdef CONFIG_X86_64 build_mmio_read(readq, "q", u64, "=r", :"memory") diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index fbb16e6b6c18b14de64bce525ab02e236b373eef..8f95686ec27e09662c1be58e2b6cb0d5aa79c67f 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -16,11 +16,7 @@ static inline int irq_canonicalize(int irq) return ((irq == 2) ? 9 : irq); } -#ifdef CONFIG_X86_32 -extern void irq_ctx_init(int cpu); -#else -# define irq_ctx_init(cpu) do { } while (0) -#endif +extern int irq_init_percpu_irqstack(unsigned int cpu); #define __ARCH_HAS_DO_SOFTIRQ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 548d90bbf919e8d75f9983a2cb97235637708bd8..889f8b1b5b7f9bae29753e16bd5b1e4d5ba1f374 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -18,8 +18,8 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts - * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts + * Vectors 129 ... LOCAL_TIMER_VECTOR-1 + * Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts * * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. * diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 93c4bf598fb06c7e53865141dd3e7faa514194ff..feab24cac610e25f276d3d1f71f4705c23106b00 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -226,7 +226,9 @@ struct x86_emulate_ops { unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); - int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase); + int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, + const char *smstate); + void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt); }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 159b5988292f33ec2d1a079bf7d10ba2bc999d4b..c79abe7ca093cf3c81f4de1938066426c8984f04 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -126,7 +126,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) } #define KVM_PERMILLE_MMU_PAGES 20 -#define KVM_MIN_ALLOC_MMU_PAGES 64 +#define KVM_MIN_ALLOC_MMU_PAGES 64UL #define KVM_MMU_HASH_SHIFT 12 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 @@ -295,6 +295,7 @@ union kvm_mmu_extended_role { unsigned int valid:1; unsigned int execonly:1; unsigned int cr0_pg:1; + unsigned int cr4_pae:1; unsigned int cr4_pse:1; unsigned int cr4_pke:1; unsigned int cr4_smap:1; @@ -844,9 +845,9 @@ enum kvm_irqchip_mode { }; struct kvm_arch { - unsigned int n_used_mmu_pages; - unsigned int n_requested_mmu_pages; - unsigned int n_max_mmu_pages; + unsigned long n_used_mmu_pages; + unsigned long n_requested_mmu_pages; + unsigned long n_max_mmu_pages; unsigned int indirect_shadow_pages; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* @@ -1182,7 +1183,7 @@ struct kvm_x86_ops { int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); - int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); + int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); int (*enable_smi_window)(struct kvm_vcpu *vcpu); int (*mem_enc_op)(struct kvm *kvm, void __user *argp); @@ -1256,8 +1257,8 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); -unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); +unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); bool pdptrs_changed(struct kvm_vcpu *vcpu); @@ -1592,4 +1593,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #define put_smstate(type, buf, offset, val) \ *(type *)((buf) + (offset) - 0x7e00) = val +#define GET_SMSTATE(type, buf, offset) \ + (*(type *)((buf) + (offset) - 0x7e00)) + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 22d05e3835f0b81424690706072a35989b3cc374..dc2d4b206ab7e2417c244050764ef2e0c4953fee 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -210,16 +210,6 @@ static inline void cmci_rediscover(void) {} static inline void cmci_recheck(void) {} #endif -#ifdef CONFIG_X86_MCE_AMD -void mce_amd_feature_init(struct cpuinfo_x86 *c); -int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr); -#else -static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } -static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; }; -#endif - -static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); } - int mce_available(struct cpuinfo_x86 *c); bool mce_is_memory_error(struct mce *m); bool mce_is_correctable(struct mce *m); @@ -345,12 +335,19 @@ extern bool amd_mce_is_memory_error(struct mce *m); extern int mce_threshold_create_device(unsigned int cpu); extern int mce_threshold_remove_device(unsigned int cpu); -#else +void mce_amd_feature_init(struct cpuinfo_x86 *c); +int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr); -static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; -static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; -static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; +#else +static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; +static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; +static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; +static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } +static inline int +umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; }; #endif +static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); } + #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 19d18fae6ec660e8119f21d2f80cf0ac3552486e..93dff19633374ca83a72c551b025dc682403cd7e 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -13,6 +13,7 @@ #include #include #include +#include extern atomic64_t last_mm_ctx_id; @@ -356,4 +357,59 @@ static inline unsigned long __get_current_cr3_fast(void) return cr3; } +typedef struct { + struct mm_struct *mm; +} temp_mm_state_t; + +/* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. + * + * Context: The temporary mm needs to be used exclusively by a single core. To + * harden security IRQs must be disabled while the temporary mm is + * loaded, thereby preventing interrupt handler bugs from overriding + * the kernel memory protection. + */ +static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) +{ + temp_mm_state_t temp_state; + + lockdep_assert_irqs_disabled(); + temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return temp_state; +} + +static inline void unuse_temporary_mm(temp_mm_state_t prev_state) +{ + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(NULL, prev_state.mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ca5bc0eacb95f56b144a2990b396520f51e0e8bb..1378518cf63ffe6980df592847e8c082e6d4bebb 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -116,6 +116,7 @@ #define LBR_INFO_CYCLES 0xffff #define MSR_IA32_PEBS_ENABLE 0x000003f1 +#define MSR_PEBS_DATA_CFG 0x000003f2 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index dad12b767ba069ede01be842e25a5d04afa35297..daf25b60c9e3a5ff6b83ee80028689b6449e85d2 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -10,6 +10,15 @@ #include #include +/* + * This should be used immediately before a retpoline alternative. It tells + * objtool where the retpolines are so that it can make sense of the control + * flow by just reading the original instruction(s) and ignoring the + * alternatives. + */ +#define ANNOTATE_NOSPEC_ALTERNATIVE \ + ANNOTATE_IGNORE_ALTERNATIVE + /* * Fill the CPU return stack buffer. * @@ -56,19 +65,6 @@ #ifdef __ASSEMBLY__ -/* - * This should be used immediately before a retpoline alternative. It tells - * objtool where the retpolines are so that it can make sense of the control - * flow by just reading the original instruction(s) and ignoring the - * alternatives. - */ -.macro ANNOTATE_NOSPEC_ALTERNATIVE - .Lannotate_\@: - .pushsection .discard.nospec - .long .Lannotate_\@ - . - .popsection -.endm - /* * This should be used immediately before an indirect jump/call. It tells * objtool the subsequent indirect jump/call is vouched safe for retpoline @@ -152,12 +148,6 @@ #else /* __ASSEMBLY__ */ -#define ANNOTATE_NOSPEC_ALTERNATIVE \ - "999:\n\t" \ - ".pushsection .discard.nospec\n\t" \ - ".long 999b - .\n\t" \ - ".popsection\n\t" - #define ANNOTATE_RETPOLINE_SAFE \ "999:\n\t" \ ".pushsection .discard.retpoline_safe\n\t" \ diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 0d5c739eebd715f387d37a81a52b837119daf9ad..565ad755c785e2bff768b2708ee71a30b7bbfa02 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -22,11 +22,9 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define DOUBLEFAULT_STACK 1 -#define NMI_STACK 0 -#define DEBUG_STACK 0 -#define MCE_STACK 0 -#define N_EXCEPTION_STACKS 1 +#define IRQ_STACK_SIZE THREAD_SIZE + +#define N_EXCEPTION_STACKS 1 #ifdef CONFIG_X86_PAE /* diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 8f657286d599a9577dca86b46b8199c9c547a661..793c14c372cba2d29a05f3f93a883fb25307888a 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -14,22 +14,20 @@ #define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define CURRENT_MASK (~(THREAD_SIZE - 1)) #define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) -#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) -#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) - #define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define DOUBLEFAULT_STACK 1 -#define NMI_STACK 2 -#define DEBUG_STACK 3 -#define MCE_STACK 4 -#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ +/* + * The index for the tss.ist[] array. The hardware limit is 7 entries. + */ +#define IST_INDEX_DF 0 +#define IST_INDEX_NMI 1 +#define IST_INDEX_DB 2 +#define IST_INDEX_MCE 3 /* * Set __PAGE_OFFSET to the most negative possible address + diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8bdf74902293489a031aa300a605447e83b96341..1392d5e6e8d671fe7d503646399c193dfce2dafa 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -7,7 +7,7 @@ */ #define INTEL_PMC_MAX_GENERIC 32 -#define INTEL_PMC_MAX_FIXED 3 +#define INTEL_PMC_MAX_FIXED 4 #define INTEL_PMC_IDX_FIXED 32 #define X86_PMC_IDX_MAX 64 @@ -32,6 +32,8 @@ #define HSW_IN_TX (1ULL << 32) #define HSW_IN_TX_CHECKPOINTED (1ULL << 33) +#define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) +#define ICL_FIXED_0_ADAPTIVE (1ULL << 32) #define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36) #define AMD64_EVENTSEL_GUESTONLY (1ULL << 40) @@ -87,6 +89,12 @@ #define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 #define ARCH_PERFMON_EVENTS_COUNT 7 +#define PEBS_DATACFG_MEMINFO BIT_ULL(0) +#define PEBS_DATACFG_GP BIT_ULL(1) +#define PEBS_DATACFG_XMMS BIT_ULL(2) +#define PEBS_DATACFG_LBRS BIT_ULL(3) +#define PEBS_DATACFG_LBR_SHIFT 24 + /* * Intel "Architectural Performance Monitoring" CPUID * detection/enumeration details: @@ -176,6 +184,41 @@ struct x86_pmu_capability { #define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58) #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55) +/* + * Adaptive PEBS v4 + */ + +struct pebs_basic { + u64 format_size; + u64 ip; + u64 applicable_counters; + u64 tsc; +}; + +struct pebs_meminfo { + u64 address; + u64 aux; + u64 latency; + u64 tsx_tuning; +}; + +struct pebs_gprs { + u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di; + u64 r8, r9, r10, r11, r12, r13, r14, r15; +}; + +struct pebs_xmm { + u64 xmm[16*2]; /* two entries for each register */ +}; + +struct pebs_lbr_entry { + u64 from, to, info; +}; + +struct pebs_lbr { + struct pebs_lbr_entry lbr[0]; /* Variable length */ +}; + /* * IBS cpuid feature detection */ @@ -248,6 +291,11 @@ extern void perf_events_lapic_init(void); #define PERF_EFLAGS_VM (1UL << 5) struct pt_regs; +struct x86_perf_regs { + struct pt_regs regs; + u64 *xmm_regs; +}; + extern unsigned long perf_instruction_pointer(struct pt_regs *regs); extern unsigned long perf_misc_flags(struct pt_regs *regs); #define perf_misc_flags(regs) perf_misc_flags(regs) @@ -260,14 +308,9 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs); */ #define perf_arch_fetch_caller_regs(regs, __ip) { \ (regs)->ip = (__ip); \ - (regs)->bp = caller_frame_pointer(); \ + (regs)->sp = (unsigned long)__builtin_frame_address(0); \ (regs)->cs = __KERNEL_CS; \ regs->flags = 0; \ - asm volatile( \ - _ASM_MOV "%%"_ASM_SP ", %0\n" \ - : "=m" ((regs)->sp) \ - :: "memory" \ - ); \ } struct perf_guest_switch_msr { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2779ace16d23f21d5cb7b65faf87f384b3b05268..3a221942f80587faa960c90b0992f553f127c4a7 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -46,7 +46,7 @@ void ptdump_walk_user_pgd_level_checkwx(void); */ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __visible; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) extern spinlock_t pgd_lock; extern struct list_head pgd_list; @@ -1021,6 +1021,9 @@ static inline void __meminit init_trampoline_default(void) /* Default trampoline pgd value */ trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; } + +void __init poking_init(void); + # ifdef CONFIG_RANDOMIZE_MEMORY void __meminit init_trampoline(void); # else diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2bb3a648fc12c966951caa1443063a972ef56e1a..7e99ef67bff08301d5d43097421e6ffe10ee7d16 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -367,6 +367,13 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); #define __KERNEL_TSS_LIMIT \ (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1) +/* Per CPU interrupt stacks */ +struct irq_stack { + char stack[IRQ_STACK_SIZE]; +} __aligned(IRQ_STACK_SIZE); + +DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); + #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #else @@ -374,38 +381,25 @@ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 #endif -/* - * Save the original ist values for checking stack pointers during debugging - */ -struct orig_ist { - unsigned long ist[7]; -}; - #ifdef CONFIG_X86_64 -DECLARE_PER_CPU(struct orig_ist, orig_ist); - -union irq_stack_union { - char irq_stack[IRQ_STACK_SIZE]; +struct fixed_percpu_data { /* * GCC hardcodes the stack canary as %gs:40. Since the * irq_stack is the object at %gs:0, we reserve the bottom * 48 bytes of the irq stack for the canary. */ - struct { - char gs_base[40]; - unsigned long stack_canary; - }; + char gs_base[40]; + unsigned long stack_canary; }; -DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; -DECLARE_INIT_PER_CPU(irq_stack_union); +DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible; +DECLARE_INIT_PER_CPU(fixed_percpu_data); static inline unsigned long cpu_kernelmode_gs_base(int cpu) { - return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu); + return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -DECLARE_PER_CPU(char *, irq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); @@ -427,15 +421,8 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -/* - * per-CPU IRQ handling stacks - */ -struct irq_stack { - u32 stack[THREAD_SIZE/sizeof(u32)]; -} __aligned(THREAD_SIZE); - -DECLARE_PER_CPU(struct irq_stack *, hardirq_stack); -DECLARE_PER_CPU(struct irq_stack *, softirq_stack); +/* Per CPU softirq stack pointer */ +DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* X86_64 */ extern unsigned int fpu_kernel_xstate_size; diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h deleted file mode 100644 index 4c25cf6caefa1b64e732a6c1d5d7a6636ebd6a62..0000000000000000000000000000000000000000 --- a/arch/x86/include/asm/rwsem.h +++ /dev/null @@ -1,237 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* rwsem.h: R/W semaphores implemented using XADD/CMPXCHG for i486+ - * - * Written by David Howells (dhowells@redhat.com). - * - * Derived from asm-x86/semaphore.h - * - * - * The MSW of the count is the negated number of active writers and waiting - * lockers, and the LSW is the total number of active locks - * - * The lock count is initialized to 0 (no active and no waiting lockers). - * - * When a writer subtracts WRITE_BIAS, it'll get 0xffff0001 for the case of an - * uncontended lock. This can be determined because XADD returns the old value. - * Readers increment by 1 and see a positive value when uncontended, negative - * if there are writers (and maybe) readers waiting (in which case it goes to - * sleep). - * - * The value of WAITING_BIAS supports up to 32766 waiting processes. This can - * be extended to 65534 by manually checking the whole MSW rather than relying - * on the S flag. - * - * The value of ACTIVE_BIAS supports up to 65535 active processes. - * - * This should be totally fair - if anything is waiting, a process that wants a - * lock will go to the back of the queue. When the currently active lock is - * released, if there's a writer at the front of the queue, then that and only - * that will be woken up; if there's a bunch of consecutive readers at the - * front, then they'll all be woken up, but no other readers will be. - */ - -#ifndef _ASM_X86_RWSEM_H -#define _ASM_X86_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ -#include - -/* - * The bias values and the counter type limits the number of - * potential readers/writers to 32767 for 32 bits and 2147483647 - * for 64 bits. - */ - -#ifdef CONFIG_X86_64 -# define RWSEM_ACTIVE_MASK 0xffffffffL -#else -# define RWSEM_ACTIVE_MASK 0x0000ffffL -#endif - -#define RWSEM_UNLOCKED_VALUE 0x00000000L -#define RWSEM_ACTIVE_BIAS 0x00000001L -#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -#define ____down_read(sem, slow_path) \ -({ \ - struct rw_semaphore* ret; \ - asm volatile("# beginning down_read\n\t" \ - LOCK_PREFIX _ASM_INC "(%[sem])\n\t" \ - /* adds 0x00000001 */ \ - " jns 1f\n" \ - " call " slow_path "\n" \ - "1:\n\t" \ - "# ending down_read\n\t" \ - : "+m" (sem->count), "=a" (ret), \ - ASM_CALL_CONSTRAINT \ - : [sem] "a" (sem) \ - : "memory", "cc"); \ - ret; \ -}) - -static inline void __down_read(struct rw_semaphore *sem) -{ - ____down_read(sem, "call_rwsem_down_read_failed"); -} - -static inline int __down_read_killable(struct rw_semaphore *sem) -{ - if (IS_ERR(____down_read(sem, "call_rwsem_down_read_failed_killable"))) - return -EINTR; - return 0; -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -static inline bool __down_read_trylock(struct rw_semaphore *sem) -{ - long result, tmp; - asm volatile("# beginning __down_read_trylock\n\t" - " mov %[count],%[result]\n\t" - "1:\n\t" - " mov %[result],%[tmp]\n\t" - " add %[inc],%[tmp]\n\t" - " jle 2f\n\t" - LOCK_PREFIX " cmpxchg %[tmp],%[count]\n\t" - " jnz 1b\n\t" - "2:\n\t" - "# ending __down_read_trylock\n\t" - : [count] "+m" (sem->count), [result] "=&a" (result), - [tmp] "=&r" (tmp) - : [inc] "i" (RWSEM_ACTIVE_READ_BIAS) - : "memory", "cc"); - return result >= 0; -} - -/* - * lock for writing - */ -#define ____down_write(sem, slow_path) \ -({ \ - long tmp; \ - struct rw_semaphore* ret; \ - \ - asm volatile("# beginning down_write\n\t" \ - LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" \ - /* adds 0xffff0001, returns the old value */ \ - " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \ - /* was the active mask 0 before? */\ - " jz 1f\n" \ - " call " slow_path "\n" \ - "1:\n" \ - "# ending down_write" \ - : "+m" (sem->count), [tmp] "=d" (tmp), \ - "=a" (ret), ASM_CALL_CONSTRAINT \ - : [sem] "a" (sem), "[tmp]" (RWSEM_ACTIVE_WRITE_BIAS) \ - : "memory", "cc"); \ - ret; \ -}) - -static inline void __down_write(struct rw_semaphore *sem) -{ - ____down_write(sem, "call_rwsem_down_write_failed"); -} - -static inline int __down_write_killable(struct rw_semaphore *sem) -{ - if (IS_ERR(____down_write(sem, "call_rwsem_down_write_failed_killable"))) - return -EINTR; - - return 0; -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -static inline bool __down_write_trylock(struct rw_semaphore *sem) -{ - bool result; - long tmp0, tmp1; - asm volatile("# beginning __down_write_trylock\n\t" - " mov %[count],%[tmp0]\n\t" - "1:\n\t" - " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" - /* was the active mask 0 before? */ - " jnz 2f\n\t" - " mov %[tmp0],%[tmp1]\n\t" - " add %[inc],%[tmp1]\n\t" - LOCK_PREFIX " cmpxchg %[tmp1],%[count]\n\t" - " jnz 1b\n\t" - "2:\n\t" - CC_SET(e) - "# ending __down_write_trylock\n\t" - : [count] "+m" (sem->count), [tmp0] "=&a" (tmp0), - [tmp1] "=&r" (tmp1), CC_OUT(e) (result) - : [inc] "er" (RWSEM_ACTIVE_WRITE_BIAS) - : "memory"); - return result; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - long tmp; - asm volatile("# beginning __up_read\n\t" - LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" - /* subtracts 1, returns the old value */ - " jns 1f\n\t" - " call call_rwsem_wake\n" /* expects old value in %edx */ - "1:\n" - "# ending __up_read\n" - : "+m" (sem->count), [tmp] "=d" (tmp) - : [sem] "a" (sem), "[tmp]" (-RWSEM_ACTIVE_READ_BIAS) - : "memory", "cc"); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - long tmp; - asm volatile("# beginning __up_write\n\t" - LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" - /* subtracts 0xffff0001, returns the old value */ - " jns 1f\n\t" - " call call_rwsem_wake\n" /* expects old value in %edx */ - "1:\n\t" - "# ending __up_write\n" - : "+m" (sem->count), [tmp] "=d" (tmp) - : [sem] "a" (sem), "[tmp]" (-RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc"); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX _ASM_ADD "%[inc],(%[sem])\n\t" - /* - * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386) - * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64) - */ - " jns 1f\n\t" - " call call_rwsem_downgrade_wake\n" - "1:\n\t" - "# ending __downgrade_write\n" - : "+m" (sem->count) - : [sem] "a" (sem), [inc] "er" (-RWSEM_WAITING_BIAS) - : "memory", "cc"); -} - -#endif /* __KERNEL__ */ -#endif /* _ASM_X86_RWSEM_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 07a25753e85c5cd53b2613a71db91862fa31684f..ae7b909dc242d17f0bc1be10130a42d08c3785ae 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -85,6 +85,9 @@ int set_pages_nx(struct page *page, int numpages); int set_pages_ro(struct page *page, int numpages); int set_pages_rw(struct page *page, int numpages); +int set_direct_map_invalid_noflush(struct page *page); +int set_direct_map_default_noflush(struct page *page); + extern int kernel_set_to_readonly; void set_kernel_text_rw(void); void set_kernel_text_ro(void); diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index db333300bd4be17205daf3b2e820c0af101ccc2d..f94a7d0ddd490e19a168cb7404a4a0cbda2e7d28 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -13,13 +13,12 @@ #ifndef _ASM_X86_SMAP_H #define _ASM_X86_SMAP_H -#include #include #include /* "Raw" instruction opcodes */ -#define __ASM_CLAC .byte 0x0f,0x01,0xca -#define __ASM_STAC .byte 0x0f,0x01,0xcb +#define __ASM_CLAC ".byte 0x0f,0x01,0xca" +#define __ASM_STAC ".byte 0x0f,0x01,0xcb" #ifdef __ASSEMBLY__ @@ -28,10 +27,10 @@ #ifdef CONFIG_X86_SMAP #define ASM_CLAC \ - ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP + ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP #define ASM_STAC \ - ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP + ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP #else /* CONFIG_X86_SMAP */ @@ -49,26 +48,46 @@ static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ - alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP); + alternative("", __ASM_CLAC, X86_FEATURE_SMAP); } static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ - alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); + alternative("", __ASM_STAC, X86_FEATURE_SMAP); +} + +static __always_inline unsigned long smap_save(void) +{ + unsigned long flags; + + asm volatile (ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC, + X86_FEATURE_SMAP) + : "=rm" (flags) : : "memory", "cc"); + + return flags; +} + +static __always_inline void smap_restore(unsigned long flags) +{ + asm volatile (ALTERNATIVE("", "push %0; popf", X86_FEATURE_SMAP) + : : "g" (flags) : "memory", "cc"); } /* These macros can be used in asm() statements */ #define ASM_CLAC \ - ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __ASM_CLAC, X86_FEATURE_SMAP) #define ASM_STAC \ - ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP) #else /* CONFIG_X86_SMAP */ static inline void clac(void) { } static inline void stac(void) { } +static inline unsigned long smap_save(void) { return 0; } +static inline void smap_restore(unsigned long flags) { } + #define ASM_CLAC #define ASM_STAC diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 2e95b6c1bca3f517555e99c81b262d386bae5530..da545df207b2affed15a511ceb728bc999c35360 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -131,7 +131,7 @@ void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); -void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); +int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); int common_cpu_die(unsigned int cpu); diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 8ec97a62c245175e87d96c0376eb962f51f8e91d..91e29b6a86a5e9203c7e432f01c0046bf0bc561e 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -13,7 +13,7 @@ * On x86_64, %gs is shared by percpu area and stack canary. All * percpu symbols are zero based and %gs points to the base of percpu * area. The first occupant of the percpu area is always - * irq_stack_union which contains stack_canary at offset 40. Userland + * fixed_percpu_data which contains stack_canary at offset 40. Userland * %gs is always saved and restored on kernel entry and exit using * swapgs, so stack protector doesn't add any complexity there. * @@ -64,7 +64,7 @@ static __always_inline void boot_init_stack_canary(void) u64 tsc; #ifdef CONFIG_X86_64 - BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40); + BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40); #endif /* * We both use the random pool and the current TSC as a source @@ -79,7 +79,7 @@ static __always_inline void boot_init_stack_canary(void) current->stack_canary = canary; #ifdef CONFIG_X86_64 - this_cpu_write(irq_stack_union.stack_canary, canary); + this_cpu_write(fixed_percpu_data.stack_canary, canary); #else this_cpu_write(stack_canary.canary, canary); #endif diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f335aad404a479e98e4a5d38dc41ee5e5aa419ec..a8d0cdf4861665a9b048a78af4ff110c2fbbefc7 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -9,6 +9,8 @@ #include #include + +#include #include enum stack_type { @@ -98,19 +100,6 @@ struct stack_frame_ia32 { u32 return_address; }; -static inline unsigned long caller_frame_pointer(void) -{ - struct stack_frame *frame; - - frame = __builtin_frame_address(0); - -#ifdef CONFIG_FRAME_POINTER - frame = frame->next_frame; -#endif - - return (unsigned long)frame; -} - void show_opcodes(struct pt_regs *regs, const char *loglvl); void show_ip(struct pt_regs *regs, const char *loglvl); #endif /* _ASM_X86_STACKTRACE_H */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 7cf1a270d89101822da3c9390f4e1f112258939e..18a4b6890fa82f589b9609ce1e509574a5411bf5 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -46,6 +46,7 @@ struct inactive_task_frame { unsigned long r13; unsigned long r12; #else + unsigned long flags; unsigned long si; unsigned long di; #endif diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 2fe745356fb119d1dc9497c00caeccb3a04ab5c3..6d8d6bc183b7421ddfc367062b1ebb7164245ee3 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -14,6 +14,8 @@ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). */ +#include + #define ADDR (*(volatile long *)addr) /** @@ -29,7 +31,7 @@ */ static inline void sync_set_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; bts %1,%0" + asm volatile("lock; " __ASM_SIZE(bts) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -47,7 +49,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr) */ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btr %1,%0" + asm volatile("lock; " __ASM_SIZE(btr) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -64,7 +66,7 @@ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) */ static inline void sync_change_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btc %1,%0" + asm volatile("lock; " __ASM_SIZE(btc) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -78,14 +80,9 @@ static inline void sync_change_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) +static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr) { - unsigned char oldbit; - - asm volatile("lock; bts %2,%1\n\tsetc %0" - : "=qm" (oldbit), "+m" (ADDR) - : "Ir" (nr) : "memory"); - return oldbit; + return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(bts), *addr, c, "Ir", nr); } /** @@ -98,12 +95,7 @@ static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { - unsigned char oldbit; - - asm volatile("lock; btr %2,%1\n\tsetc %0" - : "=qm" (oldbit), "+m" (ADDR) - : "Ir" (nr) : "memory"); - return oldbit; + return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btr), *addr, c, "Ir", nr); } /** @@ -116,12 +108,7 @@ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { - unsigned char oldbit; - - asm volatile("lock; btc %2,%1\n\tsetc %0" - : "=qm" (oldbit), "+m" (ADDR) - : "Ir" (nr) : "memory"); - return oldbit; + return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btc), *addr, c, "Ir", nr); } #define sync_test_bit(nr, addr) test_bit(nr, addr) diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index d653139857af2a1121f877b611c8d56d4e4690f0..4c305471ec3312e3b0adc7063c30e9d3edf2de7f 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -91,11 +91,9 @@ static inline void syscall_set_return_value(struct task_struct *task, static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { - BUG_ON(i + n > 6); - memcpy(args, ®s->bx + i, n * sizeof(args[0])); + memcpy(args, ®s->bx, 6 * sizeof(args[0])); } static inline void syscall_set_arguments(struct task_struct *task, @@ -116,124 +114,50 @@ static inline int syscall_get_arch(void) static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, unsigned long *args) { # ifdef CONFIG_IA32_EMULATION - if (task->thread_info.status & TS_COMPAT) - switch (i) { - case 0: - if (!n--) break; - *args++ = regs->bx; - case 1: - if (!n--) break; - *args++ = regs->cx; - case 2: - if (!n--) break; - *args++ = regs->dx; - case 3: - if (!n--) break; - *args++ = regs->si; - case 4: - if (!n--) break; - *args++ = regs->di; - case 5: - if (!n--) break; - *args++ = regs->bp; - case 6: - if (!n--) break; - default: - BUG(); - break; - } - else + if (task->thread_info.status & TS_COMPAT) { + *args++ = regs->bx; + *args++ = regs->cx; + *args++ = regs->dx; + *args++ = regs->si; + *args++ = regs->di; + *args = regs->bp; + } else # endif - switch (i) { - case 0: - if (!n--) break; - *args++ = regs->di; - case 1: - if (!n--) break; - *args++ = regs->si; - case 2: - if (!n--) break; - *args++ = regs->dx; - case 3: - if (!n--) break; - *args++ = regs->r10; - case 4: - if (!n--) break; - *args++ = regs->r8; - case 5: - if (!n--) break; - *args++ = regs->r9; - case 6: - if (!n--) break; - default: - BUG(); - break; - } + { + *args++ = regs->di; + *args++ = regs->si; + *args++ = regs->dx; + *args++ = regs->r10; + *args++ = regs->r8; + *args = regs->r9; + } } static inline void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, - unsigned int i, unsigned int n, const unsigned long *args) { # ifdef CONFIG_IA32_EMULATION - if (task->thread_info.status & TS_COMPAT) - switch (i) { - case 0: - if (!n--) break; - regs->bx = *args++; - case 1: - if (!n--) break; - regs->cx = *args++; - case 2: - if (!n--) break; - regs->dx = *args++; - case 3: - if (!n--) break; - regs->si = *args++; - case 4: - if (!n--) break; - regs->di = *args++; - case 5: - if (!n--) break; - regs->bp = *args++; - case 6: - if (!n--) break; - default: - BUG(); - break; - } - else + if (task->thread_info.status & TS_COMPAT) { + regs->bx = *args++; + regs->cx = *args++; + regs->dx = *args++; + regs->si = *args++; + regs->di = *args++; + regs->bp = *args; + } else # endif - switch (i) { - case 0: - if (!n--) break; - regs->di = *args++; - case 1: - if (!n--) break; - regs->si = *args++; - case 2: - if (!n--) break; - regs->dx = *args++; - case 3: - if (!n--) break; - regs->r10 = *args++; - case 4: - if (!n--) break; - regs->r8 = *args++; - case 5: - if (!n--) break; - regs->r9 = *args++; - case 6: - if (!n--) break; - default: - BUG(); - break; - } + { + regs->di = *args++; + regs->si = *args++; + regs->dx = *args++; + regs->r10 = *args++; + regs->r8 = *args++; + regs->r9 = *args; + } } static inline int syscall_get_arch(void) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index e85ff65c43c3efc85702841ab77bffe145778707..c90678fd391a45d8f48453f48b66fa95b73b89d7 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -18,7 +18,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define __parainstructions_end NULL #endif -extern void *text_poke_early(void *addr, const void *opcode, size_t len); +extern void text_poke_early(void *addr, const void *opcode, size_t len); /* * Clear and restore the kernel write-protection flag on the local CPU. @@ -35,8 +35,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); * inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); -extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); extern int after_bootmem; +extern __ro_after_init struct mm_struct *poking_mm; +extern __ro_after_init unsigned long poking_addr; #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 404b8b1d44f5899bb2db788a5aee7c588445748d..f23e7aaff4cd0914517d2b76bcfadb0cf9c70d1d 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -6,6 +6,7 @@ #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) +#define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); #include diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index f4204bf377fcf72d597f1d0e438a3f85a8c54127..dee37583196288df06c704b6dc78b6b58b819e28 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -167,7 +167,7 @@ struct tlb_state { */ struct mm_struct *loaded_mm; -#define LOADED_MM_SWITCHING ((struct mm_struct *)1) +#define LOADED_MM_SWITCHING ((struct mm_struct *)1UL) /* Last user mm for optimizing IBPB */ union { @@ -274,6 +274,8 @@ static inline bool nmi_uaccess_okay(void) return true; } +#define nmi_uaccess_okay nmi_uaccess_okay + /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 1954dd5552a2e2fbeaf21937ad4c6d98c6ba0aff..c82abd6e4ca39ad7e5d8c3ae454fc5d7a8671da3 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -427,10 +427,11 @@ do { \ ({ \ __label__ __pu_label; \ int __pu_err = -EFAULT; \ - __typeof__(*(ptr)) __pu_val; \ - __pu_val = x; \ + __typeof__(*(ptr)) __pu_val = (x); \ + __typeof__(ptr) __pu_ptr = (ptr); \ + __typeof__(size) __pu_size = (size); \ __uaccess_begin(); \ - __put_user_size(__pu_val, (ptr), (size), __pu_label); \ + __put_user_size(__pu_val, __pu_ptr, __pu_size, __pu_label); \ __pu_err = 0; \ __pu_label: \ __uaccess_end(); \ @@ -585,7 +586,6 @@ extern void __cmpxchg_wrong_size(void) #define __user_atomic_cmpxchg_inatomic(uval, ptr, old, new, size) \ ({ \ int __ret = 0; \ - __typeof__(ptr) __uval = (uval); \ __typeof__(*(ptr)) __old = (old); \ __typeof__(*(ptr)) __new = (new); \ __uaccess_begin_nospec(); \ @@ -661,7 +661,7 @@ extern void __cmpxchg_wrong_size(void) __cmpxchg_wrong_size(); \ } \ __uaccess_end(); \ - *__uval = __old; \ + *(uval) = __old; \ __ret; \ }) @@ -705,7 +705,7 @@ extern struct movsl_mask { * checking before using them, but you have to surround them with the * user_access_begin/end() pair. */ -static __must_check inline bool user_access_begin(const void __user *ptr, size_t len) +static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) { if (unlikely(!access_ok(ptr,len))) return 0; @@ -715,6 +715,9 @@ static __must_check inline bool user_access_begin(const void __user *ptr, size_t #define user_access_begin(a,b) user_access_begin(a,b) #define user_access_end() __uaccess_end() +#define user_access_save() smap_save() +#define user_access_restore(x) smap_restore(x) + #define unsafe_put_user(x, ptr, label) \ __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index a9d637bc301d7dd0086b5126a5ebac8f042c62c9..5cd1caa8bc6537c8795218581118c60552128ad8 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -207,9 +207,6 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) return __copy_user_flushcache(dst, src, size); } -unsigned long -copy_user_handle_tail(char *to, char *from, unsigned len); - unsigned long mcsafe_handle_tail(char *to, char *from, unsigned len); diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index de6f0d59a24f418febf72e40dd595e41dcb3c7c0..d50c7b747d8b879182cee633b22e3809b66af2b6 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -206,6 +206,9 @@ xen_single_call(unsigned int call, __HYPERCALL_DECLS; __HYPERCALL_5ARG(a1, a2, a3, a4, a5); + if (call >= PAGE_SIZE / sizeof(hypercall_page[0])) + return -EINVAL; + asm volatile(CALL_NOSPEC : __HYPERCALL_5PARAM : [thunk_target] "a" (&hypercall_page[call]) @@ -214,6 +217,22 @@ xen_single_call(unsigned int call, return (long)__res; } +static __always_inline void __xen_stac(void) +{ + /* + * Suppress objtool seeing the STAC/CLAC and getting confused about it + * calling random code with AC=1. + */ + asm volatile(ANNOTATE_IGNORE_ALTERNATIVE + ASM_STAC ::: "memory", "flags"); +} + +static __always_inline void __xen_clac(void) +{ + asm volatile(ANNOTATE_IGNORE_ALTERNATIVE + ASM_CLAC ::: "memory", "flags"); +} + static inline long privcmd_call(unsigned int call, unsigned long a1, unsigned long a2, @@ -222,9 +241,9 @@ privcmd_call(unsigned int call, { long res; - stac(); + __xen_stac(); res = xen_single_call(call, a1, a2, a3, a4, a5); - clac(); + __xen_clac(); return res; } @@ -421,9 +440,9 @@ HYPERVISOR_dm_op( domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs) { int ret; - stac(); + __xen_stac(); ret = _hypercall3(int, dm_op, dom, nr_bufs, bufs); - clac(); + __xen_clac(); return ret; } diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index dabfcf7c3941aa90a92a91ee37f1164447c71655..7a0e64ccd6ff5d02108a4424fc72a36f49018987 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -381,6 +381,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) #define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) +#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h index f3329cabce5c6d9e7c605a0fb46f764e2d643141..ac67bbea10cae36848ff0be197c40a3a7af7c0f6 100644 --- a/arch/x86/include/uapi/asm/perf_regs.h +++ b/arch/x86/include/uapi/asm/perf_regs.h @@ -27,8 +27,29 @@ enum perf_event_x86_regs { PERF_REG_X86_R13, PERF_REG_X86_R14, PERF_REG_X86_R15, - + /* These are the limits for the GPRs. */ PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1, PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1, + + /* These all need two bits set because they are 128bit */ + PERF_REG_X86_XMM0 = 32, + PERF_REG_X86_XMM1 = 34, + PERF_REG_X86_XMM2 = 36, + PERF_REG_X86_XMM3 = 38, + PERF_REG_X86_XMM4 = 40, + PERF_REG_X86_XMM5 = 42, + PERF_REG_X86_XMM6 = 44, + PERF_REG_X86_XMM7 = 46, + PERF_REG_X86_XMM8 = 48, + PERF_REG_X86_XMM9 = 50, + PERF_REG_X86_XMM10 = 52, + PERF_REG_X86_XMM11 = 54, + PERF_REG_X86_XMM12 = 56, + PERF_REG_X86_XMM13 = 58, + PERF_REG_X86_XMM14 = 60, + PERF_REG_X86_XMM15 = 62, + + /* These include both GPRs and XMMX registers */ + PERF_REG_X86_XMM_MAX = PERF_REG_X86_XMM15 + 2, }; #endif /* _ASM_X86_PERF_REGS_H */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index f0b0c90dd398246eb2882050d69c6b53ccca11af..d213ec5c3766db0dd5176c951b13e5f3c1514cfb 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -146,6 +146,7 @@ #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 +#define VMX_ABORT_VMCS_CORRUPTED 3 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 158ad1483c4352b2f93c7cbb931dfdb8dd40c3bf..cb6e076a6d3989d30fbe7fbc1824df27663e16c5 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -51,6 +51,18 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, if (c->x86_vendor == X86_VENDOR_INTEL && (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) flags->bm_control = 0; + /* + * For all recent Centaur CPUs, the ucode will make sure that each + * core can keep cache coherence with each other while entering C3 + * type state. So, set bm_check to 1 to indicate that the kernel + * doesn't need to execute a cache flush operation (WBINVD) when + * entering C3 type state. + */ + if (c->x86_vendor == X86_VENDOR_CENTAUR) { + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f && + c->x86_stepping >= 0x0e)) + flags->bm_check = 1; + } } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9a79c7808f9cc13903521b2beb6b0c5c12210485..7b9b49dfc05affe6bfd4adba2869287869f9c9b7 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -264,7 +265,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; -void *text_poke_early(void *addr, const void *opcode, size_t len); +void text_poke_early(void *addr, const void *opcode, size_t len); /* * Are we looking at a near JMP with a 1 or 4-byte displacement. @@ -666,16 +667,136 @@ void __init alternative_instructions(void) * instructions. And on the local CPU you need to be protected again NMI or MCE * handlers seeing an inconsistent instruction while you patch. */ -void *__init_or_module text_poke_early(void *addr, const void *opcode, - size_t len) +void __init_or_module text_poke_early(void *addr, const void *opcode, + size_t len) { unsigned long flags; + + if (boot_cpu_has(X86_FEATURE_NX) && + is_module_text_address((unsigned long)addr)) { + /* + * Modules text is marked initially as non-executable, so the + * code cannot be running and speculative code-fetches are + * prevented. Just change the code. + */ + memcpy(addr, opcode, len); + } else { + local_irq_save(flags); + memcpy(addr, opcode, len); + local_irq_restore(flags); + sync_core(); + + /* + * Could also do a CLFLUSH here to speed up CPU recovery; but + * that causes hangs on some VIA CPUs. + */ + } +} + +__ro_after_init struct mm_struct *poking_mm; +__ro_after_init unsigned long poking_addr; + +static void *__text_poke(void *addr, const void *opcode, size_t len) +{ + bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; + struct page *pages[2] = {NULL}; + temp_mm_state_t prev; + unsigned long flags; + pte_t pte, *ptep; + spinlock_t *ptl; + pgprot_t pgprot; + + /* + * While boot memory allocator is running we cannot use struct pages as + * they are not yet initialized. There is no way to recover. + */ + BUG_ON(!after_bootmem); + + if (!core_kernel_text((unsigned long)addr)) { + pages[0] = vmalloc_to_page(addr); + if (cross_page_boundary) + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); + } else { + pages[0] = virt_to_page(addr); + WARN_ON(!PageReserved(pages[0])); + if (cross_page_boundary) + pages[1] = virt_to_page(addr + PAGE_SIZE); + } + /* + * If something went wrong, crash and burn since recovery paths are not + * implemented. + */ + BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); + local_irq_save(flags); - memcpy(addr, opcode, len); + + /* + * Map the page without the global bit, as TLB flushing is done with + * flush_tlb_mm_range(), which is intended for non-global PTEs. + */ + pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); + + /* + * The lock is not really needed, but this allows to avoid open-coding. + */ + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + + /* + * This must not fail; preallocated in poking_init(). + */ + VM_BUG_ON(!ptep); + + pte = mk_pte(pages[0], pgprot); + set_pte_at(poking_mm, poking_addr, ptep, pte); + + if (cross_page_boundary) { + pte = mk_pte(pages[1], pgprot); + set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + } + + /* + * Loading the temporary mm behaves as a compiler barrier, which + * guarantees that the PTE will be set at the time memcpy() is done. + */ + prev = use_temporary_mm(poking_mm); + + kasan_disable_current(); + memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); + kasan_enable_current(); + + /* + * Ensure that the PTE is only cleared after the instructions of memcpy + * were issued by using a compiler barrier. + */ + barrier(); + + pte_clear(poking_mm, poking_addr, ptep); + if (cross_page_boundary) + pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + + /* + * Loading the previous page-table hierarchy requires a serializing + * instruction that already allows the core to see the updated version. + * Xen-PV is assumed to serialize execution in a similar manner. + */ + unuse_temporary_mm(prev); + + /* + * Flushing the TLB might involve IPIs, which would require enabled + * IRQs, but not if the mm is not used, as it is in this point. + */ + flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, + PAGE_SHIFT, false); + + /* + * If the text does not match what we just wrote then something is + * fundamentally screwy; there's nothing we can really do about that. + */ + BUG_ON(memcmp(addr, opcode, len)); + + pte_unmap_unlock(ptep, ptl); local_irq_restore(flags); - sync_core(); - /* Could also do a CLFLUSH here to speed up CPU recovery; but - that causes hangs on some VIA CPUs. */ return addr; } @@ -689,48 +810,36 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. + * + * Note that the caller must ensure that if the modified code is part of a + * module, the module would not be removed during poking. This can be achieved + * by registering a module notifier, and ordering module removal and patching + * trough a mutex. */ void *text_poke(void *addr, const void *opcode, size_t len) { - unsigned long flags; - char *vaddr; - struct page *pages[2]; - int i; - - /* - * While boot memory allocator is runnig we cannot use struct - * pages as they are not yet initialized. - */ - BUG_ON(!after_bootmem); - lockdep_assert_held(&text_mutex); - if (!core_kernel_text((unsigned long)addr)) { - pages[0] = vmalloc_to_page(addr); - pages[1] = vmalloc_to_page(addr + PAGE_SIZE); - } else { - pages[0] = virt_to_page(addr); - WARN_ON(!PageReserved(pages[0])); - pages[1] = virt_to_page(addr + PAGE_SIZE); - } - BUG_ON(!pages[0]); - local_irq_save(flags); - set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); - if (pages[1]) - set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); - vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); - memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - clear_fixmap(FIX_TEXT_POKE0); - if (pages[1]) - clear_fixmap(FIX_TEXT_POKE1); - local_flush_tlb(); - sync_core(); - /* Could also do a CLFLUSH here to speed up CPU recovery; but - that causes hangs on some VIA CPUs. */ - for (i = 0; i < len; i++) - BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); - local_irq_restore(flags); - return addr; + return __text_poke(addr, opcode, len); +} + +/** + * text_poke_kgdb - Update instructions on a live kernel by kgdb + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Only atomic text poke/set should be allowed when not doing early patching. + * It means the size must be writable atomically and the address must be aligned + * in a way that permits an atomic write. It also makes sure we fit on a single + * page. + * + * Context: should only be used by kgdb, which ensures no other core is running, + * despite the fact it does not hold the text_mutex. + */ +void *text_poke_kgdb(void *addr, const void *opcode, size_t len) +{ + return __text_poke(addr, opcode, len); } static void do_sync_core(void *info) @@ -788,7 +897,7 @@ NOKPROBE_SYMBOL(poke_int3_handler); * replacing opcode * - sync cores */ -void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) { unsigned char int3 = 0xcc; @@ -830,7 +939,5 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) * the writing of the new instruction. */ bp_patching_in_progress = false; - - return addr; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b7bcdd7816513a0eeb4def5ca0b58b752c161a84..ab6af775f06c2235062a35b8e4003736ad2d0940 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -802,6 +802,24 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) return 0; } +static int __init lapic_init_clockevent(void) +{ + if (!lapic_timer_frequency) + return -1; + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, + TICK_NSEC, lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; + + return 0; +} + static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = this_cpu_ptr(&lapic_events); @@ -810,25 +828,21 @@ static int __init calibrate_APIC_clock(void) long delta, deltatsc; int pm_referenced = 0; - /** - * check if lapic timer has already been calibrated by platform - * specific routine, such as tsc calibration code. if so, we just fill + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return 0; + + /* + * Check if lapic timer has already been calibrated by platform + * specific routine, such as tsc calibration code. If so just fill * in the clockevent structure and return. */ - - if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { - return 0; - } else if (lapic_timer_frequency) { + if (!lapic_init_clockevent()) { apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", - lapic_timer_frequency); - lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, - TICK_NSEC, lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.max_delta_ticks = 0x7FFFFF; - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - lapic_clockevent.min_delta_ticks = 0xF; + lapic_timer_frequency); + /* + * Direct calibration methods must have an always running + * local APIC timer, no need for broadcast timer. + */ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; return 0; } @@ -869,17 +883,8 @@ static int __init calibrate_APIC_clock(void) pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, &delta, &deltatsc); - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); - lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - lapic_clockevent.min_delta_ticks = 0xF; - lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + lapic_init_clockevent(); apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 78778b54f904a8f15c24fa257a4699b232d3f842..a5464b8b6c464d117d8d2e03c8273bc21531f412 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -175,7 +175,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) this_cpu_write(cpu_llc_id, node); /* Account for nodes per socket in multi-core-module processors */ - if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { + if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { rdmsrl(MSR_FAM10H_NODE_ID, val); nodes = ((val >> 3) & 7) + 1; } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index ddced33184b552ba0ad3a17267d7a99291591d1c..d3d075226c0aa39761e9f4a33ae05cec321d36d7 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -68,10 +68,12 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); + DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) - + offsetof(struct cea_exception_stacks, DB1_stack)); BLANK(); #ifdef CONFIG_STACKPROTECTOR - DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); + DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); BLANK(); #endif diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cfd24f9f761442392c9d42540835c31001bda71f..1796d2bdcaaa2f1f23bf420b92e8be54bcd9bee1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -28,7 +28,7 @@ obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o intel_epb.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 01004bfb1a1bcdd4a9f5b41c987939fe43f6ec62..fb6a64bd765fc99c9362aedd8e64240eff267852 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -82,11 +82,14 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) * performance at the same time.. */ +#ifdef CONFIG_X86_32 extern __visible void vide(void); -__asm__(".globl vide\n" +__asm__(".text\n" + ".globl vide\n" ".type vide, @function\n" ".align 4\n" "vide: ret\n"); +#endif static void init_amd_k5(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 804c49493938bfc06a8a5f91507a36f993b467b9..64d5aec24203fcebd54563248a8fad3ae130b470 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -83,7 +83,7 @@ unsigned int aperfmperf_get_khz(int cpu) if (!cpu_khz) return 0; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; aperfmperf_snapshot_cpu(cpu, ktime_get(), true); @@ -99,7 +99,7 @@ void arch_freq_prepare_all(void) if (!cpu_khz) return; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return; for_each_online_cpu(cpu) @@ -115,7 +115,7 @@ unsigned int arch_freq_get_on_cpu(int cpu) if (!cpu_khz) return 0; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 2da82eff0eb4f8498c8cdd65bd9f9dd5fa1fa6eb..29630393f300733a7a750f3a4eec24091a3e958a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -275,7 +275,7 @@ static const struct { const char *option; enum spectre_v2_user_cmd cmd; bool secure; -} v2_user_options[] __initdata = { +} v2_user_options[] __initconst = { { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, { "off", SPECTRE_V2_USER_CMD_NONE, false }, { "on", SPECTRE_V2_USER_CMD_FORCE, true }, @@ -419,7 +419,7 @@ static const struct { const char *option; enum spectre_v2_mitigation_cmd cmd; bool secure; -} mitigation_options[] __initdata = { +} mitigation_options[] __initconst = { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, @@ -440,7 +440,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || + cpu_mitigations_off()) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); @@ -658,7 +659,7 @@ static const char * const ssb_strings[] = { static const struct { const char *option; enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] __initdata = { +} ssb_mitigation_options[] __initconst = { { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ @@ -672,7 +673,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || + cpu_mitigations_off()) { return SPEC_STORE_BYPASS_CMD_NONE; } else { ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", @@ -1008,6 +1010,11 @@ static void __init l1tf_select_mitigation(void) if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; + if (cpu_mitigations_off()) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb28e98a0659abc5051b1e7fcb936b692a1a1264..37640544e12fd1e874b6c6fa6725a9c25084c7ca 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -507,19 +507,6 @@ void load_percpu_segment(int cpu) DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); #endif -#ifdef CONFIG_X86_64 -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ -}; -#endif - /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) { @@ -1511,9 +1498,9 @@ static __init int setup_clearcpuid(char *arg) __setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 -DEFINE_PER_CPU_FIRST(union irq_stack_union, - irq_stack_union) __aligned(PAGE_SIZE) __visible; -EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union); +DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, + fixed_percpu_data) __aligned(PAGE_SIZE) __visible; +EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); /* * The following percpu variables are hot. Align current_task to @@ -1523,9 +1510,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; - +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; @@ -1562,23 +1547,7 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - -static DEFINE_PER_CPU(unsigned long, debug_stack_addr); DEFINE_PER_CPU(int, debug_stack_usage); - -int is_debug_stack(unsigned long addr) -{ - return __this_cpu_read(debug_stack_usage) || - (addr <= __this_cpu_read(debug_stack_addr) && - addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ)); -} -NOKPROBE_SYMBOL(is_debug_stack); - DEFINE_PER_CPU(u32, debug_idt_ctr); void debug_stack_set_zero(void) @@ -1668,7 +1637,7 @@ static void setup_getcpu(int cpu) unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); struct desc_struct d = { }; - if (static_cpu_has(X86_FEATURE_RDTSCP)) + if (boot_cpu_has(X86_FEATURE_RDTSCP)) write_rdtscp_aux(cpudata); /* Store CPU and node number in limit. */ @@ -1690,17 +1659,14 @@ static void setup_getcpu(int cpu) * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init for 64 bit */ #ifdef CONFIG_X86_64 void cpu_init(void) { - struct orig_ist *oist; + int cpu = raw_smp_processor_id(); struct task_struct *me; struct tss_struct *t; - unsigned long v; - int cpu = raw_smp_processor_id(); int i; wait_for_master_cpu(cpu); @@ -1715,7 +1681,6 @@ void cpu_init(void) load_ucode_ap(); t = &per_cpu(cpu_tss_rw, cpu); - oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA if (this_cpu_read(numa_node) == 0 && @@ -1753,16 +1718,11 @@ void cpu_init(void) /* * set up and load the per-CPU TSS */ - if (!oist->ist[0]) { - char *estacks = get_cpu_entry_area(cpu)->exception_stacks; - - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - estacks += exception_stack_sizes[v]; - oist->ist[v] = t->x86_tss.ist[v] = - (unsigned long)estacks; - if (v == DEBUG_STACK-1) - per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; - } + if (!t->x86_tss.ist[0]) { + t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); + t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); + t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); + t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); } t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; @@ -1864,23 +1824,6 @@ void cpu_init(void) } #endif -static void bsp_resume(void) -{ - if (this_cpu->c_bsp_resume) - this_cpu->c_bsp_resume(&boot_cpu_data); -} - -static struct syscore_ops cpu_syscore_ops = { - .resume = bsp_resume, -}; - -static int __init init_cpu_syscore(void) -{ - register_syscore_ops(&cpu_syscore_ops); - return 0; -} -core_initcall(init_cpu_syscore); - /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 5eb946b9a9f36d3e1dadad391c3df5c7658f7c71..c0e2407abdd6ab0342982b795f3deb8a5037d474 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -14,7 +14,6 @@ struct cpu_dev { void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); void (*c_detect_tlb)(struct cpuinfo_x86 *); - void (*c_bsp_resume)(struct cpuinfo_x86 *); int c_x86_vendor; #ifdef CONFIG_X86_32 /* Optional vendor specific routine to obtain the cache size. */ diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index cf25405444ab37814d11073ccc2e949327c2fe8b..415621ddb8a236a232b2974a0a19cc373b677051 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -19,6 +19,8 @@ #include "cpu.h" +#define APICID_SOCKET_ID_BIT 6 + /* * nodes_per_socket: Stores the number of nodes per socket. * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8] @@ -87,6 +89,9 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); + /* Socket ID is ApicId[6] for these processors. */ + c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; + cacheinfo_hygon_init_llc_id(c, cpu, node_id); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fc3c07fe7df58a22c01c8c1180d0b394bde8b59a..f17c1a714779b5a77cdedbae01a2091d9bce361a 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -596,36 +596,6 @@ detect_keyid_bits: c->x86_phys_bits -= keyid_bits; } -static void init_intel_energy_perf(struct cpuinfo_x86 *c) -{ - u64 epb; - - /* - * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized. - * (x86_energy_perf_policy(8) is available to change it at run-time.) - */ - if (!cpu_has(c, X86_FEATURE_EPB)) - return; - - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); - if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE) - return; - - pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); - pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); - epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; - wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); -} - -static void intel_bsp_resume(struct cpuinfo_x86 *c) -{ - /* - * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume, - * so reinitialize it properly like during bootup: - */ - init_intel_energy_perf(c); -} - static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; @@ -763,8 +733,6 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_TME)) detect_tme(c); - init_intel_energy_perf(c); - init_intel_misc_features(c); } @@ -1023,9 +991,7 @@ static const struct cpu_dev intel_cpu_dev = { .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, .c_init = init_intel, - .c_bsp_resume = intel_bsp_resume, .c_x86_vendor = X86_VENDOR_INTEL, }; cpu_dev_register(intel_cpu_dev); - diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c new file mode 100644 index 0000000000000000000000000000000000000000..f4dd73396f2882b2780e36e1f0bc88b42722f417 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Performance and Energy Bias Hint support. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Rafael J. Wysocki + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/** + * DOC: overview + * + * The Performance and Energy Bias Hint (EPB) allows software to specify its + * preference with respect to the power-performance tradeoffs present in the + * processor. Generally, the EPB is expected to be set by user space (directly + * via sysfs or with the help of the x86_energy_perf_policy tool), but there are + * two reasons for the kernel to update it. + * + * First, there are systems where the platform firmware resets the EPB during + * system-wide transitions from sleep states back into the working state + * effectively causing the previous EPB updates by user space to be lost. + * Thus the kernel needs to save the current EPB values for all CPUs during + * system-wide transitions to sleep states and restore them on the way back to + * the working state. That can be achieved by saving EPB for secondary CPUs + * when they are taken offline during transitions into system sleep states and + * for the boot CPU in a syscore suspend operation, so that it can be restored + * for the boot CPU in a syscore resume operation and for the other CPUs when + * they are brought back online. However, CPUs that are already offline when + * a system-wide PM transition is started are not taken offline again, but their + * EPB values may still be reset by the platform firmware during the transition, + * so in fact it is necessary to save the EPB of any CPU taken offline and to + * restore it when the given CPU goes back online at all times. + * + * Second, on many systems the initial EPB value coming from the platform + * firmware is 0 ('performance') and at least on some of them that is because + * the platform firmware does not initialize EPB at all with the assumption that + * the OS will do that anyway. That sometimes is problematic, as it may cause + * the system battery to drain too fast, for example, so it is better to adjust + * it on CPU bring-up and if the initial EPB value for a given CPU is 0, the + * kernel changes it to 6 ('normal'). + */ + +static DEFINE_PER_CPU(u8, saved_epb); + +#define EPB_MASK 0x0fULL +#define EPB_SAVED 0x10ULL +#define MAX_EPB EPB_MASK + +static int intel_epb_save(void) +{ + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + /* + * Ensure that saved_epb will always be nonzero after this write even if + * the EPB value read from the MSR is 0. + */ + this_cpu_write(saved_epb, (epb & EPB_MASK) | EPB_SAVED); + + return 0; +} + +static void intel_epb_restore(void) +{ + u64 val = this_cpu_read(saved_epb); + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if (val) { + val &= EPB_MASK; + } else { + /* + * Because intel_epb_save() has not run for the current CPU yet, + * it is going online for the first time, so if its EPB value is + * 0 ('performance') at this point, assume that it has not been + * initialized by the platform firmware and set it to 6 + * ('normal'). + */ + val = epb & EPB_MASK; + if (val == ENERGY_PERF_BIAS_PERFORMANCE) { + val = ENERGY_PERF_BIAS_NORMAL; + pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); + } + } + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); +} + +static struct syscore_ops intel_epb_syscore_ops = { + .suspend = intel_epb_save, + .resume = intel_epb_restore, +}; + +static const char * const energy_perf_strings[] = { + "performance", + "balance-performance", + "normal", + "balance-power", + "power" +}; +static const u8 energ_perf_values[] = { + ENERGY_PERF_BIAS_PERFORMANCE, + ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, + ENERGY_PERF_BIAS_NORMAL, + ENERGY_PERF_BIAS_BALANCE_POWERSAVE, + ENERGY_PERF_BIAS_POWERSAVE +}; + +static ssize_t energy_perf_bias_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + unsigned int cpu = dev->id; + u64 epb; + int ret; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + return sprintf(buf, "%llu\n", epb); +} + +static ssize_t energy_perf_bias_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int cpu = dev->id; + u64 epb, val; + int ret; + + ret = __sysfs_match_string(energy_perf_strings, + ARRAY_SIZE(energy_perf_strings), buf); + if (ret >= 0) + val = energ_perf_values[ret]; + else if (kstrtou64(buf, 0, &val) || val > MAX_EPB) + return -EINVAL; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, + (epb & ~EPB_MASK) | val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR_RW(energy_perf_bias); + +static struct attribute *intel_epb_attrs[] = { + &dev_attr_energy_perf_bias.attr, + NULL +}; + +static const struct attribute_group intel_epb_attr_group = { + .name = power_group_name, + .attrs = intel_epb_attrs +}; + +static int intel_epb_online(unsigned int cpu) +{ + struct device *cpu_dev = get_cpu_device(cpu); + + intel_epb_restore(); + if (!cpuhp_tasks_frozen) + sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group); + + return 0; +} + +static int intel_epb_offline(unsigned int cpu) +{ + struct device *cpu_dev = get_cpu_device(cpu); + + if (!cpuhp_tasks_frozen) + sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group); + + intel_epb_save(); + return 0; +} + +static __init int intel_epb_init(void) +{ + int ret; + + if (!boot_cpu_has(X86_FEATURE_EPB)) + return -ENODEV; + + ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE, + "x86/intel/epb:online", intel_epb_online, + intel_epb_offline); + if (ret < 0) + goto err_out_online; + + register_syscore_ops(&intel_epb_syscore_ops); + return 0; + +err_out_online: + cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE); + return ret; +} +subsys_initcall(intel_epb_init); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index e64de5149e50e8c35518cf56ed4d0ebc61e9c78f..d904aafe640945cf809dcbaa9c1ab549bec1b508 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -563,33 +563,59 @@ out: return offset; } +bool amd_filter_mce(struct mce *m) +{ + enum smca_bank_types bank_type = smca_get_bank_type(m->bank); + struct cpuinfo_x86 *c = &boot_cpu_data; + u8 xec = (m->status >> 16) & 0x3F; + + /* See Family 17h Models 10h-2Fh Erratum #1114. */ + if (c->x86 == 0x17 && + c->x86_model >= 0x10 && c->x86_model <= 0x2F && + bank_type == SMCA_IF && xec == 10) + return true; + + return false; +} + /* - * Turn off MC4_MISC thresholding banks on all family 0x15 models since - * they're not supported there. + * Turn off thresholding banks for the following conditions: + * - MC4_MISC thresholding is not supported on Family 0x15. + * - Prevent possible spurious interrupts from the IF bank on Family 0x17 + * Models 0x10-0x2F due to Erratum #1114. */ -void disable_err_thresholding(struct cpuinfo_x86 *c) +void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) { - int i; + int i, num_msrs; u64 hwcr; bool need_toggle; - u32 msrs[] = { - 0x00000413, /* MC4_MISC0 */ - 0xc0000408, /* MC4_MISC1 */ - }; + u32 msrs[NR_BLOCKS]; + + if (c->x86 == 0x15 && bank == 4) { + msrs[0] = 0x00000413; /* MC4_MISC0 */ + msrs[1] = 0xc0000408; /* MC4_MISC1 */ + num_msrs = 2; + } else if (c->x86 == 0x17 && + (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) { - if (c->x86 != 0x15) + if (smca_get_bank_type(bank) != SMCA_IF) + return; + + msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank); + num_msrs = 1; + } else { return; + } rdmsrl(MSR_K7_HWCR, hwcr); /* McStatusWrEn has to be set */ need_toggle = !(hwcr & BIT(18)); - if (need_toggle) wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); /* Clear CntP bit safely */ - for (i = 0; i < ARRAY_SIZE(msrs); i++) + for (i = 0; i < num_msrs; i++) msr_clear_bit(msrs[i], 62); /* restore old settings */ @@ -604,12 +630,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) unsigned int bank, block, cpu = smp_processor_id(); int offset = -1; - disable_err_thresholding(c); - for (bank = 0; bank < mca_cfg.banks; ++bank) { if (mce_flags.smca) smca_configure(bank, cpu); + disable_err_thresholding(c, bank); + for (block = 0; block < NR_BLOCKS; ++block) { address = get_block_address(address, low, high, bank, block); if (!address) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b7fb541a4873f7803b216b7987fa66517bc5fff6..5112a50e6486fdd4c96083375e6e919591bab90b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -460,23 +460,6 @@ static void mce_irq_work_cb(struct irq_work *entry) mce_schedule_work(); } -static void mce_report_event(struct pt_regs *regs) -{ - if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { - mce_notify_irq(); - /* - * Triggering the work queue here is just an insurance - * policy in case the syscall exit notify handler - * doesn't run soon enough or ends up running on the - * wrong CPU (can happen when audit sleeps) - */ - mce_schedule_work(); - return; - } - - irq_work_queue(&mce_irq_work); -} - /* * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would @@ -712,19 +695,49 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) barrier(); m.status = mce_rdmsrl(msr_ops.status(i)); + + /* If this entry is not valid, ignore it */ if (!(m.status & MCI_STATUS_VAL)) continue; /* - * Uncorrected or signalled events are handled by the exception - * handler when it is enabled, so don't process those here. - * - * TBD do the same check for MCI_STATUS_EN here? + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. */ - if (!(flags & MCP_UC) && - (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) - continue; + if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) + goto log_it; + + /* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ + if (!mca_cfg.ser) { + if (m.status & MCI_STATUS_UC) + continue; + goto log_it; + } + /* Log "not enabled" (speculative) errors */ + if (!(m.status & MCI_STATUS_EN)) + goto log_it; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) + goto log_it; + + /* + * Skip anything else. Presumption is that our read of this + * bank is racing with a machine check. Leave the log alone + * for do_machine_check() to deal with it. + */ + continue; + +log_it: error_seen = true; mce_read_aux(&m, i); @@ -1301,7 +1314,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_panic("Fatal machine check on current CPU", &m, msg); if (worst > 0) - mce_report_event(regs); + irq_work_queue(&mce_irq_work); + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); sync_core(); @@ -1451,13 +1465,12 @@ EXPORT_SYMBOL_GPL(mce_notify_irq); static int __mcheck_cpu_mce_banks_init(void) { int i; - u8 num_banks = mca_cfg.banks; - mce_banks = kcalloc(num_banks, sizeof(struct mce_bank), GFP_KERNEL); + mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL); if (!mce_banks) return -ENOMEM; - for (i = 0; i < num_banks; i++) { + for (i = 0; i < MAX_NR_BANKS; i++) { struct mce_bank *b = &mce_banks[i]; b->ctl = -1ULL; @@ -1471,28 +1484,19 @@ static int __mcheck_cpu_mce_banks_init(void) */ static int __mcheck_cpu_cap_init(void) { - unsigned b; u64 cap; + u8 b; rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - if (!mca_cfg.banks) - pr_info("CPU supports %d MCE banks\n", b); - - if (b > MAX_NR_BANKS) { - pr_warn("Using only %u machine check banks out of %u\n", - MAX_NR_BANKS, b); + if (WARN_ON_ONCE(b > MAX_NR_BANKS)) b = MAX_NR_BANKS; - } - /* Don't support asymmetric configurations today */ - WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); - mca_cfg.banks = b; + mca_cfg.banks = max(mca_cfg.banks, b); if (!mce_banks) { int err = __mcheck_cpu_mce_banks_init(); - if (err) return err; } @@ -1771,6 +1775,14 @@ static void __mcheck_cpu_init_timer(void) mce_start_timer(t); } +bool filter_mce(struct mce *m) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return amd_filter_mce(m); + + return false; +} + /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { @@ -2425,8 +2437,8 @@ static int fake_panic_set(void *data, u64 val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, - fake_panic_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, + "%llu\n"); static int __init mcheck_debugfs_init(void) { @@ -2435,8 +2447,8 @@ static int __init mcheck_debugfs_init(void) dmce = mce_get_debugfs_dir(); if (!dmce) return -ENOMEM; - ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, - &fake_panic_fops); + ffake_panic = debugfs_create_file_unsafe("fake_panic", 0444, dmce, + NULL, &fake_panic_fops); if (!ffake_panic) return -ENOMEM; @@ -2451,6 +2463,8 @@ EXPORT_SYMBOL_GPL(mcsafe_key); static int __init mcheck_late_init(void) { + pr_info("Using %d MCE banks\n", mca_cfg.banks); + if (mca_cfg.recovery) static_branch_inc(&mcsafe_key); diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index 3395549c51d3f74502c18060ac5160ad934a1b48..64d1d5a00f397acae1ba23d8d97efe96f86a9d0c 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -99,6 +99,9 @@ int mce_gen_pool_add(struct mce *mce) { struct mce_evt_llist *node; + if (filter_mce(mce)) + return -EINVAL; + if (!mce_evt_pool) return -EINVAL; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 8492ef7d9015086fb44e08ec532438bf43056d5c..a6026170af925cbd86ef21e6fdcf46c2e8247cf2 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -46,8 +46,6 @@ static struct mce i_mce; static struct dentry *dfs_inj; -static u8 n_banks; - #define MAX_FLAG_OPT_SIZE 4 #define NBCFG 0x44 @@ -528,7 +526,7 @@ static void do_inject(void) * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for * Fam10h and later BKDGs. */ - if (static_cpu_has(X86_FEATURE_AMD_DCM) && + if (boot_cpu_has(X86_FEATURE_AMD_DCM) && b == 4 && boot_cpu_data.x86 < 0x17) { toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); @@ -570,9 +568,15 @@ err: static int inj_bank_set(void *data, u64 val) { struct mce *m = (struct mce *)data; + u8 n_banks; + u64 cap; + + /* Get bank count on target CPU so we can handle non-uniform values. */ + rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); + n_banks = cap & MCG_BANKCNT_MASK; if (val >= n_banks) { - pr_err("Non-existent MCE bank: %llu\n", val); + pr_err("MCA bank %llu non-existent on CPU%d\n", val, m->extcpu); return -EINVAL; } @@ -665,10 +669,6 @@ static struct dfs_node { static int __init debugfs_init(void) { unsigned int i; - u64 cap; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - n_banks = cap & MCG_BANKCNT_MASK; dfs_inj = debugfs_create_dir("mce-inject", NULL); if (!dfs_inj) diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index af5eab1e65e2707416ecab9e179c3255ec1320a3..a34b55baa7aa8e4eb3ac69ebc11a2cc10e4da278 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -173,4 +173,13 @@ struct mca_msr_regs { extern struct mca_msr_regs msr_ops; +/* Decide whether to add MCE record to MCE event pool or filter it out. */ +extern bool filter_mce(struct mce *m); + +#ifdef CONFIG_X86_MCE_AMD +extern bool amd_filter_mce(struct mce *m); +#else +static inline bool amd_filter_mce(struct mce *m) { return false; }; +#endif + #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 5260185cbf7ba1a77ecc30bdd61a99a2338b159b..8a4a7823451acf2d9d9668f0ddf2a29af3a444f9 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -418,8 +418,9 @@ static int do_microcode_update(const void __user *buf, size_t size) if (ustate == UCODE_ERROR) { error = -1; break; - } else if (ustate == UCODE_OK) + } else if (ustate == UCODE_NEW) { apply_microcode_on_target(cpu); + } } return error; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 16936a24795c8457b789f7b428216cfc5bd1fb4e..a44bdbe7c55eb4706976155afc5296d48c4f519b 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -861,32 +862,33 @@ out: return ret; } -static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; - int new_rev = uci->cpu_sig.rev; - unsigned int leftover = size; unsigned int curr_mc_size = 0, new_mc_size = 0; - unsigned int csig, cpf; enum ucode_state ret = UCODE_OK; + int new_rev = uci->cpu_sig.rev; + u8 *new_mc = NULL, *mc = NULL; + unsigned int csig, cpf; - while (leftover) { + while (iov_iter_count(iter)) { struct microcode_header_intel mc_header; - unsigned int mc_size; + unsigned int mc_size, data_size; + u8 *data; - if (leftover < sizeof(mc_header)) { - pr_err("error! Truncated header in microcode data file\n"); + if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) { + pr_err("error! Truncated or inaccessible header in microcode data file\n"); break; } - if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) - break; - mc_size = get_totalsize(&mc_header); - if (!mc_size || mc_size > leftover) { - pr_err("error! Bad data in microcode data file\n"); + if (mc_size < sizeof(mc_header)) { + pr_err("error! Bad data in microcode data file (totalsize too small)\n"); + break; + } + data_size = mc_size - sizeof(mc_header); + if (data_size > iov_iter_count(iter)) { + pr_err("error! Bad data in microcode data file (truncated file?)\n"); break; } @@ -899,7 +901,9 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, curr_mc_size = mc_size; } - if (get_ucode_data(mc, ucode_ptr, mc_size) || + memcpy(mc, &mc_header, sizeof(mc_header)); + data = mc + sizeof(mc_header); + if (!copy_from_iter_full(data, data_size, iter) || microcode_sanity_check(mc, 1) < 0) { break; } @@ -914,14 +918,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, mc = NULL; /* trigger new vmalloc */ ret = UCODE_NEW; } - - ucode_ptr += mc_size; - leftover -= mc_size; } vfree(mc); - if (leftover) { + if (iov_iter_count(iter)) { vfree(new_mc); return UCODE_ERROR; } @@ -945,12 +946,6 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, return ret; } -static int get_ucode_fw(void *to, const void *from, size_t n) -{ - memcpy(to, from, n); - return 0; -} - static bool is_blacklisted(unsigned int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -977,10 +972,12 @@ static bool is_blacklisted(unsigned int cpu) static enum ucode_state request_microcode_fw(int cpu, struct device *device, bool refresh_fw) { - char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; + struct iov_iter iter; enum ucode_state ret; + struct kvec kvec; + char name[30]; if (is_blacklisted(cpu)) return UCODE_NFOUND; @@ -993,26 +990,30 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, return UCODE_NFOUND; } - ret = generic_load_microcode(cpu, (void *)firmware->data, - firmware->size, &get_ucode_fw); + kvec.iov_base = (void *)firmware->data; + kvec.iov_len = firmware->size; + iov_iter_kvec(&iter, WRITE, &kvec, 1, firmware->size); + ret = generic_load_microcode(cpu, &iter); release_firmware(firmware); return ret; } -static int get_ucode_user(void *to, const void *from, size_t n) -{ - return copy_from_user(to, from, n); -} - static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { + struct iov_iter iter; + struct iovec iov; + if (is_blacklisted(cpu)) return UCODE_NFOUND; - return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); + iov.iov_base = (void __user *)buf; + iov.iov_len = size; + iov_iter_init(&iter, WRITE, &iov, 1, size); + + return generic_load_microcode(cpu, &iter); } static struct microcode_ops microcode_intel_ops = { diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 2c8522a39ed5dbc388bada821ed144f2435adac2..cb2e49810d687fe67ae304edcb480469b95480b7 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -35,11 +35,11 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: yes\n", - static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", - static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", - static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", - static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", - static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", c->cpuid_level); } #else diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 2dbd990a2eb78b30a8208da673bf06f936945e3a..89320c0396b1f2266beee67f8492a4dd8e4915c5 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -342,10 +342,10 @@ int update_domains(struct rdt_resource *r, int closid) if (cpumask_empty(cpu_mask) || mba_sc) goto done; cpu = get_cpu(); - /* Update CBM on this cpu if it's in cpu_mask. */ + /* Update resource control msr on this CPU if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) rdt_ctrl_update(&msr_param); - /* Update CBM on other cpus. */ + /* Update resource control msr on other CPUs. */ smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); put_cpu(); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 399601eda8e43c2cf8a855b44b4dc811a247c9e5..333c177a2471e01161fdb1131ecb6007a3c2ce67 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2039,14 +2039,14 @@ out: enum rdt_param { Opt_cdp, Opt_cdpl2, - Opt_mba_mpbs, + Opt_mba_mbps, nr__rdt_params }; static const struct fs_parameter_spec rdt_param_specs[] = { fsparam_flag("cdp", Opt_cdp), fsparam_flag("cdpl2", Opt_cdpl2), - fsparam_flag("mba_mpbs", Opt_mba_mpbs), + fsparam_flag("mba_MBps", Opt_mba_mbps), {} }; @@ -2072,7 +2072,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_cdpl2: ctx->enable_cdpl2 = true; return 0; - case Opt_mba_mpbs: + case Opt_mba_mbps: if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return -EINVAL; ctx->enable_mba_mbps = true; @@ -2516,103 +2516,131 @@ static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) bitmap_clear(val, zero_bit, cbm_len - zero_bit); } -/** - * rdtgroup_init_alloc - Initialize the new RDT group's allocations - * - * A new RDT group is being created on an allocation capable (CAT) - * supporting system. Set this group up to start off with all usable - * allocations. That is, all shareable and unused bits. +/* + * Initialize cache resources per RDT domain * - * All-zero CBM is invalid. If there are no more shareable bits available - * on any domain then the entire allocation will fail. + * Set the RDT domain up to start off with all usable allocations. That is, + * all shareable and unused bits. All-zero CBM is invalid. */ -static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, + u32 closid) { struct rdt_resource *r_cdp = NULL; struct rdt_domain *d_cdp = NULL; u32 used_b = 0, unused_b = 0; - u32 closid = rdtgrp->closid; - struct rdt_resource *r; unsigned long tmp_cbm; enum rdtgrp_mode mode; - struct rdt_domain *d; u32 peer_ctl, *ctrl; - int i, ret; + int i; - for_each_alloc_enabled_rdt_resource(r) { - /* - * Only initialize default allocations for CBM cache - * resources - */ - if (r->rid == RDT_RESOURCE_MBA) - continue; - list_for_each_entry(d, &r->domains, list) { - rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); - d->have_new_ctrl = false; - d->new_ctrl = r->cache.shareable_bits; - used_b = r->cache.shareable_bits; - ctrl = d->ctrl_val; - for (i = 0; i < closids_supported(); i++, ctrl++) { - if (closid_allocated(i) && i != closid) { - mode = rdtgroup_mode_by_closid(i); - if (mode == RDT_MODE_PSEUDO_LOCKSETUP) - break; - /* - * If CDP is active include peer - * domain's usage to ensure there - * is no overlap with an exclusive - * group. - */ - if (d_cdp) - peer_ctl = d_cdp->ctrl_val[i]; - else - peer_ctl = 0; - used_b |= *ctrl | peer_ctl; - if (mode == RDT_MODE_SHAREABLE) - d->new_ctrl |= *ctrl | peer_ctl; - } - } - if (d->plr && d->plr->cbm > 0) - used_b |= d->plr->cbm; - unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); - unused_b &= BIT_MASK(r->cache.cbm_len) - 1; - d->new_ctrl |= unused_b; - /* - * Force the initial CBM to be valid, user can - * modify the CBM based on system availability. - */ - cbm_ensure_valid(&d->new_ctrl, r); + rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); + d->have_new_ctrl = false; + d->new_ctrl = r->cache.shareable_bits; + used_b = r->cache.shareable_bits; + ctrl = d->ctrl_val; + for (i = 0; i < closids_supported(); i++, ctrl++) { + if (closid_allocated(i) && i != closid) { + mode = rdtgroup_mode_by_closid(i); + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) + break; /* - * Assign the u32 CBM to an unsigned long to ensure - * that bitmap_weight() does not access out-of-bound - * memory. + * If CDP is active include peer domain's + * usage to ensure there is no overlap + * with an exclusive group. */ - tmp_cbm = d->new_ctrl; - if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < - r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", - r->name, d->id); - return -ENOSPC; - } - d->have_new_ctrl = true; + if (d_cdp) + peer_ctl = d_cdp->ctrl_val[i]; + else + peer_ctl = 0; + used_b |= *ctrl | peer_ctl; + if (mode == RDT_MODE_SHAREABLE) + d->new_ctrl |= *ctrl | peer_ctl; } } + if (d->plr && d->plr->cbm > 0) + used_b |= d->plr->cbm; + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; + d->new_ctrl |= unused_b; + /* + * Force the initial CBM to be valid, user can + * modify the CBM based on system availability. + */ + cbm_ensure_valid(&d->new_ctrl, r); + /* + * Assign the u32 CBM to an unsigned long to ensure that + * bitmap_weight() does not access out-of-bound memory. + */ + tmp_cbm = d->new_ctrl; + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("No space on %s:%d\n", r->name, d->id); + return -ENOSPC; + } + d->have_new_ctrl = true; + + return 0; +} + +/* + * Initialize cache resources with default values. + * + * A new RDT group is being created on an allocation capable (CAT) + * supporting system. Set this group up to start off with all usable + * allocations. + * + * If there are no more shareable bits available on any domain then + * the entire allocation will fail. + */ +static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid) +{ + struct rdt_domain *d; + int ret; + + list_for_each_entry(d, &r->domains, list) { + ret = __init_one_rdt_domain(d, r, closid); + if (ret < 0) + return ret; + } + + return 0; +} + +/* Initialize MBA resource with default values. */ +static void rdtgroup_init_mba(struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + d->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl; + d->have_new_ctrl = true; + } +} + +/* Initialize the RDT group's allocations. */ +static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r; + int ret; for_each_alloc_enabled_rdt_resource(r) { - /* - * Only initialize default allocations for CBM cache - * resources - */ - if (r->rid == RDT_RESOURCE_MBA) - continue; + if (r->rid == RDT_RESOURCE_MBA) { + rdtgroup_init_mba(r); + } else { + ret = rdtgroup_init_cat(r, rdtgrp->closid); + if (ret < 0) + return ret; + } + ret = update_domains(r, rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Failed to initialize allocations\n"); return ret; } - rdtgrp->mode = RDT_MODE_SHAREABLE; + } + rdtgrp->mode = RDT_MODE_SHAREABLE; + return 0; } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 17ffc869cab822d03e85baea56bade232a0e4598..a96ca85848039878be760b54f6d77765344c737b 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -204,8 +204,7 @@ static struct crash_mem *fill_up_crash_elf_data(void) * another range split. So add extra two slots here. */ nr_ranges += 2; - cmem = vzalloc(sizeof(struct crash_mem) + - sizeof(struct crash_mem_range) * nr_ranges); + cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return NULL; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index cd53f3030e4004e89d7d027cee217fb8e0ea08a7..64a59d72663952f0fb1eb8eba100d74baa93a0f7 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -34,14 +34,14 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -59,14 +59,14 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 5cdb9e84da57db3d4a494fdd41805219303bd7b6..753b8cfe8b8a212d7423ec1622e3699431a3b6f2 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,23 +16,21 @@ #include #include +#include #include -static char *exception_stack_names[N_EXCEPTION_STACKS] = { - [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ NMI_STACK-1 ] = "NMI", - [ DEBUG_STACK-1 ] = "#DB", - [ MCE_STACK-1 ] = "#MC", -}; - -static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ +static const char * const exception_stack_names[] = { + [ ESTACK_DF ] = "#DF", + [ ESTACK_NMI ] = "NMI", + [ ESTACK_DB2 ] = "#DB2", + [ ESTACK_DB1 ] = "#DB1", + [ ESTACK_DB ] = "#DB", + [ ESTACK_MCE ] = "#MC", }; const char *stack_type_name(enum stack_type type) { - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); if (type == STACK_TYPE_IRQ) return "IRQ"; @@ -52,43 +50,84 @@ const char *stack_type_name(enum stack_type type) return NULL; } +/** + * struct estack_pages - Page descriptor for exception stacks + * @offs: Offset from the start of the exception stack area + * @size: Size of the exception stack + * @type: Type to store in the stack_info struct + */ +struct estack_pages { + u32 offs; + u16 size; + u16 type; +}; + +#define EPAGERANGE(st) \ + [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ + PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ + .offs = CEA_ESTACK_OFFS(st), \ + .size = CEA_ESTACK_SIZE(st), \ + .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, } + +/* + * Array of exception stack page descriptors. If the stack is larger than + * PAGE_SIZE, all pages covering a particular stack will have the same + * info. The guard pages including the not mapped DB2 stack are zeroed + * out. + */ +static const +struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { + EPAGERANGE(DF), + EPAGERANGE(NMI), + EPAGERANGE(DB1), + EPAGERANGE(DB), + EPAGERANGE(MCE), +}; + static bool in_exception_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin, *end; + unsigned long begin, end, stk = (unsigned long)stack; + const struct estack_pages *ep; struct pt_regs *regs; - unsigned k; + unsigned int k; - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; - begin = end - (exception_stack_sizes[k] / sizeof(long)); - regs = (struct pt_regs *)end - 1; - - if (stack <= begin || stack >= end) - continue; + begin = (unsigned long)__this_cpu_read(cea_exception_stacks); + end = begin + sizeof(struct cea_exception_stacks); + /* Bail if @stack is outside the exception stack area. */ + if (stk < begin || stk >= end) + return false; - info->type = STACK_TYPE_EXCEPTION + k; - info->begin = begin; - info->end = end; - info->next_sp = (unsigned long *)regs->sp; + /* Calc page offset from start of exception stacks */ + k = (stk - begin) >> PAGE_SHIFT; + /* Lookup the page descriptor */ + ep = &estack_pages[k]; + /* Guard page? */ + if (!ep->size) + return false; - return true; - } + begin += (unsigned long)ep->offs; + end = begin + (unsigned long)ep->size; + regs = (struct pt_regs *)end - 1; - return false; + info->type = ep->type; + info->begin = (unsigned long *)begin; + info->end = (unsigned long *)end; + info->next_sp = (unsigned long *)regs->sp; + return true; } static bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack >= end) return false; info->type = STACK_TYPE_IRQ; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ef49517f6bb24e350b9235b29e6ba1885022f918..0caf8122d68078a1c712aa0dbdf77d7b7eb1c780 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -678,12 +678,8 @@ static inline void *alloc_tramp(unsigned long size) { return module_alloc(size); } -static inline void tramp_free(void *tramp, int size) +static inline void tramp_free(void *tramp) { - int npages = PAGE_ALIGN(size) >> PAGE_SHIFT; - - set_memory_nx((unsigned long)tramp, npages); - set_memory_rw((unsigned long)tramp, npages); module_memfree(tramp); } #else @@ -692,7 +688,7 @@ static inline void *alloc_tramp(unsigned long size) { return NULL; } -static inline void tramp_free(void *tramp, int size) { } +static inline void tramp_free(void *tramp) { } #endif /* Defined as markers to the end of the ftrace default trampolines */ @@ -730,6 +726,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long end_offset; unsigned long op_offset; unsigned long offset; + unsigned long npages; unsigned long size; unsigned long retq; unsigned long *ptr; @@ -762,6 +759,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) return 0; *tramp_size = size + RET_SIZE + sizeof(void *); + npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ ret = probe_kernel_read(trampoline, (void *)start_offset, size); @@ -806,9 +804,17 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* ALLOC_TRAMP flags lets us know we created it */ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; + set_vm_flush_reset_perms(trampoline); + + /* + * Module allocation needs to be completed by making the page + * executable. The page is still writable, which is a security hazard, + * but anyhow ftrace breaks W^X completely. + */ + set_memory_x((unsigned long)trampoline, npages); return (unsigned long)trampoline; fail: - tramp_free(trampoline, *tramp_size); + tramp_free(trampoline); return 0; } @@ -939,7 +945,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) return; - tramp_free((void *)ops->trampoline, ops->trampoline_size); + tramp_free((void *)ops->trampoline); ops->trampoline = 0; } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d1dbe8e4eb8243e725b0c23609e7e28cad238f87..bcd206c8ac90064e9e702bc4bc1f2aa967a38065 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -265,7 +265,7 @@ ENDPROC(start_cpu0) GLOBAL(initial_code) .quad x86_64_start_kernel GLOBAL(initial_gs) - .quad INIT_PER_CPU_VAR(irq_stack_union) + .quad INIT_PER_CPU_VAR(fixed_percpu_data) GLOBAL(initial_stack) /* * The SIZEOF_PTREGS gap is a convention which helps the in-kernel diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 01adea278a71024076137a2571794fcd5ab9d246..6d8917875f44a844e1f1eb20a35fc73a65655984 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -41,13 +41,12 @@ struct idt_data { #define SYSG(_vector, _addr) \ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) -/* Interrupt gate with interrupt stack */ +/* + * Interrupt gate with interrupt stack. The _ist index is the index in + * the tss.ist[] array, but for the descriptor it needs to start at 1. + */ #define ISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) - -/* System interrupt gate with interrupt stack */ -#define SISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS) + G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS) /* Task gate */ #define TSKG(_vector, _gdt) \ @@ -184,11 +183,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, DEBUG_STACK), - ISTG(X86_TRAP_NMI, nmi, NMI_STACK), - ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), + ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), + ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE), #endif }; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 95600a99ae93652dbbd4143c9e538e808911bb24..fc34816c6f044923ffdc82c0800f6efa68982e9a 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -51,8 +51,8 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack); -DEFINE_PER_CPU(struct irq_stack *, softirq_stack); +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); +DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr); static void call_on_stack(void *func, void *stack) { @@ -76,7 +76,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) u32 *isp, *prev_esp, arg1; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(hardirq_stack); + irqstk = __this_cpu_read(hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -107,27 +107,28 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) } /* - * allocate per-cpu stacks for hardirq and for softirq processing + * Allocate per-cpu stacks for hardirq and softirq processing */ -void irq_ctx_init(int cpu) +int irq_init_percpu_irqstack(unsigned int cpu) { - struct irq_stack *irqstk; - - if (per_cpu(hardirq_stack, cpu)) - return; + int node = cpu_to_node(cpu); + struct page *ph, *ps; - irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), - THREADINFO_GFP, - THREAD_SIZE_ORDER)); - per_cpu(hardirq_stack, cpu) = irqstk; + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; - irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), - THREADINFO_GFP, - THREAD_SIZE_ORDER)); - per_cpu(softirq_stack, cpu) = irqstk; + ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ph) + return -ENOMEM; + ps = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ps) { + __free_pages(ph, THREAD_SIZE_ORDER); + return -ENOMEM; + } - printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu)); + per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(softirq_stack_ptr, cpu) = page_address(ps); + return 0; } void do_softirq_own_stack(void) @@ -135,7 +136,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(softirq_stack); + irqstk = __this_cpu_read(softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 0469cd078db15c0c26700fc7a50f9c3f8144ff2a..6bf6517a05bba3dafbcab3ad0499843d04e05be0 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,63 +18,64 @@ #include #include #include + +#include #include #include -int sysctl_panic_on_stackoverflow; +DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; +DECLARE_INIT_PER_CPU(irq_stack_backing_store); -/* - * Probabilistic stack overflow check: - * - * Only check the stack in process context, because everything else - * runs on the big interrupt stacks. Checking reliably is too expensive, - * so we just check from interrupts. - */ -static inline void stack_overflow_check(struct pt_regs *regs) +bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) { -#ifdef CONFIG_DEBUG_STACKOVERFLOW -#define STACK_TOP_MARGIN 128 - struct orig_ist *oist; - u64 irq_stack_top, irq_stack_bottom; - u64 estack_top, estack_bottom; - u64 curbase = (u64)task_stack_page(current); + if (IS_ERR_OR_NULL(desc)) + return false; - if (user_mode(regs)) - return; + generic_handle_irq_desc(desc); + return true; +} - if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && - regs->sp <= curbase + THREAD_SIZE) - return; +#ifdef CONFIG_VMAP_STACK +/* + * VMAP the backing store with guard pages + */ +static int map_irq_stack(unsigned int cpu) +{ + char *stack = (char *)per_cpu_ptr(&irq_stack_backing_store, cpu); + struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE]; + void *va; + int i; - irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) + - STACK_TOP_MARGIN; - irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr); - if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) - return; + for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) { + phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT)); - oist = this_cpu_ptr(&orig_ist); - estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; - estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; - if (regs->sp >= estack_top && regs->sp <= estack_bottom) - return; + pages[i] = pfn_to_page(pa >> PAGE_SHIFT); + } - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", - current->comm, curbase, regs->sp, - irq_stack_top, irq_stack_bottom, - estack_top, estack_bottom, (void *)regs->ip); + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + if (!va) + return -ENOMEM; - if (sysctl_panic_on_stackoverflow) - panic("low stack detected by irq handler - check messages\n"); -#endif + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; } - -bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) +#else +/* + * If VMAP stacks are disabled due to KASAN, just use the per cpu + * backing store without guard pages. + */ +static int map_irq_stack(unsigned int cpu) { - stack_overflow_check(regs); + void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); - if (IS_ERR_OR_NULL(desc)) - return false; + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; +} +#endif - generic_handle_irq_desc(desc); - return true; +int irq_init_percpu_irqstack(unsigned int cpu) +{ + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; + return map_irq_stack(cpu); } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a0693b71cfc1cc4bb99af19ed7b9d07bfca80c33..16919a9671fa93f89aac7ed279097898f6516b33 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -91,6 +91,8 @@ void __init init_IRQ(void) for (i = 0; i < nr_legacy_irqs(); i++) per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); + BUG_ON(irq_init_percpu_irqstack(smp_processor_id())); + x86_init.irqs.intr_init(); } @@ -104,6 +106,4 @@ void __init native_init_IRQ(void) if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); - - irq_ctx_init(smp_processor_id()); } diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index f99bd26bd3f11371a9d36781d0be26c183a793b1..e631c358f7f4a69c58df1aa83663ee9e5b9569ac 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -37,7 +37,6 @@ static void bug_at(unsigned char *ip, int line) static void __ref __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, - void *(*poker)(void *, const void *, size_t), int init) { union jump_code_union jmp; @@ -50,9 +49,6 @@ static void __ref __jump_label_transform(struct jump_entry *entry, jmp.offset = jump_entry_target(entry) - (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); - if (early_boot_irqs_disabled) - poker = text_poke_early; - if (type == JUMP_LABEL_JMP) { if (init) { expect = default_nop; line = __LINE__; @@ -75,16 +71,19 @@ static void __ref __jump_label_transform(struct jump_entry *entry, bug_at((void *)jump_entry_code(entry), line); /* - * Make text_poke_bp() a default fallback poker. + * As long as only a single processor is running and the code is still + * not marked as RO, text_poke_early() can be used; Checking that + * system_state is SYSTEM_BOOTING guarantees it. It will be set to + * SYSTEM_SCHEDULING before other cores are awaken and before the + * code is write-protected. * * At the time the change is being done, just ignore whether we * are doing nop -> jump or jump -> nop transition, and assume * always nop being the 'currently valid' instruction - * */ - if (poker) { - (*poker)((void *)jump_entry_code(entry), code, - JUMP_LABEL_NOP_SIZE); + if (init || system_state == SYSTEM_BOOTING) { + text_poke_early((void *)jump_entry_code(entry), code, + JUMP_LABEL_NOP_SIZE); return; } @@ -96,7 +95,7 @@ void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { mutex_lock(&text_mutex); - __jump_label_transform(entry, type, NULL, 0); + __jump_label_transform(entry, type, 0); mutex_unlock(&text_mutex); } @@ -126,5 +125,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, jlstate = JL_STATE_NO_UPDATE; } if (jlstate == JL_STATE_UPDATE) - __jump_label_transform(entry, type, text_poke_early, 1); + __jump_label_transform(entry, type, 1); } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 4ff6b4cdb94190827847d5e7c4b51198c958b23d..13b13311b792564f12d0adf78d64fd770be6d16c 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -747,7 +747,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { int err; - char opc[BREAK_INSTR_SIZE]; bpt->type = BP_BREAKPOINT; err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, @@ -759,18 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) if (!err) return err; /* - * It is safe to call text_poke() because normal kernel execution + * It is safe to call text_poke_kgdb() because normal kernel execution * is stopped on all cores, so long as the text_mutex is not locked. */ if (mutex_is_locked(&text_mutex)) return -EBUSY; - text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, - BREAK_INSTR_SIZE); - err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); - if (err) - return err; - if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE)) - return -EINVAL; + text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); bpt->type = BP_POKE_BREAKPOINT; return err; @@ -778,22 +772,17 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { - int err; - char opc[BREAK_INSTR_SIZE]; - if (bpt->type != BP_POKE_BREAKPOINT) goto knl_write; /* - * It is safe to call text_poke() because normal kernel execution + * It is safe to call text_poke_kgdb() because normal kernel execution * is stopped on all cores, so long as the text_mutex is not locked. */ if (mutex_is_locked(&text_mutex)) goto knl_write; - text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE); - err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); - if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) - goto knl_write; - return err; + text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr, + BREAK_INSTR_SIZE); + return 0; knl_write: return probe_kernel_write((char *)bpt->bpt_addr, diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index a034cb808e7eb482e6fd8eae3fac9afca63b429c..cf52ee0d87111c13e0eef8a2429db889ed585962 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -431,8 +431,21 @@ void *alloc_insn_page(void) void *page; page = module_alloc(PAGE_SIZE); - if (page) - set_memory_ro((unsigned long)page & PAGE_MASK, 1); + if (!page) + return NULL; + + set_vm_flush_reset_perms(page); + /* + * First make the page read-only, and only then make it executable to + * prevent it from being W+X in between. + */ + set_memory_ro((unsigned long)page, 1); + + /* + * TODO: Once additional kernel code protection mechanisms are set, ensure + * that the page was not maliciously altered and it is still zeroed. + */ + set_memory_x((unsigned long)page, 1); return page; } @@ -440,8 +453,6 @@ void *alloc_insn_page(void) /* Recover page to RW mode before releasing it */ void free_insn_page(void *page) { - set_memory_nx((unsigned long)page & PAGE_MASK, 1); - set_memory_rw((unsigned long)page & PAGE_MASK, 1); module_memfree(page); } @@ -569,6 +580,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) unsigned long *sara = stack_addr(regs); ri->ret_addr = (kprobe_opcode_t *) *sara; + ri->fp = sara; /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; @@ -715,6 +727,7 @@ NOKPROBE_SYMBOL(kprobe_int3_handler); * calls trampoline_handler() runs, which calls the kretprobe's handler. */ asm( + ".text\n" ".global kretprobe_trampoline\n" ".type kretprobe_trampoline, @function\n" "kretprobe_trampoline:\n" @@ -748,26 +761,48 @@ asm( NOKPROBE_SYMBOL(kretprobe_trampoline); STACK_FRAME_NON_STANDARD(kretprobe_trampoline); +static struct kprobe kretprobe_kprobe = { + .addr = (void *)kretprobe_trampoline, +}; + /* * Called from kretprobe_trampoline */ static __used void *trampoline_handler(struct pt_regs *regs) { + struct kprobe_ctlblk *kcb; struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; struct hlist_node *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; kprobe_opcode_t *correct_ret_addr = NULL; + void *frame_pointer; + bool skipped = false; + + preempt_disable(); + + /* + * Set a dummy kprobe for avoiding kretprobe recursion. + * Since kretprobe never run in kprobe handler, kprobe must not + * be running at this point. + */ + kcb = get_kprobe_ctlblk(); + __this_cpu_write(current_kprobe, &kretprobe_kprobe); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; + /* On x86-64, we use pt_regs->sp for return address holder. */ + frame_pointer = ®s->sp; #else regs->cs = __KERNEL_CS | get_kernel_rpl(); regs->gs = 0; + /* On x86-32, we use pt_regs->flags for return address holder. */ + frame_pointer = ®s->flags; #endif regs->ip = trampoline_address; regs->orig_ax = ~0UL; @@ -789,8 +824,25 @@ static __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + /* + * Return probes must be pushed on this hash list correct + * order (same as return order) so that it can be poped + * correctly. However, if we find it is pushed it incorrect + * order, this means we find a function which should not be + * probed, because the wrong order entry is pushed on the + * path of processing other kretprobe itself. + */ + if (ri->fp != frame_pointer) { + if (!skipped) + pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); + skipped = true; + continue; + } orig_ret_address = (unsigned long)ri->ret_addr; + if (skipped) + pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", + ri->rp->kp.addr); if (orig_ret_address != trampoline_address) /* @@ -808,14 +860,15 @@ static __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + if (ri->fp != frame_pointer) + continue; orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { __this_cpu_write(current_kprobe, &ri->rp->kp); - get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, NULL); + __this_cpu_write(current_kprobe, &kretprobe_kprobe); } recycle_rp_inst(ri, &empty_rp); @@ -831,6 +884,9 @@ static __used void *trampoline_handler(struct pt_regs *regs) kretprobe_hash_unlock(current, &flags); + __this_cpu_write(current_kprobe, NULL); + preempt_enable(); + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5c93a65ee1e5c2ec56e83eda147bc1bc31e159cd..3f0cc828cc364fd039f452dc32b4317ed73a6131 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -67,7 +67,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); -static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); +DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; /* diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 6135ae8ce0364772f5cc72f73b4bb8f2ad3a8d9e..b2463fcb20a8116921203fb246d3b7ffa0ef1e88 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -113,7 +113,7 @@ static void do_sanity_check(struct mm_struct *mm, * tables. */ WARN_ON(!had_kernel_mapping); - if (static_cpu_has(X86_FEATURE_PTI)) + if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(!had_user_mapping); } else { /* @@ -121,7 +121,7 @@ static void do_sanity_check(struct mm_struct *mm, * Sync the pgd to the usermode tables. */ WARN_ON(had_kernel_mapping); - if (static_cpu_has(X86_FEATURE_PTI)) + if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(had_user_mapping); } } @@ -156,7 +156,7 @@ static void map_ldt_struct_to_user(struct mm_struct *mm) k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); - if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pmd(u_pmd, *k_pmd); } @@ -181,7 +181,7 @@ static void map_ldt_struct_to_user(struct mm_struct *mm) { pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); - if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pgd(kernel_to_user_pgdp(pgd), *pgd); } @@ -208,7 +208,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) spinlock_t *ptl; int i, nr_pages; - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return 0; /* @@ -271,7 +271,7 @@ static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) return; /* LDT map/unmap is only required for PTI */ - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); @@ -311,7 +311,7 @@ static void free_ldt_pgtables(struct mm_struct *mm) unsigned long start = LDT_BASE_ADDR; unsigned long end = LDT_END_ADDR; - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; tlb_gather_mmu(&tlb, mm, start, end); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b052e883dd8cc35df741a4e18c5236e38a234936..cfa3106faee42ee4879d512d698149a399f21ac8 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -87,7 +87,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), MODULES_END, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { vfree(p); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 18bc9b51ac9b99ffaf51e85daf490b0ba108bcc9..3755d0310026aab7d8496afe2efb57b7e9f747bb 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -21,13 +21,14 @@ #include #include #include +#include #include #if defined(CONFIG_EDAC) #include #endif -#include +#include #include #include #include @@ -487,6 +488,23 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2); * switch back to the original IDT. */ static DEFINE_PER_CPU(int, update_debug_stack); + +static bool notrace is_debug_stack(unsigned long addr) +{ + struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); + unsigned long top = CEA_ESTACK_TOP(cs, DB); + unsigned long bot = CEA_ESTACK_BOT(cs, DB1); + + if (__this_cpu_read(debug_stack_usage)) + return true; + /* + * Note, this covers the guard page between DB and DB1 as well to + * avoid two checks. But by all means @addr can never point into + * the guard page. + */ + return addr >= bot && addr < top; +} +NOKPROBE_SYMBOL(is_debug_stack); #endif dotraplinkage notrace void diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c0e0101133f352ba6a8ac8369eef15a3e5301be3..7bbaa6baf37f9b9ada524cf8f07385cf23717739 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -121,7 +121,7 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); void __init native_pv_lock_init(void) { - if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) static_branch_disable(&virt_spin_lock_key); } diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index c06c4c16c6b69c0d251505fa4c03a658c5f938a6..07c30ee1742542f15923b6e4ab7020b22bc634ad 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -59,18 +59,34 @@ static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { u64 perf_reg_value(struct pt_regs *regs, int idx) { + struct x86_perf_regs *perf_regs; + + if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) { + perf_regs = container_of(regs, struct x86_perf_regs, regs); + if (!perf_regs->xmm_regs) + return 0; + return perf_regs->xmm_regs[idx - PERF_REG_X86_XMM0]; + } + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset))) return 0; return regs_get_register(regs, pt_regs_offset[idx]); } -#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL)) - #ifdef CONFIG_X86_32 +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \ + (1ULL << PERF_REG_X86_R9) | \ + (1ULL << PERF_REG_X86_R10) | \ + (1ULL << PERF_REG_X86_R11) | \ + (1ULL << PERF_REG_X86_R12) | \ + (1ULL << PERF_REG_X86_R13) | \ + (1ULL << PERF_REG_X86_R14) | \ + (1ULL << PERF_REG_X86_R15)) + int perf_reg_validate(u64 mask) { - if (!mask || mask & REG_RESERVED) + if (!mask || (mask & REG_NOSUPPORT)) return -EINVAL; return 0; @@ -96,10 +112,7 @@ void perf_get_regs_user(struct perf_regs *regs_user, int perf_reg_validate(u64 mask) { - if (!mask || mask & REG_RESERVED) - return -EINVAL; - - if (mask & REG_NOSUPPORT) + if (!mask || (mask & REG_NOSUPPORT)) return -EINVAL; return 0; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 58ac7be52c7a6df944dca7305492b8ce70ed8d8e..d1d312d012a616ea346ad6e4e917de60bc26bcfd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -236,7 +236,7 @@ static int get_cpuid_mode(void) static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) { - if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) + if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; if (cpuid_enabled) @@ -426,6 +426,8 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, u64 msr = x86_spec_ctrl_base; bool updmsr = false; + lockdep_assert_irqs_disabled(); + /* * If TIF_SSBD is different, select the proper mitigation * method. Note that if SSBD mitigation is disabled or permanentely @@ -477,10 +479,12 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) void speculation_ctrl_update(unsigned long tif) { + unsigned long flags; + /* Forced update. Make sure all relevant TIF flags are different */ - preempt_disable(); + local_irq_save(flags); __speculation_ctrl_update(~tif, tif); - preempt_enable(); + local_irq_restore(flags); } /* Called from seccomp/prctl update */ @@ -666,7 +670,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) if (c->x86_vendor != X86_VENDOR_INTEL) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR)) + if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR)) return 0; return 1; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index e471d8e6f0b248951a44654f5222ef217fd4dd2e..70933193878caafa4a6414dd7313aa3b7a840d84 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -127,6 +127,13 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, struct task_struct *tsk; int err; + /* + * For a new task use the RESET flags value since there is no before. + * All the status flags are zero; DF and all the system flags must also + * be 0, specifically IF must be 0 because we context switch to the new + * task with interrupts disabled. + */ + frame->flags = X86_EFLAGS_FIXED; frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6a62f4af9fcf72d8979f7a842d2932fa34338afc..844a28b29967ded4b78e3e69ffda3a6d6a2097d1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -392,6 +392,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; + frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 725624b6c0c05cdc0c94175214a7ce796df47eee..09d6bded3c1e569fbce3426ecfd10995212ec6e8 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -81,6 +81,19 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } +/* + * Some machines don't handle the default ACPI reboot method and + * require the EFI reboot method: + */ +static int __init set_efi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_EFI && !efi_runtime_disabled()) { + reboot_type = BOOT_EFI; + pr_info("%s series board detected. Selecting EFI-method for reboot.\n", d->ident); + } + return 0; +} + void __noreturn machine_real_restart(unsigned int type) { local_irq_disable(); @@ -108,7 +121,7 @@ void __noreturn machine_real_restart(unsigned int type) write_cr3(real_mode_header->trampoline_pgd); /* Exiting long mode will fail if CR4.PCIDE is set. */ - if (static_cpu_has(X86_FEATURE_PCID)) + if (boot_cpu_has(X86_FEATURE_PCID)) cr4_clear_bits(X86_CR4_PCIDE); #endif @@ -166,6 +179,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), }, }, + { /* Handle reboot issue on Acer TravelMate X514-51T */ + .callback = set_efi_reboot, + .ident = "Acer TravelMate X514-51T", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate X514-51T"), + }, + }, /* Apple */ { /* Handle problems with rebooting on Apple MacBook5 */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3d872a527cd966facecec3bd8440677ae1d99204..905dae880563889db4e7bbabfd0abcf37d5d5995 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include