diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg new file mode 100644 index 0000000000000000000000000000000000000000..727e270b11e4e3b5871aecd1084d2c1b4585e056 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg @@ -0,0 +1,474 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + struct + + rcu_data + + CPU 0 + + struct + + rcu_data + + CPU 15 + + struct + + rcu_data + + CPU 1007 + + struct + + rcu_data + + CPU 1023 + + struct rcu_state + + struct + + rcu_node + + rcu_node + + struct + + struct + + rcu_node + + + + + + + + diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg new file mode 100644 index 0000000000000000000000000000000000000000..9bbb1944f962d5a8c22861492f136b9e5431f9a9 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg @@ -0,0 +1,499 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_bh + + struct + + rcu_node + + struct + + rcu_node + + rcu_node + + struct + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct rcu_state + + rcu_sched + + + + + + + + + + + diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg new file mode 100644 index 0000000000000000000000000000000000000000..21ba7823479d4f031612c2bd28af839bdf779487 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg @@ -0,0 +1,695 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_bh + + struct + + rcu_node + + struct + + rcu_node + + rcu_node + + struct + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct rcu_state + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + rcu_sched + + + + + diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg new file mode 100644 index 0000000000000000000000000000000000000000..15adcac036c73351c81bf8fdf38e505a21a057a0 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg @@ -0,0 +1,741 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_bh + + struct + + rcu_node + + struct + + rcu_node + + rcu_node + + struct + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct rcu_state + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + rcu_preempt + + rcu_sched + + + + + diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg new file mode 100644 index 0000000000000000000000000000000000000000..bbc3801470d095a95be916503fd5799f52229ef6 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg @@ -0,0 +1,858 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + struct + + rcu_head + + struct + + rcu_head + + struct + + rcu_head + + rcu_sched + + rcu_bh + + struct + + rcu_node + + struct + + rcu_node + + rcu_node + + struct + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct rcu_state + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + rcu_preempt + + + + + diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html new file mode 100644 index 0000000000000000000000000000000000000000..7eb47ac25ad772bdc25b812e016982d974752f8c --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -0,0 +1,1333 @@ + + + A Tour Through TREE_RCU's Data Structures [LWN.net] + + +

January 27, 2016

+

This article was contributed by Paul E. McKenney

+ +

Introduction

+ +This document describes RCU's major data structures and their relationship +to each other. + +
    +
  1. + Data-Structure Relationships +
  2. + The rcu_state Structure +
  3. + The rcu_node Structure +
  4. + The rcu_data Structure +
  5. + The rcu_dynticks Structure +
  6. + The rcu_head Structure +
  7. + RCU-Specific Fields in the task_struct Structure +
  8. + Accessor Functions +
+ +At the end we have the +answers to the quick quizzes. + +

Data-Structure Relationships

+ +

RCU is for all intents and purposes a large state machine, and its +data structures maintain the state in such a way as to allow RCU readers +to execute extremely quickly, while also processing the RCU grace periods +requested by updaters in an efficient and extremely scalable fashion. +The efficiency and scalability of RCU updaters is provided primarily +by a combining tree, as shown below: + +

BigTreeClassicRCU.svg + +

This diagram shows an enclosing rcu_state structure +containing a tree of rcu_node structures. +Each leaf node of the rcu_node tree has up to 16 +rcu_data structures associated with it, so that there +are NR_CPUS number of rcu_data structures, +one for each possible CPU. +This structure is adjusted at boot time, if needed, to handle the +common case where nr_cpu_ids is much less than +NR_CPUs. +For example, a number of Linux distributions set NR_CPUs=4096, +which results in a three-level rcu_node tree. +If the actual hardware has only 16 CPUs, RCU will adjust itself +at boot time, resulting in an rcu_node tree with only a single node. + +

The purpose of this combining tree is to allow per-CPU events +such as quiescent states, dyntick-idle transitions, +and CPU hotplug operations to be processed efficiently +and scalably. +Quiescent states are recorded by the per-CPU rcu_data structures, +and other events are recorded by the leaf-level rcu_node +structures. +All of these events are combined at each level of the tree until finally +grace periods are completed at the tree's root rcu_node +structure. +A grace period can be completed at the root once every CPU +(or, in the case of CONFIG_PREEMPT_RCU, task) +has passed through a quiescent state. +Once a grace period has completed, record of that fact is propagated +back down the tree. + +

As can be seen from the diagram, on a 64-bit system +a two-level tree with 64 leaves can accommodate 1,024 CPUs, with a fanout +of 64 at the root and a fanout of 16 at the leaves. + + + + + + + + +
 
Quick Quiz:
+ Why isn't the fanout at the leaves also 64? +
Answer:
+ Because there are more types of events that affect the leaf-level + rcu_node structures than further up the tree. + Therefore, if the leaf rcu_node structures have fanout of + 64, the contention on these structures' ->structures + becomes excessive. + Experimentation on a wide variety of systems has shown that a fanout + of 16 works well for the leaves of the rcu_node tree. + + +

Of course, further experience with + systems having hundreds or thousands of CPUs may demonstrate + that the fanout for the non-leaf rcu_node structures + must also be reduced. + Such reduction can be easily carried out when and if it proves + necessary. + In the meantime, if you are using such a system and running into + contention problems on the non-leaf rcu_node structures, + you may use the CONFIG_RCU_FANOUT kernel configuration + parameter to reduce the non-leaf fanout as needed. + + +

Kernels built for systems with + strong NUMA characteristics might also need to adjust + CONFIG_RCU_FANOUT so that the domains of the + rcu_node structures align with hardware boundaries. + However, there has thus far been no need for this. +

 
+ +

If your system has more than 1,024 CPUs (or more than 512 CPUs on +a 32-bit system), then RCU will automatically add more levels to the +tree. +For example, if you are crazy enough to build a 64-bit system with 65,536 +CPUs, RCU would configure the rcu_node tree as follows: + +

HugeTreeClassicRCU.svg + +

RCU currently permits up to a four-level tree, which on a 64-bit system +accommodates up to 4,194,304 CPUs, though only a mere 524,288 CPUs for +32-bit systems. +On the other hand, you can set CONFIG_RCU_FANOUT to be +as small as 2 if you wish, which would permit only 16 CPUs, which +is useful for testing. + +

This multi-level combining tree allows us to get most of the +performance and scalability +benefits of partitioning, even though RCU grace-period detection is +inherently a global operation. +The trick here is that only the last CPU to report a quiescent state +into a given rcu_node structure need advance to the rcu_node +structure at the next level up the tree. +This means that at the leaf-level rcu_node structure, only +one access out of sixteen will progress up the tree. +For the internal rcu_node structures, the situation is even +more extreme: Only one access out of sixty-four will progress up +the tree. +Because the vast majority of the CPUs do not progress up the tree, +the lock contention remains roughly constant up the tree. +No matter how many CPUs there are in the system, at most 64 quiescent-state +reports per grace period will progress all the way to the root +rcu_node structure, thus ensuring that the lock contention +on that root rcu_node structure remains acceptably low. + +

In effect, the combining tree acts like a big shock absorber, +keeping lock contention under control at all tree levels regardless +of the level of loading on the system. + +

The Linux kernel actually supports multiple flavors of RCU +running concurrently, so RCU builds separate data structures for each +flavor. +For example, for CONFIG_TREE_RCU=y kernels, RCU provides +rcu_sched and rcu_bh, as shown below: + +

BigTreeClassicRCUBH.svg + +

Energy efficiency is increasingly important, and for that +reason the Linux kernel provides CONFIG_NO_HZ_IDLE, which +turns off the scheduling-clock interrupts on idle CPUs, which in +turn allows those CPUs to attain deeper sleep states and to consume +less energy. +CPUs whose scheduling-clock interrupts have been turned off are +said to be in dyntick-idle mode. +RCU must handle dyntick-idle CPUs specially +because RCU would otherwise wake up each CPU on every grace period, +which would defeat the whole purpose of CONFIG_NO_HZ_IDLE. +RCU uses the rcu_dynticks structure to track +which CPUs are in dyntick idle mode, as shown below: + +

BigTreeClassicRCUBHdyntick.svg + +

However, if a CPU is in dyntick-idle mode, it is in that mode +for all flavors of RCU. +Therefore, a single rcu_dynticks structure is allocated per +CPU, and all of a given CPU's rcu_data structures share +that rcu_dynticks, as shown in the figure. + +

Kernels built with CONFIG_PREEMPT_RCU support +rcu_preempt in addition to rcu_sched and rcu_bh, as shown below: + +

BigTreePreemptRCUBHdyntick.svg + +

RCU updaters wait for normal grace periods by registering +RCU callbacks, either directly via call_rcu() and +friends (namely call_rcu_bh() and call_rcu_sched()), +there being a separate interface per flavor of RCU) +or indirectly via synchronize_rcu() and friends. +RCU callbacks are represented by rcu_head structures, +which are queued on rcu_data structures while they are +waiting for a grace period to elapse, as shown in the following figure: + +

BigTreePreemptRCUBHdyntickCB.svg + +

This figure shows how TREE_RCU's and +PREEMPT_RCU's major data structures are related. +Lesser data structures will be introduced with the algorithms that +make use of them. + +

Note that each of the data structures in the above figure has +its own synchronization: + +

    +
  1. Each rcu_state structures has a lock and a mutex, + and some fields are protected by the corresponding root + rcu_node structure's lock. +
  2. Each rcu_node structure has a spinlock. +
  3. The fields in rcu_data are private to the corresponding + CPU, although a few can be read and written by other CPUs. +
  4. Similarly, the fields in rcu_dynticks are private + to the corresponding CPU, although a few can be read by + other CPUs. +
+ +

It is important to note that different data structures can have +very different ideas about the state of RCU at any given time. +For but one example, awareness of the start or end of a given RCU +grace period propagates slowly through the data structures. +This slow propagation is absolutely necessary for RCU to have good +read-side performance. +If this balkanized implementation seems foreign to you, one useful +trick is to consider each instance of these data structures to be +a different person, each having the usual slightly different +view of reality. + +

The general role of each of these data structures is as +follows: + +

    +
  1. rcu_state: + This structure forms the interconnection between the + rcu_node and rcu_data structures, + tracks grace periods, serves as short-term repository + for callbacks orphaned by CPU-hotplug events, + maintains rcu_barrier() state, + tracks expedited grace-period state, + and maintains state used to force quiescent states when + grace periods extend too long, +
  2. rcu_node: This structure forms the combining + tree that propagates quiescent-state + information from the leaves to the root, and also propagates + grace-period information from the root to the leaves. + It provides local copies of the grace-period state in order + to allow this information to be accessed in a synchronized + manner without suffering the scalability limitations that + would otherwise be imposed by global locking. + In CONFIG_PREEMPT_RCU kernels, it manages the lists + of tasks that have blocked while in their current + RCU read-side critical section. + In CONFIG_PREEMPT_RCU with + CONFIG_RCU_BOOST, it manages the + per-rcu_node priority-boosting + kernel threads (kthreads) and state. + Finally, it records CPU-hotplug state in order to determine + which CPUs should be ignored during a given grace period. +
  3. rcu_data: This per-CPU structure is the + focus of quiescent-state detection and RCU callback queuing. + It also tracks its relationship to the corresponding leaf + rcu_node structure to allow more-efficient + propagation of quiescent states up the rcu_node + combining tree. + Like the rcu_node structure, it provides a local + copy of the grace-period information to allow for-free + synchronized + access to this information from the corresponding CPU. + Finally, this structure records past dyntick-idle state + for the corresponding CPU and also tracks statistics. +
  4. rcu_dynticks: + This per-CPU structure tracks the current dyntick-idle + state for the corresponding CPU. + Unlike the other three structures, the rcu_dynticks + structure is not replicated per RCU flavor. +
  5. rcu_head: + This structure represents RCU callbacks, and is the + only structure allocated and managed by RCU users. + The rcu_head structure is normally embedded + within the RCU-protected data structure. +
+ +

If all you wanted from this article was a general notion of how +RCU's data structures are related, you are done. +Otherwise, each of the following sections give more details on +the rcu_state, rcu_node, rcu_data, +and rcu_dynticks data structures. + +

+The rcu_state Structure

+ +

The rcu_state structure is the base structure that +represents a flavor of RCU. +This structure forms the interconnection between the +rcu_node and rcu_data structures, +tracks grace periods, contains the lock used to +synchronize with CPU-hotplug events, +and maintains state used to force quiescent states when +grace periods extend too long, + +

A few of the rcu_state structure's fields are discussed, +singly and in groups, in the following sections. +The more specialized fields are covered in the discussion of their +use. + +

Relationship to rcu_node and rcu_data Structures
+ +This portion of the rcu_state structure is declared +as follows: + +
+  1   struct rcu_node node[NUM_RCU_NODES];
+  2   struct rcu_node *level[NUM_RCU_LVLS + 1];
+  3   struct rcu_data __percpu *rda;
+
+ + + + + + + + +
 
Quick Quiz:
+ Wait a minute! + You said that the rcu_node structures formed a tree, + but they are declared as a flat array! + What gives? +
Answer:
+ The tree is laid out in the array. + The first node In the array is the head, the next set of nodes in the + array are children of the head node, and so on until the last set of + nodes in the array are the leaves. + + +

See the following diagrams to see how + this works. +

 
+ +

The rcu_node tree is embedded into the +->node[] array as shown in the following figure: + +

TreeMapping.svg + +

One interesting consequence of this mapping is that a +breadth-first traversal of the tree is implemented as a simple +linear scan of the array, which is in fact what the +rcu_for_each_node_breadth_first() macro does. +This macro is used at the beginning and ends of grace periods. + +

Each entry of the ->level array references +the first rcu_node structure on the corresponding level +of the tree, for example, as shown below: + +

TreeMappingLevel.svg + +

The zeroth element of the array references the root +rcu_node structure, the first element references the +first child of the root rcu_node, and finally the second +element references the first leaf rcu_node structure. + +

For whatever it is worth, if you draw the tree to be tree-shaped +rather than array-shaped, it is easy to draw a planar representation: + +

TreeLevel.svg + +

Finally, the ->rda field references a per-CPU +pointer to the corresponding CPU's rcu_data structure. + +

All of these fields are constant once initialization is complete, +and therefore need no protection. + +

Grace-Period Tracking
+ +

This portion of the rcu_state structure is declared +as follows: + +

+  1   unsigned long gpnum;
+  2   unsigned long completed;
+
+ +

RCU grace periods are numbered, and +the ->gpnum field contains the number of the grace +period that started most recently. +The ->completed field contains the number of the +grace period that completed most recently. +If the two fields are equal, the RCU grace period that most recently +started has already completed, and therefore the corresponding +flavor of RCU is idle. +If ->gpnum is one greater than ->completed, +then ->gpnum gives the number of the current RCU +grace period, which has not yet completed. +Any other combination of values indicates that something is broken. +These two fields are protected by the root rcu_node's +->lock field. + +

There are ->gpnum and ->completed fields +in the rcu_node and rcu_data structures +as well. +The fields in the rcu_state structure represent the +most current values, and those of the other structures are compared +in order to detect the start of a new grace period in a distributed +fashion. +The values flow from rcu_state to rcu_node +(down the tree from the root to the leaves) to rcu_data. + +

Miscellaneous
+ +

This portion of the rcu_state structure is declared +as follows: + +

+  1   unsigned long gp_max;
+  2   char abbr;
+  3   char *name;
+
+ +

The ->gp_max field tracks the duration of the longest +grace period in jiffies. +It is protected by the root rcu_node's ->lock. + +

The ->name field points to the name of the RCU flavor +(for example, “rcu_sched”), and is constant. +The ->abbr field contains a one-character abbreviation, +for example, “s” for RCU-sched. + +

+The rcu_node Structure

+ +

The rcu_node structures form the combining +tree that propagates quiescent-state +information from the leaves to the root and also that propagates +grace-period information from the root down to the leaves. +They provides local copies of the grace-period state in order +to allow this information to be accessed in a synchronized +manner without suffering the scalability limitations that +would otherwise be imposed by global locking. +In CONFIG_PREEMPT_RCU kernels, they manage the lists +of tasks that have blocked while in their current +RCU read-side critical section. +In CONFIG_PREEMPT_RCU with +CONFIG_RCU_BOOST, they manage the +per-rcu_node priority-boosting +kernel threads (kthreads) and state. +Finally, they record CPU-hotplug state in order to determine +which CPUs should be ignored during a given grace period. + +

The rcu_node structure's fields are discussed, +singly and in groups, in the following sections. + +

Connection to Combining Tree
+ +

This portion of the rcu_node structure is declared +as follows: + +

+  1   struct rcu_node *parent;
+  2   u8 level;
+  3   u8 grpnum;
+  4   unsigned long grpmask;
+  5   int grplo;
+  6   int grphi;
+
+ +

The ->parent pointer references the rcu_node +one level up in the tree, and is NULL for the root +rcu_node. +The RCU implementation makes heavy use of this field to push quiescent +states up the tree. +The ->level field gives the level in the tree, with +the root being at level zero, its children at level one, and so on. +The ->grpnum field gives this node's position within +the children of its parent, so this number can range between 0 and 31 +on 32-bit systems and between 0 and 63 on 64-bit systems. +The ->level and ->grpnum fields are +used only during initialization and for tracing. +The ->grpmask field is the bitmask counterpart of +->grpnum, and therefore always has exactly one bit set. +This mask is used to clear the bit corresponding to this rcu_node +structure in its parent's bitmasks, which are described later. +Finally, the ->grplo and ->grphi fields +contain the lowest and highest numbered CPU served by this +rcu_node structure, respectively. + +

All of these fields are constant, and thus do not require any +synchronization. + +

Synchronization
+ +

This field of the rcu_node structure is declared +as follows: + +

+  1   raw_spinlock_t lock;
+
+ +

This field is used to protect the remaining fields in this structure, +unless otherwise stated. +That said, all of the fields in this structure can be accessed without +locking for tracing purposes. +Yes, this can result in confusing traces, but better some tracing confusion +than to be heisenbugged out of existence. + +

Grace-Period Tracking
+ +

This portion of the rcu_node structure is declared +as follows: + +

+  1   unsigned long gpnum;
+  2   unsigned long completed;
+
+ +

These fields are the counterparts of the fields of the same name in +the rcu_state structure. +They each may lag up to one behind their rcu_state +counterparts. +If a given rcu_node structure's ->gpnum and +->complete fields are equal, then this rcu_node +structure believes that RCU is idle. +Otherwise, as with the rcu_state structure, +the ->gpnum field will be one greater than the +->complete fields, with ->gpnum +indicating which grace period this rcu_node believes +is still being waited for. + +

The >gpnum field of each rcu_node +structure is updated at the beginning +of each grace period, and the ->completed fields are +updated at the end of each grace period. + +

Quiescent-State Tracking
+ +

These fields manage the propagation of quiescent states up the +combining tree. + +

This portion of the rcu_node structure has fields +as follows: + +

+  1   unsigned long qsmask;
+  2   unsigned long expmask;
+  3   unsigned long qsmaskinit;
+  4   unsigned long expmaskinit;
+
+ +

The ->qsmask field tracks which of this +rcu_node structure's children still need to report +quiescent states for the current normal grace period. +Such children will have a value of 1 in their corresponding bit. +Note that the leaf rcu_node structures should be +thought of as having rcu_data structures as their +children. +Similarly, the ->expmask field tracks which +of this rcu_node structure's children still need to report +quiescent states for the current expedited grace period. +An expedited grace period has +the same conceptual properties as a normal grace period, but the +expedited implementation accepts extreme CPU overhead to obtain +much lower grace-period latency, for example, consuming a few +tens of microseconds worth of CPU time to reduce grace-period +duration from milliseconds to tens of microseconds. +The ->qsmaskinit field tracks which of this +rcu_node structure's children cover for at least +one online CPU. +This mask is used to initialize ->qsmask, +and ->expmaskinit is used to initialize +->expmask and the beginning of the +normal and expedited grace periods, respectively. + + + + + + + + +
 
Quick Quiz:
+ Why are these bitmasks protected by locking? + Come on, haven't you heard of atomic instructions??? +
Answer:
+ Lockless grace-period computation! Such a tantalizing possibility! + + +

But consider the following sequence of events: + + +

    +
  1. CPU 0 has been in dyntick-idle + mode for quite some time. + When it wakes up, it notices that the current RCU + grace period needs it to report in, so it sets a + flag where the scheduling clock interrupt will find it. +

    +

  2. Meanwhile, CPU 1 is running + force_quiescent_state(), + and notices that CPU 0 has been in dyntick idle mode, + which qualifies as an extended quiescent state. +

    +

  3. CPU 0's scheduling clock + interrupt fires in the + middle of an RCU read-side critical section, and notices + that the RCU core needs something, so commences RCU softirq + processing. + +

    +

  4. CPU 0's softirq handler + executes and is just about ready + to report its quiescent state up the rcu_node + tree. +

    +

  5. But CPU 1 beats it to the punch, + completing the current + grace period and starting a new one. +

    +

  6. CPU 0 now reports its quiescent + state for the wrong + grace period. + That grace period might now end before the RCU read-side + critical section. + If that happens, disaster will ensue. + +
+ +

So the locking is absolutely required in + order to coordinate + clearing of the bits with the grace-period numbers in + ->gpnum and ->completed. +

 
+ +

Blocked-Task Management
+ +

PREEMPT_RCU allows tasks to be preempted in the +midst of their RCU read-side critical sections, and these tasks +must be tracked explicitly. +The details of exactly why and how they are tracked will be covered +in a separate article on RCU read-side processing. +For now, it is enough to know that the rcu_node +structure tracks them. + +

+  1   struct list_head blkd_tasks;
+  2   struct list_head *gp_tasks;
+  3   struct list_head *exp_tasks;
+  4   bool wait_blkd_tasks;
+
+ +

The ->blkd_tasks field is a list header for +the list of blocked and preempted tasks. +As tasks undergo context switches within RCU read-side critical +sections, their task_struct structures are enqueued +(via the task_struct's ->rcu_node_entry +field) onto the head of the ->blkd_tasks list for the +leaf rcu_node structure corresponding to the CPU +on which the outgoing context switch executed. +As these tasks later exit their RCU read-side critical sections, +they remove themselves from the list. +This list is therefore in reverse time order, so that if one of the tasks +is blocking the current grace period, all subsequent tasks must +also be blocking that same grace period. +Therefore, a single pointer into this list suffices to track +all tasks blocking a given grace period. +That pointer is stored in ->gp_tasks for normal +grace periods and in ->exp_tasks for expedited +grace periods. +These last two fields are NULL if either there is +no grace period in flight or if there are no blocked tasks +preventing that grace period from completing. +If either of these two pointers is referencing a task that +removes itself from the ->blkd_tasks list, +then that task must advance the pointer to the next task on +the list, or set the pointer to NULL if there +are no subsequent tasks on the list. + +

For example, suppose that tasks T1, T2, and T3 are +all hard-affinitied to the largest-numbered CPU in the system. +Then if task T1 blocked in an RCU read-side +critical section, then an expedited grace period started, +then task T2 blocked in an RCU read-side critical section, +then a normal grace period started, and finally task 3 blocked +in an RCU read-side critical section, then the state of the +last leaf rcu_node structure's blocked-task list +would be as shown below: + +

blkd_task.svg + +

Task T1 is blocking both grace periods, task T2 is +blocking only the normal grace period, and task T3 is blocking +neither grace period. +Note that these tasks will not remove themselves from this list +immediately upon resuming execution. +They will instead remain on the list until they execute the outermost +rcu_read_unlock() that ends their RCU read-side critical +section. + +

+The ->wait_blkd_tasks field indicates whether or not +the current grace period is waiting on a blocked task. + +

Sizing the rcu_node Array
+ +

The rcu_node array is sized via a series of +C-preprocessor expressions as follows: + +

+ 1 #ifdef CONFIG_RCU_FANOUT
+ 2 #define RCU_FANOUT CONFIG_RCU_FANOUT
+ 3 #else
+ 4 # ifdef CONFIG_64BIT
+ 5 # define RCU_FANOUT 64
+ 6 # else
+ 7 # define RCU_FANOUT 32
+ 8 # endif
+ 9 #endif
+10
+11 #ifdef CONFIG_RCU_FANOUT_LEAF
+12 #define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
+13 #else
+14 # ifdef CONFIG_64BIT
+15 # define RCU_FANOUT_LEAF 64
+16 # else
+17 # define RCU_FANOUT_LEAF 32
+18 # endif
+19 #endif
+20
+21 #define RCU_FANOUT_1        (RCU_FANOUT_LEAF)
+22 #define RCU_FANOUT_2        (RCU_FANOUT_1 * RCU_FANOUT)
+23 #define RCU_FANOUT_3        (RCU_FANOUT_2 * RCU_FANOUT)
+24 #define RCU_FANOUT_4        (RCU_FANOUT_3 * RCU_FANOUT)
+25
+26 #if NR_CPUS <= RCU_FANOUT_1
+27 #  define RCU_NUM_LVLS        1
+28 #  define NUM_RCU_LVL_0        1
+29 #  define NUM_RCU_NODES        NUM_RCU_LVL_0
+30 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
+31 #  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
+32 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
+33 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0" }
+34 #elif NR_CPUS <= RCU_FANOUT_2
+35 #  define RCU_NUM_LVLS        2
+36 #  define NUM_RCU_LVL_0        1
+37 #  define NUM_RCU_LVL_1        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+38 #  define NUM_RCU_NODES        (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
+39 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
+40 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
+41 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+42 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1" }
+43 #elif NR_CPUS <= RCU_FANOUT_3
+44 #  define RCU_NUM_LVLS        3
+45 #  define NUM_RCU_LVL_0        1
+46 #  define NUM_RCU_LVL_1        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+47 #  define NUM_RCU_LVL_2        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+48 #  define NUM_RCU_NODES        (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
+49 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
+50 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
+51 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+52 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
+53 #elif NR_CPUS <= RCU_FANOUT_4
+54 #  define RCU_NUM_LVLS        4
+55 #  define NUM_RCU_LVL_0        1
+56 #  define NUM_RCU_LVL_1        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+57 #  define NUM_RCU_LVL_2        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+58 #  define NUM_RCU_LVL_3        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+59 #  define NUM_RCU_NODES        (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+60 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
+61 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
+62 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+63 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
+64 #else
+65 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+66 #endif
+
+ +

The maximum number of levels in the rcu_node structure +is currently limited to four, as specified by lines 21-24 +and the structure of the subsequent “if” statement. +For 32-bit systems, this allows 16*32*32*32=524,288 CPUs, which +should be sufficient for the next few years at least. +For 64-bit systems, 16*64*64*64=4,194,304 CPUs is allowed, which +should see us through the next decade or so. +This four-level tree also allows kernels built with +CONFIG_RCU_FANOUT=8 to support up to 4096 CPUs, +which might be useful in very large systems having eight CPUs per +socket (but please note that no one has yet shown any measurable +performance degradation due to misaligned socket and rcu_node +boundaries). +In addition, building kernels with a full four levels of rcu_node +tree permits better testing of RCU's combining-tree code. + +

The RCU_FANOUT symbol controls how many children +are permitted at each non-leaf level of the rcu_node tree. +If the CONFIG_RCU_FANOUT Kconfig option is not specified, +it is set based on the word size of the system, which is also +the Kconfig default. + +

The RCU_FANOUT_LEAF symbol controls how many CPUs are +handled by each leaf rcu_node structure. +Experience has shown that allowing a given leaf rcu_node +structure to handle 64 CPUs, as permitted by the number of bits in +the ->qsmask field on a 64-bit system, results in +excessive contention for the leaf rcu_node structures' +->lock fields. +The number of CPUs per leaf rcu_node structure is therefore +limited to 16 given the default value of CONFIG_RCU_FANOUT_LEAF. +If CONFIG_RCU_FANOUT_LEAF is unspecified, the value +selected is based on the word size of the system, just as for +CONFIG_RCU_FANOUT. +Lines 11-19 perform this computation. + +

Lines 21-24 compute the maximum number of CPUs supported by +a single-level (which contains a single rcu_node structure), +two-level, three-level, and four-level rcu_node tree, +respectively, given the fanout specified by RCU_FANOUT +and RCU_FANOUT_LEAF. +These numbers of CPUs are retained in the +RCU_FANOUT_1, +RCU_FANOUT_2, +RCU_FANOUT_3, and +RCU_FANOUT_4 +C-preprocessor variables, respectively. + +

These variables are used to control the C-preprocessor #if +statement spanning lines 26-66 that computes the number of +rcu_node structures required for each level of the tree, +as well as the number of levels required. +The number of levels is placed in the NUM_RCU_LVLS +C-preprocessor variable by lines 27, 35, 44, and 54. +The number of rcu_node structures for the topmost level +of the tree is always exactly one, and this value is unconditionally +placed into NUM_RCU_LVL_0 by lines 28, 36, 45, and 55. +The rest of the levels (if any) of the rcu_node tree +are computed by dividing the maximum number of CPUs by the +fanout supported by the number of levels from the current level down, +rounding up. This computation is performed by lines 37, +46-47, and 56-58. +Lines 31-33, 40-42, 50-52, and 62-63 create initializers +for lockdep lock-class names. +Finally, lines 64-66 produce an error if the maximum number of +CPUs is too large for the specified fanout. + +

+The rcu_data Structure

+ +

The rcu_data maintains the per-CPU state for the +corresponding flavor of RCU. +The fields in this structure may be accessed only from the corresponding +CPU (and from tracing) unless otherwise stated. +This structure is the +focus of quiescent-state detection and RCU callback queuing. +It also tracks its relationship to the corresponding leaf +rcu_node structure to allow more-efficient +propagation of quiescent states up the rcu_node +combining tree. +Like the rcu_node structure, it provides a local +copy of the grace-period information to allow for-free +synchronized +access to this information from the corresponding CPU. +Finally, this structure records past dyntick-idle state +for the corresponding CPU and also tracks statistics. + +

The rcu_data structure's fields are discussed, +singly and in groups, in the following sections. + +

Connection to Other Data Structures
+ +

This portion of the rcu_data structure is declared +as follows: + +

+  1   int cpu;
+  2   struct rcu_state *rsp;
+  3   struct rcu_node *mynode;
+  4   struct rcu_dynticks *dynticks;
+  5   unsigned long grpmask;
+  6   bool beenonline;
+
+ +

The ->cpu field contains the number of the +corresponding CPU, the ->rsp pointer references +the corresponding rcu_state structure (and is most frequently +used to locate the name of the corresponding flavor of RCU for tracing), +and the ->mynode field references the corresponding +rcu_node structure. +The ->mynode is used to propagate quiescent states +up the combining tree. +

The ->dynticks pointer references the +rcu_dynticks structure corresponding to this +CPU. +Recall that a single per-CPU instance of the rcu_dynticks +structure is shared among all flavors of RCU. +These first four fields are constant and therefore require not +synchronization. + +

The ->grpmask field indicates the bit in +the ->mynode->qsmask corresponding to this +rcu_data structure, and is also used when propagating +quiescent states. +The ->beenonline flag is set whenever the corresponding +CPU comes online, which means that the debugfs tracing need not dump +out any rcu_data structure for which this flag is not set. + +

Quiescent-State and Grace-Period Tracking
+ +

This portion of the rcu_data structure is declared +as follows: + +

+  1   unsigned long completed;
+  2   unsigned long gpnum;
+  3   bool cpu_no_qs;
+  4   bool core_needs_qs;
+  5   bool gpwrap;
+  6   unsigned long rcu_qs_ctr_snap;
+
+ +

The completed and gpnum +fields are the counterparts of the fields of the same name +in the rcu_state and rcu_node structures. +They may each lag up to one behind their rcu_node +counterparts, but in CONFIG_NO_HZ_IDLE and +CONFIG_NO_HZ_FULL kernels can lag +arbitrarily far behind for CPUs in dyntick-idle mode (but these counters +will catch up upon exit from dyntick-idle mode). +If a given rcu_data structure's ->gpnum and +->complete fields are equal, then this rcu_data +structure believes that RCU is idle. +Otherwise, as with the rcu_state and rcu_node +structure, +the ->gpnum field will be one greater than the +->complete fields, with ->gpnum +indicating which grace period this rcu_data believes +is still being waited for. + + + + + + + + +
 
Quick Quiz:
+ All this replication of the grace period numbers can only cause + massive confusion. + Why not just keep a global pair of counters and be done with it??? +
Answer:
+ Because if there was only a single global pair of grace-period + numbers, there would need to be a single global lock to allow + safely accessing and updating them. + And if we are not going to have a single global lock, we need + to carefully manage the numbers on a per-node basis. + Recall from the answer to a previous Quick Quiz that the consequences + of applying a previously sampled quiescent state to the wrong + grace period are quite severe. +
 
+ +

The ->cpu_no_qs flag indicates that the +CPU has not yet passed through a quiescent state, +while the ->core_needs_qs flag indicates that the +RCU core needs a quiescent state from the corresponding CPU. +The ->gpwrap field indicates that the corresponding +CPU has remained idle for so long that the completed +and gpnum counters are in danger of overflow, which +will cause the CPU to disregard the values of its counters on +its next exit from idle. +Finally, the rcu_qs_ctr_snap field is used to detect +cases where a given operation has resulted in a quiescent state +for all flavors of RCU, for example, cond_resched_rcu_qs(). + +

RCU Callback Handling
+ +

In the absence of CPU-hotplug events, RCU callbacks are invoked by +the same CPU that registered them. +This is strictly a cache-locality optimization: callbacks can and +do get invoked on CPUs other than the one that registered them. +After all, if the CPU that registered a given callback has gone +offline before the callback can be invoked, there really is no other +choice. + +

This portion of the rcu_data structure is declared +as follows: + +

+ 1 struct rcu_head *nxtlist;
+ 2 struct rcu_head **nxttail[RCU_NEXT_SIZE];
+ 3 unsigned long nxtcompleted[RCU_NEXT_SIZE];
+ 4 long qlen_lazy;
+ 5 long qlen;
+ 6 long qlen_last_fqs_check;
+ 7 unsigned long n_force_qs_snap;
+ 8 unsigned long n_cbs_invoked;
+ 9 unsigned long n_cbs_orphaned;
+10 unsigned long n_cbs_adopted;
+11 long blimit;
+
+ +

The ->nxtlist pointer and the +->nxttail[] array form a four-segment list with +older callbacks near the head and newer ones near the tail. +Each segment contains callbacks with the corresponding relationship +to the current grace period. +The pointer out of the end of each of the four segments is referenced +by the element of the ->nxttail[] array indexed by +RCU_DONE_TAIL (for callbacks handled by a prior grace period), +RCU_WAIT_TAIL (for callbacks waiting on the current grace period), +RCU_NEXT_READY_TAIL (for callbacks that will wait on the next +grace period), and +RCU_NEXT_TAIL (for callbacks that are not yet associated +with a specific grace period) +respectively, as shown in the following figure. + +

nxtlist.svg + +

In this figure, the ->nxtlist pointer references the +first +RCU callback in the list. +The ->nxttail[RCU_DONE_TAIL] array element references +the ->nxtlist pointer itself, indicating that none +of the callbacks is ready to invoke. +The ->nxttail[RCU_WAIT_TAIL] array element references callback +CB 2's ->next pointer, which indicates that +CB 1 and CB 2 are both waiting on the current grace period. +The ->nxttail[RCU_NEXT_READY_TAIL] array element +references the same RCU callback that ->nxttail[RCU_WAIT_TAIL] +does, which indicates that there are no callbacks waiting on the next +RCU grace period. +The ->nxttail[RCU_NEXT_TAIL] array element references +CB 4's ->next pointer, indicating that all the +remaining RCU callbacks have not yet been assigned to an RCU grace +period. +Note that the ->nxttail[RCU_NEXT_TAIL] array element +always references the last RCU callback's ->next pointer +unless the callback list is empty, in which case it references +the ->nxtlist pointer. + +

CPUs advance their callbacks from the +RCU_NEXT_TAIL to the RCU_NEXT_READY_TAIL to the +RCU_WAIT_TAIL to the RCU_DONE_TAIL list segments +as grace periods advance. +The CPU advances the callbacks in its rcu_data structure +whenever it notices that another RCU grace period has completed. +The CPU detects the completion of an RCU grace period by noticing +that the value of its rcu_data structure's +->completed field differs from that of its leaf +rcu_node structure. +Recall that each rcu_node structure's +->completed field is updated at the end of each +grace period. + +

The ->nxtcompleted[] array records grace-period +numbers corresponding to the list segments. +This allows CPUs that go idle for extended periods to determine +which of their callbacks are ready to be invoked after reawakening. + +

The ->qlen counter contains the number of +callbacks in ->nxtlist, and the +->qlen_lazy contains the number of those callbacks that +are known to only free memory, and whose invocation can therefore +be safely deferred. +The ->qlen_last_fqs_check and +->n_force_qs_snap coordinate the forcing of quiescent +states from call_rcu() and friends when callback +lists grow excessively long. + +

The ->n_cbs_invoked, +->n_cbs_orphaned, and ->n_cbs_adopted +fields count the number of callbacks invoked, +sent to other CPUs when this CPU goes offline, +and received from other CPUs when those other CPUs go offline. +Finally, the ->blimit counter is the maximum number of +RCU callbacks that may be invoked at a given time. + +

Dyntick-Idle Handling
+ +

This portion of the rcu_data structure is declared +as follows: + +

+  1   int dynticks_snap;
+  2   unsigned long dynticks_fqs;
+
+ +The ->dynticks_snap field is used to take a snapshot +of the corresponding CPU's dyntick-idle state when forcing +quiescent states, and is therefore accessed from other CPUs. +Finally, the ->dynticks_fqs field is used to +count the number of times this CPU is determined to be in +dyntick-idle state, and is used for tracing and debugging purposes. + +

+The rcu_dynticks Structure

+ +

The rcu_dynticks maintains the per-CPU dyntick-idle state +for the corresponding CPU. +Unlike the other structures, rcu_dynticks is not +replicated over the different flavors of RCU. +The fields in this structure may be accessed only from the corresponding +CPU (and from tracing) unless otherwise stated. +Its fields are as follows: + +

+  1   int dynticks_nesting;
+  2   int dynticks_nmi_nesting;
+  3   atomic_t dynticks;
+
+ +

The ->dynticks_nesting field counts the +nesting depth of normal interrupts. +In addition, this counter is incremented when exiting dyntick-idle +mode and decremented when entering it. +This counter can therefore be thought of as counting the number +of reasons why this CPU cannot be permitted to enter dyntick-idle +mode, aside from non-maskable interrupts (NMIs). +NMIs are counted by the ->dynticks_nmi_nesting +field, except that NMIs that interrupt non-dyntick-idle execution +are not counted. + +

Finally, the ->dynticks field counts the corresponding +CPU's transitions to and from dyntick-idle mode, so that this counter +has an even value when the CPU is in dyntick-idle mode and an odd +value otherwise. + + + + + + + + +
 
Quick Quiz:
+ Why not just count all NMIs? + Wouldn't that be simpler and less error prone? +
Answer:
+ It seems simpler only until you think hard about how to go about + updating the rcu_dynticks structure's + ->dynticks field. +
 
+ +

Additional fields are present for some special-purpose +builds, and are discussed separately. + +

+The rcu_head Structure

+ +

Each rcu_head structure represents an RCU callback. +These structures are normally embedded within RCU-protected data +structures whose algorithms use asynchronous grace periods. +In contrast, when using algorithms that block waiting for RCU grace periods, +RCU users need not provide rcu_head structures. + +

The rcu_head structure has fields as follows: + +

+  1   struct rcu_head *next;
+  2   void (*func)(struct rcu_head *head);
+
+ +

The ->next field is used +to link the rcu_head structures together in the +lists within the rcu_data structures. +The ->func field is a pointer to the function +to be called when the callback is ready to be invoked, and +this function is passed a pointer to the rcu_head +structure. +However, kfree_rcu() uses the ->func +field to record the offset of the rcu_head +structure within the enclosing RCU-protected data structure. + +

Both of these fields are used internally by RCU. +From the viewpoint of RCU users, this structure is an +opaque “cookie”. + + + + + + + + +
 
Quick Quiz:
+ Given that the callback function ->func + is passed a pointer to the rcu_head structure, + how is that function supposed to find the beginning of the + enclosing RCU-protected data structure? +
Answer:
+ In actual practice, there is a separate callback function per + type of RCU-protected data structure. + The callback function can therefore use the container_of() + macro in the Linux kernel (or other pointer-manipulation facilities + in other software environments) to find the beginning of the + enclosing structure. +
 
+ +

+RCU-Specific Fields in the task_struct Structure

+ +

The CONFIG_PREEMPT_RCU implementation uses some +additional fields in the task_struct structure: + +

+ 1 #ifdef CONFIG_PREEMPT_RCU
+ 2   int rcu_read_lock_nesting;
+ 3   union rcu_special rcu_read_unlock_special;
+ 4   struct list_head rcu_node_entry;
+ 5   struct rcu_node *rcu_blocked_node;
+ 6 #endif /* #ifdef CONFIG_PREEMPT_RCU */
+ 7 #ifdef CONFIG_TASKS_RCU
+ 8   unsigned long rcu_tasks_nvcsw;
+ 9   bool rcu_tasks_holdout;
+10   struct list_head rcu_tasks_holdout_list;
+11   int rcu_tasks_idle_cpu;
+12 #endif /* #ifdef CONFIG_TASKS_RCU */
+
+ +

The ->rcu_read_lock_nesting field records the +nesting level for RCU read-side critical sections, and +the ->rcu_read_unlock_special field is a bitmask +that records special conditions that require rcu_read_unlock() +to do additional work. +The ->rcu_node_entry field is used to form lists of +tasks that have blocked within preemptible-RCU read-side critical +sections and the ->rcu_blocked_node field references +the rcu_node structure whose list this task is a member of, +or NULL if it is not blocked within a preemptible-RCU +read-side critical section. + +

The ->rcu_tasks_nvcsw field tracks the number of +voluntary context switches that this task had undergone at the +beginning of the current tasks-RCU grace period, +->rcu_tasks_holdout is set if the current tasks-RCU +grace period is waiting on this task, ->rcu_tasks_holdout_list +is a list element enqueuing this task on the holdout list, +and ->rcu_tasks_idle_cpu tracks which CPU this +idle task is running, but only if the task is currently running, +that is, if the CPU is currently idle. + +

+Accessor Functions

+ +

The following listing shows the +rcu_get_root(), rcu_for_each_node_breadth_first, +rcu_for_each_nonleaf_node_breadth_first(), and +rcu_for_each_leaf_node() function and macros: + +

+  1 static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+  2 {
+  3   return &rsp->node[0];
+  4 }
+  5
+  6 #define rcu_for_each_node_breadth_first(rsp, rnp) \
+  7   for ((rnp) = &(rsp)->node[0]; \
+  8        (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+  9
+ 10 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
+ 11   for ((rnp) = &(rsp)->node[0]; \
+ 12        (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
+ 13
+ 14 #define rcu_for_each_leaf_node(rsp, rnp) \
+ 15   for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
+ 16        (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+
+ +

The rcu_get_root() simply returns a pointer to the +first element of the specified rcu_state structure's +->node[] array, which is the root rcu_node +structure. + +

As noted earlier, the rcu_for_each_node_breadth_first() +macro takes advantage of the layout of the rcu_node +structures in the rcu_state structure's +->node[] array, performing a breadth-first traversal by +simply traversing the array in order. +The rcu_for_each_nonleaf_node_breadth_first() macro operates +similarly, but traverses only the first part of the array, thus excluding +the leaf rcu_node structures. +Finally, the rcu_for_each_leaf_node() macro traverses only +the last part of the array, thus traversing only the leaf +rcu_node structures. + + + + + + + + +
 
Quick Quiz:
+ What do rcu_for_each_nonleaf_node_breadth_first() and + rcu_for_each_leaf_node() do if the rcu_node tree + contains only a single node? +
Answer:
+ In the single-node case, + rcu_for_each_nonleaf_node_breadth_first() is a no-op + and rcu_for_each_leaf_node() traverses the single node. +
 
+ +

+Summary

+ +So each flavor of RCU is represented by an rcu_state structure, +which contains a combining tree of rcu_node and +rcu_data structures. +Finally, in CONFIG_NO_HZ_IDLE kernels, each CPU's dyntick-idle +state is tracked by an rcu_dynticks structure. + +If you made it this far, you are well prepared to read the code +walkthroughs in the other articles in this series. + +

+Acknowledgments

+ +I owe thanks to Cyrill Gorcunov, Mathieu Desnoyers, Dhaval Giani, Paul +Turner, Abhishek Srivastava, Matt Kowalczyk, and Serge Hallyn +for helping me get this document into a more human-readable state. + +

+Legal Statement

+ +

This work represents the view of the author and does not necessarily +represent the view of IBM. + +

Linux is a registered trademark of Linus Torvalds. + +

Other company, product, and service names may be trademarks or +service marks of others. + + diff --git a/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg b/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg new file mode 100644 index 0000000000000000000000000000000000000000..2bf12b46820602fb8076d22aee6158a090e124ac --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg @@ -0,0 +1,939 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_node + + struct + + struct + + rcu_node + + struct + + rcu_node + + rcu_node + + struct + + rcu_node + + struct + + struct + + rcu_node + + CPU 0 + + struct + + rcu_data + + CPU 15 + + struct + + rcu_data + + struct + + rcu_data + + CPU 21823 + + CPU 21839 + + rcu_data + + struct + + struct + + rcu_data + + CPU 43679 + + CPU 43695 + + rcu_data + + struct + + struct + + rcu_data + + CPU 65519 + + CPU 65535 + + rcu_data + + struct + + struct rcu_state + + struct + + rcu_node + + diff --git a/Documentation/RCU/Design/Data-Structures/TreeLevel.svg b/Documentation/RCU/Design/Data-Structures/TreeLevel.svg new file mode 100644 index 0000000000000000000000000000000000000000..7a7eb3bac95cc9c30521025baaba446891d095ab --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/TreeLevel.svg @@ -0,0 +1,828 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_node + + struct + + struct + + rcu_node + + struct + + rcu_node + + rcu_node + + struct + + rcu_node + + struct + + struct + + rcu_node + + ->level[0] + + ->level[1] + + ->level[2] + + struct + + rcu_node + + CPU 15 + + CPU 0 + + CPU 65535 + + CPU 65519 + + CPU 43695 + + CPU 43679 + + CPU 21839 + + CPU 21823 + + struct rcu_state + + diff --git a/Documentation/RCU/Design/Data-Structures/TreeMapping.svg b/Documentation/RCU/Design/Data-Structures/TreeMapping.svg new file mode 100644 index 0000000000000000000000000000000000000000..729cfa9e6cdb8ed1c3c923e362273168ebdf2d8f --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/TreeMapping.svg @@ -0,0 +1,305 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0:7 + + 4:7 + + 0:1 + + 2:3 + + 4:5 + + 6:7 + + 0:3 + + struct rcu_state + + diff --git a/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg b/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg new file mode 100644 index 0000000000000000000000000000000000000000..5b416a4b8453f69b27b78dd5bd4384e3ab91ac01 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg @@ -0,0 +1,380 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ->level[0] + + ->level[1] + + ->level[2] + + 0:7 + + 4:7 + + 0:1 + + 2:3 + + 4:5 + + 6:7 + + 0:3 + + struct rcu_state + + + + + + + + + diff --git a/Documentation/RCU/Design/Data-Structures/blkd_task.svg b/Documentation/RCU/Design/Data-Structures/blkd_task.svg new file mode 100644 index 0000000000000000000000000000000000000000..00e810bb84194ae5b73a4dbe4bc0aa2ced0046df --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/blkd_task.svg @@ -0,0 +1,843 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_bh + + struct + + rcu_node + + struct + + rcu_node + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct rcu_state + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + rcu_sched + + T3 + + T2 + + T1 + + + + + + + + + + + + + rcu_node + + struct + + blkd_tasks + + gp_tasks + + exp_tasks + + diff --git a/Documentation/RCU/Design/Data-Structures/nxtlist.svg b/Documentation/RCU/Design/Data-Structures/nxtlist.svg new file mode 100644 index 0000000000000000000000000000000000000000..abc4cc73a0977195317d7a0397f1f5e7c52f35b3 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/nxtlist.svg @@ -0,0 +1,396 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nxtlist + + nxttail[RCU_DONE_TAIL] + + nxttail[RCU_WAIT_TAIL] + + nxttail[RCU_NEXT_READY_TAIL] + + nxttail[RCU_NEXT_TAIL] + + CB 1 + + next + + CB 3 + + next + + CB 4 + + next + + CB 2 + + next + + diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png deleted file mode 100644 index 7496a55e4e7b41becdb658a2bd34765fbe80013f..0000000000000000000000000000000000000000 Binary files a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png and /dev/null differ diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg deleted file mode 100644 index ebcbeee391ed7babdce76130c60977f76ca10832..0000000000000000000000000000000000000000 --- a/Documentation/RCU/Design/Requirements/RCUApplicability.svg +++ /dev/null @@ -1,237 +0,0 @@ - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - Read-Mostly, Stale & - - Inconsistent Data OK - - (RCU Works Great!!!) - - (RCU Works Well) - - Read-Mostly, Need Consistent Data - - Read-Write, Need Consistent Data - - Update-Mostly, Need Consistent Data - - (RCU Might Be OK...) - - (1) Provide Existence Guarantees For Update-Friendly Mechanisms - - (2) Provide Wait-Free Read-Side Primitives for Real-Time Use) - - (RCU is Very Unlikely to be the Right Tool For The Job, But it Can: - - diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index a725f9900ec8962a43dfb888f71f1a2b05efabf0..e7e24b3e86e29e2c721c447c237de890b11e7062 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1,5 +1,3 @@ - - @@ -65,8 +63,8 @@ All that aside, here are the categories of currently known RCU requirements:

This is followed by a summary, -which is in turn followed by the inevitable -answers to the quick quizzes. +however, the answers to each quick quiz immediately follows the quiz. +Select the big white space with your mouse to see the answer.

Fundamental Requirements

@@ -153,13 +151,27 @@ Therefore, the outcome: cannot happen. -

Quick Quiz 1: -Wait a minute! -You said that updaters can make useful forward progress concurrently -with readers, but pre-existing readers will block -synchronize_rcu()!!! -Just who are you trying to fool??? -
Answer + + + + + + + +
 
Quick Quiz:
+ Wait a minute! + You said that updaters can make useful forward progress concurrently + with readers, but pre-existing readers will block + synchronize_rcu()!!! + Just who are you trying to fool??? +
Answer:
+ First, if updaters do not wish to be blocked by readers, they can use + call_rcu() or kfree_rcu(), which will + be discussed later. + Second, even when using synchronize_rcu(), the other + update-side code does run concurrently with readers, whether + pre-existing or not. +
 

This scenario resembles one of the first uses of RCU in @@ -210,9 +222,20 @@ to guarantee that do_something() never runs concurrently with recovery(), but with little or no synchronization overhead in do_something_dlm(). -

Quick Quiz 2: -Why is the synchronize_rcu() on line 28 needed? -
Answer + + + + + + + +
 
Quick Quiz:
+ Why is the synchronize_rcu() on line 28 needed? +
Answer:
+ Without that extra grace period, memory reordering could result in + do_something_dlm() executing do_something() + concurrently with the last bits of recovery(). +
 

In order to avoid fatal problems such as deadlocks, @@ -332,12 +355,27 @@ It also prevents any number of “interesting” compiler optimizations, for example, the use of gp as a scratch location immediately preceding the assignment. -

Quick Quiz 3: -But rcu_assign_pointer() does nothing to prevent the -two assignments to p->a and p->b -from being reordered. -Can't that also cause problems? -
Answer + + + + + + + +
 
Quick Quiz:
+ But rcu_assign_pointer() does nothing to prevent the + two assignments to p->a and p->b + from being reordered. + Can't that also cause problems? +
Answer:
+ No, it cannot. + The readers cannot see either of these two fields until + the assignment to gp, by which time both fields are + fully initialized. + So reordering the assignments + to p->a and p->b cannot possibly + cause any problems. +
 

It is tempting to assume that the reader need not do anything special @@ -494,11 +532,42 @@ The rcu_access_pointer() on line 6 is similar to code protected by the corresponding update-side lock. -

Quick Quiz 4: -Without the rcu_dereference() or the -rcu_access_pointer(), what destructive optimizations -might the compiler make use of? -
Answer + + + + + + + +
 
Quick Quiz:
+ Without the rcu_dereference() or the + rcu_access_pointer(), what destructive optimizations + might the compiler make use of? +
Answer:
+ Let's start with what happens to do_something_gp() + if it fails to use rcu_dereference(). + It could reuse a value formerly fetched from this same pointer. + It could also fetch the pointer from gp in a byte-at-a-time + manner, resulting in load tearing, in turn resulting a bytewise + mash-up of two distince pointer values. + It might even use value-speculation optimizations, where it makes + a wrong guess, but by the time it gets around to checking the + value, an update has changed the pointer to match the wrong guess. + Too bad about any dereferences that returned pre-initialization garbage + in the meantime! + + +

+ For remove_gp_synchronous(), as long as all modifications + to gp are carried out while holding gp_lock, + the above optimizations are harmless. + However, + with CONFIG_SPARSE_RCU_POINTER=y, + sparse will complain if you + define gp with __rcu and then + access it without using + either rcu_access_pointer() or rcu_dereference(). +

 

In short, RCU's publish-subscribe guarantee is provided by the combination @@ -571,17 +640,156 @@ systems with more than one CPU: synchronize_rcu() migrates in the meantime. -

Quick Quiz 5: -Given that multiple CPUs can start RCU read-side critical sections -at any time without any ordering whatsoever, how can RCU possibly tell whether -or not a given RCU read-side critical section starts before a -given instance of synchronize_rcu()? -
Answer - -

Quick Quiz 6: -The first and second guarantees require unbelievably strict ordering! -Are all these memory barriers really required? -
Answer + + + + + + + +
 
Quick Quiz:
+ Given that multiple CPUs can start RCU read-side critical sections + at any time without any ordering whatsoever, how can RCU possibly + tell whether or not a given RCU read-side critical section starts + before a given instance of synchronize_rcu()? +
Answer:
+ If RCU cannot tell whether or not a given + RCU read-side critical section starts before a + given instance of synchronize_rcu(), + then it must assume that the RCU read-side critical section + started first. + In other words, a given instance of synchronize_rcu() + can avoid waiting on a given RCU read-side critical section only + if it can prove that synchronize_rcu() started first. +
 
+ + + + + + + + +
 
Quick Quiz:
+ The first and second guarantees require unbelievably strict ordering! + Are all these memory barriers really required? +
Answer:
+ Yes, they really are required. + To see why the first guarantee is required, consider the following + sequence of events: + + +
    +
  1. + CPU 1: rcu_read_lock() + +
  2. + CPU 1: q = rcu_dereference(gp); + /* Very likely to return p. */ + +
  3. + CPU 0: list_del_rcu(p); + +
  4. + CPU 0: synchronize_rcu() starts. + +
  5. + CPU 1: do_something_with(q->a); + /* No smp_mb(), so might happen after kfree(). */ + +
  6. + CPU 1: rcu_read_unlock() + +
  7. + CPU 0: synchronize_rcu() returns. + +
  8. + CPU 0: kfree(p); + +
+ +

+ Therefore, there absolutely must be a full memory barrier between the + end of the RCU read-side critical section and the end of the + grace period. + + +

+ The sequence of events demonstrating the necessity of the second rule + is roughly similar: + + +

    +
  1. CPU 0: list_del_rcu(p); + +
  2. CPU 0: synchronize_rcu() starts. + +
  3. CPU 1: rcu_read_lock() + +
  4. CPU 1: q = rcu_dereference(gp); + /* Might return p if no memory barrier. */ + +
  5. CPU 0: synchronize_rcu() returns. + +
  6. CPU 0: kfree(p); + +
  7. + CPU 1: do_something_with(q->a); /* Boom!!! */ + +
  8. CPU 1: rcu_read_unlock() + +
+ +

+ And similarly, without a memory barrier between the beginning of the + grace period and the beginning of the RCU read-side critical section, + CPU 1 might end up accessing the freelist. + + +

+ The “as if” rule of course applies, so that any + implementation that acts as if the appropriate memory barriers + were in place is a correct implementation. + That said, it is much easier to fool yourself into believing + that you have adhered to the as-if rule than it is to actually + adhere to it! +

 
+ + + + + + + + +
 
Quick Quiz:
+ You claim that rcu_read_lock() and rcu_read_unlock() + generate absolutely no code in some kernel builds. + This means that the compiler might arbitrarily rearrange consecutive + RCU read-side critical sections. + Given such rearrangement, if a given RCU read-side critical section + is done, how can you be sure that all prior RCU read-side critical + sections are done? + Won't the compiler rearrangements make that impossible to determine? +
Answer:
+ In cases where rcu_read_lock() and rcu_read_unlock() + generate absolutely no code, RCU infers quiescent states only at + special locations, for example, within the scheduler. + Because calls to schedule() had better prevent calling-code + accesses to shared variables from being rearranged across the call to + schedule(), if RCU detects the end of a given RCU read-side + critical section, it will necessarily detect the end of all prior + RCU read-side critical sections, no matter how aggressively the + compiler scrambles the code. + + +

+ Again, this all assumes that the compiler cannot scramble code across + calls to the scheduler, out of interrupt handlers, into the idle loop, + into user-mode code, and so on. + But if your kernel build allows that sort of scrambling, you have broken + far more than just RCU! +

 

Note that these memory-barrier requirements do not replace the fundamental @@ -626,9 +834,19 @@ inconvenience can be avoided through use of the call_rcu() and kfree_rcu() API members described later in this document. -

Quick Quiz 7: -But how does the upgrade-to-write operation exclude other readers? -
Answer + + + + + + + +
 
Quick Quiz:
+ But how does the upgrade-to-write operation exclude other readers? +
Answer:
+ It doesn't, just like normal RCU updates, which also do not exclude + RCU readers. +
 

This guarantee allows lookup code to be shared between read-side @@ -714,9 +932,20 @@ to do significant reordering. This is by design: Any significant ordering constraints would slow down these fast-path APIs. -

Quick Quiz 8: -Can't the compiler also reorder this code? -
Answer + + + + + + + +
 
Quick Quiz:
+ Can't the compiler also reorder this code? +
Answer:
+ No, the volatile casts in READ_ONCE() and + WRITE_ONCE() prevent the compiler from reordering in + this particular case. +
 

Readers Do Not Exclude Updaters

@@ -769,10 +998,28 @@ new readers can start immediately after synchronize_rcu() starts, and synchronize_rcu() is under no obligation to wait for these new readers. -

Quick Quiz 9: -Suppose that synchronize_rcu() did wait until all readers had completed. -Would the updater be able to rely on this? -
Answer + + + + + + + +
 
Quick Quiz:
+ Suppose that synchronize_rcu() did wait until all + readers had completed instead of waiting only on + pre-existing readers. + For how long would the updater be able to rely on there + being no readers? +
Answer:
+ For no time at all. + Even if synchronize_rcu() were to wait until + all readers had completed, a new reader might start immediately after + synchronize_rcu() completed. + Therefore, the code following + synchronize_rcu() can never rely on there being + no readers. +
 

Grace Periods Don't Partition Read-Side Critical Sections

@@ -969,11 +1216,24 @@ grace period. As a result, an RCU read-side critical section cannot partition a pair of RCU grace periods. -

Quick Quiz 10: -How long a sequence of grace periods, each separated by an RCU read-side -critical section, would be required to partition the RCU read-side -critical sections at the beginning and end of the chain? -
Answer + + + + + + + +
 
Quick Quiz:
+ How long a sequence of grace periods, each separated by an RCU + read-side critical section, would be required to partition the RCU + read-side critical sections at the beginning and end of the chain? +
Answer:
+ In theory, an infinite number. + In practice, an unknown number that is sensitive to both implementation + details and timing considerations. + Therefore, even in practice, RCU users must abide by the + theoretical rather than the practical answer. +
 

Disabling Preemption Does Not Block Grace Periods

@@ -1109,12 +1369,27 @@ These classes is covered in the following sections.

Specialization

-RCU is and always has been intended primarily for read-mostly situations, as -illustrated by the following figure. -This means that RCU's read-side primitives are optimized, often at the +RCU is and always has been intended primarily for read-mostly situations, +which means that RCU's read-side primitives are optimized, often at the expense of its update-side primitives. +Experience thus far is captured by the following list of situations: -

RCUApplicability.svg

+
    +
  1. Read-mostly data, where stale and inconsistent data is not + a problem: RCU works great! +
  2. Read-mostly data, where data must be consistent: + RCU works well. +
  3. Read-write data, where data must be consistent: + RCU might work OK. + Or not. +
  4. Write-mostly data, where data must be consistent: + RCU is very unlikely to be the right tool for the job, + with the following exceptions, where RCU can provide: +
      +
    1. Existence guarantees for update-friendly mechanisms. +
    2. Wait-free read-side primitives for real-time use. +
    +

This focus on read-mostly situations means that RCU must interoperate @@ -1127,9 +1402,43 @@ synchronization primitives be legal within RCU read-side critical sections, including spinlocks, sequence locks, atomic operations, reference counters, and memory barriers. -

Quick Quiz 11: -What about sleeping locks? -
Answer + + + + + + + +
 
Quick Quiz:
+ What about sleeping locks? +
Answer:
+ These are forbidden within Linux-kernel RCU read-side critical + sections because it is not legal to place a quiescent state + (in this case, voluntary context switch) within an RCU read-side + critical section. + However, sleeping locks may be used within userspace RCU read-side + critical sections, and also within Linux-kernel sleepable RCU + (SRCU) + read-side critical sections. + In addition, the -rt patchset turns spinlocks into a + sleeping locks so that the corresponding critical sections + can be preempted, which also means that these sleeplockified + spinlocks (but not other sleeping locks!) may be acquire within + -rt-Linux-kernel RCU read-side critical sections. + + +

+ Note that it is legal for a normal RCU read-side + critical section to conditionally acquire a sleeping locks + (as in mutex_trylock()), but only as long as it does + not loop indefinitely attempting to conditionally acquire that + sleeping locks. + The key point is that things like mutex_trylock() + either return with the mutex held, or return an error indication if + the mutex was not immediately available. + Either way, mutex_trylock() returns immediately without + sleeping. +

 

It often comes as a surprise that many algorithms do not require a @@ -1160,10 +1469,7 @@ some period of time, so the exact wait period is a judgment call. One of our pair of veternarians might wait 30 seconds before pronouncing the cat dead, while the other might insist on waiting a full minute. The two veternarians would then disagree on the state of the cat during -the final 30 seconds of the minute following the last heartbeat, as -fancifully illustrated below: - -

2013-08-is-it-dead.png

+the final 30 seconds of the minute following the last heartbeat.

Interestingly enough, this same situation applies to hardware. @@ -1343,7 +1649,8 @@ situations where neither synchronize_rcu() nor synchronize_rcu_expedited() would be legal, including within preempt-disable code, local_bh_disable() code, interrupt-disable code, and interrupt handlers. -However, even call_rcu() is illegal within NMI handlers. +However, even call_rcu() is illegal within NMI handlers +and from idle and offline CPUs. The callback function (remove_gp_cb() in this case) will be executed within softirq (software interrupt) environment within the Linux kernel, @@ -1354,12 +1661,27 @@ write an RCU callback function that takes too long. Long-running operations should be relegated to separate threads or (in the Linux kernel) workqueues. -

Quick Quiz 12: -Why does line 19 use rcu_access_pointer()? -After all, call_rcu() on line 25 stores into the -structure, which would interact badly with concurrent insertions. -Doesn't this mean that rcu_dereference() is required? -
Answer + + + + + + + +
 
Quick Quiz:
+ Why does line 19 use rcu_access_pointer()? + After all, call_rcu() on line 25 stores into the + structure, which would interact badly with concurrent insertions. + Doesn't this mean that rcu_dereference() is required? +
Answer:
+ Presumably the ->gp_lock acquired on line 18 excludes + any changes, including any insertions that rcu_dereference() + would protect against. + Therefore, any insertions will be delayed until after + ->gp_lock + is released on line 25, which in turn means that + rcu_access_pointer() suffices. +
 

However, all that remove_gp_cb() is doing is @@ -1406,14 +1728,31 @@ This was due to the fact that RCU was not heavily used within DYNIX/ptx, so the very few places that needed something like synchronize_rcu() simply open-coded it. -

Quick Quiz 13: -Earlier it was claimed that call_rcu() and -kfree_rcu() allowed updaters to avoid being blocked -by readers. -But how can that be correct, given that the invocation of the callback -and the freeing of the memory (respectively) must still wait for -a grace period to elapse? -
Answer + + + + + + + +
 
Quick Quiz:
+ Earlier it was claimed that call_rcu() and + kfree_rcu() allowed updaters to avoid being blocked + by readers. + But how can that be correct, given that the invocation of the callback + and the freeing of the memory (respectively) must still wait for + a grace period to elapse? +
Answer:
+ We could define things this way, but keep in mind that this sort of + definition would say that updates in garbage-collected languages + cannot complete until the next time the garbage collector runs, + which does not seem at all reasonable. + The key point is that in most cases, an updater using either + call_rcu() or kfree_rcu() can proceed to the + next update as soon as it has invoked call_rcu() or + kfree_rcu(), without having to wait for a subsequent + grace period. +
 

But what if the updater must wait for the completion of code to be @@ -1838,11 +2177,26 @@ kthreads to be spawned. Therefore, invoking synchronize_rcu() during scheduler initialization can result in deadlock. -

Quick Quiz 14: -So what happens with synchronize_rcu() during -scheduler initialization for CONFIG_PREEMPT=n -kernels? -
Answer + + + + + + + +
 
Quick Quiz:
+ So what happens with synchronize_rcu() during + scheduler initialization for CONFIG_PREEMPT=n + kernels? +
Answer:
+ In CONFIG_PREEMPT=n kernel, synchronize_rcu() + maps directly to synchronize_sched(). + Therefore, synchronize_rcu() works normally throughout + boot in CONFIG_PREEMPT=n kernels. + However, your code must also work in CONFIG_PREEMPT=y kernels, + so it is still necessary to avoid invoking synchronize_rcu() + during scheduler initialization. +
 

I learned of these boot-time requirements as a result of a series of @@ -2170,6 +2524,14 @@ up to and including systems with 4096 CPUs. This real-time requirement motivated the grace-period kthread, which also simplified handling of a number of race conditions. +

+RCU must avoid degrading real-time response for CPU-bound threads, whether +executing in usermode (which is one use case for +CONFIG_NO_HZ_FULL=y) or in the kernel. +That said, CPU-bound loops in the kernel must execute +cond_resched_rcu_qs() at least once per few tens of milliseconds +in order to avoid receiving an IPI from RCU. +

Finally, RCU's status as a synchronization primitive means that any RCU failure can result in arbitrary memory corruption that can be @@ -2223,6 +2585,8 @@ described in a separate section.

  • Sched Flavor
  • Sleepable RCU
  • Tasks RCU +
  • + Waiting for Multiple Grace Periods

    Bottom-Half Flavor

    @@ -2472,6 +2836,94 @@ The tasks-RCU API is quite compact, consisting only of synchronize_rcu_tasks(), and rcu_barrier_tasks(). +

    +Waiting for Multiple Grace Periods

    + +

    +Perhaps you have an RCU protected data structure that is accessed from +RCU read-side critical sections, from softirq handlers, and from +hardware interrupt handlers. +That is three flavors of RCU, the normal flavor, the bottom-half flavor, +and the sched flavor. +How to wait for a compound grace period? + +

    +The best approach is usually to “just say no!” and +insert rcu_read_lock() and rcu_read_unlock() +around each RCU read-side critical section, regardless of what +environment it happens to be in. +But suppose that some of the RCU read-side critical sections are +on extremely hot code paths, and that use of CONFIG_PREEMPT=n +is not a viable option, so that rcu_read_lock() and +rcu_read_unlock() are not free. +What then? + +

    +You could wait on all three grace periods in succession, as follows: + +

    +
    + 1 synchronize_rcu();
    + 2 synchronize_rcu_bh();
    + 3 synchronize_sched();
    +
    +
    + +

    +This works, but triples the update-side latency penalty. +In cases where this is not acceptable, synchronize_rcu_mult() +may be used to wait on all three flavors of grace period concurrently: + +

    +
    + 1 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched);
    +
    +
    + +

    +But what if it is necessary to also wait on SRCU? +This can be done as follows: + +

    +
    + 1 static void call_my_srcu(struct rcu_head *head,
    + 2        void (*func)(struct rcu_head *head))
    + 3 {
    + 4   call_srcu(&my_srcu, head, func);
    + 5 }
    + 6
    + 7 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched, call_my_srcu);
    +
    +
    + +

    +If you needed to wait on multiple different flavors of SRCU +(but why???), you would need to create a wrapper function resembling +call_my_srcu() for each SRCU flavor. + + + + + + + + +
     
    Quick Quiz:
    + But what if I need to wait for multiple RCU flavors, but I also need + the grace periods to be expedited? +
    Answer:
    + If you are using expedited grace periods, there should be less penalty + for waiting on them in succession. + But if that is nevertheless a problem, you can use workqueues + or multiple kthreads to wait on the various expedited grace + periods concurrently. +
     
    + +

    +Again, it is usually better to adjust the RCU read-side critical sections +to use a single flavor of RCU, but when this is not feasible, you can use +synchronize_rcu_mult(). +

    Possible Future Changes

    @@ -2569,329 +3021,4 @@ and is provided under the terms of the Creative Commons Attribution-Share Alike 3.0 United States license. -

    -Answers to Quick Quizzes

    - - -

    Quick Quiz 1: -Wait a minute! -You said that updaters can make useful forward progress concurrently -with readers, but pre-existing readers will block -synchronize_rcu()!!! -Just who are you trying to fool??? - - -

    Answer: -First, if updaters do not wish to be blocked by readers, they can use -call_rcu() or kfree_rcu(), which will -be discussed later. -Second, even when using synchronize_rcu(), the other -update-side code does run concurrently with readers, whether pre-existing -or not. - - -

    Back to Quick Quiz 1. - - -

    Quick Quiz 2: -Why is the synchronize_rcu() on line 28 needed? - - -

    Answer: -Without that extra grace period, memory reordering could result in -do_something_dlm() executing do_something() -concurrently with the last bits of recovery(). - - -

    Back to Quick Quiz 2. - - -

    Quick Quiz 3: -But rcu_assign_pointer() does nothing to prevent the -two assignments to p->a and p->b -from being reordered. -Can't that also cause problems? - - -

    Answer: -No, it cannot. -The readers cannot see either of these two fields until -the assignment to gp, by which time both fields are -fully initialized. -So reordering the assignments -to p->a and p->b cannot possibly -cause any problems. - - -

    Back to Quick Quiz 3. - - -

    Quick Quiz 4: -Without the rcu_dereference() or the -rcu_access_pointer(), what destructive optimizations -might the compiler make use of? - - -

    Answer: -Let's start with what happens to do_something_gp() -if it fails to use rcu_dereference(). -It could reuse a value formerly fetched from this same pointer. -It could also fetch the pointer from gp in a byte-at-a-time -manner, resulting in load tearing, in turn resulting a bytewise -mash-up of two distince pointer values. -It might even use value-speculation optimizations, where it makes a wrong -guess, but by the time it gets around to checking the value, an update -has changed the pointer to match the wrong guess. -Too bad about any dereferences that returned pre-initialization garbage -in the meantime! - -

    -For remove_gp_synchronous(), as long as all modifications -to gp are carried out while holding gp_lock, -the above optimizations are harmless. -However, -with CONFIG_SPARSE_RCU_POINTER=y, -sparse will complain if you -define gp with __rcu and then -access it without using -either rcu_access_pointer() or rcu_dereference(). - - -

    Back to Quick Quiz 4. - - -

    Quick Quiz 5: -Given that multiple CPUs can start RCU read-side critical sections -at any time without any ordering whatsoever, how can RCU possibly tell whether -or not a given RCU read-side critical section starts before a -given instance of synchronize_rcu()? - - -

    Answer: -If RCU cannot tell whether or not a given -RCU read-side critical section starts before a -given instance of synchronize_rcu(), -then it must assume that the RCU read-side critical section -started first. -In other words, a given instance of synchronize_rcu() -can avoid waiting on a given RCU read-side critical section only -if it can prove that synchronize_rcu() started first. - - -

    Back to Quick Quiz 5. - - -

    Quick Quiz 6: -The first and second guarantees require unbelievably strict ordering! -Are all these memory barriers really required? - - -

    Answer: -Yes, they really are required. -To see why the first guarantee is required, consider the following -sequence of events: - -

      -
    1. CPU 1: rcu_read_lock() -
    2. CPU 1: q = rcu_dereference(gp); - /* Very likely to return p. */ -
    3. CPU 0: list_del_rcu(p); -
    4. CPU 0: synchronize_rcu() starts. -
    5. CPU 1: do_something_with(q->a); - /* No smp_mb(), so might happen after kfree(). */ -
    6. CPU 1: rcu_read_unlock() -
    7. CPU 0: synchronize_rcu() returns. -
    8. CPU 0: kfree(p); -
    - -

    -Therefore, there absolutely must be a full memory barrier between the -end of the RCU read-side critical section and the end of the -grace period. - -

    -The sequence of events demonstrating the necessity of the second rule -is roughly similar: - -

      -
    1. CPU 0: list_del_rcu(p); -
    2. CPU 0: synchronize_rcu() starts. -
    3. CPU 1: rcu_read_lock() -
    4. CPU 1: q = rcu_dereference(gp); - /* Might return p if no memory barrier. */ -
    5. CPU 0: synchronize_rcu() returns. -
    6. CPU 0: kfree(p); -
    7. CPU 1: do_something_with(q->a); /* Boom!!! */ -
    8. CPU 1: rcu_read_unlock() -
    - -

    -And similarly, without a memory barrier between the beginning of the -grace period and the beginning of the RCU read-side critical section, -CPU 1 might end up accessing the freelist. - -

    -The “as if” rule of course applies, so that any implementation -that acts as if the appropriate memory barriers were in place is a -correct implementation. -That said, it is much easier to fool yourself into believing that you have -adhered to the as-if rule than it is to actually adhere to it! - - -

    Back to Quick Quiz 6. - - -

    Quick Quiz 7: -But how does the upgrade-to-write operation exclude other readers? - - -

    Answer: -It doesn't, just like normal RCU updates, which also do not exclude -RCU readers. - - -

    Back to Quick Quiz 7. - - -

    Quick Quiz 8: -Can't the compiler also reorder this code? - - -

    Answer: -No, the volatile casts in READ_ONCE() and -WRITE_ONCE() prevent the compiler from reordering in -this particular case. - - -

    Back to Quick Quiz 8. - - -

    Quick Quiz 9: -Suppose that synchronize_rcu() did wait until all readers had completed. -Would the updater be able to rely on this? - - -

    Answer: -No. -Even if synchronize_rcu() were to wait until -all readers had completed, a new reader might start immediately after -synchronize_rcu() completed. -Therefore, the code following -synchronize_rcu() cannot rely on there being no readers -in any case. - - -

    Back to Quick Quiz 9. - - -

    Quick Quiz 10: -How long a sequence of grace periods, each separated by an RCU read-side -critical section, would be required to partition the RCU read-side -critical sections at the beginning and end of the chain? - - -

    Answer: -In theory, an infinite number. -In practice, an unknown number that is sensitive to both implementation -details and timing considerations. -Therefore, even in practice, RCU users must abide by the theoretical rather -than the practical answer. - - -

    Back to Quick Quiz 10. - - -

    Quick Quiz 11: -What about sleeping locks? - - -

    Answer: -These are forbidden within Linux-kernel RCU read-side critical sections -because it is not legal to place a quiescent state (in this case, -voluntary context switch) within an RCU read-side critical section. -However, sleeping locks may be used within userspace RCU read-side critical -sections, and also within Linux-kernel sleepable RCU -(SRCU) -read-side critical sections. -In addition, the -rt patchset turns spinlocks into a sleeping locks so -that the corresponding critical sections can be preempted, which -also means that these sleeplockified spinlocks (but not other sleeping locks!) -may be acquire within -rt-Linux-kernel RCU read-side critical sections. - -

    -Note that it is legal for a normal RCU read-side critical section -to conditionally acquire a sleeping locks (as in mutex_trylock()), -but only as long as it does not loop indefinitely attempting to -conditionally acquire that sleeping locks. -The key point is that things like mutex_trylock() -either return with the mutex held, or return an error indication if -the mutex was not immediately available. -Either way, mutex_trylock() returns immediately without sleeping. - - -

    Back to Quick Quiz 11. - - -

    Quick Quiz 12: -Why does line 19 use rcu_access_pointer()? -After all, call_rcu() on line 25 stores into the -structure, which would interact badly with concurrent insertions. -Doesn't this mean that rcu_dereference() is required? - - -

    Answer: -Presumably the ->gp_lock acquired on line 18 excludes -any changes, including any insertions that rcu_dereference() -would protect against. -Therefore, any insertions will be delayed until after ->gp_lock -is released on line 25, which in turn means that -rcu_access_pointer() suffices. - - -

    Back to Quick Quiz 12. - - -

    Quick Quiz 13: -Earlier it was claimed that call_rcu() and -kfree_rcu() allowed updaters to avoid being blocked -by readers. -But how can that be correct, given that the invocation of the callback -and the freeing of the memory (respectively) must still wait for -a grace period to elapse? - - -

    Answer: -We could define things this way, but keep in mind that this sort of -definition would say that updates in garbage-collected languages -cannot complete until the next time the garbage collector runs, -which does not seem at all reasonable. -The key point is that in most cases, an updater using either -call_rcu() or kfree_rcu() can proceed to the -next update as soon as it has invoked call_rcu() or -kfree_rcu(), without having to wait for a subsequent -grace period. - - -

    Back to Quick Quiz 13. - - -

    Quick Quiz 14: -So what happens with synchronize_rcu() during -scheduler initialization for CONFIG_PREEMPT=n -kernels? - - -

    Answer: -In CONFIG_PREEMPT=n kernel, synchronize_rcu() -maps directly to synchronize_sched(). -Therefore, synchronize_rcu() works normally throughout -boot in CONFIG_PREEMPT=n kernels. -However, your code must also work in CONFIG_PREEMPT=y kernels, -so it is still necessary to avoid invoking synchronize_rcu() -during scheduler initialization. - - -

    Back to Quick Quiz 14. - - diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx deleted file mode 100644 index 3a97ba490c42b03d606c5fe36e5afe1a9b213e28..0000000000000000000000000000000000000000 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ /dev/null @@ -1,2741 +0,0 @@ - - - A Tour Through RCU's Requirements [LWN.net] - - -

    A Tour Through RCU's Requirements

    - -

    Copyright IBM Corporation, 2015

    -

    Author: Paul E. McKenney

    -

    The initial version of this document appeared in the -LWN articles -here, -here, and -here.

    - -

    Introduction

    - -

    -Read-copy update (RCU) is a synchronization mechanism that is often -used as a replacement for reader-writer locking. -RCU is unusual in that updaters do not block readers, -which means that RCU's read-side primitives can be exceedingly fast -and scalable. -In addition, updaters can make useful forward progress concurrently -with readers. -However, all this concurrency between RCU readers and updaters does raise -the question of exactly what RCU readers are doing, which in turn -raises the question of exactly what RCU's requirements are. - -

    -This document therefore summarizes RCU's requirements, and can be thought -of as an informal, high-level specification for RCU. -It is important to understand that RCU's specification is primarily -empirical in nature; -in fact, I learned about many of these requirements the hard way. -This situation might cause some consternation, however, not only -has this learning process been a lot of fun, but it has also been -a great privilege to work with so many people willing to apply -technologies in interesting new ways. - -

    -All that aside, here are the categories of currently known RCU requirements: -

    - -
      -
    1. - Fundamental Requirements -
    2. Fundamental Non-Requirements -
    3. - Parallelism Facts of Life -
    4. - Quality-of-Implementation Requirements -
    5. - Linux Kernel Complications -
    6. - Software-Engineering Requirements -
    7. - Other RCU Flavors -
    8. - Possible Future Changes -
    - -

    -This is followed by a summary, -which is in turn followed by the inevitable -answers to the quick quizzes. - -

    Fundamental Requirements

    - -

    -RCU's fundamental requirements are the closest thing RCU has to hard -mathematical requirements. -These are: - -

      -
    1. - Grace-Period Guarantee -
    2. - Publish-Subscribe Guarantee -
    3. - Memory-Barrier Guarantees -
    4. - RCU Primitives Guaranteed to Execute Unconditionally -
    5. - Guaranteed Read-to-Write Upgrade -
    - -

    Grace-Period Guarantee

    - -

    -RCU's grace-period guarantee is unusual in being premeditated: -Jack Slingwine and I had this guarantee firmly in mind when we started -work on RCU (then called “rclock”) in the early 1990s. -That said, the past two decades of experience with RCU have produced -a much more detailed understanding of this guarantee. - -

    -RCU's grace-period guarantee allows updaters to wait for the completion -of all pre-existing RCU read-side critical sections. -An RCU read-side critical section -begins with the marker rcu_read_lock() and ends with -the marker rcu_read_unlock(). -These markers may be nested, and RCU treats a nested set as one -big RCU read-side critical section. -Production-quality implementations of rcu_read_lock() and -rcu_read_unlock() are extremely lightweight, and in -fact have exactly zero overhead in Linux kernels built for production -use with CONFIG_PREEMPT=n. - -

    -This guarantee allows ordering to be enforced with extremely low -overhead to readers, for example: - -

    -
    - 1 int x, y;
    - 2
    - 3 void thread0(void)
    - 4 {
    - 5   rcu_read_lock();
    - 6   r1 = READ_ONCE(x);
    - 7   r2 = READ_ONCE(y);
    - 8   rcu_read_unlock();
    - 9 }
    -10
    -11 void thread1(void)
    -12 {
    -13   WRITE_ONCE(x, 1);
    -14   synchronize_rcu();
    -15   WRITE_ONCE(y, 1);
    -16 }
    -
    -
    - -

    -Because the synchronize_rcu() on line 14 waits for -all pre-existing readers, any instance of thread0() that -loads a value of zero from x must complete before -thread1() stores to y, so that instance must -also load a value of zero from y. -Similarly, any instance of thread0() that loads a value of -one from y must have started after the -synchronize_rcu() started, and must therefore also load -a value of one from x. -Therefore, the outcome: -

    -
    -(r1 == 0 && r2 == 1)
    -
    -
    -cannot happen. - -

    @@QQ@@ -Wait a minute! -You said that updaters can make useful forward progress concurrently -with readers, but pre-existing readers will block -synchronize_rcu()!!! -Just who are you trying to fool??? -

    @@QQA@@ -First, if updaters do not wish to be blocked by readers, they can use -call_rcu() or kfree_rcu(), which will -be discussed later. -Second, even when using synchronize_rcu(), the other -update-side code does run concurrently with readers, whether pre-existing -or not. -

    @@QQE@@ - -

    -This scenario resembles one of the first uses of RCU in -DYNIX/ptx, -which managed a distributed lock manager's transition into -a state suitable for handling recovery from node failure, -more or less as follows: - -

    -
    - 1 #define STATE_NORMAL        0
    - 2 #define STATE_WANT_RECOVERY 1
    - 3 #define STATE_RECOVERING    2
    - 4 #define STATE_WANT_NORMAL   3
    - 5
    - 6 int state = STATE_NORMAL;
    - 7
    - 8 void do_something_dlm(void)
    - 9 {
    -10   int state_snap;
    -11
    -12   rcu_read_lock();
    -13   state_snap = READ_ONCE(state);
    -14   if (state_snap == STATE_NORMAL)
    -15     do_something();
    -16   else
    -17     do_something_carefully();
    -18   rcu_read_unlock();
    -19 }
    -20
    -21 void start_recovery(void)
    -22 {
    -23   WRITE_ONCE(state, STATE_WANT_RECOVERY);
    -24   synchronize_rcu();
    -25   WRITE_ONCE(state, STATE_RECOVERING);
    -26   recovery();
    -27   WRITE_ONCE(state, STATE_WANT_NORMAL);
    -28   synchronize_rcu();
    -29   WRITE_ONCE(state, STATE_NORMAL);
    -30 }
    -
    -
    - -

    -The RCU read-side critical section in do_something_dlm() -works with the synchronize_rcu() in start_recovery() -to guarantee that do_something() never runs concurrently -with recovery(), but with little or no synchronization -overhead in do_something_dlm(). - -

    @@QQ@@ -Why is the synchronize_rcu() on line 28 needed? -

    @@QQA@@ -Without that extra grace period, memory reordering could result in -do_something_dlm() executing do_something() -concurrently with the last bits of recovery(). -

    @@QQE@@ - -

    -In order to avoid fatal problems such as deadlocks, -an RCU read-side critical section must not contain calls to -synchronize_rcu(). -Similarly, an RCU read-side critical section must not -contain anything that waits, directly or indirectly, on completion of -an invocation of synchronize_rcu(). - -

    -Although RCU's grace-period guarantee is useful in and of itself, with -quite a few use cases, -it would be good to be able to use RCU to coordinate read-side -access to linked data structures. -For this, the grace-period guarantee is not sufficient, as can -be seen in function add_gp_buggy() below. -We will look at the reader's code later, but in the meantime, just think of -the reader as locklessly picking up the gp pointer, -and, if the value loaded is non-NULL, locklessly accessing the -->a and ->b fields. - -

    -
    - 1 bool add_gp_buggy(int a, int b)
    - 2 {
    - 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
    - 4   if (!p)
    - 5     return -ENOMEM;
    - 6   spin_lock(&gp_lock);
    - 7   if (rcu_access_pointer(gp)) {
    - 8     spin_unlock(&gp_lock);
    - 9     return false;
    -10   }
    -11   p->a = a;
    -12   p->b = a;
    -13   gp = p; /* ORDERING BUG */
    -14   spin_unlock(&gp_lock);
    -15   return true;
    -16 }
    -
    -
    - -

    -The problem is that both the compiler and weakly ordered CPUs are within -their rights to reorder this code as follows: - -

    -
    - 1 bool add_gp_buggy_optimized(int a, int b)
    - 2 {
    - 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
    - 4   if (!p)
    - 5     return -ENOMEM;
    - 6   spin_lock(&gp_lock);
    - 7   if (rcu_access_pointer(gp)) {
    - 8     spin_unlock(&gp_lock);
    - 9     return false;
    -10   }
    -11   gp = p; /* ORDERING BUG */
    -12   p->a = a;
    -13   p->b = a;
    -14   spin_unlock(&gp_lock);
    -15   return true;
    -16 }
    -
    -
    - -

    -If an RCU reader fetches gp just after -add_gp_buggy_optimized executes line 11, -it will see garbage in the ->a and ->b -fields. -And this is but one of many ways in which compiler and hardware optimizations -could cause trouble. -Therefore, we clearly need some way to prevent the compiler and the CPU from -reordering in this manner, which brings us to the publish-subscribe -guarantee discussed in the next section. - -

    Publish/Subscribe Guarantee

    - -

    -RCU's publish-subscribe guarantee allows data to be inserted -into a linked data structure without disrupting RCU readers. -The updater uses rcu_assign_pointer() to insert the -new data, and readers use rcu_dereference() to -access data, whether new or old. -The following shows an example of insertion: - -

    -
    - 1 bool add_gp(int a, int b)
    - 2 {
    - 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
    - 4   if (!p)
    - 5     return -ENOMEM;
    - 6   spin_lock(&gp_lock);
    - 7   if (rcu_access_pointer(gp)) {
    - 8     spin_unlock(&gp_lock);
    - 9     return false;
    -10   }
    -11   p->a = a;
    -12   p->b = a;
    -13   rcu_assign_pointer(gp, p);
    -14   spin_unlock(&gp_lock);
    -15   return true;
    -16 }
    -
    -
    - -

    -The rcu_assign_pointer() on line 13 is conceptually -equivalent to a simple assignment statement, but also guarantees -that its assignment will -happen after the two assignments in lines 11 and 12, -similar to the C11 memory_order_release store operation. -It also prevents any number of “interesting” compiler -optimizations, for example, the use of gp as a scratch -location immediately preceding the assignment. - -

    @@QQ@@ -But rcu_assign_pointer() does nothing to prevent the -two assignments to p->a and p->b -from being reordered. -Can't that also cause problems? -

    @@QQA@@ -No, it cannot. -The readers cannot see either of these two fields until -the assignment to gp, by which time both fields are -fully initialized. -So reordering the assignments -to p->a and p->b cannot possibly -cause any problems. -

    @@QQE@@ - -

    -It is tempting to assume that the reader need not do anything special -to control its accesses to the RCU-protected data, -as shown in do_something_gp_buggy() below: - -

    -
    - 1 bool do_something_gp_buggy(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   p = gp;  /* OPTIMIZATIONS GALORE!!! */
    - 5   if (p) {
    - 6     do_something(p->a, p->b);
    - 7     rcu_read_unlock();
    - 8     return true;
    - 9   }
    -10   rcu_read_unlock();
    -11   return false;
    -12 }
    -
    -
    - -

    -However, this temptation must be resisted because there are a -surprisingly large number of ways that the compiler -(to say nothing of -DEC Alpha CPUs) -can trip this code up. -For but one example, if the compiler were short of registers, it -might choose to refetch from gp rather than keeping -a separate copy in p as follows: - -

    -
    - 1 bool do_something_gp_buggy_optimized(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   if (gp) { /* OPTIMIZATIONS GALORE!!! */
    - 5     do_something(gp->a, gp->b);
    - 6     rcu_read_unlock();
    - 7     return true;
    - 8   }
    - 9   rcu_read_unlock();
    -10   return false;
    -11 }
    -
    -
    - -

    -If this function ran concurrently with a series of updates that -replaced the current structure with a new one, -the fetches of gp->a -and gp->b might well come from two different structures, -which could cause serious confusion. -To prevent this (and much else besides), do_something_gp() uses -rcu_dereference() to fetch from gp: - -

    -
    - 1 bool do_something_gp(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   p = rcu_dereference(gp);
    - 5   if (p) {
    - 6     do_something(p->a, p->b);
    - 7     rcu_read_unlock();
    - 8     return true;
    - 9   }
    -10   rcu_read_unlock();
    -11   return false;
    -12 }
    -
    -
    - -

    -The rcu_dereference() uses volatile casts and (for DEC Alpha) -memory barriers in the Linux kernel. -Should a -high-quality implementation of C11 memory_order_consume [PDF] -ever appear, then rcu_dereference() could be implemented -as a memory_order_consume load. -Regardless of the exact implementation, a pointer fetched by -rcu_dereference() may not be used outside of the -outermost RCU read-side critical section containing that -rcu_dereference(), unless protection of -the corresponding data element has been passed from RCU to some -other synchronization mechanism, most commonly locking or -reference counting. - -

    -In short, updaters use rcu_assign_pointer() and readers -use rcu_dereference(), and these two RCU API elements -work together to ensure that readers have a consistent view of -newly added data elements. - -

    -Of course, it is also necessary to remove elements from RCU-protected -data structures, for example, using the following process: - -

      -
    1. Remove the data element from the enclosing structure. -
    2. Wait for all pre-existing RCU read-side critical sections - to complete (because only pre-existing readers can possibly have - a reference to the newly removed data element). -
    3. At this point, only the updater has a reference to the - newly removed data element, so it can safely reclaim - the data element, for example, by passing it to kfree(). -
    - -This process is implemented by remove_gp_synchronous(): - -
    -
    - 1 bool remove_gp_synchronous(void)
    - 2 {
    - 3   struct foo *p;
    - 4
    - 5   spin_lock(&gp_lock);
    - 6   p = rcu_access_pointer(gp);
    - 7   if (!p) {
    - 8     spin_unlock(&gp_lock);
    - 9     return false;
    -10   }
    -11   rcu_assign_pointer(gp, NULL);
    -12   spin_unlock(&gp_lock);
    -13   synchronize_rcu();
    -14   kfree(p);
    -15   return true;
    -16 }
    -
    -
    - -

    -This function is straightforward, with line 13 waiting for a grace -period before line 14 frees the old data element. -This waiting ensures that readers will reach line 7 of -do_something_gp() before the data element referenced by -p is freed. -The rcu_access_pointer() on line 6 is similar to -rcu_dereference(), except that: - -

      -
    1. The value returned by rcu_access_pointer() - cannot be dereferenced. - If you want to access the value pointed to as well as - the pointer itself, use rcu_dereference() - instead of rcu_access_pointer(). -
    2. The call to rcu_access_pointer() need not be - protected. - In contrast, rcu_dereference() must either be - within an RCU read-side critical section or in a code - segment where the pointer cannot change, for example, in - code protected by the corresponding update-side lock. -
    - -

    @@QQ@@ -Without the rcu_dereference() or the -rcu_access_pointer(), what destructive optimizations -might the compiler make use of? -

    @@QQA@@ -Let's start with what happens to do_something_gp() -if it fails to use rcu_dereference(). -It could reuse a value formerly fetched from this same pointer. -It could also fetch the pointer from gp in a byte-at-a-time -manner, resulting in load tearing, in turn resulting a bytewise -mash-up of two distince pointer values. -It might even use value-speculation optimizations, where it makes a wrong -guess, but by the time it gets around to checking the value, an update -has changed the pointer to match the wrong guess. -Too bad about any dereferences that returned pre-initialization garbage -in the meantime! - -

    -For remove_gp_synchronous(), as long as all modifications -to gp are carried out while holding gp_lock, -the above optimizations are harmless. -However, -with CONFIG_SPARSE_RCU_POINTER=y, -sparse will complain if you -define gp with __rcu and then -access it without using -either rcu_access_pointer() or rcu_dereference(). -

    @@QQE@@ - -

    -In short, RCU's publish-subscribe guarantee is provided by the combination -of rcu_assign_pointer() and rcu_dereference(). -This guarantee allows data elements to be safely added to RCU-protected -linked data structures without disrupting RCU readers. -This guarantee can be used in combination with the grace-period -guarantee to also allow data elements to be removed from RCU-protected -linked data structures, again without disrupting RCU readers. - -

    -This guarantee was only partially premeditated. -DYNIX/ptx used an explicit memory barrier for publication, but had nothing -resembling rcu_dereference() for subscription, nor did it -have anything resembling the smp_read_barrier_depends() -that was later subsumed into rcu_dereference(). -The need for these operations made itself known quite suddenly at a -late-1990s meeting with the DEC Alpha architects, back in the days when -DEC was still a free-standing company. -It took the Alpha architects a good hour to convince me that any sort -of barrier would ever be needed, and it then took me a good two hours -to convince them that their documentation did not make this point clear. -More recent work with the C and C++ standards committees have provided -much education on tricks and traps from the compiler. -In short, compilers were much less tricky in the early 1990s, but in -2015, don't even think about omitting rcu_dereference()! - -

    Memory-Barrier Guarantees

    - -

    -The previous section's simple linked-data-structure scenario clearly -demonstrates the need for RCU's stringent memory-ordering guarantees on -systems with more than one CPU: - -

      -
    1. Each CPU that has an RCU read-side critical section that - begins before synchronize_rcu() starts is - guaranteed to execute a full memory barrier between the time - that the RCU read-side critical section ends and the time that - synchronize_rcu() returns. - Without this guarantee, a pre-existing RCU read-side critical section - might hold a reference to the newly removed struct foo - after the kfree() on line 14 of - remove_gp_synchronous(). -
    2. Each CPU that has an RCU read-side critical section that ends - after synchronize_rcu() returns is guaranteed - to execute a full memory barrier between the time that - synchronize_rcu() begins and the time that the RCU - read-side critical section begins. - Without this guarantee, a later RCU read-side critical section - running after the kfree() on line 14 of - remove_gp_synchronous() might - later run do_something_gp() and find the - newly deleted struct foo. -
    3. If the task invoking synchronize_rcu() remains - on a given CPU, then that CPU is guaranteed to execute a full - memory barrier sometime during the execution of - synchronize_rcu(). - This guarantee ensures that the kfree() on - line 14 of remove_gp_synchronous() really does - execute after the removal on line 11. -
    4. If the task invoking synchronize_rcu() migrates - among a group of CPUs during that invocation, then each of the - CPUs in that group is guaranteed to execute a full memory barrier - sometime during the execution of synchronize_rcu(). - This guarantee also ensures that the kfree() on - line 14 of remove_gp_synchronous() really does - execute after the removal on - line 11, but also in the case where the thread executing the - synchronize_rcu() migrates in the meantime. -
    - -

    @@QQ@@ -Given that multiple CPUs can start RCU read-side critical sections -at any time without any ordering whatsoever, how can RCU possibly tell whether -or not a given RCU read-side critical section starts before a -given instance of synchronize_rcu()? -

    @@QQA@@ -If RCU cannot tell whether or not a given -RCU read-side critical section starts before a -given instance of synchronize_rcu(), -then it must assume that the RCU read-side critical section -started first. -In other words, a given instance of synchronize_rcu() -can avoid waiting on a given RCU read-side critical section only -if it can prove that synchronize_rcu() started first. -

    @@QQE@@ - -

    @@QQ@@ -The first and second guarantees require unbelievably strict ordering! -Are all these memory barriers really required? -

    @@QQA@@ -Yes, they really are required. -To see why the first guarantee is required, consider the following -sequence of events: - -

      -
    1. CPU 1: rcu_read_lock() -
    2. CPU 1: q = rcu_dereference(gp); - /* Very likely to return p. */ -
    3. CPU 0: list_del_rcu(p); -
    4. CPU 0: synchronize_rcu() starts. -
    5. CPU 1: do_something_with(q->a); - /* No smp_mb(), so might happen after kfree(). */ -
    6. CPU 1: rcu_read_unlock() -
    7. CPU 0: synchronize_rcu() returns. -
    8. CPU 0: kfree(p); -
    - -

    -Therefore, there absolutely must be a full memory barrier between the -end of the RCU read-side critical section and the end of the -grace period. - -

    -The sequence of events demonstrating the necessity of the second rule -is roughly similar: - -

      -
    1. CPU 0: list_del_rcu(p); -
    2. CPU 0: synchronize_rcu() starts. -
    3. CPU 1: rcu_read_lock() -
    4. CPU 1: q = rcu_dereference(gp); - /* Might return p if no memory barrier. */ -
    5. CPU 0: synchronize_rcu() returns. -
    6. CPU 0: kfree(p); -
    7. CPU 1: do_something_with(q->a); /* Boom!!! */ -
    8. CPU 1: rcu_read_unlock() -
    - -

    -And similarly, without a memory barrier between the beginning of the -grace period and the beginning of the RCU read-side critical section, -CPU 1 might end up accessing the freelist. - -

    -The “as if” rule of course applies, so that any implementation -that acts as if the appropriate memory barriers were in place is a -correct implementation. -That said, it is much easier to fool yourself into believing that you have -adhered to the as-if rule than it is to actually adhere to it! -

    @@QQE@@ - -

    -Note that these memory-barrier requirements do not replace the fundamental -RCU requirement that a grace period wait for all pre-existing readers. -On the contrary, the memory barriers called out in this section must operate in -such a way as to enforce this fundamental requirement. -Of course, different implementations enforce this requirement in different -ways, but enforce it they must. - -

    RCU Primitives Guaranteed to Execute Unconditionally

    - -

    -The common-case RCU primitives are unconditional. -They are invoked, they do their job, and they return, with no possibility -of error, and no need to retry. -This is a key RCU design philosophy. - -

    -However, this philosophy is pragmatic rather than pigheaded. -If someone comes up with a good justification for a particular conditional -RCU primitive, it might well be implemented and added. -After all, this guarantee was reverse-engineered, not premeditated. -The unconditional nature of the RCU primitives was initially an -accident of implementation, and later experience with synchronization -primitives with conditional primitives caused me to elevate this -accident to a guarantee. -Therefore, the justification for adding a conditional primitive to -RCU would need to be based on detailed and compelling use cases. - -

    Guaranteed Read-to-Write Upgrade

    - -

    -As far as RCU is concerned, it is always possible to carry out an -update within an RCU read-side critical section. -For example, that RCU read-side critical section might search for -a given data element, and then might acquire the update-side -spinlock in order to update that element, all while remaining -in that RCU read-side critical section. -Of course, it is necessary to exit the RCU read-side critical section -before invoking synchronize_rcu(), however, this -inconvenience can be avoided through use of the -call_rcu() and kfree_rcu() API members -described later in this document. - -

    @@QQ@@ -But how does the upgrade-to-write operation exclude other readers? -

    @@QQA@@ -It doesn't, just like normal RCU updates, which also do not exclude -RCU readers. -

    @@QQE@@ - -

    -This guarantee allows lookup code to be shared between read-side -and update-side code, and was premeditated, appearing in the earliest -DYNIX/ptx RCU documentation. - -

    Fundamental Non-Requirements

    - -

    -RCU provides extremely lightweight readers, and its read-side guarantees, -though quite useful, are correspondingly lightweight. -It is therefore all too easy to assume that RCU is guaranteeing more -than it really is. -Of course, the list of things that RCU does not guarantee is infinitely -long, however, the following sections list a few non-guarantees that -have caused confusion. -Except where otherwise noted, these non-guarantees were premeditated. - -

      -
    1. - Readers Impose Minimal Ordering -
    2. - Readers Do Not Exclude Updaters -
    3. - Updaters Only Wait For Old Readers -
    4. - Grace Periods Don't Partition Read-Side Critical Sections -
    5. - Read-Side Critical Sections Don't Partition Grace Periods -
    6. - Disabling Preemption Does Not Block Grace Periods -
    - -

    Readers Impose Minimal Ordering

    - -

    -Reader-side markers such as rcu_read_lock() and -rcu_read_unlock() provide absolutely no ordering guarantees -except through their interaction with the grace-period APIs such as -synchronize_rcu(). -To see this, consider the following pair of threads: - -

    -
    - 1 void thread0(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   WRITE_ONCE(x, 1);
    - 5   rcu_read_unlock();
    - 6   rcu_read_lock();
    - 7   WRITE_ONCE(y, 1);
    - 8   rcu_read_unlock();
    - 9 }
    -10
    -11 void thread1(void)
    -12 {
    -13   rcu_read_lock();
    -14   r1 = READ_ONCE(y);
    -15   rcu_read_unlock();
    -16   rcu_read_lock();
    -17   r2 = READ_ONCE(x);
    -18   rcu_read_unlock();
    -19 }
    -
    -
    - -

    -After thread0() and thread1() execute -concurrently, it is quite possible to have - -

    -
    -(r1 == 1 && r2 == 0)
    -
    -
    - -(that is, y appears to have been assigned before x), -which would not be possible if rcu_read_lock() and -rcu_read_unlock() had much in the way of ordering -properties. -But they do not, so the CPU is within its rights -to do significant reordering. -This is by design: Any significant ordering constraints would slow down -these fast-path APIs. - -

    @@QQ@@ -Can't the compiler also reorder this code? -

    @@QQA@@ -No, the volatile casts in READ_ONCE() and -WRITE_ONCE() prevent the compiler from reordering in -this particular case. -

    @@QQE@@ - -

    Readers Do Not Exclude Updaters

    - -

    -Neither rcu_read_lock() nor rcu_read_unlock() -exclude updates. -All they do is to prevent grace periods from ending. -The following example illustrates this: - -

    -
    - 1 void thread0(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   r1 = READ_ONCE(y);
    - 5   if (r1) {
    - 6     do_something_with_nonzero_x();
    - 7     r2 = READ_ONCE(x);
    - 8     WARN_ON(!r2); /* BUG!!! */
    - 9   }
    -10   rcu_read_unlock();
    -11 }
    -12
    -13 void thread1(void)
    -14 {
    -15   spin_lock(&my_lock);
    -16   WRITE_ONCE(x, 1);
    -17   WRITE_ONCE(y, 1);
    -18   spin_unlock(&my_lock);
    -19 }
    -
    -
    - -

    -If the thread0() function's rcu_read_lock() -excluded the thread1() function's update, -the WARN_ON() could never fire. -But the fact is that rcu_read_lock() does not exclude -much of anything aside from subsequent grace periods, of which -thread1() has none, so the -WARN_ON() can and does fire. - -

    Updaters Only Wait For Old Readers

    - -

    -It might be tempting to assume that after synchronize_rcu() -completes, there are no readers executing. -This temptation must be avoided because -new readers can start immediately after synchronize_rcu() -starts, and synchronize_rcu() is under no -obligation to wait for these new readers. - -

    @@QQ@@ -Suppose that synchronize_rcu() did wait until all readers had completed. -Would the updater be able to rely on this? -

    @@QQA@@ -No. -Even if synchronize_rcu() were to wait until -all readers had completed, a new reader might start immediately after -synchronize_rcu() completed. -Therefore, the code following -synchronize_rcu() cannot rely on there being no readers -in any case. -

    @@QQE@@ - -

    -Grace Periods Don't Partition Read-Side Critical Sections

    - -

    -It is tempting to assume that if any part of one RCU read-side critical -section precedes a given grace period, and if any part of another RCU -read-side critical section follows that same grace period, then all of -the first RCU read-side critical section must precede all of the second. -However, this just isn't the case: A single grace period does not -partition the set of RCU read-side critical sections. -An example of this situation can be illustrated as follows, where -x, y, and z are initially all zero: - -

    -
    - 1 void thread0(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   WRITE_ONCE(a, 1);
    - 5   WRITE_ONCE(b, 1);
    - 6   rcu_read_unlock();
    - 7 }
    - 8
    - 9 void thread1(void)
    -10 {
    -11   r1 = READ_ONCE(a);
    -12   synchronize_rcu();
    -13   WRITE_ONCE(c, 1);
    -14 }
    -15
    -16 void thread2(void)
    -17 {
    -18   rcu_read_lock();
    -19   r2 = READ_ONCE(b);
    -20   r3 = READ_ONCE(c);
    -21   rcu_read_unlock();
    -22 }
    -
    -
    - -

    -It turns out that the outcome: - -

    -
    -(r1 == 1 && r2 == 0 && r3 == 1)
    -
    -
    - -is entirely possible. -The following figure show how this can happen, with each circled -QS indicating the point at which RCU recorded a -quiescent state for each thread, that is, a state in which -RCU knows that the thread cannot be in the midst of an RCU read-side -critical section that started before the current grace period: - -

    GPpartitionReaders1.svg

    - -

    -If it is necessary to partition RCU read-side critical sections in this -manner, it is necessary to use two grace periods, where the first -grace period is known to end before the second grace period starts: - -

    -
    - 1 void thread0(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   WRITE_ONCE(a, 1);
    - 5   WRITE_ONCE(b, 1);
    - 6   rcu_read_unlock();
    - 7 }
    - 8
    - 9 void thread1(void)
    -10 {
    -11   r1 = READ_ONCE(a);
    -12   synchronize_rcu();
    -13   WRITE_ONCE(c, 1);
    -14 }
    -15
    -16 void thread2(void)
    -17 {
    -18   r2 = READ_ONCE(c);
    -19   synchronize_rcu();
    -20   WRITE_ONCE(d, 1);
    -21 }
    -22
    -23 void thread3(void)
    -24 {
    -25   rcu_read_lock();
    -26   r3 = READ_ONCE(b);
    -27   r4 = READ_ONCE(d);
    -28   rcu_read_unlock();
    -29 }
    -
    -
    - -

    -Here, if (r1 == 1), then -thread0()'s write to b must happen -before the end of thread1()'s grace period. -If in addition (r4 == 1), then -thread3()'s read from b must happen -after the beginning of thread2()'s grace period. -If it is also the case that (r2 == 1), then the -end of thread1()'s grace period must precede the -beginning of thread2()'s grace period. -This mean that the two RCU read-side critical sections cannot overlap, -guaranteeing that (r3 == 1). -As a result, the outcome: - -

    -
    -(r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1)
    -
    -
    - -cannot happen. - -

    -This non-requirement was also non-premeditated, but became apparent -when studying RCU's interaction with memory ordering. - -

    -Read-Side Critical Sections Don't Partition Grace Periods

    - -

    -It is also tempting to assume that if an RCU read-side critical section -happens between a pair of grace periods, then those grace periods cannot -overlap. -However, this temptation leads nowhere good, as can be illustrated by -the following, with all variables initially zero: - -

    -
    - 1 void thread0(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   WRITE_ONCE(a, 1);
    - 5   WRITE_ONCE(b, 1);
    - 6   rcu_read_unlock();
    - 7 }
    - 8
    - 9 void thread1(void)
    -10 {
    -11   r1 = READ_ONCE(a);
    -12   synchronize_rcu();
    -13   WRITE_ONCE(c, 1);
    -14 }
    -15
    -16 void thread2(void)
    -17 {
    -18   rcu_read_lock();
    -19   WRITE_ONCE(d, 1);
    -20   r2 = READ_ONCE(c);
    -21   rcu_read_unlock();
    -22 }
    -23
    -24 void thread3(void)
    -25 {
    -26   r3 = READ_ONCE(d);
    -27   synchronize_rcu();
    -28   WRITE_ONCE(e, 1);
    -29 }
    -30
    -31 void thread4(void)
    -32 {
    -33   rcu_read_lock();
    -34   r4 = READ_ONCE(b);
    -35   r5 = READ_ONCE(e);
    -36   rcu_read_unlock();
    -37 }
    -
    -
    - -

    -In this case, the outcome: - -

    -
    -(r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1)
    -
    -
    - -is entirely possible, as illustrated below: - -

    ReadersPartitionGP1.svg

    - -

    -Again, an RCU read-side critical section can overlap almost all of a -given grace period, just so long as it does not overlap the entire -grace period. -As a result, an RCU read-side critical section cannot partition a pair -of RCU grace periods. - -

    @@QQ@@ -How long a sequence of grace periods, each separated by an RCU read-side -critical section, would be required to partition the RCU read-side -critical sections at the beginning and end of the chain? -

    @@QQA@@ -In theory, an infinite number. -In practice, an unknown number that is sensitive to both implementation -details and timing considerations. -Therefore, even in practice, RCU users must abide by the theoretical rather -than the practical answer. -

    @@QQE@@ - -

    -Disabling Preemption Does Not Block Grace Periods

    - -

    -There was a time when disabling preemption on any given CPU would block -subsequent grace periods. -However, this was an accident of implementation and is not a requirement. -And in the current Linux-kernel implementation, disabling preemption -on a given CPU in fact does not block grace periods, as Oleg Nesterov -demonstrated. - -

    -If you need a preempt-disable region to block grace periods, you need to add -rcu_read_lock() and rcu_read_unlock(), for example -as follows: - -

    -
    - 1 preempt_disable();
    - 2 rcu_read_lock();
    - 3 do_something();
    - 4 rcu_read_unlock();
    - 5 preempt_enable();
    - 6
    - 7 /* Spinlocks implicitly disable preemption. */
    - 8 spin_lock(&mylock);
    - 9 rcu_read_lock();
    -10 do_something();
    -11 rcu_read_unlock();
    -12 spin_unlock(&mylock);
    -
    -
    - -

    -In theory, you could enter the RCU read-side critical section first, -but it is more efficient to keep the entire RCU read-side critical -section contained in the preempt-disable region as shown above. -Of course, RCU read-side critical sections that extend outside of -preempt-disable regions will work correctly, but such critical sections -can be preempted, which forces rcu_read_unlock() to do -more work. -And no, this is not an invitation to enclose all of your RCU -read-side critical sections within preempt-disable regions, because -doing so would degrade real-time response. - -

    -This non-requirement appeared with preemptible RCU. -If you need a grace period that waits on non-preemptible code regions, use -RCU-sched. - -

    Parallelism Facts of Life

    - -

    -These parallelism facts of life are by no means specific to RCU, but -the RCU implementation must abide by them. -They therefore bear repeating: - -

      -
    1. Any CPU or task may be delayed at any time, - and any attempts to avoid these delays by disabling - preemption, interrupts, or whatever are completely futile. - This is most obvious in preemptible user-level - environments and in virtualized environments (where - a given guest OS's VCPUs can be preempted at any time by - the underlying hypervisor), but can also happen in bare-metal - environments due to ECC errors, NMIs, and other hardware - events. - Although a delay of more than about 20 seconds can result - in splats, the RCU implementation is obligated to use - algorithms that can tolerate extremely long delays, but where - “extremely long” is not long enough to allow - wrap-around when incrementing a 64-bit counter. -
    2. Both the compiler and the CPU can reorder memory accesses. - Where it matters, RCU must use compiler directives and - memory-barrier instructions to preserve ordering. -
    3. Conflicting writes to memory locations in any given cache line - will result in expensive cache misses. - Greater numbers of concurrent writes and more-frequent - concurrent writes will result in more dramatic slowdowns. - RCU is therefore obligated to use algorithms that have - sufficient locality to avoid significant performance and - scalability problems. -
    4. As a rough rule of thumb, only one CPU's worth of processing - may be carried out under the protection of any given exclusive - lock. - RCU must therefore use scalable locking designs. -
    5. Counters are finite, especially on 32-bit systems. - RCU's use of counters must therefore tolerate counter wrap, - or be designed such that counter wrap would take way more - time than a single system is likely to run. - An uptime of ten years is quite possible, a runtime - of a century much less so. - As an example of the latter, RCU's dyntick-idle nesting counter - allows 54 bits for interrupt nesting level (this counter - is 64 bits even on a 32-bit system). - Overflowing this counter requires 254 - half-interrupts on a given CPU without that CPU ever going idle. - If a half-interrupt happened every microsecond, it would take - 570 years of runtime to overflow this counter, which is currently - believed to be an acceptably long time. -
    6. Linux systems can have thousands of CPUs running a single - Linux kernel in a single shared-memory environment. - RCU must therefore pay close attention to high-end scalability. -
    - -

    -This last parallelism fact of life means that RCU must pay special -attention to the preceding facts of life. -The idea that Linux might scale to systems with thousands of CPUs would -have been met with some skepticism in the 1990s, but these requirements -would have otherwise have been unsurprising, even in the early 1990s. - -

    Quality-of-Implementation Requirements

    - -

    -These sections list quality-of-implementation requirements. -Although an RCU implementation that ignores these requirements could -still be used, it would likely be subject to limitations that would -make it inappropriate for industrial-strength production use. -Classes of quality-of-implementation requirements are as follows: - -

      -
    1. Specialization -
    2. Performance and Scalability -
    3. Composability -
    4. Corner Cases -
    - -

    -These classes is covered in the following sections. - -

    Specialization

    - -

    -RCU is and always has been intended primarily for read-mostly situations, as -illustrated by the following figure. -This means that RCU's read-side primitives are optimized, often at the -expense of its update-side primitives. - -

    RCUApplicability.svg

    - -

    -This focus on read-mostly situations means that RCU must interoperate -with other synchronization primitives. -For example, the add_gp() and remove_gp_synchronous() -examples discussed earlier use RCU to protect readers and locking to -coordinate updaters. -However, the need extends much farther, requiring that a variety of -synchronization primitives be legal within RCU read-side critical sections, -including spinlocks, sequence locks, atomic operations, reference -counters, and memory barriers. - -

    @@QQ@@ -What about sleeping locks? -

    @@QQA@@ -These are forbidden within Linux-kernel RCU read-side critical sections -because it is not legal to place a quiescent state (in this case, -voluntary context switch) within an RCU read-side critical section. -However, sleeping locks may be used within userspace RCU read-side critical -sections, and also within Linux-kernel sleepable RCU -(SRCU) -read-side critical sections. -In addition, the -rt patchset turns spinlocks into a sleeping locks so -that the corresponding critical sections can be preempted, which -also means that these sleeplockified spinlocks (but not other sleeping locks!) -may be acquire within -rt-Linux-kernel RCU read-side critical sections. - -

    -Note that it is legal for a normal RCU read-side critical section -to conditionally acquire a sleeping locks (as in mutex_trylock()), -but only as long as it does not loop indefinitely attempting to -conditionally acquire that sleeping locks. -The key point is that things like mutex_trylock() -either return with the mutex held, or return an error indication if -the mutex was not immediately available. -Either way, mutex_trylock() returns immediately without sleeping. -

    @@QQE@@ - -

    -It often comes as a surprise that many algorithms do not require a -consistent view of data, but many can function in that mode, -with network routing being the poster child. -Internet routing algorithms take significant time to propagate -updates, so that by the time an update arrives at a given system, -that system has been sending network traffic the wrong way for -a considerable length of time. -Having a few threads continue to send traffic the wrong way for a -few more milliseconds is clearly not a problem: In the worst case, -TCP retransmissions will eventually get the data where it needs to go. -In general, when tracking the state of the universe outside of the -computer, some level of inconsistency must be tolerated due to -speed-of-light delays if nothing else. - -

    -Furthermore, uncertainty about external state is inherent in many cases. -For example, a pair of veternarians might use heartbeat to determine -whether or not a given cat was alive. -But how long should they wait after the last heartbeat to decide that -the cat is in fact dead? -Waiting less than 400 milliseconds makes no sense because this would -mean that a relaxed cat would be considered to cycle between death -and life more than 100 times per minute. -Moreover, just as with human beings, a cat's heart might stop for -some period of time, so the exact wait period is a judgment call. -One of our pair of veternarians might wait 30 seconds before pronouncing -the cat dead, while the other might insist on waiting a full minute. -The two veternarians would then disagree on the state of the cat during -the final 30 seconds of the minute following the last heartbeat, as -fancifully illustrated below: - -

    2013-08-is-it-dead.png

    - -

    -Interestingly enough, this same situation applies to hardware. -When push comes to shove, how do we tell whether or not some -external server has failed? -We send messages to it periodically, and declare it failed if we -don't receive a response within a given period of time. -Policy decisions can usually tolerate short -periods of inconsistency. -The policy was decided some time ago, and is only now being put into -effect, so a few milliseconds of delay is normally inconsequential. - -

    -However, there are algorithms that absolutely must see consistent data. -For example, the translation between a user-level SystemV semaphore -ID to the corresponding in-kernel data structure is protected by RCU, -but it is absolutely forbidden to update a semaphore that has just been -removed. -In the Linux kernel, this need for consistency is accommodated by acquiring -spinlocks located in the in-kernel data structure from within -the RCU read-side critical section, and this is indicated by the -green box in the figure above. -Many other techniques may be used, and are in fact used within the -Linux kernel. - -

    -In short, RCU is not required to maintain consistency, and other -mechanisms may be used in concert with RCU when consistency is required. -RCU's specialization allows it to do its job extremely well, and its -ability to interoperate with other synchronization mechanisms allows -the right mix of synchronization tools to be used for a given job. - -

    Performance and Scalability

    - -

    -Energy efficiency is a critical component of performance today, -and Linux-kernel RCU implementations must therefore avoid unnecessarily -awakening idle CPUs. -I cannot claim that this requirement was premeditated. -In fact, I learned of it during a telephone conversation in which I -was given “frank and open” feedback on the importance -of energy efficiency in battery-powered systems and on specific -energy-efficiency shortcomings of the Linux-kernel RCU implementation. -In my experience, the battery-powered embedded community will consider -any unnecessary wakeups to be extremely unfriendly acts. -So much so that mere Linux-kernel-mailing-list posts are -insufficient to vent their ire. - -

    -Memory consumption is not particularly important for in most -situations, and has become decreasingly -so as memory sizes have expanded and memory -costs have plummeted. -However, as I learned from Matt Mackall's -bloatwatch -efforts, memory footprint is critically important on single-CPU systems with -non-preemptible (CONFIG_PREEMPT=n) kernels, and thus -tiny RCU -was born. -Josh Triplett has since taken over the small-memory banner with his -Linux kernel tinification -project, which resulted in -SRCU -becoming optional for those kernels not needing it. - -

    -The remaining performance requirements are, for the most part, -unsurprising. -For example, in keeping with RCU's read-side specialization, -rcu_dereference() should have negligible overhead (for -example, suppression of a few minor compiler optimizations). -Similarly, in non-preemptible environments, rcu_read_lock() and -rcu_read_unlock() should have exactly zero overhead. - -

    -In preemptible environments, in the case where the RCU read-side -critical section was not preempted (as will be the case for the -highest-priority real-time process), rcu_read_lock() and -rcu_read_unlock() should have minimal overhead. -In particular, they should not contain atomic read-modify-write -operations, memory-barrier instructions, preemption disabling, -interrupt disabling, or backwards branches. -However, in the case where the RCU read-side critical section was preempted, -rcu_read_unlock() may acquire spinlocks and disable interrupts. -This is why it is better to nest an RCU read-side critical section -within a preempt-disable region than vice versa, at least in cases -where that critical section is short enough to avoid unduly degrading -real-time latencies. - -

    -The synchronize_rcu() grace-period-wait primitive is -optimized for throughput. -It may therefore incur several milliseconds of latency in addition to -the duration of the longest RCU read-side critical section. -On the other hand, multiple concurrent invocations of -synchronize_rcu() are required to use batching optimizations -so that they can be satisfied by a single underlying grace-period-wait -operation. -For example, in the Linux kernel, it is not unusual for a single -grace-period-wait operation to serve more than -1,000 separate invocations -of synchronize_rcu(), thus amortizing the per-invocation -overhead down to nearly zero. -However, the grace-period optimization is also required to avoid -measurable degradation of real-time scheduling and interrupt latencies. - -

    -In some cases, the multi-millisecond synchronize_rcu() -latencies are unacceptable. -In these cases, synchronize_rcu_expedited() may be used -instead, reducing the grace-period latency down to a few tens of -microseconds on small systems, at least in cases where the RCU read-side -critical sections are short. -There are currently no special latency requirements for -synchronize_rcu_expedited() on large systems, but, -consistent with the empirical nature of the RCU specification, -that is subject to change. -However, there most definitely are scalability requirements: -A storm of synchronize_rcu_expedited() invocations on 4096 -CPUs should at least make reasonable forward progress. -In return for its shorter latencies, synchronize_rcu_expedited() -is permitted to impose modest degradation of real-time latency -on non-idle online CPUs. -That said, it will likely be necessary to take further steps to reduce this -degradation, hopefully to roughly that of a scheduling-clock interrupt. - -

    -There are a number of situations where even -synchronize_rcu_expedited()'s reduced grace-period -latency is unacceptable. -In these situations, the asynchronous call_rcu() can be -used in place of synchronize_rcu() as follows: - -

    -
    - 1 struct foo {
    - 2   int a;
    - 3   int b;
    - 4   struct rcu_head rh;
    - 5 };
    - 6
    - 7 static void remove_gp_cb(struct rcu_head *rhp)
    - 8 {
    - 9   struct foo *p = container_of(rhp, struct foo, rh);
    -10
    -11   kfree(p);
    -12 }
    -13
    -14 bool remove_gp_asynchronous(void)
    -15 {
    -16   struct foo *p;
    -17
    -18   spin_lock(&gp_lock);
    -19   p = rcu_dereference(gp);
    -20   if (!p) {
    -21     spin_unlock(&gp_lock);
    -22     return false;
    -23   }
    -24   rcu_assign_pointer(gp, NULL);
    -25   call_rcu(&p->rh, remove_gp_cb);
    -26   spin_unlock(&gp_lock);
    -27   return true;
    -28 }
    -
    -
    - -

    -A definition of struct foo is finally needed, and appears -on lines 1-5. -The function remove_gp_cb() is passed to call_rcu() -on line 25, and will be invoked after the end of a subsequent -grace period. -This gets the same effect as remove_gp_synchronous(), -but without forcing the updater to wait for a grace period to elapse. -The call_rcu() function may be used in a number of -situations where neither synchronize_rcu() nor -synchronize_rcu_expedited() would be legal, -including within preempt-disable code, local_bh_disable() code, -interrupt-disable code, and interrupt handlers. -However, even call_rcu() is illegal within NMI handlers. -The callback function (remove_gp_cb() in this case) will be -executed within softirq (software interrupt) environment within the -Linux kernel, -either within a real softirq handler or under the protection -of local_bh_disable(). -In both the Linux kernel and in userspace, it is bad practice to -write an RCU callback function that takes too long. -Long-running operations should be relegated to separate threads or -(in the Linux kernel) workqueues. - -

    @@QQ@@ -Why does line 19 use rcu_access_pointer()? -After all, call_rcu() on line 25 stores into the -structure, which would interact badly with concurrent insertions. -Doesn't this mean that rcu_dereference() is required? -

    @@QQA@@ -Presumably the ->gp_lock acquired on line 18 excludes -any changes, including any insertions that rcu_dereference() -would protect against. -Therefore, any insertions will be delayed until after ->gp_lock -is released on line 25, which in turn means that -rcu_access_pointer() suffices. -

    @@QQE@@ - -

    -However, all that remove_gp_cb() is doing is -invoking kfree() on the data element. -This is a common idiom, and is supported by kfree_rcu(), -which allows “fire and forget” operation as shown below: - -

    -
    - 1 struct foo {
    - 2   int a;
    - 3   int b;
    - 4   struct rcu_head rh;
    - 5 };
    - 6
    - 7 bool remove_gp_faf(void)
    - 8 {
    - 9   struct foo *p;
    -10
    -11   spin_lock(&gp_lock);
    -12   p = rcu_dereference(gp);
    -13   if (!p) {
    -14     spin_unlock(&gp_lock);
    -15     return false;
    -16   }
    -17   rcu_assign_pointer(gp, NULL);
    -18   kfree_rcu(p, rh);
    -19   spin_unlock(&gp_lock);
    -20   return true;
    -21 }
    -
    -
    - -

    -Note that remove_gp_faf() simply invokes -kfree_rcu() and proceeds, without any need to pay any -further attention to the subsequent grace period and kfree(). -It is permissible to invoke kfree_rcu() from the same -environments as for call_rcu(). -Interestingly enough, DYNIX/ptx had the equivalents of -call_rcu() and kfree_rcu(), but not -synchronize_rcu(). -This was due to the fact that RCU was not heavily used within DYNIX/ptx, -so the very few places that needed something like -synchronize_rcu() simply open-coded it. - -

    @@QQ@@ -Earlier it was claimed that call_rcu() and -kfree_rcu() allowed updaters to avoid being blocked -by readers. -But how can that be correct, given that the invocation of the callback -and the freeing of the memory (respectively) must still wait for -a grace period to elapse? -

    @@QQA@@ -We could define things this way, but keep in mind that this sort of -definition would say that updates in garbage-collected languages -cannot complete until the next time the garbage collector runs, -which does not seem at all reasonable. -The key point is that in most cases, an updater using either -call_rcu() or kfree_rcu() can proceed to the -next update as soon as it has invoked call_rcu() or -kfree_rcu(), without having to wait for a subsequent -grace period. -

    @@QQE@@ - -

    -But what if the updater must wait for the completion of code to be -executed after the end of the grace period, but has other tasks -that can be carried out in the meantime? -The polling-style get_state_synchronize_rcu() and -cond_synchronize_rcu() functions may be used for this -purpose, as shown below: - -

    -
    - 1 bool remove_gp_poll(void)
    - 2 {
    - 3   struct foo *p;
    - 4   unsigned long s;
    - 5
    - 6   spin_lock(&gp_lock);
    - 7   p = rcu_access_pointer(gp);
    - 8   if (!p) {
    - 9     spin_unlock(&gp_lock);
    -10     return false;
    -11   }
    -12   rcu_assign_pointer(gp, NULL);
    -13   spin_unlock(&gp_lock);
    -14   s = get_state_synchronize_rcu();
    -15   do_something_while_waiting();
    -16   cond_synchronize_rcu(s);
    -17   kfree(p);
    -18   return true;
    -19 }
    -
    -
    - -

    -On line 14, get_state_synchronize_rcu() obtains a -“cookie” from RCU, -then line 15 carries out other tasks, -and finally, line 16 returns immediately if a grace period has -elapsed in the meantime, but otherwise waits as required. -The need for get_state_synchronize_rcu and -cond_synchronize_rcu() has appeared quite recently, -so it is too early to tell whether they will stand the test of time. - -

    -RCU thus provides a range of tools to allow updaters to strike the -required tradeoff between latency, flexibility and CPU overhead. - -

    Composability

    - -

    -Composability has received much attention in recent years, perhaps in part -due to the collision of multicore hardware with object-oriented techniques -designed in single-threaded environments for single-threaded use. -And in theory, RCU read-side critical sections may be composed, and in -fact may be nested arbitrarily deeply. -In practice, as with all real-world implementations of composable -constructs, there are limitations. - -

    -Implementations of RCU for which rcu_read_lock() -and rcu_read_unlock() generate no code, such as -Linux-kernel RCU when CONFIG_PREEMPT=n, can be -nested arbitrarily deeply. -After all, there is no overhead. -Except that if all these instances of rcu_read_lock() -and rcu_read_unlock() are visible to the compiler, -compilation will eventually fail due to exhausting memory, -mass storage, or user patience, whichever comes first. -If the nesting is not visible to the compiler, as is the case with -mutually recursive functions each in its own translation unit, -stack overflow will result. -If the nesting takes the form of loops, either the control variable -will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. -Nevertheless, this class of RCU implementations is one -of the most composable constructs in existence. - -

    -RCU implementations that explicitly track nesting depth -are limited by the nesting-depth counter. -For example, the Linux kernel's preemptible RCU limits nesting to -INT_MAX. -This should suffice for almost all practical purposes. -That said, a consecutive pair of RCU read-side critical sections -between which there is an operation that waits for a grace period -cannot be enclosed in another RCU read-side critical section. -This is because it is not legal to wait for a grace period within -an RCU read-side critical section: To do so would result either -in deadlock or -in RCU implicitly splitting the enclosing RCU read-side critical -section, neither of which is conducive to a long-lived and prosperous -kernel. - -

    -It is worth noting that RCU is not alone in limiting composability. -For example, many transactional-memory implementations prohibit -composing a pair of transactions separated by an irrevocable -operation (for example, a network receive operation). -For another example, lock-based critical sections can be composed -surprisingly freely, but only if deadlock is avoided. - -

    -In short, although RCU read-side critical sections are highly composable, -care is required in some situations, just as is the case for any other -composable synchronization mechanism. - -

    Corner Cases

    - -

    -A given RCU workload might have an endless and intense stream of -RCU read-side critical sections, perhaps even so intense that there -was never a point in time during which there was not at least one -RCU read-side critical section in flight. -RCU cannot allow this situation to block grace periods: As long as -all the RCU read-side critical sections are finite, grace periods -must also be finite. - -

    -That said, preemptible RCU implementations could potentially result -in RCU read-side critical sections being preempted for long durations, -which has the effect of creating a long-duration RCU read-side -critical section. -This situation can arise only in heavily loaded systems, but systems using -real-time priorities are of course more vulnerable. -Therefore, RCU priority boosting is provided to help deal with this -case. -That said, the exact requirements on RCU priority boosting will likely -evolve as more experience accumulates. - -

    -Other workloads might have very high update rates. -Although one can argue that such workloads should instead use -something other than RCU, the fact remains that RCU must -handle such workloads gracefully. -This requirement is another factor driving batching of grace periods, -but it is also the driving force behind the checks for large numbers -of queued RCU callbacks in the call_rcu() code path. -Finally, high update rates should not delay RCU read-side critical -sections, although some read-side delays can occur when using -synchronize_rcu_expedited(), courtesy of this function's use -of try_stop_cpus(). -(In the future, synchronize_rcu_expedited() will be -converted to use lighter-weight inter-processor interrupts (IPIs), -but this will still disturb readers, though to a much smaller degree.) - -

    -Although all three of these corner cases were understood in the early -1990s, a simple user-level test consisting of close(open(path)) -in a tight loop -in the early 2000s suddenly provided a much deeper appreciation of the -high-update-rate corner case. -This test also motivated addition of some RCU code to react to high update -rates, for example, if a given CPU finds itself with more than 10,000 -RCU callbacks queued, it will cause RCU to take evasive action by -more aggressively starting grace periods and more aggressively forcing -completion of grace-period processing. -This evasive action causes the grace period to complete more quickly, -but at the cost of restricting RCU's batching optimizations, thus -increasing the CPU overhead incurred by that grace period. - -

    -Software-Engineering Requirements

    - -

    -Between Murphy's Law and “To err is human”, it is necessary to -guard against mishaps and misuse: - -

      -
    1. It is all too easy to forget to use rcu_read_lock() - everywhere that it is needed, so kernels built with - CONFIG_PROVE_RCU=y will spat if - rcu_dereference() is used outside of an - RCU read-side critical section. - Update-side code can use rcu_dereference_protected(), - which takes a - lockdep expression - to indicate what is providing the protection. - If the indicated protection is not provided, a lockdep splat - is emitted. - -

      - Code shared between readers and updaters can use - rcu_dereference_check(), which also takes a - lockdep expression, and emits a lockdep splat if neither - rcu_read_lock() nor the indicated protection - is in place. - In addition, rcu_dereference_raw() is used in those - (hopefully rare) cases where the required protection cannot - be easily described. - Finally, rcu_read_lock_held() is provided to - allow a function to verify that it has been invoked within - an RCU read-side critical section. - I was made aware of this set of requirements shortly after Thomas - Gleixner audited a number of RCU uses. -

    2. A given function might wish to check for RCU-related preconditions - upon entry, before using any other RCU API. - The rcu_lockdep_assert() does this job, - asserting the expression in kernels having lockdep enabled - and doing nothing otherwise. -
    3. It is also easy to forget to use rcu_assign_pointer() - and rcu_dereference(), perhaps (incorrectly) - substituting a simple assignment. - To catch this sort of error, a given RCU-protected pointer may be - tagged with __rcu, after which running sparse - with CONFIG_SPARSE_RCU_POINTER=y will complain - about simple-assignment accesses to that pointer. - Arnd Bergmann made me aware of this requirement, and also - supplied the needed - patch series. -
    4. Kernels built with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y - will splat if a data element is passed to call_rcu() - twice in a row, without a grace period in between. - (This error is similar to a double free.) - The corresponding rcu_head structures that are - dynamically allocated are automatically tracked, but - rcu_head structures allocated on the stack - must be initialized with init_rcu_head_on_stack() - and cleaned up with destroy_rcu_head_on_stack(). - Similarly, statically allocated non-stack rcu_head - structures must be initialized with init_rcu_head() - and cleaned up with destroy_rcu_head(). - Mathieu Desnoyers made me aware of this requirement, and also - supplied the needed - patch. -
    5. An infinite loop in an RCU read-side critical section will - eventually trigger an RCU CPU stall warning splat, with - the duration of “eventually” being controlled by the - RCU_CPU_STALL_TIMEOUT Kconfig option, or, - alternatively, by the - rcupdate.rcu_cpu_stall_timeout boot/sysfs - parameter. - However, RCU is not obligated to produce this splat - unless there is a grace period waiting on that particular - RCU read-side critical section. -

      - Some extreme workloads might intentionally delay - RCU grace periods, and systems running those workloads can - be booted with rcupdate.rcu_cpu_stall_suppress - to suppress the splats. - This kernel parameter may also be set via sysfs. - Furthermore, RCU CPU stall warnings are counter-productive - during sysrq dumps and during panics. - RCU therefore supplies the rcu_sysrq_start() and - rcu_sysrq_end() API members to be called before - and after long sysrq dumps. - RCU also supplies the rcu_panic() notifier that is - automatically invoked at the beginning of a panic to suppress - further RCU CPU stall warnings. - -

      - This requirement made itself known in the early 1990s, pretty - much the first time that it was necessary to debug a CPU stall. - That said, the initial implementation in DYNIX/ptx was quite - generic in comparison with that of Linux. -

    6. Although it would be very good to detect pointers leaking out - of RCU read-side critical sections, there is currently no - good way of doing this. - One complication is the need to distinguish between pointers - leaking and pointers that have been handed off from RCU to - some other synchronization mechanism, for example, reference - counting. -
    7. In kernels built with CONFIG_RCU_TRACE=y, RCU-related - information is provided via both debugfs and event tracing. -
    8. Open-coded use of rcu_assign_pointer() and - rcu_dereference() to create typical linked - data structures can be surprisingly error-prone. - Therefore, RCU-protected - linked lists - and, more recently, RCU-protected - hash tables - are available. - Many other special-purpose RCU-protected data structures are - available in the Linux kernel and the userspace RCU library. -
    9. Some linked structures are created at compile time, but still - require __rcu checking. - The RCU_POINTER_INITIALIZER() macro serves this - purpose. -
    10. It is not necessary to use rcu_assign_pointer() - when creating linked structures that are to be published via - a single external pointer. - The RCU_INIT_POINTER() macro is provided for - this task and also for assigning NULL pointers - at runtime. -
    - -

    -This not a hard-and-fast list: RCU's diagnostic capabilities will -continue to be guided by the number and type of usage bugs found -in real-world RCU usage. - -

    Linux Kernel Complications

    - -

    -The Linux kernel provides an interesting environment for all kinds of -software, including RCU. -Some of the relevant points of interest are as follows: - -

      -
    1. Configuration. -
    2. Firmware Interface. -
    3. Early Boot. -
    4. - Interrupts and non-maskable interrupts (NMIs). -
    5. Loadable Modules. -
    6. Hotplug CPU. -
    7. Scheduler and RCU. -
    8. Tracing and RCU. -
    9. Energy Efficiency. -
    10. Memory Efficiency. -
    11. - Performance, Scalability, Response Time, and Reliability. -
    - -

    -This list is probably incomplete, but it does give a feel for the -most notable Linux-kernel complications. -Each of the following sections covers one of the above topics. - -

    Configuration

    - -

    -RCU's goal is automatic configuration, so that almost nobody -needs to worry about RCU's Kconfig options. -And for almost all users, RCU does in fact work well -“out of the box.” - -

    -However, there are specialized use cases that are handled by -kernel boot parameters and Kconfig options. -Unfortunately, the Kconfig system will explicitly ask users -about new Kconfig options, which requires almost all of them -be hidden behind a CONFIG_RCU_EXPERT Kconfig option. - -

    -This all should be quite obvious, but the fact remains that -Linus Torvalds recently had to -remind -me of this requirement. - -

    Firmware Interface

    - -

    -In many cases, kernel obtains information about the system from the -firmware, and sometimes things are lost in translation. -Or the translation is accurate, but the original message is bogus. - -

    -For example, some systems' firmware overreports the number of CPUs, -sometimes by a large factor. -If RCU naively believed the firmware, as it used to do, -it would create too many per-CPU kthreads. -Although the resulting system will still run correctly, the extra -kthreads needlessly consume memory and can cause confusion -when they show up in ps listings. - -

    -RCU must therefore wait for a given CPU to actually come online before -it can allow itself to believe that the CPU actually exists. -The resulting “ghost CPUs” (which are never going to -come online) cause a number of -interesting complications. - -

    Early Boot

    - -

    -The Linux kernel's boot sequence is an interesting process, -and RCU is used early, even before rcu_init() -is invoked. -In fact, a number of RCU's primitives can be used as soon as the -initial task's task_struct is available and the -boot CPU's per-CPU variables are set up. -The read-side primitives (rcu_read_lock(), -rcu_read_unlock(), rcu_dereference(), -and rcu_access_pointer()) will operate normally very early on, -as will rcu_assign_pointer(). - -

    -Although call_rcu() may be invoked at any -time during boot, callbacks are not guaranteed to be invoked until after -the scheduler is fully up and running. -This delay in callback invocation is due to the fact that RCU does not -invoke callbacks until it is fully initialized, and this full initialization -cannot occur until after the scheduler has initialized itself to the -point where RCU can spawn and run its kthreads. -In theory, it would be possible to invoke callbacks earlier, -however, this is not a panacea because there would be severe restrictions -on what operations those callbacks could invoke. - -

    -Perhaps surprisingly, synchronize_rcu(), -synchronize_rcu_bh() -(discussed below), -and -synchronize_sched() -will all operate normally -during very early boot, the reason being that there is only one CPU -and preemption is disabled. -This means that the call synchronize_rcu() (or friends) -itself is a quiescent -state and thus a grace period, so the early-boot implementation can -be a no-op. - -

    -Both synchronize_rcu_bh() and synchronize_sched() -continue to operate normally through the remainder of boot, courtesy -of the fact that preemption is disabled across their RCU read-side -critical sections and also courtesy of the fact that there is still -only one CPU. -However, once the scheduler starts initializing, preemption is enabled. -There is still only a single CPU, but the fact that preemption is enabled -means that the no-op implementation of synchronize_rcu() no -longer works in CONFIG_PREEMPT=y kernels. -Therefore, as soon as the scheduler starts initializing, the early-boot -fastpath is disabled. -This means that synchronize_rcu() switches to its runtime -mode of operation where it posts callbacks, which in turn means that -any call to synchronize_rcu() will block until the corresponding -callback is invoked. -Unfortunately, the callback cannot be invoked until RCU's runtime -grace-period machinery is up and running, which cannot happen until -the scheduler has initialized itself sufficiently to allow RCU's -kthreads to be spawned. -Therefore, invoking synchronize_rcu() during scheduler -initialization can result in deadlock. - -

    @@QQ@@ -So what happens with synchronize_rcu() during -scheduler initialization for CONFIG_PREEMPT=n -kernels? -

    @@QQA@@ -In CONFIG_PREEMPT=n kernel, synchronize_rcu() -maps directly to synchronize_sched(). -Therefore, synchronize_rcu() works normally throughout -boot in CONFIG_PREEMPT=n kernels. -However, your code must also work in CONFIG_PREEMPT=y kernels, -so it is still necessary to avoid invoking synchronize_rcu() -during scheduler initialization. -

    @@QQE@@ - -

    -I learned of these boot-time requirements as a result of a series of -system hangs. - -

    Interrupts and NMIs

    - -

    -The Linux kernel has interrupts, and RCU read-side critical sections are -legal within interrupt handlers and within interrupt-disabled regions -of code, as are invocations of call_rcu(). - -

    -Some Linux-kernel architectures can enter an interrupt handler from -non-idle process context, and then just never leave it, instead stealthily -transitioning back to process context. -This trick is sometimes used to invoke system calls from inside the kernel. -These “half-interrupts” mean that RCU has to be very careful -about how it counts interrupt nesting levels. -I learned of this requirement the hard way during a rewrite -of RCU's dyntick-idle code. - -

    -The Linux kernel has non-maskable interrupts (NMIs), and -RCU read-side critical sections are legal within NMI handlers. -Thankfully, RCU update-side primitives, including -call_rcu(), are prohibited within NMI handlers. - -

    -The name notwithstanding, some Linux-kernel architectures -can have nested NMIs, which RCU must handle correctly. -Andy Lutomirski -surprised me -with this requirement; -he also kindly surprised me with -an algorithm -that meets this requirement. - -

    Loadable Modules

    - -

    -The Linux kernel has loadable modules, and these modules can -also be unloaded. -After a given module has been unloaded, any attempt to call -one of its functions results in a segmentation fault. -The module-unload functions must therefore cancel any -delayed calls to loadable-module functions, for example, -any outstanding mod_timer() must be dealt with -via del_timer_sync() or similar. - -

    -Unfortunately, there is no way to cancel an RCU callback; -once you invoke call_rcu(), the callback function is -going to eventually be invoked, unless the system goes down first. -Because it is normally considered socially irresponsible to crash the system -in response to a module unload request, we need some other way -to deal with in-flight RCU callbacks. - -

    -RCU therefore provides -rcu_barrier(), -which waits until all in-flight RCU callbacks have been invoked. -If a module uses call_rcu(), its exit function should therefore -prevent any future invocation of call_rcu(), then invoke -rcu_barrier(). -In theory, the underlying module-unload code could invoke -rcu_barrier() unconditionally, but in practice this would -incur unacceptable latencies. - -

    -Nikita Danilov noted this requirement for an analogous filesystem-unmount -situation, and Dipankar Sarma incorporated rcu_barrier() into RCU. -The need for rcu_barrier() for module unloading became -apparent later. - -

    Hotplug CPU

    - -

    -The Linux kernel supports CPU hotplug, which means that CPUs -can come and go. -It is of course illegal to use any RCU API member from an offline CPU. -This requirement was present from day one in DYNIX/ptx, but -on the other hand, the Linux kernel's CPU-hotplug implementation -is “interesting.” - -

    -The Linux-kernel CPU-hotplug implementation has notifiers that -are used to allow the various kernel subsystems (including RCU) -to respond appropriately to a given CPU-hotplug operation. -Most RCU operations may be invoked from CPU-hotplug notifiers, -including even normal synchronous grace-period operations -such as synchronize_rcu(). -However, expedited grace-period operations such as -synchronize_rcu_expedited() are not supported, -due to the fact that current implementations block CPU-hotplug -operations, which could result in deadlock. - -

    -In addition, all-callback-wait operations such as -rcu_barrier() are also not supported, due to the -fact that there are phases of CPU-hotplug operations where -the outgoing CPU's callbacks will not be invoked until after -the CPU-hotplug operation ends, which could also result in deadlock. - -

    Scheduler and RCU

    - -

    -RCU depends on the scheduler, and the scheduler uses RCU to -protect some of its data structures. -This means the scheduler is forbidden from acquiring -the runqueue locks and the priority-inheritance locks -in the middle of an outermost RCU read-side critical section unless either -(1) it releases them before exiting that same -RCU read-side critical section, or -(2) interrupts are disabled across -that entire RCU read-side critical section. -This same prohibition also applies (recursively!) to any lock that is acquired -while holding any lock to which this prohibition applies. -Adhering to this rule prevents preemptible RCU from invoking -rcu_read_unlock_special() while either runqueue or -priority-inheritance locks are held, thus avoiding deadlock. - -

    -Prior to v4.4, it was only necessary to disable preemption across -RCU read-side critical sections that acquired scheduler locks. -In v4.4, expedited grace periods started using IPIs, and these -IPIs could force a rcu_read_unlock() to take the slowpath. -Therefore, this expedited-grace-period change required disabling of -interrupts, not just preemption. - -

    -For RCU's part, the preemptible-RCU rcu_read_unlock() -implementation must be written carefully to avoid similar deadlocks. -In particular, rcu_read_unlock() must tolerate an -interrupt where the interrupt handler invokes both -rcu_read_lock() and rcu_read_unlock(). -This possibility requires rcu_read_unlock() to use -negative nesting levels to avoid destructive recursion via -interrupt handler's use of RCU. - -

    -This pair of mutual scheduler-RCU requirements came as a -complete surprise. - -

    -As noted above, RCU makes use of kthreads, and it is necessary to -avoid excessive CPU-time accumulation by these kthreads. -This requirement was no surprise, but RCU's violation of it -when running context-switch-heavy workloads when built with -CONFIG_NO_HZ_FULL=y -did come as a surprise [PDF]. -RCU has made good progress towards meeting this requirement, even -for context-switch-have CONFIG_NO_HZ_FULL=y workloads, -but there is room for further improvement. - -

    Tracing and RCU

    - -

    -It is possible to use tracing on RCU code, but tracing itself -uses RCU. -For this reason, rcu_dereference_raw_notrace() -is provided for use by tracing, which avoids the destructive -recursion that could otherwise ensue. -This API is also used by virtualization in some architectures, -where RCU readers execute in environments in which tracing -cannot be used. -The tracing folks both located the requirement and provided the -needed fix, so this surprise requirement was relatively painless. - -

    Energy Efficiency

    - -

    -Interrupting idle CPUs is considered socially unacceptable, -especially by people with battery-powered embedded systems. -RCU therefore conserves energy by detecting which CPUs are -idle, including tracking CPUs that have been interrupted from idle. -This is a large part of the energy-efficiency requirement, -so I learned of this via an irate phone call. - -

    -Because RCU avoids interrupting idle CPUs, it is illegal to -execute an RCU read-side critical section on an idle CPU. -(Kernels built with CONFIG_PROVE_RCU=y will splat -if you try it.) -The RCU_NONIDLE() macro and _rcuidle -event tracing is provided to work around this restriction. -In addition, rcu_is_watching() may be used to -test whether or not it is currently legal to run RCU read-side -critical sections on this CPU. -I learned of the need for diagnostics on the one hand -and RCU_NONIDLE() on the other while inspecting -idle-loop code. -Steven Rostedt supplied _rcuidle event tracing, -which is used quite heavily in the idle loop. - -

    -It is similarly socially unacceptable to interrupt an -nohz_full CPU running in userspace. -RCU must therefore track nohz_full userspace -execution. -And in -CONFIG_NO_HZ_FULL_SYSIDLE=y -kernels, RCU must separately track idle CPUs on the one hand and -CPUs that are either idle or executing in userspace on the other. -In both cases, RCU must be able to sample state at two points in -time, and be able to determine whether or not some other CPU spent -any time idle and/or executing in userspace. - -

    -These energy-efficiency requirements have proven quite difficult to -understand and to meet, for example, there have been more than five -clean-sheet rewrites of RCU's energy-efficiency code, the last of -which was finally able to demonstrate -real energy savings running on real hardware [PDF]. -As noted earlier, -I learned of many of these requirements via angry phone calls: -Flaming me on the Linux-kernel mailing list was apparently not -sufficient to fully vent their ire at RCU's energy-efficiency bugs! - -

    Memory Efficiency

    - -

    -Although small-memory non-realtime systems can simply use Tiny RCU, -code size is only one aspect of memory efficiency. -Another aspect is the size of the rcu_head structure -used by call_rcu() and kfree_rcu(). -Although this structure contains nothing more than a pair of pointers, -it does appear in many RCU-protected data structures, including -some that are size critical. -The page structure is a case in point, as evidenced by -the many occurrences of the union keyword within that structure. - -

    -This need for memory efficiency is one reason that RCU uses hand-crafted -singly linked lists to track the rcu_head structures that -are waiting for a grace period to elapse. -It is also the reason why rcu_head structures do not contain -debug information, such as fields tracking the file and line of the -call_rcu() or kfree_rcu() that posted them. -Although this information might appear in debug-only kernel builds at some -point, in the meantime, the ->func field will often provide -the needed debug information. - -

    -However, in some cases, the need for memory efficiency leads to even -more extreme measures. -Returning to the page structure, the rcu_head field -shares storage with a great many other structures that are used at -various points in the corresponding page's lifetime. -In order to correctly resolve certain -race conditions, -the Linux kernel's memory-management subsystem needs a particular bit -to remain zero during all phases of grace-period processing, -and that bit happens to map to the bottom bit of the -rcu_head structure's ->next field. -RCU makes this guarantee as long as call_rcu() -is used to post the callback, as opposed to kfree_rcu() -or some future “lazy” -variant of call_rcu() that might one day be created for -energy-efficiency purposes. - -

    -Performance, Scalability, Response Time, and Reliability

    - -

    -Expanding on the -earlier discussion, -RCU is used heavily by hot code paths in performance-critical -portions of the Linux kernel's networking, security, virtualization, -and scheduling code paths. -RCU must therefore use efficient implementations, especially in its -read-side primitives. -To that end, it would be good if preemptible RCU's implementation -of rcu_read_lock() could be inlined, however, doing -this requires resolving #include issues with the -task_struct structure. - -

    -The Linux kernel supports hardware configurations with up to -4096 CPUs, which means that RCU must be extremely scalable. -Algorithms that involve frequent acquisitions of global locks or -frequent atomic operations on global variables simply cannot be -tolerated within the RCU implementation. -RCU therefore makes heavy use of a combining tree based on the -rcu_node structure. -RCU is required to tolerate all CPUs continuously invoking any -combination of RCU's runtime primitives with minimal per-operation -overhead. -In fact, in many cases, increasing load must decrease the -per-operation overhead, witness the batching optimizations for -synchronize_rcu(), call_rcu(), -synchronize_rcu_expedited(), and rcu_barrier(). -As a general rule, RCU must cheerfully accept whatever the -rest of the Linux kernel decides to throw at it. - -

    -The Linux kernel is used for real-time workloads, especially -in conjunction with the --rt patchset. -The real-time-latency response requirements are such that the -traditional approach of disabling preemption across RCU -read-side critical sections is inappropriate. -Kernels built with CONFIG_PREEMPT=y therefore -use an RCU implementation that allows RCU read-side critical -sections to be preempted. -This requirement made its presence known after users made it -clear that an earlier -real-time patch -did not meet their needs, in conjunction with some -RCU issues -encountered by a very early version of the -rt patchset. - -

    -In addition, RCU must make do with a sub-100-microsecond real-time latency -budget. -In fact, on smaller systems with the -rt patchset, the Linux kernel -provides sub-20-microsecond real-time latencies for the whole kernel, -including RCU. -RCU's scalability and latency must therefore be sufficient for -these sorts of configurations. -To my surprise, the sub-100-microsecond real-time latency budget - -applies to even the largest systems [PDF], -up to and including systems with 4096 CPUs. -This real-time requirement motivated the grace-period kthread, which -also simplified handling of a number of race conditions. - -

    -Finally, RCU's status as a synchronization primitive means that -any RCU failure can result in arbitrary memory corruption that can be -extremely difficult to debug. -This means that RCU must be extremely reliable, which in -practice also means that RCU must have an aggressive stress-test -suite. -This stress-test suite is called rcutorture. - -

    -Although the need for rcutorture was no surprise, -the current immense popularity of the Linux kernel is posing -interesting—and perhaps unprecedented—validation -challenges. -To see this, keep in mind that there are well over one billion -instances of the Linux kernel running today, given Android -smartphones, Linux-powered televisions, and servers. -This number can be expected to increase sharply with the advent of -the celebrated Internet of Things. - -

    -Suppose that RCU contains a race condition that manifests on average -once per million years of runtime. -This bug will be occurring about three times per day across -the installed base. -RCU could simply hide behind hardware error rates, given that no one -should really expect their smartphone to last for a million years. -However, anyone taking too much comfort from this thought should -consider the fact that in most jurisdictions, a successful multi-year -test of a given mechanism, which might include a Linux kernel, -suffices for a number of types of safety-critical certifications. -In fact, rumor has it that the Linux kernel is already being used -in production for safety-critical applications. -I don't know about you, but I would feel quite bad if a bug in RCU -killed someone. -Which might explain my recent focus on validation and verification. - -

    Other RCU Flavors

    - -

    -One of the more surprising things about RCU is that there are now -no fewer than five flavors, or API families. -In addition, the primary flavor that has been the sole focus up to -this point has two different implementations, non-preemptible and -preemptible. -The other four flavors are listed below, with requirements for each -described in a separate section. - -

      -
    1. Bottom-Half Flavor -
    2. Sched Flavor -
    3. Sleepable RCU -
    4. Tasks RCU -
    - -

    Bottom-Half Flavor

    - -

    -The softirq-disable (AKA “bottom-half”, -hence the “_bh” abbreviations) -flavor of RCU, or RCU-bh, was developed by -Dipankar Sarma to provide a flavor of RCU that could withstand the -network-based denial-of-service attacks researched by Robert -Olsson. -These attacks placed so much networking load on the system -that some of the CPUs never exited softirq execution, -which in turn prevented those CPUs from ever executing a context switch, -which, in the RCU implementation of that time, prevented grace periods -from ever ending. -The result was an out-of-memory condition and a system hang. - -

    -The solution was the creation of RCU-bh, which does -local_bh_disable() -across its read-side critical sections, and which uses the transition -from one type of softirq processing to another as a quiescent state -in addition to context switch, idle, user mode, and offline. -This means that RCU-bh grace periods can complete even when some of -the CPUs execute in softirq indefinitely, thus allowing algorithms -based on RCU-bh to withstand network-based denial-of-service attacks. - -

    -Because -rcu_read_lock_bh() and rcu_read_unlock_bh() -disable and re-enable softirq handlers, any attempt to start a softirq -handlers during the -RCU-bh read-side critical section will be deferred. -In this case, rcu_read_unlock_bh() -will invoke softirq processing, which can take considerable time. -One can of course argue that this softirq overhead should be associated -with the code following the RCU-bh read-side critical section rather -than rcu_read_unlock_bh(), but the fact -is that most profiling tools cannot be expected to make this sort -of fine distinction. -For example, suppose that a three-millisecond-long RCU-bh read-side -critical section executes during a time of heavy networking load. -There will very likely be an attempt to invoke at least one softirq -handler during that three milliseconds, but any such invocation will -be delayed until the time of the rcu_read_unlock_bh(). -This can of course make it appear at first glance as if -rcu_read_unlock_bh() was executing very slowly. - -

    -The -RCU-bh API -includes -rcu_read_lock_bh(), -rcu_read_unlock_bh(), -rcu_dereference_bh(), -rcu_dereference_bh_check(), -synchronize_rcu_bh(), -synchronize_rcu_bh_expedited(), -call_rcu_bh(), -rcu_barrier_bh(), and -rcu_read_lock_bh_held(). - -

    Sched Flavor

    - -

    -Before preemptible RCU, waiting for an RCU grace period had the -side effect of also waiting for all pre-existing interrupt -and NMI handlers. -However, there are legitimate preemptible-RCU implementations that -do not have this property, given that any point in the code outside -of an RCU read-side critical section can be a quiescent state. -Therefore, RCU-sched was created, which follows “classic” -RCU in that an RCU-sched grace period waits for for pre-existing -interrupt and NMI handlers. -In kernels built with CONFIG_PREEMPT=n, the RCU and RCU-sched -APIs have identical implementations, while kernels built with -CONFIG_PREEMPT=y provide a separate implementation for each. - -

    -Note well that in CONFIG_PREEMPT=y kernels, -rcu_read_lock_sched() and rcu_read_unlock_sched() -disable and re-enable preemption, respectively. -This means that if there was a preemption attempt during the -RCU-sched read-side critical section, rcu_read_unlock_sched() -will enter the scheduler, with all the latency and overhead entailed. -Just as with rcu_read_unlock_bh(), this can make it look -as if rcu_read_unlock_sched() was executing very slowly. -However, the highest-priority task won't be preempted, so that task -will enjoy low-overhead rcu_read_unlock_sched() invocations. - -

    -The -RCU-sched API -includes -rcu_read_lock_sched(), -rcu_read_unlock_sched(), -rcu_read_lock_sched_notrace(), -rcu_read_unlock_sched_notrace(), -rcu_dereference_sched(), -rcu_dereference_sched_check(), -synchronize_sched(), -synchronize_rcu_sched_expedited(), -call_rcu_sched(), -rcu_barrier_sched(), and -rcu_read_lock_sched_held(). -However, anything that disables preemption also marks an RCU-sched -read-side critical section, including -preempt_disable() and preempt_enable(), -local_irq_save() and local_irq_restore(), -and so on. - -

    Sleepable RCU

    - -

    -For well over a decade, someone saying “I need to block within -an RCU read-side critical section” was a reliable indication -that this someone did not understand RCU. -After all, if you are always blocking in an RCU read-side critical -section, you can probably afford to use a higher-overhead synchronization -mechanism. -However, that changed with the advent of the Linux kernel's notifiers, -whose RCU read-side critical -sections almost never sleep, but sometimes need to. -This resulted in the introduction of -sleepable RCU, -or SRCU. - -

    -SRCU allows different domains to be defined, with each such domain -defined by an instance of an srcu_struct structure. -A pointer to this structure must be passed in to each SRCU function, -for example, synchronize_srcu(&ss), where -ss is the srcu_struct structure. -The key benefit of these domains is that a slow SRCU reader in one -domain does not delay an SRCU grace period in some other domain. -That said, one consequence of these domains is that read-side code -must pass a “cookie” from srcu_read_lock() -to srcu_read_unlock(), for example, as follows: - -

    -
    - 1 int idx;
    - 2
    - 3 idx = srcu_read_lock(&ss);
    - 4 do_something();
    - 5 srcu_read_unlock(&ss, idx);
    -
    -
    - -

    -As noted above, it is legal to block within SRCU read-side critical sections, -however, with great power comes great responsibility. -If you block forever in one of a given domain's SRCU read-side critical -sections, then that domain's grace periods will also be blocked forever. -Of course, one good way to block forever is to deadlock, which can -happen if any operation in a given domain's SRCU read-side critical -section can block waiting, either directly or indirectly, for that domain's -grace period to elapse. -For example, this results in a self-deadlock: - -

    -
    - 1 int idx;
    - 2
    - 3 idx = srcu_read_lock(&ss);
    - 4 do_something();
    - 5 synchronize_srcu(&ss);
    - 6 srcu_read_unlock(&ss, idx);
    -
    -
    - -

    -However, if line 5 acquired a mutex that was held across -a synchronize_srcu() for domain ss, -deadlock would still be possible. -Furthermore, if line 5 acquired a mutex that was held across -a synchronize_srcu() for some other domain ss1, -and if an ss1-domain SRCU read-side critical section -acquired another mutex that was held across as ss-domain -synchronize_srcu(), -deadlock would again be possible. -Such a deadlock cycle could extend across an arbitrarily large number -of different SRCU domains. -Again, with great power comes great responsibility. - -

    -Unlike the other RCU flavors, SRCU read-side critical sections can -run on idle and even offline CPUs. -This ability requires that srcu_read_lock() and -srcu_read_unlock() contain memory barriers, which means -that SRCU readers will run a bit slower than would RCU readers. -It also motivates the smp_mb__after_srcu_read_unlock() -API, which, in combination with srcu_read_unlock(), -guarantees a full memory barrier. - -

    -The -SRCU API -includes -srcu_read_lock(), -srcu_read_unlock(), -srcu_dereference(), -srcu_dereference_check(), -synchronize_srcu(), -synchronize_srcu_expedited(), -call_srcu(), -srcu_barrier(), and -srcu_read_lock_held(). -It also includes -DEFINE_SRCU(), -DEFINE_STATIC_SRCU(), and -init_srcu_struct() -APIs for defining and initializing srcu_struct structures. - -

    Tasks RCU

    - -

    -Some forms of tracing use “tramopolines” to handle the -binary rewriting required to install different types of probes. -It would be good to be able to free old trampolines, which sounds -like a job for some form of RCU. -However, because it is necessary to be able to install a trace -anywhere in the code, it is not possible to use read-side markers -such as rcu_read_lock() and rcu_read_unlock(). -In addition, it does not work to have these markers in the trampoline -itself, because there would need to be instructions following -rcu_read_unlock(). -Although synchronize_rcu() would guarantee that execution -reached the rcu_read_unlock(), it would not be able to -guarantee that execution had completely left the trampoline. - -

    -The solution, in the form of -Tasks RCU, -is to have implicit -read-side critical sections that are delimited by voluntary context -switches, that is, calls to schedule(), -cond_resched_rcu_qs(), and -synchronize_rcu_tasks(). -In addition, transitions to and from userspace execution also delimit -tasks-RCU read-side critical sections. - -

    -The tasks-RCU API is quite compact, consisting only of -call_rcu_tasks(), -synchronize_rcu_tasks(), and -rcu_barrier_tasks(). - -

    Possible Future Changes

    - -

    -One of the tricks that RCU uses to attain update-side scalability is -to increase grace-period latency with increasing numbers of CPUs. -If this becomes a serious problem, it will be necessary to rework the -grace-period state machine so as to avoid the need for the additional -latency. - -

    -Expedited grace periods scan the CPUs, so their latency and overhead -increases with increasing numbers of CPUs. -If this becomes a serious problem on large systems, it will be necessary -to do some redesign to avoid this scalability problem. - -

    -RCU disables CPU hotplug in a few places, perhaps most notably in the -expedited grace-period and rcu_barrier() operations. -If there is a strong reason to use expedited grace periods in CPU-hotplug -notifiers, it will be necessary to avoid disabling CPU hotplug. -This would introduce some complexity, so there had better be a very -good reason. - -

    -The tradeoff between grace-period latency on the one hand and interruptions -of other CPUs on the other hand may need to be re-examined. -The desire is of course for zero grace-period latency as well as zero -interprocessor interrupts undertaken during an expedited grace period -operation. -While this ideal is unlikely to be achievable, it is quite possible that -further improvements can be made. - -

    -The multiprocessor implementations of RCU use a combining tree that -groups CPUs so as to reduce lock contention and increase cache locality. -However, this combining tree does not spread its memory across NUMA -nodes nor does it align the CPU groups with hardware features such -as sockets or cores. -Such spreading and alignment is currently believed to be unnecessary -because the hotpath read-side primitives do not access the combining -tree, nor does call_rcu() in the common case. -If you believe that your architecture needs such spreading and alignment, -then your architecture should also benefit from the -rcutree.rcu_fanout_leaf boot parameter, which can be set -to the number of CPUs in a socket, NUMA node, or whatever. -If the number of CPUs is too large, use a fraction of the number of -CPUs. -If the number of CPUs is a large prime number, well, that certainly -is an “interesting” architectural choice! -More flexible arrangements might be considered, but only if -rcutree.rcu_fanout_leaf has proven inadequate, and only -if the inadequacy has been demonstrated by a carefully run and -realistic system-level workload. - -

    -Please note that arrangements that require RCU to remap CPU numbers will -require extremely good demonstration of need and full exploration of -alternatives. - -

    -There is an embarrassingly large number of flavors of RCU, and this -number has been increasing over time. -Perhaps it will be possible to combine some at some future date. - -

    -RCU's various kthreads are reasonably recent additions. -It is quite likely that adjustments will be required to more gracefully -handle extreme loads. -It might also be necessary to be able to relate CPU utilization by -RCU's kthreads and softirq handlers to the code that instigated this -CPU utilization. -For example, RCU callback overhead might be charged back to the -originating call_rcu() instance, though probably not -in production kernels. - -

    Summary

    - -

    -This document has presented more than two decade's worth of RCU -requirements. -Given that the requirements keep changing, this will not be the last -word on this subject, but at least it serves to get an important -subset of the requirements set forth. - -

    Acknowledgments

    - -I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, -Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and -Andy Lutomirski for their help in rendering -this article human readable, and to Michelle Rankin for her support -of this effort. -Other contributions are acknowledged in the Linux kernel's git archive. -The cartoon is copyright (c) 2013 by Melissa Broussard, -and is provided -under the terms of the Creative Commons Attribution-Share Alike 3.0 -United States license. - -

    @@QQAL@@ - - diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh deleted file mode 100755 index d354f069559b8f9c186f87786d608c9fa773fe8e..0000000000000000000000000000000000000000 --- a/Documentation/RCU/Design/htmlqqz.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/sh -# -# Usage: sh htmlqqz.sh file -# -# Extracts and converts quick quizzes in a proto-HTML document file.htmlx. -# Commands, all of which must be on a line by themselves: -# -# "

    @@QQ@@": Start of a quick quiz. -# "

    @@QQA@@": Start of a quick-quiz answer. -# "

    @@QQE@@": End of a quick-quiz answer, and thus of the quick quiz. -# "

    @@QQAL@@": Place to put quick-quiz answer list. -# -# Places the result in file.html. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, you can access it online at -# http://www.gnu.org/licenses/gpl-2.0.html. -# -# Copyright (c) 2013 Paul E. McKenney, IBM Corporation. - -fn=$1 -if test ! -r $fn.htmlx -then - echo "Error: $fn.htmlx unreadable." - exit 1 -fi - -echo "" > $fn.html -echo "" >> $fn.html -awk < $fn.htmlx >> $fn.html ' - -state == "" && $1 != "

    @@QQ@@" && $1 != "

    @@QQAL@@" { - print $0; - if ($0 ~ /^

    @@QQ/) - print "Bad Quick Quiz command: " NR " (expected

    @@QQ@@ or

    @@QQAL@@)." > "/dev/stderr" - next; -} - -state == "" && $1 == "

    @@QQ@@" { - qqn++; - qqlineno = NR; - haveqq = 1; - state = "qq"; - print "

    Quick Quiz " qqn ":" - next; -} - -state == "qq" && $1 != "

    @@QQA@@" { - qq[qqn] = qq[qqn] $0 "\n"; - print $0 - if ($0 ~ /^

    @@QQ/) - print "Bad Quick Quiz command: " NR ". (expected

    @@QQA@@)" > "/dev/stderr" - next; -} - -state == "qq" && $1 == "

    @@QQA@@" { - state = "qqa"; - print "
    Answer" - next; -} - -state == "qqa" && $1 != "

    @@QQE@@" { - qqa[qqn] = qqa[qqn] $0 "\n"; - if ($0 ~ /^

    @@QQ/) - print "Bad Quick Quiz command: " NR " (expected

    @@QQE@@)." > "/dev/stderr" - next; -} - -state == "qqa" && $1 == "

    @@QQE@@" { - state = ""; - next; -} - -state == "" && $1 == "

    @@QQAL@@" { - haveqq = ""; - print "

    " - print "Answers to Quick Quizzes

    " - print ""; - for (i = 1; i <= qqn; i++) { - print "" - print "

    Quick Quiz " i ":" - print qq[i]; - print ""; - print "

    Answer:" - print qqa[i]; - print ""; - print "

    Back to Quick Quiz " i "." - print ""; - } - next; -} - -END { - if (state != "") - print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr" - else if (haveqq) - print "Missing \"

    @@QQAL@@\", no Quick Quiz." > "/dev/stderr" -}' diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index ec6998b1b6d04f3139ed6c066537cc059c89838d..00a3a38b375ae9946425fc2ea94fa0c2383e867c 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -237,17 +237,17 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of The output of "cat rcu/rcu_preempt/rcuexp" looks as follows: -s=21872 wd0=0 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872 +s=21872 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872 These fields are as follows: o "s" is the sequence number, with an odd number indicating that an expedited grace period is in progress. -o "wd0", "wd1", "wd2", and "wd3" are the number of times that an - attempt to start an expedited grace period found that someone - else had completed an expedited grace period that satisfies the - attempted request. "Our work is done." +o "wd1", "wd2", and "wd3" are the number of times that an attempt + to start an expedited grace period found that someone else had + completed an expedited grace period that satisfies the attempted + request. "Our work is done." o "n" is number of times that a concurrent CPU-hotplug operation forced a fallback to a normal grace period. diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index dc49c6712b17ff4968d3c4fdf2b304e0292fb5be..111770ffa10e7cc4c3d2dada09e1e85f8d576099 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -681,22 +681,30 @@ Although RCU can be used in many different ways, a very common use of RCU is analogous to reader-writer locking. The following unified diff shows how closely related RCU and reader-writer locking can be. + @@ -5,5 +5,5 @@ struct el { + int data; + /* Other data fields */ + }; + -rwlock_t listmutex; + +spinlock_t listmutex; + struct el head; + @@ -13,15 +14,15 @@ struct list_head *lp; struct el *p; - - read_lock(); + - read_lock(&listmutex); - list_for_each_entry(p, head, lp) { + rcu_read_lock(); + list_for_each_entry_rcu(p, head, lp) { if (p->key == key) { *result = p->data; - - read_unlock(); + - read_unlock(&listmutex); + rcu_read_unlock(); return 1; } } - - read_unlock(); + - read_unlock(&listmutex); + rcu_read_unlock(); return 0; } @@ -732,7 +740,7 @@ Or, for those who prefer a side-by-side listing: 5 int data; 5 int data; 6 /* Other data fields */ 6 /* Other data fields */ 7 }; 7 }; - 8 spinlock_t listmutex; 8 spinlock_t listmutex; + 8 rwlock_t listmutex; 8 spinlock_t listmutex; 9 struct el head; 9 struct el head; 1 int search(long key, int *result) 1 int search(long key, int *result) @@ -740,15 +748,15 @@ Or, for those who prefer a side-by-side listing: 3 struct list_head *lp; 3 struct list_head *lp; 4 struct el *p; 4 struct el *p; 5 5 - 6 read_lock(); 6 rcu_read_lock(); + 6 read_lock(&listmutex); 6 rcu_read_lock(); 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) { 8 if (p->key == key) { 8 if (p->key == key) { 9 *result = p->data; 9 *result = p->data; -10 read_unlock(); 10 rcu_read_unlock(); +10 read_unlock(&listmutex); 10 rcu_read_unlock(); 11 return 1; 11 return 1; 12 } 12 } 13 } 13 } -14 read_unlock(); 14 rcu_read_unlock(); +14 read_unlock(&listmutex); 14 rcu_read_unlock(); 15 return 0; 15 return 0; 16 } 16 } diff --git a/Documentation/devicetree/bindings/regmap/regmap.txt b/Documentation/devicetree/bindings/regmap/regmap.txt index e98a9652ccc8c4d3a2263fe5a67b9064b27d1f04..0127be360fe852d70a853026c03de1fcc8c8a7a0 100644 --- a/Documentation/devicetree/bindings/regmap/regmap.txt +++ b/Documentation/devicetree/bindings/regmap/regmap.txt @@ -1,50 +1,29 @@ -Device-Tree binding for regmap - -The endianness mode of CPU & Device scenarios: -Index Device Endianness properties ---------------------------------------------------- -1 BE 'big-endian' -2 LE 'little-endian' -3 Native 'native-endian' - -For one device driver, which will run in different scenarios above -on different SoCs using the devicetree, we need one way to simplify -this. +Devicetree binding for regmap Optional properties: -- {big,little,native}-endian: these are boolean properties, if absent - then the implementation will choose a default based on the device - being controlled. These properties are for register values and all - the buffers only. Native endian means that the CPU and device have - the same endianness. -Examples: -Scenario 1 : CPU in LE mode & device in LE mode. -dev: dev@40031000 { - compatible = "name"; - reg = <0x40031000 0x1000>; - ... -}; + little-endian, + big-endian, + native-endian: See common-properties.txt for a definition -Scenario 2 : CPU in LE mode & device in BE mode. -dev: dev@40031000 { - compatible = "name"; - reg = <0x40031000 0x1000>; - ... - big-endian; -}; +Note: +Regmap defaults to little-endian register access on MMIO based +devices, this is by far the most common setting. On CPU +architectures that typically run big-endian operating systems +(e.g. PowerPC), registers can be defined as big-endian and must +be marked that way in the devicetree. -Scenario 3 : CPU in BE mode & device in BE mode. -dev: dev@40031000 { - compatible = "name"; - reg = <0x40031000 0x1000>; - ... -}; +On SoCs that can be operated in both big-endian and little-endian +modes, with a single hardware switch controlling both the endianess +of the CPU and a byteswap for MMIO registers (e.g. many Broadcom MIPS +chips), "native-endian" is used to allow using the same device tree +blob in both cases. -Scenario 4 : CPU in BE mode & device in LE mode. +Examples: +Scenario 1 : a register set in big-endian mode. dev: dev@40031000 { - compatible = "name"; + compatible = "syscon"; reg = <0x40031000 0x1000>; + big-endian; ... - little-endian; }; diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 0b3de80ec8f69c1aef9dfaa59fb86c14395ba43e..13f89d18bc251a040617acbde0961bdc9e34a54f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -131,6 +131,7 @@ parameter is applicable: More X86-64 boot options can be found in Documentation/x86/x86_64/boot-options.txt . X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64) + X86_UV SGI UV support is enabled. XEN Xen support is enabled In addition, the following text indicates that the option: @@ -542,6 +543,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: (must be >=0) Default: 64 + bau= [X86_UV] Enable the BAU on SGI UV. The default + behavior is to disable the BAU (i.e. bau=0). + Format: { "0" | "1" } + 0 - Disable the BAU. + 1 - Enable the BAU. + unset - Disable the BAU. + baycom_epp= [HW,AX25] Format: , @@ -3284,6 +3292,44 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Lazy RCU callbacks are those which RCU can prove do nothing more than free memory. + rcuperf.gp_exp= [KNL] + Measure performance of expedited synchronous + grace-period primitives. + + rcuperf.holdoff= [KNL] + Set test-start holdoff period. The purpose of + this parameter is to delay the start of the + test until boot completes in order to avoid + interference. + + rcuperf.nreaders= [KNL] + Set number of RCU readers. The value -1 selects + N, where N is the number of CPUs. A value + "n" less than -1 selects N-n+1, where N is again + the number of CPUs. For example, -2 selects N + (the number of CPUs), -3 selects N+1, and so on. + A value of "n" less than or equal to -N selects + a single reader. + + rcuperf.nwriters= [KNL] + Set number of RCU writers. The values operate + the same as for rcuperf.nreaders. + N, where N is the number of CPUs + + rcuperf.perf_runnable= [BOOT] + Start rcuperf running at boot time. + + rcuperf.shutdown= [KNL] + Shut the system down after performance tests + complete. This is useful for hands-off automated + testing. + + rcuperf.perf_type= [KNL] + Specify the RCU implementation to test. + + rcuperf.verbose= [KNL] + Enable additional printk() statements. + rcutorture.cbflood_inter_holdoff= [KNL] Set holdoff time (jiffies) between successive callback-flood tests. diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt index 5001280e9d824d360cfb5589eb40751968b841d3..9de1c158d44c78a1d2a6ed6e116c4076d7ac5894 100644 --- a/Documentation/locking/lockdep-design.txt +++ b/Documentation/locking/lockdep-design.txt @@ -97,7 +97,7 @@ between any two lock-classes: -> -> -The first rule comes from the fact the a hardirq-safe lock could be +The first rule comes from the fact that a hardirq-safe lock could be taken by a hardirq context, interrupting a hardirq-unsafe lock - and thus could result in a lock inversion deadlock. Likewise, a softirq-safe lock could be taken by an softirq context, interrupting a softirq-unsafe @@ -220,7 +220,7 @@ calculated, which hash is unique for every lock chain. The hash value, when the chain is validated for the first time, is then put into a hash table, which hash-table can be checked in a lockfree manner. If the locking chain occurs again later on, the hash table tells us that we -dont have to validate the chain again. +don't have to validate the chain again. Troubleshooting: ---------------- diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 3729cbe60e4169340b5bc522951d9e0f40c4cb46..147ae8ec836f85666110634ff5565f4016de1d80 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -4,8 +4,40 @@ By: David Howells Paul E. McKenney + Will Deacon + Peter Zijlstra -Contents: +========== +DISCLAIMER +========== + +This document is not a specification; it is intentionally (for the sake of +brevity) and unintentionally (due to being human) incomplete. This document is +meant as a guide to using the various memory barriers provided by Linux, but +in case of any doubt (and there are many) please ask. + +To repeat, this document is not a specification of what Linux expects from +hardware. + +The purpose of this document is twofold: + + (1) to specify the minimum functionality that one can rely on for any + particular barrier, and + + (2) to provide a guide as to how to use the barriers that are available. + +Note that an architecture can provide more than the minimum requirement +for any particular barrier, but if the architecure provides less than +that, that architecture is incorrect. + +Note also that it is possible that a barrier may be a no-op for an +architecture because the way that arch works renders an explicit barrier +unnecessary in that case. + + +======== +CONTENTS +======== (*) Abstract memory access model. @@ -31,15 +63,15 @@ Contents: (*) Implicit kernel memory barriers. - - Locking functions. + - Lock acquisition functions. - Interrupt disabling functions. - Sleep and wake-up functions. - Miscellaneous functions. - (*) Inter-CPU locking barrier effects. + (*) Inter-CPU acquiring barrier effects. - - Locks vs memory accesses. - - Locks vs I/O accesses. + - Acquires vs memory accesses. + - Acquires vs I/O accesses. (*) Where are memory barriers needed? @@ -61,6 +93,7 @@ Contents: (*) The things CPUs get up to. - And then there's the Alpha. + - Virtual Machine Guests. (*) Example uses. @@ -148,7 +181,7 @@ As a further example, consider this sequence of events: CPU 1 CPU 2 =============== =============== - { A == 1, B == 2, C = 3, P == &A, Q == &C } + { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; Q = P; P = &B D = *Q; @@ -430,8 +463,9 @@ And a couple of implicit varieties: This acts as a one-way permeable barrier. It guarantees that all memory operations after the ACQUIRE operation will appear to happen after the ACQUIRE operation with respect to the other components of the system. - ACQUIRE operations include LOCK operations and smp_load_acquire() - operations. + ACQUIRE operations include LOCK operations and both smp_load_acquire() + and smp_cond_acquire() operations. The later builds the necessary ACQUIRE + semantics from relying on a control dependency and smp_rmb(). Memory operations that occur before an ACQUIRE operation may appear to happen after it completes. @@ -464,6 +498,11 @@ And a couple of implicit varieties: This means that ACQUIRE acts as a minimal "acquire" operation and RELEASE acts as a minimal "release" operation. +A subset of the atomic operations described in atomic_ops.txt have ACQUIRE +and RELEASE variants in addition to fully-ordered and relaxed (no barrier +semantics) definitions. For compound atomics performing both a load and a +store, ACQUIRE semantics apply only to the load and RELEASE semantics apply +only to the store portion of the operation. Memory barriers are only required where there's a possibility of interaction between two CPUs or between a CPU and a device. If it can be guaranteed that @@ -517,7 +556,7 @@ following sequence of events: CPU 1 CPU 2 =============== =============== - { A == 1, B == 2, C = 3, P == &A, Q == &C } + { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; WRITE_ONCE(P, &B) @@ -544,7 +583,7 @@ between the address load and the data load: CPU 1 CPU 2 =============== =============== - { A == 1, B == 2, C = 3, P == &A, Q == &C } + { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; WRITE_ONCE(P, &B); @@ -813,9 +852,10 @@ In summary: the same variable, then those stores must be ordered, either by preceding both of them with smp_mb() or by using smp_store_release() to carry out the stores. Please note that it is -not- sufficient - to use barrier() at beginning of each leg of the "if" statement, - as optimizing compilers do not necessarily respect barrier() - in this case. + to use barrier() at beginning of each leg of the "if" statement + because, as shown by the example above, optimizing compilers can + destroy the control dependency while respecting the letter of the + barrier() law. (*) Control dependencies require at least one run-time conditional between the prior load and the subsequent store, and this @@ -1731,15 +1771,15 @@ The Linux kernel has eight basic CPU memory barriers: All memory barriers except the data dependency barriers imply a compiler -barrier. Data dependencies do not impose any additional compiler ordering. +barrier. Data dependencies do not impose any additional compiler ordering. Aside: In the case of data dependencies, the compiler would be expected to issue the loads in the correct order (eg. `a[b]` would have to load the value of b before loading a[b]), however there is no guarantee in the C specification that the compiler may not speculate the value of b (eg. is equal to 1) and load a before b (eg. tmp = a[1]; if (b != 1) -tmp = a[b]; ). There is also the problem of a compiler reloading b after -having loaded a[b], thus having a newer copy of b than a[b]. A consensus +tmp = a[b]; ). There is also the problem of a compiler reloading b after +having loaded a[b], thus having a newer copy of b than a[b]. A consensus has not yet been reached about these problems, however the READ_ONCE() macro is a good place to start looking. @@ -1794,6 +1834,7 @@ There are some more advanced barrier functions: (*) lockless_dereference(); + This can be thought of as a pointer-fetch wrapper around the smp_read_barrier_depends() data-dependency barrier. @@ -1858,7 +1899,7 @@ This is a variation on the mandatory write barrier that causes writes to weakly ordered I/O regions to be partially ordered. Its effects may go beyond the CPU->Hardware interface and actually affect the hardware at some level. -See the subsection "Locks vs I/O accesses" for more information. +See the subsection "Acquires vs I/O accesses" for more information. =============================== @@ -1873,8 +1914,8 @@ provide more substantial guarantees, but these may not be relied upon outside of arch specific code. -ACQUIRING FUNCTIONS -------------------- +LOCK ACQUISITION FUNCTIONS +-------------------------- The Linux kernel has a number of locking constructs: @@ -1895,7 +1936,7 @@ for each construct. These operations all imply certain barriers: Memory operations issued before the ACQUIRE may be completed after the ACQUIRE operation has completed. An smp_mb__before_spinlock(), combined with a following ACQUIRE, orders prior stores against - subsequent loads and stores. Note that this is weaker than smp_mb()! + subsequent loads and stores. Note that this is weaker than smp_mb()! The smp_mb__before_spinlock() primitive is free on many architectures. (2) RELEASE operation implication: @@ -2090,9 +2131,9 @@ or: event_indicated = 1; wake_up_process(event_daemon); -A write memory barrier is implied by wake_up() and co. if and only if they wake -something up. The barrier occurs before the task state is cleared, and so sits -between the STORE to indicate the event and the STORE to set TASK_RUNNING: +A write memory barrier is implied by wake_up() and co. if and only if they +wake something up. The barrier occurs before the task state is cleared, and so +sits between the STORE to indicate the event and the STORE to set TASK_RUNNING: CPU 1 CPU 2 =============================== =============================== @@ -2206,7 +2247,7 @@ three CPUs; then should the following sequence of events occur: Then there is no guarantee as to what order CPU 3 will see the accesses to *A through *H occur in, other than the constraints imposed by the separate locks -on the separate CPUs. It might, for example, see: +on the separate CPUs. It might, for example, see: *E, ACQUIRE M, ACQUIRE Q, *G, *C, *F, *A, *B, RELEASE Q, *D, *H, RELEASE M @@ -2486,9 +2527,9 @@ The following operations are special locking primitives: clear_bit_unlock(); __clear_bit_unlock(); -These implement ACQUIRE-class and RELEASE-class operations. These should be used in -preference to other operations when implementing locking primitives, because -their implementations can be optimised on many architectures. +These implement ACQUIRE-class and RELEASE-class operations. These should be +used in preference to other operations when implementing locking primitives, +because their implementations can be optimised on many architectures. [!] Note that special memory barrier primitives are available for these situations because on some CPUs the atomic instructions used imply full memory @@ -2568,12 +2609,12 @@ explicit barriers are used. Normally this won't be a problem because the I/O accesses done inside such sections will include synchronous load operations on strictly ordered I/O -registers that form implicit I/O barriers. If this isn't sufficient then an +registers that form implicit I/O barriers. If this isn't sufficient then an mmiowb() may need to be used explicitly. A similar situation may occur between an interrupt routine and two routines -running on separate CPUs that communicate with each other. If such a case is +running on separate CPUs that communicate with each other. If such a case is likely, then interrupt-disabling locks should be used to guarantee ordering. @@ -2587,8 +2628,8 @@ functions: (*) inX(), outX(): These are intended to talk to I/O space rather than memory space, but - that's primarily a CPU-specific concept. The i386 and x86_64 processors do - indeed have special I/O space access cycles and instructions, but many + that's primarily a CPU-specific concept. The i386 and x86_64 processors + do indeed have special I/O space access cycles and instructions, but many CPUs don't have such a concept. The PCI bus, amongst others, defines an I/O space concept which - on such @@ -2610,7 +2651,7 @@ functions: Whether these are guaranteed to be fully ordered and uncombined with respect to each other on the issuing CPU depends on the characteristics - defined for the memory window through which they're accessing. On later + defined for the memory window through which they're accessing. On later i386 architecture machines, for example, this is controlled by way of the MTRR registers. @@ -2635,10 +2676,10 @@ functions: (*) readX_relaxed(), writeX_relaxed() These are similar to readX() and writeX(), but provide weaker memory - ordering guarantees. Specifically, they do not guarantee ordering with + ordering guarantees. Specifically, they do not guarantee ordering with respect to normal memory accesses (e.g. DMA buffers) nor do they guarantee - ordering with respect to LOCK or UNLOCK operations. If the latter is - required, an mmiowb() barrier can be used. Note that relaxed accesses to + ordering with respect to LOCK or UNLOCK operations. If the latter is + required, an mmiowb() barrier can be used. Note that relaxed accesses to the same peripheral are guaranteed to be ordered with respect to each other. @@ -3040,8 +3081,9 @@ The Alpha defines the Linux kernel's memory barrier model. See the subsection on "Cache Coherency" above. + VIRTUAL MACHINE GUESTS -------------------- +---------------------- Guests running within virtual machines might be affected by SMP effects even if the guest itself is compiled without SMP support. This is an artifact of @@ -3050,7 +3092,7 @@ barriers for this use-case would be possible but is often suboptimal. To handle this case optimally, low-level virt_mb() etc macros are available. These have the same effect as smp_mb() etc when SMP is enabled, but generate -identical code for SMP and non-SMP systems. For example, virtual machine guests +identical code for SMP and non-SMP systems. For example, virtual machine guests should use virt_mb() rather than smp_mb() when synchronizing against a (possibly SMP) host. @@ -3058,6 +3100,7 @@ These are equivalent to smp_mb() etc counterparts in all other respects, in particular, they do not control MMIO effects: to control MMIO effects, use mandatory barriers. + ============ EXAMPLE USES ============ diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index fcddfd5ded999a3ac8d5624f150439c402e8f0ed..daabdd7ee543ea7f8bfe61274869ded73f353a94 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -60,6 +60,7 @@ show up in /proc/sys/kernel: - panic_on_warn - perf_cpu_time_max_percent - perf_event_paranoid +- perf_event_max_stack - pid_max - powersave-nap [ PPC only ] - printk @@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN). The default value is 2. ============================================================== +perf_event_max_stack: + +Controls maximum number of stack frames to copy for (attr.sample_type & +PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using +'perf record -g' or 'perf trace --call-graph fp'. + +This can only be done when no events are in use that have callchains +enabled, otherwise writing to this file will return -EBUSY. + +The default value is 127. + +============================================================== + pid_max: PID allocation wrap value. When the kernel's next PID value diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index f52f297cb40627a7d5855f04399977f304272e51..9857606dd7b7118c23885d039919e4265ea55775 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set. -0 3dN.1 12us : menu_hrtimer_cancel <-tick_nohz_idle_exit -0 3dN.1 12us : ktime_get <-tick_nohz_idle_exit -0 3dN.1 12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit - -0 3dN.1 13us : update_cpu_load_nohz <-tick_nohz_idle_exit - -0 3dN.1 13us : _raw_spin_lock <-update_cpu_load_nohz + -0 3dN.1 13us : cpu_load_update_nohz <-tick_nohz_idle_exit + -0 3dN.1 13us : _raw_spin_lock <-cpu_load_update_nohz -0 3dN.1 13us : add_preempt_count <-_raw_spin_lock - -0 3dN.2 13us : __update_cpu_load <-update_cpu_load_nohz - -0 3dN.2 14us : sched_avg_update <-__update_cpu_load - -0 3dN.2 14us : _raw_spin_unlock <-update_cpu_load_nohz + -0 3dN.2 13us : __cpu_load_update <-cpu_load_update_nohz + -0 3dN.2 14us : sched_avg_update <-__cpu_load_update + -0 3dN.2 14us : _raw_spin_unlock <-cpu_load_update_nohz -0 3dN.2 14us : sub_preempt_count <-_raw_spin_unlock -0 3dN.1 15us : calc_load_exit_idle <-tick_nohz_idle_exit -0 3dN.1 15us : touch_softlockup_watchdog <-tick_nohz_idle_exit diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt index 54944c71b819bd7b37aeb4d221ef02fffd201879..2a4ee6302122f8942ac08f6d26dbdf2f369f4e6e 100644 --- a/Documentation/x86/pat.txt +++ b/Documentation/x86/pat.txt @@ -196,3 +196,35 @@ Another, more verbose way of getting PAT related debug messages is with "debugpat" boot parameter. With this parameter, various debug messages are printed to dmesg log. +PAT Initialization +------------------ + +The following table describes how PAT is initialized under various +configurations. The PAT MSR must be updated by Linux in order to support WC +and WT attributes. Otherwise, the PAT MSR has the value programmed in it +by the firmware. Note, Xen enables WC attribute in the PAT MSR for guests. + + MTRR PAT Call Sequence PAT State PAT MSR + ========================================================= + E E MTRR -> PAT init Enabled OS + E D MTRR -> PAT init Disabled - + D E MTRR -> PAT disable Disabled BIOS + D D MTRR -> PAT disable Disabled - + - np/E PAT -> PAT disable Disabled BIOS + - np/D PAT -> PAT disable Disabled - + E !P/E MTRR -> PAT init Disabled BIOS + D !P/E MTRR -> PAT disable Disabled BIOS + !M !P/E MTRR stub -> PAT disable Disabled BIOS + + Legend + ------------------------------------------------ + E Feature enabled in CPU + D Feature disabled/unsupported in CPU + np "nopat" boot option specified + !P CONFIG_X86_PAT option unset + !M CONFIG_MTRR option unset + Enabled PAT state set to enabled + Disabled PAT state set to disabled + OS PAT initializes PAT MSR with OS setting + BIOS PAT keeps PAT MSR with BIOS setting + diff --git a/Makefile b/Makefile index acf6155421cc244913b71c6d3b95cd17690e64ff..0f9cb36d45c2c59a589670679e7fa1d25ff9ee59 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 6 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Charred Weasel # *DOCUMENTATION* diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index a83bbea62c67481442cb0bd04ba033b7ef95b848..0131a7058778ed67fd32da6595052a0287b742fa 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -63,7 +63,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) return res >= 0 ? 1 : 0; } -static inline void __down_write(struct rw_semaphore *sem) +static inline long ___down_write(struct rw_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP @@ -83,10 +83,24 @@ static inline void __down_write(struct rw_semaphore *sem) :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) :"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory"); #endif - if (unlikely(oldcount)) + return oldcount; +} + +static inline void __down_write(struct rw_semaphore *sem) +{ + if (unlikely(___down_write(sem))) rwsem_down_write_failed(sem); } +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + if (unlikely(___down_write(sem))) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + + return 0; +} + /* * trylock for writing -- returns 1 if successful, 0 if contention */ diff --git a/arch/arm/boot/dts/at91sam9x5.dtsi b/arch/arm/boot/dts/at91sam9x5.dtsi index 0827d594b1f0ef3750146690c5ce11babf1fc65b..cd0cd5fd09a33bdf17b3f609d1e3497b446c9412 100644 --- a/arch/arm/boot/dts/at91sam9x5.dtsi +++ b/arch/arm/boot/dts/at91sam9x5.dtsi @@ -106,7 +106,7 @@ pmc: pmc@fffffc00 { compatible = "atmel,at91sam9x5-pmc", "syscon"; - reg = <0xfffffc00 0x100>; + reg = <0xfffffc00 0x200>; interrupts = <1 IRQ_TYPE_LEVEL_HIGH 7>; interrupt-controller; #address-cells = <1>; diff --git a/arch/arm/boot/dts/sama5d2.dtsi b/arch/arm/boot/dts/sama5d2.dtsi index 78996bdbd3df38c30fb0a0051cd0db3e9a9249eb..9817090c1b731540a2dcd4cb3dc480608040fbfa 100644 --- a/arch/arm/boot/dts/sama5d2.dtsi +++ b/arch/arm/boot/dts/sama5d2.dtsi @@ -280,7 +280,7 @@ status = "disabled"; nfc@c0000000 { - compatible = "atmel,sama5d4-nfc"; + compatible = "atmel,sama5d3-nfc"; #address-cells = <1>; #size-cells = <1>; reg = < /* NFC Command Registers */ diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index e0eea72deb87eb1ba3967b464a8848256565ee6c..a708fa1f090579228363f8a5f0427db16d31b3a1 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -17,34 +17,28 @@ #include #include #include +#include #ifdef CONFIG_EFI void efi_init(void); int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); +int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); -#define efi_call_virt(f, ...) \ -({ \ - efi_##f##_t *__f; \ - efi_status_t __s; \ - \ - efi_virtmap_load(); \ - __f = efi.systab->runtime->f; \ - __s = __f(__VA_ARGS__); \ - efi_virtmap_unload(); \ - __s; \ -}) +#define arch_efi_call_virt_setup() efi_virtmap_load() +#define arch_efi_call_virt_teardown() efi_virtmap_unload() -#define __efi_call_virt(f, ...) \ +#define arch_efi_call_virt(f, args...) \ ({ \ efi_##f##_t *__f; \ - \ - efi_virtmap_load(); \ __f = efi.systab->runtime->f; \ - __f(__VA_ARGS__); \ - efi_virtmap_unload(); \ + __f(args); \ }) +#define ARCH_EFI_IRQ_FLAGS_MASK \ + (PSR_J_BIT | PSR_E_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | \ + PSR_T_BIT | MODE_MASK) + static inline void efi_set_pgd(struct mm_struct *mm) { check_and_switch_context(mm, NULL); @@ -59,7 +53,16 @@ void efi_virtmap_unload(void); /* arch specific definitions used by the stub code */ -#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define __efi_call_early(f, ...) f(__VA_ARGS__) +#define efi_is_64bit() (false) + +struct screen_info *alloc_screen_info(efi_system_table_t *sys_table_arg); +void free_screen_info(efi_system_table_t *sys_table, struct screen_info *si); + +static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) +{ +} /* * A reasonable upper bound for the uncompressed kernel size is 32 MBytes, diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h index fa5b42d44985fbda00595f891450c6a60c8fd48b..3cc14dd8587c097746dc7c8138be18f64b5ace94 100644 --- a/arch/arm/include/asm/mmu_context.h +++ b/arch/arm/include/asm/mmu_context.h @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -66,6 +67,7 @@ static inline void check_and_switch_context(struct mm_struct *mm, cpu_switch_mm(mm->pgd, mm); } +#ifndef MODULE #define finish_arch_post_lock_switch \ finish_arch_post_lock_switch static inline void finish_arch_post_lock_switch(void) @@ -87,6 +89,7 @@ static inline void finish_arch_post_lock_switch(void) preempt_enable_no_resched(); } } +#endif /* !MODULE */ #endif /* CONFIG_MMU */ diff --git a/arch/arm/kernel/efi.c b/arch/arm/kernel/efi.c index ff8a9d8acfaca727517c76bbefe4bbdd4755f73a..9f43ba012d1077ef553617bff01cf002fba70613 100644 --- a/arch/arm/kernel/efi.c +++ b/arch/arm/kernel/efi.c @@ -11,6 +11,41 @@ #include #include +static int __init set_permissions(pte_t *ptep, pgtable_t token, + unsigned long addr, void *data) +{ + efi_memory_desc_t *md = data; + pte_t pte = *ptep; + + if (md->attribute & EFI_MEMORY_RO) + pte = set_pte_bit(pte, __pgprot(L_PTE_RDONLY)); + if (md->attribute & EFI_MEMORY_XP) + pte = set_pte_bit(pte, __pgprot(L_PTE_XN)); + set_pte_ext(ptep, pte, PTE_EXT_NG); + return 0; +} + +int __init efi_set_mapping_permissions(struct mm_struct *mm, + efi_memory_desc_t *md) +{ + unsigned long base, size; + + base = md->virt_addr; + size = md->num_pages << EFI_PAGE_SHIFT; + + /* + * We can only use apply_to_page_range() if we can guarantee that the + * entire region was mapped using pages. This should be the case if the + * region does not cover any naturally aligned SECTION_SIZE sized + * blocks. + */ + if (round_down(base + size, SECTION_SIZE) < + round_up(base, SECTION_SIZE) + SECTION_SIZE) + return apply_to_page_range(mm, base, size, set_permissions, md); + + return 0; +} + int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { struct map_desc desc = { @@ -34,5 +69,11 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) desc.type = MT_DEVICE; create_mapping_late(mm, &desc, true); + + /* + * If stricter permissions were specified, apply them now. + */ + if (md->attribute & (EFI_MEMORY_RO | EFI_MEMORY_XP)) + return efi_set_mapping_permissions(mm, md); return 0; } diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 6284779d64ee6394dc11b38cc32e24cefac276b1..b8df45883cf78e36aab24522d0da348335801695 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -631,7 +631,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) info->address &= ~alignment_mask; info->ctrl.len <<= offset; - if (!bp->overflow_handler) { + if (is_default_overflow_handler(bp)) { /* * Mismatch breakpoints are required for single-stepping * breakpoints. @@ -754,7 +754,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr, * mismatch breakpoint so we can single-step over the * watchpoint trigger. */ - if (!wp->overflow_handler) + if (is_default_overflow_handler(wp)) enable_single_step(wp, instruction_pointer(regs)); unlock: diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c index 4e02ae5950ff6463e4da472066b55ceef9a970be..27563befa8a2df5b27ea39b56bde584771945dfd 100644 --- a/arch/arm/kernel/perf_callchain.c +++ b/arch/arm/kernel/perf_callchain.c @@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) tail = (struct frame_tail __user *)regs->ARM_fp - 1; - while ((entry->nr < PERF_MAX_STACK_DEPTH) && + while ((entry->nr < sysctl_perf_event_max_stack) && tail && !((unsigned long)tail & 0x3)) tail = user_backtrace(tail, entry); } diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 2c4bea39cf224f8368cca2b4dba61a5b807a3dde..7d4e2850910ce4cdbeb3e669c7846d782dfe6c8c 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -883,7 +883,8 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) request_resource(&ioport_resource, &lp2); } -#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE) +#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE) || \ + defined(CONFIG_EFI) struct screen_info screen_info = { .orig_video_lines = 30, .orig_video_cols = 80, diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index efa77c146415b64d774a9811d30268e690d354d1..521b1ec5915759f61e0ef1ab6e8328ca0cd62256 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -2,6 +2,7 @@ menu "Platform selection" config ARCH_SUNXI bool "Allwinner sunxi 64-bit SoC Family" + select GENERIC_IRQ_CHIP help This enables support for Allwinner sunxi based SoCs like the A64. diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 8e88a696c9cbcbd2c8f717ea1c929b87750f1b03..622db3c6474e2d5c51b3a1689869534cf16019ee 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #ifdef CONFIG_EFI @@ -14,32 +15,29 @@ extern void efi_init(void); int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); -#define efi_call_virt(f, ...) \ +#define efi_set_mapping_permissions efi_create_mapping + +#define arch_efi_call_virt_setup() \ ({ \ - efi_##f##_t *__f; \ - efi_status_t __s; \ - \ kernel_neon_begin(); \ efi_virtmap_load(); \ - __f = efi.systab->runtime->f; \ - __s = __f(__VA_ARGS__); \ - efi_virtmap_unload(); \ - kernel_neon_end(); \ - __s; \ }) -#define __efi_call_virt(f, ...) \ +#define arch_efi_call_virt(f, args...) \ ({ \ efi_##f##_t *__f; \ - \ - kernel_neon_begin(); \ - efi_virtmap_load(); \ __f = efi.systab->runtime->f; \ - __f(__VA_ARGS__); \ + __f(args); \ +}) + +#define arch_efi_call_virt_teardown() \ +({ \ efi_virtmap_unload(); \ kernel_neon_end(); \ }) +#define ARCH_EFI_IRQ_FLAGS_MASK (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT) + /* arch specific definitions used by the stub code */ /* @@ -50,7 +48,16 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); #define EFI_FDT_ALIGN SZ_2M /* used by allocate_new_fdt_and_exit_boot() */ #define MAX_FDT_OFFSET SZ_512M -#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define __efi_call_early(f, ...) f(__VA_ARGS__) +#define efi_is_64bit() (true) + +#define alloc_screen_info(x...) &screen_info +#define free_screen_info(x...) + +static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) +{ +} #define EFI_ALLOC_ALIGN SZ_64K diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index b6abc852f2a142123150662bc6dd6bf5c3de62af..78f52488f9ff82461f2933689b44e7b0fcf74b80 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -17,22 +17,51 @@ #include -int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) +/* + * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be + * executable, everything else can be mapped with the XN bits + * set. Also take the new (optional) RO/XP bits into account. + */ +static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) { - pteval_t prot_val; + u64 attr = md->attribute; + u32 type = md->type; - /* - * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be - * executable, everything else can be mapped with the XN bits - * set. - */ - if ((md->attribute & EFI_MEMORY_WB) == 0) - prot_val = PROT_DEVICE_nGnRE; - else if (md->type == EFI_RUNTIME_SERVICES_CODE || - !PAGE_ALIGNED(md->phys_addr)) - prot_val = pgprot_val(PAGE_KERNEL_EXEC); - else - prot_val = pgprot_val(PAGE_KERNEL); + if (type == EFI_MEMORY_MAPPED_IO) + return PROT_DEVICE_nGnRE; + + if (WARN_ONCE(!PAGE_ALIGNED(md->phys_addr), + "UEFI Runtime regions are not aligned to 64 KB -- buggy firmware?")) + /* + * If the region is not aligned to the page size of the OS, we + * can not use strict permissions, since that would also affect + * the mapping attributes of the adjacent regions. + */ + return pgprot_val(PAGE_KERNEL_EXEC); + + /* R-- */ + if ((attr & (EFI_MEMORY_XP | EFI_MEMORY_RO)) == + (EFI_MEMORY_XP | EFI_MEMORY_RO)) + return pgprot_val(PAGE_KERNEL_RO); + + /* R-X */ + if (attr & EFI_MEMORY_RO) + return pgprot_val(PAGE_KERNEL_ROX); + + /* RW- */ + if (attr & EFI_MEMORY_XP || type != EFI_RUNTIME_SERVICES_CODE) + return pgprot_val(PAGE_KERNEL); + + /* RWX */ + return pgprot_val(PAGE_KERNEL_EXEC); +} + +/* we will fill this structure from the stub, so don't put it in .bss */ +struct screen_info screen_info __section(.data); + +int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) +{ + pteval_t prot_val = create_mapping_protection(md); create_pgd_mapping(mm, md->phys_addr, md->virt_addr, md->num_pages << EFI_PAGE_SHIFT, diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index b45c95d34b8323e74992e0a4a56e6da0e1257c60..4ef5373f9a7620dbbcdd4087a125b6f5a7f56c60 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -616,7 +616,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr, perf_bp_event(bp, regs); /* Do we need to handle the stepping? */ - if (!bp->overflow_handler) + if (is_default_overflow_handler(bp)) step = 1; unlock: rcu_read_unlock(); @@ -712,7 +712,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, perf_bp_event(wp, regs); /* Do we need to handle the stepping? */ - if (!wp->overflow_handler) + if (is_default_overflow_handler(wp)) step = 1; unlock: diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index 5e360ce88f10ba4110697552a47ea8c42053a02f..1428849aece8f98a52b4661c68c14ea555fe0000 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -112,6 +112,7 @@ __efistub___memset = KALLSYMS_HIDE(__pi_memset); __efistub__text = KALLSYMS_HIDE(_text); __efistub__end = KALLSYMS_HIDE(_end); __efistub__edata = KALLSYMS_HIDE(_edata); +__efistub_screen_info = KALLSYMS_HIDE(screen_info); #endif diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index ff4665462a025d4ec2655ca30d49732a63194e53..32c3c6e70119f4e123498b85f1bc28398e333b13 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry, tail = (struct frame_tail __user *)regs->regs[29]; - while (entry->nr < PERF_MAX_STACK_DEPTH && + while (entry->nr < sysctl_perf_event_max_stack && tail && !((unsigned long)tail & 0xf)) tail = user_backtrace(tail, entry); } else { @@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry, tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; - while ((entry->nr < PERF_MAX_STACK_DEPTH) && + while ((entry->nr < sysctl_perf_event_max_stack) && tail && !((unsigned long)tail & 0x3)) tail = compat_user_backtrace(tail, entry); #endif diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index a34420a5df9a2e5134beb26964f08c57d97c0d16..b405bbb544319e2a9a66ced51c12eaabb5621d25 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -476,6 +476,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) case BPF_JGE: jmp_cond = A64_COND_CS; break; + case BPF_JSET: case BPF_JNE: jmp_cond = A64_COND_NE; break; diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h index 105c93b00b1bc53ce22f04ffd249595b5001c4d2..1d1212901ae70138466b7b32a6e04538d8f73a9d 100644 --- a/arch/ia64/include/asm/iommu.h +++ b/arch/ia64/include/asm/iommu.h @@ -1,7 +1,6 @@ #ifndef _ASM_IA64_IOMMU_H #define _ASM_IA64_IOMMU_H 1 -#define cpu_has_x2apic 0 /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT (((cycles_t) local_cpu_data->itc_freq)*10) diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index ce112472bdd65a10072eecf995ef5053cbae2e41..8b23e070b8440e0ce0989ec07bb416c55ad86e35 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -49,8 +49,8 @@ __down_read (struct rw_semaphore *sem) /* * lock for writing */ -static inline void -__down_write (struct rw_semaphore *sem) +static inline long +___down_write (struct rw_semaphore *sem) { long old, new; @@ -59,10 +59,26 @@ __down_write (struct rw_semaphore *sem) new = old + RWSEM_ACTIVE_WRITE_BIAS; } while (cmpxchg_acq(&sem->count, old, new) != old); - if (old != 0) + return old; +} + +static inline void +__down_write (struct rw_semaphore *sem) +{ + if (___down_write(sem)) rwsem_down_write_failed(sem); } +static inline int +__down_write_killable (struct rw_semaphore *sem) +{ + if (___down_write(sem)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + + return 0; +} + /* * unlock after reading */ diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 300dac3702f11a13460d1954843bfcbd1ddc2034..bf0865cd438a4a25b2c516274ffed969a1dc64b3 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -531,8 +531,6 @@ efi_init (void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); - set_bit(EFI_SYSTEM_TABLES, &efi.flags); - palo_phys = EFI_INVALID_TABLE_ADDR; if (efi_config_init(arch_tables) != 0) diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c index 315633461a94537c51c96b7ba92420b9715fd523..252abc12a5a31f6221b106b060ffd6926a8a6b5b 100644 --- a/arch/metag/kernel/perf_callchain.c +++ b/arch/metag/kernel/perf_callchain.c @@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) --frame; - while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame) + while ((entry->nr < sysctl_perf_event_max_stack) && frame) frame = user_backtrace(frame, entry); } diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c index c1cf9c6c3f7705b9c50281d196633d91c8e788e5..5021c546ad07d3e28b7d0ac1969448c32b9a93e2 100644 --- a/arch/mips/kernel/perf_event.c +++ b/arch/mips/kernel/perf_event.c @@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry, addr = *sp++; if (__kernel_text_address(addr)) { perf_callchain_store(entry, addr); - if (entry->nr >= PERF_MAX_STACK_DEPTH) + if (entry->nr >= sysctl_perf_event_max_stack) break; } } @@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, } do { perf_callchain_store(entry, pc); - if (entry->nr >= PERF_MAX_STACK_DEPTH) + if (entry->nr >= sysctl_perf_event_max_stack) break; pc = unwind_stack(current, &sp, pc, &ra); } while (pc); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 8cac1eb414661ad6e3340a8469361fe9a885d117..55c924b65f71aa4dfc7c58f9e4a446669c5416e0 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -565,7 +565,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) smp_ops->give_timebase(); /* Wait until cpu puts itself in the online & active maps */ - while (!cpu_online(cpu) || !cpu_active(cpu)) + while (!cpu_online(cpu)) cpu_relax(); return 0; diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index e04a6752b39991bbdf5ba389aef524182511fa9c..22d9015c1acc80dea12e78c1ac53f8fcd3b19f0a 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { fp = (unsigned long __user *) sp; if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp)) return; @@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { fp = (unsigned int __user *) (unsigned long) sp; if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp)) return; diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index fead491dfc28522017b1be01b1160ce788d052cb..c75e4471e618826a385c6aef2955b79c757ab9e7 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -90,7 +90,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline long ___down_write(struct rw_semaphore *sem) { signed long old, new, tmp; @@ -104,13 +104,23 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) : "=&d" (old), "=&d" (new), "=Q" (sem->count) : "Q" (sem->count), "m" (tmp) : "cc", "memory"); - if (old != 0) - rwsem_down_write_failed(sem); + + return old; } static inline void __down_write(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + if (___down_write(sem)) + rwsem_down_write_failed(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + if (___down_write(sem)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + + return 0; } /* diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 40a6b4f9c36cedd5400898572f32173c90bf0240..7b89a757210031e1b312603b1630b44216531347 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -832,7 +832,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) pcpu_attach_task(pcpu, tidle); pcpu_start_fn(pcpu, smp_start_secondary, NULL); /* Wait until cpu puts itself in the online & active maps */ - while (!cpu_online(cpu) || !cpu_active(cpu)) + while (!cpu_online(cpu)) cpu_relax(); return 0; } diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index a319745a7b635e16561fe33a061486372abf2004..751c3373a92c8882331fa8dfa6e567c4c4be7584 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -26,6 +26,7 @@ generic-y += percpu.h generic-y += poll.h generic-y += preempt.h generic-y += resource.h +generic-y += rwsem.h generic-y += sembuf.h generic-y += serial.h generic-y += shmbuf.h diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h deleted file mode 100644 index edab57265293936cc3b6ecaed0cfc8fa609eb983..0000000000000000000000000000000000000000 --- a/arch/sh/include/asm/rwsem.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * include/asm-sh/rwsem.h: R/W semaphores for SH using the stuff - * in lib/rwsem.c. - */ - -#ifndef _ASM_SH_RWSEM_H -#define _ASM_SH_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ - -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (atomic_inc_return((atomic_t *)(&sem->count)) > 0) - smp_wmb(); - else - rwsem_down_read_failed(sem); -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - int tmp; - - while ((tmp = sem->count) >= 0) { - if (tmp == cmpxchg(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - smp_wmb(); - return 1; - } - } - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write(struct rw_semaphore *sem) -{ - int tmp; - - tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)); - if (tmp == RWSEM_ACTIVE_WRITE_BIAS) - smp_wmb(); - else - rwsem_down_write_failed(sem); -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - int tmp; - - tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - smp_wmb(); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - int tmp; - - smp_wmb(); - tmp = atomic_dec_return((atomic_t *)(&sem->count)); - if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - smp_wmb(); - if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)) < 0) - rwsem_wake(sem); -} - -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) -{ - atomic_add(delta, (atomic_t *)(&sem->count)); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - int tmp; - - smp_wmb(); - tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count)); - if (tmp < 0) - rwsem_downgrade_wake(sem); -} - -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) -{ - __down_write(sem); -} - -/* - * implement exchange and add functionality - */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) -{ - smp_mb(); - return atomic_add_return(delta, (atomic_t *)(&sem->count)); -} - -#endif /* __KERNEL__ */ -#endif /* _ASM_SH_RWSEM_H */ diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index e928618838bc53c83555cc7b275073fc8c745b3d..6024c26c058565c76f20ebe98a3c3b0ad5a96120 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += mm-arch-hooks.h generic-y += module.h generic-y += mutex.h generic-y += preempt.h +generic-y += rwsem.h generic-y += serial.h generic-y += trace_clock.h generic-y += types.h diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h deleted file mode 100644 index 069bf4d663a119f90aedeb484a3603e2a8b6f2b7..0000000000000000000000000000000000000000 --- a/arch/sparc/include/asm/rwsem.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * rwsem.h: R/W semaphores implemented using CAS - * - * Written by David S. Miller (davem@redhat.com), 2001. - * Derived from asm-i386/rwsem.h - */ -#ifndef _SPARC64_RWSEM_H -#define _SPARC64_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ - -#define RWSEM_UNLOCKED_VALUE 0x00000000L -#define RWSEM_ACTIVE_BIAS 0x00000001L -#define RWSEM_ACTIVE_MASK 0xffffffffL -#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (unlikely(atomic64_inc_return((atomic64_t *)(&sem->count)) <= 0L)) - rwsem_down_read_failed(sem); -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - long tmp; - - while ((tmp = sem->count) >= 0L) { - if (tmp == cmpxchg(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - return 1; - } - } - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) -{ - long tmp; - - tmp = atomic64_add_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic64_t *)(&sem->count)); - if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) - rwsem_down_write_failed(sem); -} - -static inline void __down_write(struct rw_semaphore *sem) -{ - __down_write_nested(sem, 0); -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - long tmp; - - tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic64_dec_return((atomic64_t *)(&sem->count)); - if (unlikely(tmp < -1L && (tmp & RWSEM_ACTIVE_MASK) == 0L)) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - if (unlikely(atomic64_sub_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic64_t *)(&sem->count)) < 0L)) - rwsem_wake(sem); -} - -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) -{ - atomic64_add(delta, (atomic64_t *)(&sem->count)); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic64_add_return(-RWSEM_WAITING_BIAS, (atomic64_t *)(&sem->count)); - if (tmp < 0L) - rwsem_downgrade_wake(sem); -} - -/* - * implement exchange and add functionality - */ -static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) -{ - return atomic64_add_return(delta, (atomic64_t *)(&sem->count)); -} - -#endif /* __KERNEL__ */ - -#endif /* _SPARC64_RWSEM_H */ diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 6596f66ce1126fa487b7ce8eec19ef17dc4912e7..a4b8b5aed21c7b0fa83b30c7f3783e5c109ccf61 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, } } #endif - } while (entry->nr < PERF_MAX_STACK_DEPTH); + } while (entry->nr < sysctl_perf_event_max_stack); } static inline int @@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, pc = sf.callers_pc; ufp = (unsigned long)sf.fp + STACK_BIAS; perf_callchain_store(entry, pc); - } while (entry->nr < PERF_MAX_STACK_DEPTH); + } while (entry->nr < sysctl_perf_event_max_stack); } static void perf_callchain_user_32(struct perf_callchain_entry *entry, @@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, ufp = (unsigned long)sf.fp; } perf_callchain_store(entry, pc); - } while (entry->nr < PERF_MAX_STACK_DEPTH); + } while (entry->nr < sysctl_perf_event_max_stack); } void diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2dc18605831f6e88fd45c2863fc3d1fc7e6b6622..7bb15747fea251ff5470085ed454655398f9d60f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -164,10 +164,6 @@ config INSTRUCTION_DECODER def_bool y depends on KPROBES || PERF_EVENTS || UPROBES -config PERF_EVENTS_INTEL_UNCORE - def_bool y - depends on PERF_EVENTS && CPU_SUP_INTEL && PCI - config OUTPUT_FORMAT string default "elf32-i386" if X86_32 @@ -1046,6 +1042,8 @@ config X86_THERMAL_VECTOR def_bool y depends on X86_MCE_INTEL +source "arch/x86/events/Kconfig" + config X86_LEGACY_VM86 bool "Legacy VM86 support" default n @@ -1210,15 +1208,6 @@ config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE -config PERF_EVENTS_AMD_POWER - depends on PERF_EVENTS && CPU_SUP_AMD - tristate "AMD Processor Power Reporting Mechanism" - ---help--- - Provide power reporting mechanism support for AMD processors. - Currently, it leverages X86_FEATURE_ACC_POWER - (CPUID Fn8000_0007_EDX[12]) interface to calculate the - average power consumption on Family 15h processors. - config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" ---help--- @@ -1932,54 +1921,38 @@ config RELOCATABLE (CONFIG_PHYSICAL_START) is used as the minimum location. config RANDOMIZE_BASE - bool "Randomize the address of the kernel image" + bool "Randomize the address of the kernel image (KASLR)" depends on RELOCATABLE default n ---help--- - Randomizes the physical and virtual address at which the - kernel image is decompressed, as a security feature that - deters exploit attempts relying on knowledge of the location - of kernel internals. + In support of Kernel Address Space Layout Randomization (KASLR), + this randomizes the physical address at which the kernel image + is decompressed and the virtual address where the kernel + image is mapped, as a security feature that deters exploit + attempts relying on knowledge of the location of kernel + code internals. + + The kernel physical and virtual address can be randomized + from 16MB up to 1GB on 64-bit and 512MB on 32-bit. (Note that + using RANDOMIZE_BASE reduces the memory space available to + kernel modules from 1.5GB to 1GB.) + + Entropy is generated using the RDRAND instruction if it is + supported. If RDTSC is supported, its value is mixed into + the entropy pool as well. If neither RDRAND nor RDTSC are + supported, then entropy is read from the i8254 timer. + + Since the kernel is built using 2GB addressing, and + PHYSICAL_ALIGN must be at a minimum of 2MB, only 10 bits of + entropy is theoretically possible. Currently, with the + default value for PHYSICAL_ALIGN and due to page table + layouts, 64-bit uses 9 bits of entropy and 32-bit uses 8 bits. + + If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot + time. To enable it, boot with "kaslr" on the kernel command + line (which will also disable hibernation). - Entropy is generated using the RDRAND instruction if it is - supported. If RDTSC is supported, it is used as well. If - neither RDRAND nor RDTSC are supported, then randomness is - read from the i8254 timer. - - The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET, - and aligned according to PHYSICAL_ALIGN. Since the kernel is - built using 2GiB addressing, and PHYSICAL_ALGIN must be at a - minimum of 2MiB, only 10 bits of entropy is theoretically - possible. At best, due to page table layouts, 64-bit can use - 9 bits of entropy and 32-bit uses 8 bits. - - If unsure, say N. - -config RANDOMIZE_BASE_MAX_OFFSET - hex "Maximum kASLR offset allowed" if EXPERT - depends on RANDOMIZE_BASE - range 0x0 0x20000000 if X86_32 - default "0x20000000" if X86_32 - range 0x0 0x40000000 if X86_64 - default "0x40000000" if X86_64 - ---help--- - The lesser of RANDOMIZE_BASE_MAX_OFFSET and available physical - memory is used to determine the maximal offset in bytes that will - be applied to the kernel when kernel Address Space Layout - Randomization (kASLR) is active. This must be a multiple of - PHYSICAL_ALIGN. - - On 32-bit this is limited to 512MiB by page table layouts. The - default is 512MiB. - - On 64-bit this is limited by how the kernel fixmap page table is - positioned, so this cannot be larger than 1GiB currently. Without - RANDOMIZE_BASE, there is a 512MiB to 1.5GiB split between kernel - and modules. When RANDOMIZE_BASE_MAX_OFFSET is above 512MiB, the - modules area will shrink to compensate, up to the current maximum - 1GiB to 1GiB split. The default is 1GiB. - - If unsure, leave at the default value. + If unsure, say N. # Relocation on x86 needs some additional build support config X86_NEED_RELOCS diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 4086abca0b32345c92207fa75468cc98ebe97da4..6fce7f096b889f4ca63daae060bee63eccfee7ec 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -208,7 +208,8 @@ endif head-y := arch/x86/kernel/head_$(BITS).o head-y += arch/x86/kernel/head$(BITS).o -head-y += arch/x86/kernel/head.o +head-y += arch/x86/kernel/ebda.o +head-y += arch/x86/kernel/platform-quirks.o libs-y += arch/x86/lib/ diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index b1ef9e48908474bf95dbe6f577cc1fab1a5ea669..700a9c6e6159362d9b62631d124d6703a6554ba5 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -86,16 +86,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p' - -quiet_cmd_voffset = VOFFSET $@ - cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ - -targets += voffset.h -$(obj)/voffset.h: vmlinux FORCE - $(call if_changed,voffset) - -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ @@ -106,7 +97,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE AFLAGS_header.o += -I$(obj) -$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h +$(obj)/header.o: $(obj)/zoffset.h LDFLAGS_setup.elf := -T $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 8774cb23064fe417cf35935c33e876eb9faa9670..cfdd8c3f8af2e095b0c9e3388c17af834e0589b9 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -57,12 +57,27 @@ LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' + +quiet_cmd_voffset = VOFFSET $@ + cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ + +targets += ../voffset.h + +$(obj)/../voffset.h: vmlinux FORCE + $(call if_changed,voffset) + +$(obj)/misc.o: $(obj)/../voffset.h + vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ - $(obj)/string.o $(obj)/cmdline.o \ + $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \ $(obj)/piggy.o $(obj)/cpuflags.o vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o -vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o +vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o +ifdef CONFIG_X86_64 + vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o +endif $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone @@ -109,10 +124,8 @@ suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo suffix-$(CONFIG_KERNEL_LZ4) := lz4 -RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \ - $(CONFIG_SHELL) $(srctree)/arch/x86/tools/calc_run_size.sh) quiet_cmd_mkpiggy = MKPIGGY $@ - cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false ) + cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) targets += piggy.S $(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c deleted file mode 100644 index 6a9b96b4624d2ed93586b44d05848738bbd80178..0000000000000000000000000000000000000000 --- a/arch/x86/boot/compressed/aslr.c +++ /dev/null @@ -1,339 +0,0 @@ -#include "misc.h" - -#include -#include -#include - -#include -#include -#include -#include -#include - -/* Simplified build-specific string for starting entropy. */ -static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" - LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; - -#define I8254_PORT_CONTROL 0x43 -#define I8254_PORT_COUNTER0 0x40 -#define I8254_CMD_READBACK 0xC0 -#define I8254_SELECT_COUNTER0 0x02 -#define I8254_STATUS_NOTREADY 0x40 -static inline u16 i8254(void) -{ - u16 status, timer; - - do { - outb(I8254_PORT_CONTROL, - I8254_CMD_READBACK | I8254_SELECT_COUNTER0); - status = inb(I8254_PORT_COUNTER0); - timer = inb(I8254_PORT_COUNTER0); - timer |= inb(I8254_PORT_COUNTER0) << 8; - } while (status & I8254_STATUS_NOTREADY); - - return timer; -} - -static unsigned long rotate_xor(unsigned long hash, const void *area, - size_t size) -{ - size_t i; - unsigned long *ptr = (unsigned long *)area; - - for (i = 0; i < size / sizeof(hash); i++) { - /* Rotate by odd number of bits and XOR. */ - hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); - hash ^= ptr[i]; - } - - return hash; -} - -/* Attempt to create a simple but unpredictable starting entropy. */ -static unsigned long get_random_boot(void) -{ - unsigned long hash = 0; - - hash = rotate_xor(hash, build_str, sizeof(build_str)); - hash = rotate_xor(hash, real_mode, sizeof(*real_mode)); - - return hash; -} - -static unsigned long get_random_long(void) -{ -#ifdef CONFIG_X86_64 - const unsigned long mix_const = 0x5d6008cbf3848dd3UL; -#else - const unsigned long mix_const = 0x3f39e593UL; -#endif - unsigned long raw, random = get_random_boot(); - bool use_i8254 = true; - - debug_putstr("KASLR using"); - - if (has_cpuflag(X86_FEATURE_RDRAND)) { - debug_putstr(" RDRAND"); - if (rdrand_long(&raw)) { - random ^= raw; - use_i8254 = false; - } - } - - if (has_cpuflag(X86_FEATURE_TSC)) { - debug_putstr(" RDTSC"); - raw = rdtsc(); - - random ^= raw; - use_i8254 = false; - } - - if (use_i8254) { - debug_putstr(" i8254"); - random ^= i8254(); - } - - /* Circular multiply for better bit diffusion */ - asm("mul %3" - : "=a" (random), "=d" (raw) - : "a" (random), "rm" (mix_const)); - random += raw; - - debug_putstr("...\n"); - - return random; -} - -struct mem_vector { - unsigned long start; - unsigned long size; -}; - -#define MEM_AVOID_MAX 5 -static struct mem_vector mem_avoid[MEM_AVOID_MAX]; - -static bool mem_contains(struct mem_vector *region, struct mem_vector *item) -{ - /* Item at least partially before region. */ - if (item->start < region->start) - return false; - /* Item at least partially after region. */ - if (item->start + item->size > region->start + region->size) - return false; - return true; -} - -static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) -{ - /* Item one is entirely before item two. */ - if (one->start + one->size <= two->start) - return false; - /* Item one is entirely after item two. */ - if (one->start >= two->start + two->size) - return false; - return true; -} - -static void mem_avoid_init(unsigned long input, unsigned long input_size, - unsigned long output, unsigned long output_size) -{ - u64 initrd_start, initrd_size; - u64 cmd_line, cmd_line_size; - unsigned long unsafe, unsafe_len; - char *ptr; - - /* - * Avoid the region that is unsafe to overlap during - * decompression (see calculations at top of misc.c). - */ - unsafe_len = (output_size >> 12) + 32768 + 18; - unsafe = (unsigned long)input + input_size - unsafe_len; - mem_avoid[0].start = unsafe; - mem_avoid[0].size = unsafe_len; - - /* Avoid initrd. */ - initrd_start = (u64)real_mode->ext_ramdisk_image << 32; - initrd_start |= real_mode->hdr.ramdisk_image; - initrd_size = (u64)real_mode->ext_ramdisk_size << 32; - initrd_size |= real_mode->hdr.ramdisk_size; - mem_avoid[1].start = initrd_start; - mem_avoid[1].size = initrd_size; - - /* Avoid kernel command line. */ - cmd_line = (u64)real_mode->ext_cmd_line_ptr << 32; - cmd_line |= real_mode->hdr.cmd_line_ptr; - /* Calculate size of cmd_line. */ - ptr = (char *)(unsigned long)cmd_line; - for (cmd_line_size = 0; ptr[cmd_line_size++]; ) - ; - mem_avoid[2].start = cmd_line; - mem_avoid[2].size = cmd_line_size; - - /* Avoid heap memory. */ - mem_avoid[3].start = (unsigned long)free_mem_ptr; - mem_avoid[3].size = BOOT_HEAP_SIZE; - - /* Avoid stack memory. */ - mem_avoid[4].start = (unsigned long)free_mem_end_ptr; - mem_avoid[4].size = BOOT_STACK_SIZE; -} - -/* Does this memory vector overlap a known avoided area? */ -static bool mem_avoid_overlap(struct mem_vector *img) -{ - int i; - struct setup_data *ptr; - - for (i = 0; i < MEM_AVOID_MAX; i++) { - if (mem_overlaps(img, &mem_avoid[i])) - return true; - } - - /* Avoid all entries in the setup_data linked list. */ - ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data; - while (ptr) { - struct mem_vector avoid; - - avoid.start = (unsigned long)ptr; - avoid.size = sizeof(*ptr) + ptr->len; - - if (mem_overlaps(img, &avoid)) - return true; - - ptr = (struct setup_data *)(unsigned long)ptr->next; - } - - return false; -} - -static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN]; -static unsigned long slot_max; - -static void slots_append(unsigned long addr) -{ - /* Overflowing the slots list should be impossible. */ - if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN) - return; - - slots[slot_max++] = addr; -} - -static unsigned long slots_fetch_random(void) -{ - /* Handle case of no slots stored. */ - if (slot_max == 0) - return 0; - - return slots[get_random_long() % slot_max]; -} - -static void process_e820_entry(struct e820entry *entry, - unsigned long minimum, - unsigned long image_size) -{ - struct mem_vector region, img; - - /* Skip non-RAM entries. */ - if (entry->type != E820_RAM) - return; - - /* Ignore entries entirely above our maximum. */ - if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET) - return; - - /* Ignore entries entirely below our minimum. */ - if (entry->addr + entry->size < minimum) - return; - - region.start = entry->addr; - region.size = entry->size; - - /* Potentially raise address to minimum location. */ - if (region.start < minimum) - region.start = minimum; - - /* Potentially raise address to meet alignment requirements. */ - region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); - - /* Did we raise the address above the bounds of this e820 region? */ - if (region.start > entry->addr + entry->size) - return; - - /* Reduce size by any delta from the original address. */ - region.size -= region.start - entry->addr; - - /* Reduce maximum size to fit end of image within maximum limit. */ - if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET) - region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start; - - /* Walk each aligned slot and check for avoided areas. */ - for (img.start = region.start, img.size = image_size ; - mem_contains(®ion, &img) ; - img.start += CONFIG_PHYSICAL_ALIGN) { - if (mem_avoid_overlap(&img)) - continue; - slots_append(img.start); - } -} - -static unsigned long find_random_addr(unsigned long minimum, - unsigned long size) -{ - int i; - unsigned long addr; - - /* Make sure minimum is aligned. */ - minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); - - /* Verify potential e820 positions, appending to slots list. */ - for (i = 0; i < real_mode->e820_entries; i++) { - process_e820_entry(&real_mode->e820_map[i], minimum, size); - } - - return slots_fetch_random(); -} - -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, - unsigned long input_size, - unsigned char *output, - unsigned long output_size) -{ - unsigned long choice = (unsigned long)output; - unsigned long random; - -#ifdef CONFIG_HIBERNATION - if (!cmdline_find_option_bool("kaslr")) { - debug_putstr("KASLR disabled by default...\n"); - goto out; - } -#else - if (cmdline_find_option_bool("nokaslr")) { - debug_putstr("KASLR disabled by cmdline...\n"); - goto out; - } -#endif - - boot_params->hdr.loadflags |= KASLR_FLAG; - - /* Record the various known unsafe memory ranges. */ - mem_avoid_init((unsigned long)input, input_size, - (unsigned long)output, output_size); - - /* Walk e820 and find a random address. */ - random = find_random_addr(choice, output_size); - if (!random) { - debug_putstr("KASLR could not find suitable E820 region...\n"); - goto out; - } - - /* Always enforce the minimum. */ - if (random < choice) - goto out; - - choice = random; -out: - return (unsigned char *)choice; -} diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index b68e3033e6b9bd6fa76f89e3fbaa034823790ad7..73ccf63b0f48c6c9195cf336789da050ababef27 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -15,9 +15,9 @@ static inline char rdfs8(addr_t addr) #include "../cmdline.c" static unsigned long get_cmd_line_ptr(void) { - unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr; + unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr; - cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32; + cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32; return cmd_line_ptr; } diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 583d539a41977a2c7397b9fd38e0839d3c41dbb9..52fef606bc54258b7095aa6b4b16eda8fbf16244 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -571,312 +571,6 @@ static void setup_efi_pci(struct boot_params *params) efi_call_early(free_pool, pci_handle); } -static void -setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, - struct efi_pixel_bitmask pixel_info, int pixel_format) -{ - if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { - si->lfb_depth = 32; - si->lfb_linelength = pixels_per_scan_line * 4; - si->red_size = 8; - si->red_pos = 0; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 16; - si->rsvd_size = 8; - si->rsvd_pos = 24; - } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) { - si->lfb_depth = 32; - si->lfb_linelength = pixels_per_scan_line * 4; - si->red_size = 8; - si->red_pos = 16; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 0; - si->rsvd_size = 8; - si->rsvd_pos = 24; - } else if (pixel_format == PIXEL_BIT_MASK) { - find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size); - find_bits(pixel_info.green_mask, &si->green_pos, - &si->green_size); - find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size); - find_bits(pixel_info.reserved_mask, &si->rsvd_pos, - &si->rsvd_size); - si->lfb_depth = si->red_size + si->green_size + - si->blue_size + si->rsvd_size; - si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; - } else { - si->lfb_depth = 4; - si->lfb_linelength = si->lfb_width / 2; - si->red_size = 0; - si->red_pos = 0; - si->green_size = 0; - si->green_pos = 0; - si->blue_size = 0; - si->blue_pos = 0; - si->rsvd_size = 0; - si->rsvd_pos = 0; - } -} - -static efi_status_t -__gop_query32(struct efi_graphics_output_protocol_32 *gop32, - struct efi_graphics_output_mode_info **info, - unsigned long *size, u64 *fb_base) -{ - struct efi_graphics_output_protocol_mode_32 *mode; - efi_status_t status; - unsigned long m; - - m = gop32->mode; - mode = (struct efi_graphics_output_protocol_mode_32 *)m; - - status = efi_early->call(gop32->query_mode, gop32, - mode->mode, size, info); - if (status != EFI_SUCCESS) - return status; - - *fb_base = mode->frame_buffer_base; - return status; -} - -static efi_status_t -setup_gop32(struct screen_info *si, efi_guid_t *proto, - unsigned long size, void **gop_handle) -{ - struct efi_graphics_output_protocol_32 *gop32, *first_gop; - unsigned long nr_gops; - u16 width, height; - u32 pixels_per_scan_line; - u32 ext_lfb_base; - u64 fb_base; - struct efi_pixel_bitmask pixel_info; - int pixel_format; - efi_status_t status; - u32 *handles = (u32 *)(unsigned long)gop_handle; - int i; - - first_gop = NULL; - gop32 = NULL; - - nr_gops = size / sizeof(u32); - for (i = 0; i < nr_gops; i++) { - struct efi_graphics_output_mode_info *info = NULL; - efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; - bool conout_found = false; - void *dummy = NULL; - u32 h = handles[i]; - u64 current_fb_base; - - status = efi_call_early(handle_protocol, h, - proto, (void **)&gop32); - if (status != EFI_SUCCESS) - continue; - - status = efi_call_early(handle_protocol, h, - &conout_proto, &dummy); - if (status == EFI_SUCCESS) - conout_found = true; - - status = __gop_query32(gop32, &info, &size, ¤t_fb_base); - if (status == EFI_SUCCESS && (!first_gop || conout_found)) { - /* - * Systems that use the UEFI Console Splitter may - * provide multiple GOP devices, not all of which are - * backed by real hardware. The workaround is to search - * for a GOP implementing the ConOut protocol, and if - * one isn't found, to just fall back to the first GOP. - */ - width = info->horizontal_resolution; - height = info->vertical_resolution; - pixel_format = info->pixel_format; - pixel_info = info->pixel_information; - pixels_per_scan_line = info->pixels_per_scan_line; - fb_base = current_fb_base; - - /* - * Once we've found a GOP supporting ConOut, - * don't bother looking any further. - */ - first_gop = gop32; - if (conout_found) - break; - } - } - - /* Did we find any GOPs? */ - if (!first_gop) - goto out; - - /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; - - si->lfb_width = width; - si->lfb_height = height; - si->lfb_base = fb_base; - - ext_lfb_base = (u64)(unsigned long)fb_base >> 32; - if (ext_lfb_base) { - si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; - si->ext_lfb_base = ext_lfb_base; - } - - si->pages = 1; - - setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); - - si->lfb_size = si->lfb_linelength * si->lfb_height; - - si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; -out: - return status; -} - -static efi_status_t -__gop_query64(struct efi_graphics_output_protocol_64 *gop64, - struct efi_graphics_output_mode_info **info, - unsigned long *size, u64 *fb_base) -{ - struct efi_graphics_output_protocol_mode_64 *mode; - efi_status_t status; - unsigned long m; - - m = gop64->mode; - mode = (struct efi_graphics_output_protocol_mode_64 *)m; - - status = efi_early->call(gop64->query_mode, gop64, - mode->mode, size, info); - if (status != EFI_SUCCESS) - return status; - - *fb_base = mode->frame_buffer_base; - return status; -} - -static efi_status_t -setup_gop64(struct screen_info *si, efi_guid_t *proto, - unsigned long size, void **gop_handle) -{ - struct efi_graphics_output_protocol_64 *gop64, *first_gop; - unsigned long nr_gops; - u16 width, height; - u32 pixels_per_scan_line; - u32 ext_lfb_base; - u64 fb_base; - struct efi_pixel_bitmask pixel_info; - int pixel_format; - efi_status_t status; - u64 *handles = (u64 *)(unsigned long)gop_handle; - int i; - - first_gop = NULL; - gop64 = NULL; - - nr_gops = size / sizeof(u64); - for (i = 0; i < nr_gops; i++) { - struct efi_graphics_output_mode_info *info = NULL; - efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; - bool conout_found = false; - void *dummy = NULL; - u64 h = handles[i]; - u64 current_fb_base; - - status = efi_call_early(handle_protocol, h, - proto, (void **)&gop64); - if (status != EFI_SUCCESS) - continue; - - status = efi_call_early(handle_protocol, h, - &conout_proto, &dummy); - if (status == EFI_SUCCESS) - conout_found = true; - - status = __gop_query64(gop64, &info, &size, ¤t_fb_base); - if (status == EFI_SUCCESS && (!first_gop || conout_found)) { - /* - * Systems that use the UEFI Console Splitter may - * provide multiple GOP devices, not all of which are - * backed by real hardware. The workaround is to search - * for a GOP implementing the ConOut protocol, and if - * one isn't found, to just fall back to the first GOP. - */ - width = info->horizontal_resolution; - height = info->vertical_resolution; - pixel_format = info->pixel_format; - pixel_info = info->pixel_information; - pixels_per_scan_line = info->pixels_per_scan_line; - fb_base = current_fb_base; - - /* - * Once we've found a GOP supporting ConOut, - * don't bother looking any further. - */ - first_gop = gop64; - if (conout_found) - break; - } - } - - /* Did we find any GOPs? */ - if (!first_gop) - goto out; - - /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; - - si->lfb_width = width; - si->lfb_height = height; - si->lfb_base = fb_base; - - ext_lfb_base = (u64)(unsigned long)fb_base >> 32; - if (ext_lfb_base) { - si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; - si->ext_lfb_base = ext_lfb_base; - } - - si->pages = 1; - - setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); - - si->lfb_size = si->lfb_linelength * si->lfb_height; - - si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; -out: - return status; -} - -/* - * See if we have Graphics Output Protocol - */ -static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, - unsigned long size) -{ - efi_status_t status; - void **gop_handle = NULL; - - status = efi_call_early(allocate_pool, EFI_LOADER_DATA, - size, (void **)&gop_handle); - if (status != EFI_SUCCESS) - return status; - - status = efi_call_early(locate_handle, - EFI_LOCATE_BY_PROTOCOL, - proto, NULL, &size, gop_handle); - if (status != EFI_SUCCESS) - goto free_handle; - - if (efi_early->is64) - status = setup_gop64(si, proto, size, gop_handle); - else - status = setup_gop32(si, proto, size, gop_handle); - -free_handle: - efi_call_early(free_pool, gop_handle); - return status; -} - static efi_status_t setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) { @@ -1038,7 +732,7 @@ void setup_graphics(struct boot_params *boot_params) EFI_LOCATE_BY_PROTOCOL, &graphics_proto, NULL, &size, gop_handle); if (status == EFI_BUFFER_TOO_SMALL) - status = setup_gop(si, &graphics_proto, size); + status = efi_setup_gop(NULL, si, &graphics_proto, size); if (status != EFI_SUCCESS) { size = 0; diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index d487e727f1ec7347ea960b038428998d43a1d6bb..c0223f1a89d71229021ed2d15f99c50ac8c82166 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -11,80 +11,6 @@ #define DESC_TYPE_CODE_DATA (1 << 0) -#define EFI_CONSOLE_OUT_DEVICE_GUID \ - EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x0, 0x90, 0x27, \ - 0x3f, 0xc1, 0x4d) - -#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 -#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 -#define PIXEL_BIT_MASK 2 -#define PIXEL_BLT_ONLY 3 -#define PIXEL_FORMAT_MAX 4 - -struct efi_pixel_bitmask { - u32 red_mask; - u32 green_mask; - u32 blue_mask; - u32 reserved_mask; -}; - -struct efi_graphics_output_mode_info { - u32 version; - u32 horizontal_resolution; - u32 vertical_resolution; - int pixel_format; - struct efi_pixel_bitmask pixel_information; - u32 pixels_per_scan_line; -} __packed; - -struct efi_graphics_output_protocol_mode_32 { - u32 max_mode; - u32 mode; - u32 info; - u32 size_of_info; - u64 frame_buffer_base; - u32 frame_buffer_size; -} __packed; - -struct efi_graphics_output_protocol_mode_64 { - u32 max_mode; - u32 mode; - u64 info; - u64 size_of_info; - u64 frame_buffer_base; - u64 frame_buffer_size; -} __packed; - -struct efi_graphics_output_protocol_mode { - u32 max_mode; - u32 mode; - unsigned long info; - unsigned long size_of_info; - u64 frame_buffer_base; - unsigned long frame_buffer_size; -} __packed; - -struct efi_graphics_output_protocol_32 { - u32 query_mode; - u32 set_mode; - u32 blt; - u32 mode; -}; - -struct efi_graphics_output_protocol_64 { - u64 query_mode; - u64 set_mode; - u64 blt; - u64 mode; -}; - -struct efi_graphics_output_protocol { - void *query_mode; - unsigned long set_mode; - unsigned long blt; - struct efi_graphics_output_protocol_mode *mode; -}; - struct efi_uga_draw_protocol_32 { u32 get_mode; u32 set_mode; diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c new file mode 100644 index 0000000000000000000000000000000000000000..6248740b68b5a0c71bddf6eb29e4c6898902d471 --- /dev/null +++ b/arch/x86/boot/compressed/error.c @@ -0,0 +1,22 @@ +/* + * Callers outside of misc.c need access to the error reporting routines, + * but the *_putstr() functions need to stay in misc.c because of how + * memcpy() and memmove() are defined for the compressed boot environment. + */ +#include "misc.h" + +void warn(char *m) +{ + error_putstr("\n\n"); + error_putstr(m); + error_putstr("\n\n"); +} + +void error(char *m) +{ + warn(m); + error_putstr(" -- System halted"); + + while (1) + asm("hlt"); +} diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h new file mode 100644 index 0000000000000000000000000000000000000000..2e59dac07f9e051ad34e1061c3ae7b9a8e111480 --- /dev/null +++ b/arch/x86/boot/compressed/error.h @@ -0,0 +1,7 @@ +#ifndef BOOT_COMPRESSED_ERROR_H +#define BOOT_COMPRESSED_ERROR_H + +void warn(char *m); +void error(char *m); + +#endif /* BOOT_COMPRESSED_ERROR_H */ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 0256064da8da38c69098cbccc75a19cc0493ca82..1038524270e7e4bf5f74b8b959d94f7f2818efc9 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -176,7 +176,9 @@ preferred_addr: 1: /* Target address to relocate to for decompression */ - addl $z_extract_offset, %ebx + movl BP_init_size(%esi), %eax + subl $_end, %eax + addl %eax, %ebx /* Set up the stack */ leal boot_stack_end(%ebx), %esp @@ -233,24 +235,28 @@ relocated: 2: /* - * Do the decompression, and jump to the new kernel.. + * Do the extraction, and jump to the new kernel.. */ - /* push arguments for decompress_kernel: */ - pushl $z_run_size /* size of kernel with .bss and .brk */ + /* push arguments for extract_kernel: */ pushl $z_output_len /* decompressed length, end of relocs */ - leal z_extract_offset_negative(%ebx), %ebp + + movl BP_init_size(%esi), %eax + subl $_end, %eax + movl %ebx, %ebp + subl %eax, %ebp pushl %ebp /* output address */ + pushl $z_input_len /* input_len */ leal input_data(%ebx), %eax pushl %eax /* input_data */ leal boot_heap(%ebx), %eax pushl %eax /* heap area */ pushl %esi /* real mode pointer */ - call decompress_kernel /* returns kernel location in %eax */ - addl $28, %esp + call extract_kernel /* returns kernel location in %eax */ + addl $24, %esp /* - * Jump to the decompressed kernel. + * Jump to the extracted kernel. */ xorl %ebx, %ebx jmp *%eax diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 86558a1991393c509bc3005c021c2ab490b26b2a..0d80a7ad65cd74d1624db66ec13b91e8a2a4aca9 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -110,7 +110,9 @@ ENTRY(startup_32) 1: /* Target address to relocate to for decompression */ - addl $z_extract_offset, %ebx + movl BP_init_size(%esi), %eax + subl $_end, %eax + addl %eax, %ebx /* * Prepare for entering 64 bit mode @@ -132,7 +134,7 @@ ENTRY(startup_32) /* Initialize Page tables to 0 */ leal pgtable(%ebx), %edi xorl %eax, %eax - movl $((4096*6)/4), %ecx + movl $(BOOT_INIT_PGT_SIZE/4), %ecx rep stosl /* Build Level 4 */ @@ -338,7 +340,9 @@ preferred_addr: 1: /* Target address to relocate to for decompression */ - leaq z_extract_offset(%rbp), %rbx + movl BP_init_size(%rsi), %ebx + subl $_end, %ebx + addq %rbp, %rbx /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp @@ -408,19 +412,16 @@ relocated: 2: /* - * Do the decompression, and jump to the new kernel.. + * Do the extraction, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ - movq $z_run_size, %r9 /* size of kernel with .bss and .brk */ - pushq %r9 movq %rsi, %rdi /* real mode address */ leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ leaq input_data(%rip), %rdx /* input_data */ movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ movq $z_output_len, %r9 /* decompressed length, end of relocs */ - call decompress_kernel /* returns kernel location in %rax */ - popq %r9 + call extract_kernel /* returns kernel location in %rax */ popq %rsi /* @@ -485,4 +486,4 @@ boot_stack_end: .section ".pgtable","a",@nobits .balign 4096 pgtable: - .fill 6*4096, 1, 0 + .fill BOOT_PGT_SIZE, 1, 0 diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c new file mode 100644 index 0000000000000000000000000000000000000000..cfeb0259ed81aeb9341d071cb98e3ff129d06426 --- /dev/null +++ b/arch/x86/boot/compressed/kaslr.c @@ -0,0 +1,510 @@ +/* + * kaslr.c + * + * This contains the routines needed to generate a reasonable level of + * entropy to choose a randomized kernel base address offset in support + * of Kernel Address Space Layout Randomization (KASLR). Additionally + * handles walking the physical memory maps (and tracking memory regions + * to avoid) in order to select a physical memory location that can + * contain the entire properly aligned running kernel image. + * + */ +#include "misc.h" +#include "error.h" + +#include +#include +#include + +#include +#include +#include +#include +#include + +/* Simplified build-specific string for starting entropy. */ +static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; + +#define I8254_PORT_CONTROL 0x43 +#define I8254_PORT_COUNTER0 0x40 +#define I8254_CMD_READBACK 0xC0 +#define I8254_SELECT_COUNTER0 0x02 +#define I8254_STATUS_NOTREADY 0x40 +static inline u16 i8254(void) +{ + u16 status, timer; + + do { + outb(I8254_PORT_CONTROL, + I8254_CMD_READBACK | I8254_SELECT_COUNTER0); + status = inb(I8254_PORT_COUNTER0); + timer = inb(I8254_PORT_COUNTER0); + timer |= inb(I8254_PORT_COUNTER0) << 8; + } while (status & I8254_STATUS_NOTREADY); + + return timer; +} + +static unsigned long rotate_xor(unsigned long hash, const void *area, + size_t size) +{ + size_t i; + unsigned long *ptr = (unsigned long *)area; + + for (i = 0; i < size / sizeof(hash); i++) { + /* Rotate by odd number of bits and XOR. */ + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); + hash ^= ptr[i]; + } + + return hash; +} + +/* Attempt to create a simple but unpredictable starting entropy. */ +static unsigned long get_random_boot(void) +{ + unsigned long hash = 0; + + hash = rotate_xor(hash, build_str, sizeof(build_str)); + hash = rotate_xor(hash, boot_params, sizeof(*boot_params)); + + return hash; +} + +static unsigned long get_random_long(const char *purpose) +{ +#ifdef CONFIG_X86_64 + const unsigned long mix_const = 0x5d6008cbf3848dd3UL; +#else + const unsigned long mix_const = 0x3f39e593UL; +#endif + unsigned long raw, random = get_random_boot(); + bool use_i8254 = true; + + debug_putstr(purpose); + debug_putstr(" KASLR using"); + + if (has_cpuflag(X86_FEATURE_RDRAND)) { + debug_putstr(" RDRAND"); + if (rdrand_long(&raw)) { + random ^= raw; + use_i8254 = false; + } + } + + if (has_cpuflag(X86_FEATURE_TSC)) { + debug_putstr(" RDTSC"); + raw = rdtsc(); + + random ^= raw; + use_i8254 = false; + } + + if (use_i8254) { + debug_putstr(" i8254"); + random ^= i8254(); + } + + /* Circular multiply for better bit diffusion */ + asm("mul %3" + : "=a" (random), "=d" (raw) + : "a" (random), "rm" (mix_const)); + random += raw; + + debug_putstr("...\n"); + + return random; +} + +struct mem_vector { + unsigned long start; + unsigned long size; +}; + +enum mem_avoid_index { + MEM_AVOID_ZO_RANGE = 0, + MEM_AVOID_INITRD, + MEM_AVOID_CMDLINE, + MEM_AVOID_BOOTPARAMS, + MEM_AVOID_MAX, +}; + +static struct mem_vector mem_avoid[MEM_AVOID_MAX]; + +static bool mem_contains(struct mem_vector *region, struct mem_vector *item) +{ + /* Item at least partially before region. */ + if (item->start < region->start) + return false; + /* Item at least partially after region. */ + if (item->start + item->size > region->start + region->size) + return false; + return true; +} + +static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) +{ + /* Item one is entirely before item two. */ + if (one->start + one->size <= two->start) + return false; + /* Item one is entirely after item two. */ + if (one->start >= two->start + two->size) + return false; + return true; +} + +/* + * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). + * The mem_avoid array is used to store the ranges that need to be avoided + * when KASLR searches for an appropriate random address. We must avoid any + * regions that are unsafe to overlap with during decompression, and other + * things like the initrd, cmdline and boot_params. This comment seeks to + * explain mem_avoid as clearly as possible since incorrect mem_avoid + * memory ranges lead to really hard to debug boot failures. + * + * The initrd, cmdline, and boot_params are trivial to identify for + * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and + * MEM_AVOID_BOOTPARAMS respectively below. + * + * What is not obvious how to avoid is the range of memory that is used + * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover + * the compressed kernel (ZO) and its run space, which is used to extract + * the uncompressed kernel (VO) and relocs. + * + * ZO's full run size sits against the end of the decompression buffer, so + * we can calculate where text, data, bss, etc of ZO are positioned more + * easily. + * + * For additional background, the decompression calculations can be found + * in header.S, and the memory diagram is based on the one found in misc.c. + * + * The following conditions are already enforced by the image layouts and + * associated code: + * - input + input_size >= output + output_size + * - kernel_total_size <= init_size + * - kernel_total_size <= output_size (see Note below) + * - output + init_size >= output + output_size + * + * (Note that kernel_total_size and output_size have no fundamental + * relationship, but output_size is passed to choose_random_location + * as a maximum of the two. The diagram is showing a case where + * kernel_total_size is larger than output_size, but this case is + * handled by bumping output_size.) + * + * The above conditions can be illustrated by a diagram: + * + * 0 output input input+input_size output+init_size + * | | | | | + * | | | | | + * |-----|--------|--------|--------------|-----------|--|-------------| + * | | | + * | | | + * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size + * + * [output, output+init_size) is the entire memory range used for + * extracting the compressed image. + * + * [output, output+kernel_total_size) is the range needed for the + * uncompressed kernel (VO) and its run size (bss, brk, etc). + * + * [output, output+output_size) is VO plus relocs (i.e. the entire + * uncompressed payload contained by ZO). This is the area of the buffer + * written to during decompression. + * + * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case + * range of the copied ZO and decompression code. (i.e. the range + * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.) + * + * [input, input+input_size) is the original copied compressed image (ZO) + * (i.e. it does not include its run size). This range must be avoided + * because it contains the data used for decompression. + * + * [input+input_size, output+init_size) is [_text, _end) for ZO. This + * range includes ZO's heap and stack, and must be avoided since it + * performs the decompression. + * + * Since the above two ranges need to be avoided and they are adjacent, + * they can be merged, resulting in: [input, output+init_size) which + * becomes the MEM_AVOID_ZO_RANGE below. + */ +static void mem_avoid_init(unsigned long input, unsigned long input_size, + unsigned long output) +{ + unsigned long init_size = boot_params->hdr.init_size; + u64 initrd_start, initrd_size; + u64 cmd_line, cmd_line_size; + char *ptr; + + /* + * Avoid the region that is unsafe to overlap during + * decompression. + */ + mem_avoid[MEM_AVOID_ZO_RANGE].start = input; + mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; + add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start, + mem_avoid[MEM_AVOID_ZO_RANGE].size); + + /* Avoid initrd. */ + initrd_start = (u64)boot_params->ext_ramdisk_image << 32; + initrd_start |= boot_params->hdr.ramdisk_image; + initrd_size = (u64)boot_params->ext_ramdisk_size << 32; + initrd_size |= boot_params->hdr.ramdisk_size; + mem_avoid[MEM_AVOID_INITRD].start = initrd_start; + mem_avoid[MEM_AVOID_INITRD].size = initrd_size; + /* No need to set mapping for initrd, it will be handled in VO. */ + + /* Avoid kernel command line. */ + cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; + cmd_line |= boot_params->hdr.cmd_line_ptr; + /* Calculate size of cmd_line. */ + ptr = (char *)(unsigned long)cmd_line; + for (cmd_line_size = 0; ptr[cmd_line_size++]; ) + ; + mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; + mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; + add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, + mem_avoid[MEM_AVOID_CMDLINE].size); + + /* Avoid boot parameters. */ + mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; + mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); + add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start, + mem_avoid[MEM_AVOID_BOOTPARAMS].size); + + /* We don't need to set a mapping for setup_data. */ + +#ifdef CONFIG_X86_VERBOSE_BOOTUP + /* Make sure video RAM can be used. */ + add_identity_map(0, PMD_SIZE); +#endif +} + +/* + * Does this memory vector overlap a known avoided area? If so, record the + * overlap region with the lowest address. + */ +static bool mem_avoid_overlap(struct mem_vector *img, + struct mem_vector *overlap) +{ + int i; + struct setup_data *ptr; + unsigned long earliest = img->start + img->size; + bool is_overlapping = false; + + for (i = 0; i < MEM_AVOID_MAX; i++) { + if (mem_overlaps(img, &mem_avoid[i]) && + mem_avoid[i].start < earliest) { + *overlap = mem_avoid[i]; + is_overlapping = true; + } + } + + /* Avoid all entries in the setup_data linked list. */ + ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; + while (ptr) { + struct mem_vector avoid; + + avoid.start = (unsigned long)ptr; + avoid.size = sizeof(*ptr) + ptr->len; + + if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { + *overlap = avoid; + is_overlapping = true; + } + + ptr = (struct setup_data *)(unsigned long)ptr->next; + } + + return is_overlapping; +} + +static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN]; + +struct slot_area { + unsigned long addr; + int num; +}; + +#define MAX_SLOT_AREA 100 + +static struct slot_area slot_areas[MAX_SLOT_AREA]; + +static unsigned long slot_max; + +static unsigned long slot_area_index; + +static void store_slot_info(struct mem_vector *region, unsigned long image_size) +{ + struct slot_area slot_area; + + if (slot_area_index == MAX_SLOT_AREA) + return; + + slot_area.addr = region->start; + slot_area.num = (region->size - image_size) / + CONFIG_PHYSICAL_ALIGN + 1; + + if (slot_area.num > 0) { + slot_areas[slot_area_index++] = slot_area; + slot_max += slot_area.num; + } +} + +static void slots_append(unsigned long addr) +{ + /* Overflowing the slots list should be impossible. */ + if (slot_max >= KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN) + return; + + slots[slot_max++] = addr; +} + +static unsigned long slots_fetch_random(void) +{ + /* Handle case of no slots stored. */ + if (slot_max == 0) + return 0; + + return slots[get_random_long("Physical") % slot_max]; +} + +static void process_e820_entry(struct e820entry *entry, + unsigned long minimum, + unsigned long image_size) +{ + struct mem_vector region, img, overlap; + + /* Skip non-RAM entries. */ + if (entry->type != E820_RAM) + return; + + /* Ignore entries entirely above our maximum. */ + if (entry->addr >= KERNEL_IMAGE_SIZE) + return; + + /* Ignore entries entirely below our minimum. */ + if (entry->addr + entry->size < minimum) + return; + + region.start = entry->addr; + region.size = entry->size; + + /* Potentially raise address to minimum location. */ + if (region.start < minimum) + region.start = minimum; + + /* Potentially raise address to meet alignment requirements. */ + region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); + + /* Did we raise the address above the bounds of this e820 region? */ + if (region.start > entry->addr + entry->size) + return; + + /* Reduce size by any delta from the original address. */ + region.size -= region.start - entry->addr; + + /* Reduce maximum size to fit end of image within maximum limit. */ + if (region.start + region.size > KERNEL_IMAGE_SIZE) + region.size = KERNEL_IMAGE_SIZE - region.start; + + /* Walk each aligned slot and check for avoided areas. */ + for (img.start = region.start, img.size = image_size ; + mem_contains(®ion, &img) ; + img.start += CONFIG_PHYSICAL_ALIGN) { + if (mem_avoid_overlap(&img, &overlap)) + continue; + slots_append(img.start); + } +} + +static unsigned long find_random_phys_addr(unsigned long minimum, + unsigned long image_size) +{ + int i; + unsigned long addr; + + /* Make sure minimum is aligned. */ + minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + + /* Verify potential e820 positions, appending to slots list. */ + for (i = 0; i < boot_params->e820_entries; i++) { + process_e820_entry(&boot_params->e820_map[i], minimum, + image_size); + } + + return slots_fetch_random(); +} + +static unsigned long find_random_virt_addr(unsigned long minimum, + unsigned long image_size) +{ + unsigned long slots, random_addr; + + /* Make sure minimum is aligned. */ + minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + /* Align image_size for easy slot calculations. */ + image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN); + + /* + * There are how many CONFIG_PHYSICAL_ALIGN-sized slots + * that can hold image_size within the range of minimum to + * KERNEL_IMAGE_SIZE? + */ + slots = (KERNEL_IMAGE_SIZE - minimum - image_size) / + CONFIG_PHYSICAL_ALIGN + 1; + + random_addr = get_random_long("Virtual") % slots; + + return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; +} + +/* + * Since this function examines addresses much more numerically, + * it takes the input and output pointers as 'unsigned long'. + */ +unsigned char *choose_random_location(unsigned long input, + unsigned long input_size, + unsigned long output, + unsigned long output_size) +{ + unsigned long choice = output; + unsigned long random_addr; + +#ifdef CONFIG_HIBERNATION + if (!cmdline_find_option_bool("kaslr")) { + warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected)."); + goto out; + } +#else + if (cmdline_find_option_bool("nokaslr")) { + warn("KASLR disabled: 'nokaslr' on cmdline."); + goto out; + } +#endif + + boot_params->hdr.loadflags |= KASLR_FLAG; + + /* Record the various known unsafe memory ranges. */ + mem_avoid_init(input, input_size, output); + + /* Walk e820 and find a random address. */ + random_addr = find_random_phys_addr(output, output_size); + if (!random_addr) { + warn("KASLR disabled: could not find suitable E820 region!"); + goto out; + } + + /* Always enforce the minimum. */ + if (random_addr < choice) + goto out; + + choice = random_addr; + + add_identity_map(choice, output_size); + + /* This actually loads the identity pagetable on x86_64. */ + finalize_identity_maps(); +out: + return (unsigned char *)choice; +} diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 79dac1758e7c00d8c062be2e3c2b054bc4dfc475..f14db4e21654401940aab6a06c75192e05af721d 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -1,8 +1,10 @@ /* * misc.c * - * This is a collection of several routines from gzip-1.0.3 - * adapted for Linux. + * This is a collection of several routines used to extract the kernel + * which includes KASLR relocation, decompression, ELF parsing, and + * relocation processing. Additionally included are the screen and serial + * output functions and related debugging support functions. * * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 * puts by Nick Holloway 1993, better puts by Martin Mares 1995 @@ -10,111 +12,37 @@ */ #include "misc.h" +#include "error.h" #include "../string.h" - -/* WARNING!! - * This code is compiled with -fPIC and it is relocated dynamically - * at run time, but no relocation processing is performed. - * This means that it is not safe to place pointers in static structures. - */ +#include "../voffset.h" /* - * Getting to provable safe in place decompression is hard. - * Worst case behaviours need to be analyzed. - * Background information: - * - * The file layout is: - * magic[2] - * method[1] - * flags[1] - * timestamp[4] - * extraflags[1] - * os[1] - * compressed data blocks[N] - * crc[4] orig_len[4] - * - * resulting in 18 bytes of non compressed data overhead. - * - * Files divided into blocks - * 1 bit (last block flag) - * 2 bits (block type) - * - * 1 block occurs every 32K -1 bytes or when there 50% compression - * has been achieved. The smallest block type encoding is always used. - * - * stored: - * 32 bits length in bytes. - * - * fixed: - * magic fixed tree. - * symbols. - * - * dynamic: - * dynamic tree encoding. - * symbols. - * - * - * The buffer for decompression in place is the length of the - * uncompressed data, plus a small amount extra to keep the algorithm safe. - * The compressed data is placed at the end of the buffer. The output - * pointer is placed at the start of the buffer and the input pointer - * is placed where the compressed data starts. Problems will occur - * when the output pointer overruns the input pointer. - * - * The output pointer can only overrun the input pointer if the input - * pointer is moving faster than the output pointer. A condition only - * triggered by data whose compressed form is larger than the uncompressed - * form. - * - * The worst case at the block level is a growth of the compressed data - * of 5 bytes per 32767 bytes. - * - * The worst case internal to a compressed block is very hard to figure. - * The worst case can at least be boundined by having one bit that represents - * 32764 bytes and then all of the rest of the bytes representing the very - * very last byte. - * - * All of which is enough to compute an amount of extra data that is required - * to be safe. To avoid problems at the block level allocating 5 extra bytes - * per 32767 bytes of data is sufficient. To avoind problems internal to a - * block adding an extra 32767 bytes (the worst case uncompressed block size) - * is sufficient, to ensure that in the worst case the decompressed data for - * block will stop the byte before the compressed data for a block begins. - * To avoid problems with the compressed data's meta information an extra 18 - * bytes are needed. Leading to the formula: - * - * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. - * - * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. - * Adding 32768 instead of 32767 just makes for round numbers. - * Adding the decompressor_size is necessary as it musht live after all - * of the data as well. Last I measured the decompressor is about 14K. - * 10K of actual data and 4K of bss. - * + * WARNING!! + * This code is compiled with -fPIC and it is relocated dynamically at + * run time, but no relocation processing is performed. This means that + * it is not safe to place pointers in static structures. */ -/* - * gzip declarations - */ +/* Macros used by the included decompressor code below. */ #define STATIC static -#undef memcpy - /* - * Use a normal definition of memset() from string.c. There are already + * Use normal definitions of mem*() from string.c. There are already * included header files which expect a definition of memset() and by * the time we define memset macro, it is too late. */ +#undef memcpy #undef memset #define memzero(s, n) memset((s), 0, (n)) +#define memmove memmove - -static void error(char *m); +/* Functions used by the included decompressor code below. */ +void *memmove(void *dest, const void *src, size_t n); /* * This is set up by the setup-routine at boot-time */ -struct boot_params *real_mode; /* Pointer to real-mode data */ +struct boot_params *boot_params; memptr free_mem_ptr; memptr free_mem_end_ptr; @@ -146,12 +74,16 @@ static int lines, cols; #ifdef CONFIG_KERNEL_LZ4 #include "../../../../lib/decompress_unlz4.c" #endif +/* + * NOTE: When adding a new decompressor, please update the analysis in + * ../header.S. + */ static void scroll(void) { int i; - memcpy(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2); + memmove(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2); for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2) vidmem[i] = ' '; } @@ -184,12 +116,12 @@ void __putstr(const char *s) } } - if (real_mode->screen_info.orig_video_mode == 0 && + if (boot_params->screen_info.orig_video_mode == 0 && lines == 0 && cols == 0) return; - x = real_mode->screen_info.orig_x; - y = real_mode->screen_info.orig_y; + x = boot_params->screen_info.orig_x; + y = boot_params->screen_info.orig_y; while ((c = *s++) != '\0') { if (c == '\n') { @@ -210,8 +142,8 @@ void __putstr(const char *s) } } - real_mode->screen_info.orig_x = x; - real_mode->screen_info.orig_y = y; + boot_params->screen_info.orig_x = x; + boot_params->screen_info.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ outb(14, vidport); @@ -237,23 +169,13 @@ void __puthex(unsigned long value) } } -static void error(char *x) -{ - error_putstr("\n\n"); - error_putstr(x); - error_putstr("\n\n -- System halted"); - - while (1) - asm("hlt"); -} - #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len) { int *reloc; unsigned long delta, map, ptr; unsigned long min_addr = (unsigned long)output; - unsigned long max_addr = min_addr + output_len; + unsigned long max_addr = min_addr + (VO___bss_start - VO__text); /* * Calculate the delta between where vmlinux was linked to load @@ -295,7 +217,7 @@ static void handle_relocations(void *output, unsigned long output_len) * So we work backwards from the end of the decompressed image. */ for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) { - int extended = *reloc; + long extended = *reloc; extended += map; ptr = (unsigned long)extended; @@ -372,9 +294,7 @@ static void parse_elf(void *output) #else dest = (void *)(phdr->p_paddr); #endif - memcpy(dest, - output + phdr->p_offset, - phdr->p_filesz); + memmove(dest, output + phdr->p_offset, phdr->p_filesz); break; default: /* Ignore other PT_* */ break; } @@ -383,23 +303,41 @@ static void parse_elf(void *output) free(phdrs); } -asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, +/* + * The compressed kernel image (ZO), has been moved so that its position + * is against the end of the buffer used to hold the uncompressed kernel + * image (VO) and the execution environment (.bss, .brk), which makes sure + * there is room to do the in-place decompression. (See header.S for the + * calculations.) + * + * |-----compressed kernel image------| + * V V + * 0 extract_offset +INIT_SIZE + * |-----------|---------------|-------------------------|--------| + * | | | | + * VO__text startup_32 of ZO VO__end ZO__end + * ^ ^ + * |-------uncompressed kernel image---------| + * + */ +asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, - unsigned long output_len, - unsigned long run_size) + unsigned long output_len) { + const unsigned long kernel_total_size = VO__end - VO__text; unsigned char *output_orig = output; - real_mode = rmode; + /* Retain x86 boot parameters pointer passed from startup_32/64. */ + boot_params = rmode; - /* Clear it for solely in-kernel use */ - real_mode->hdr.loadflags &= ~KASLR_FLAG; + /* Clear flags intended for solely in-kernel use. */ + boot_params->hdr.loadflags &= ~KASLR_FLAG; - sanitize_boot_params(real_mode); + sanitize_boot_params(boot_params); - if (real_mode->screen_info.orig_video_mode == 7) { + if (boot_params->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; } else { @@ -407,11 +345,11 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, vidport = 0x3d4; } - lines = real_mode->screen_info.orig_video_lines; - cols = real_mode->screen_info.orig_video_cols; + lines = boot_params->screen_info.orig_video_lines; + cols = boot_params->screen_info.orig_video_cols; console_init(); - debug_putstr("early console in decompress_kernel\n"); + debug_putstr("early console in extract_kernel\n"); free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; @@ -421,16 +359,16 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, debug_putaddr(input_len); debug_putaddr(output); debug_putaddr(output_len); - debug_putaddr(run_size); + debug_putaddr(kernel_total_size); /* * The memory hole needed for the kernel is the larger of either * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_kernel_location(real_mode, input_data, input_len, output, - output_len > run_size ? output_len - : run_size); + output = choose_random_location((unsigned long)input_data, input_len, + (unsigned long)output, + max(output_len, kernel_total_size)); /* Validate memory location choices. */ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 3783dc3e10b31b8598eb7621a50edab6c0fc0f41..b6fec1ff10e442bac58885af6bc54223f695480c 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -32,7 +32,7 @@ /* misc.c */ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; -extern struct boot_params *real_mode; /* Pointer to real-mode data */ +extern struct boot_params *boot_params; void __putstr(const char *s); void __puthex(unsigned long value); #define error_putstr(__x) __putstr(__x) @@ -66,26 +66,35 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE -/* aslr.c */ -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, +/* kaslr.c */ +unsigned char *choose_random_location(unsigned long input_ptr, unsigned long input_size, - unsigned char *output, + unsigned long output_ptr, unsigned long output_size); /* cpuflags.c */ bool has_cpuflag(int flag); #else static inline -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, +unsigned char *choose_random_location(unsigned long input_ptr, unsigned long input_size, - unsigned char *output, + unsigned long output_ptr, unsigned long output_size) { - return output; + return (unsigned char *)output_ptr; } #endif +#ifdef CONFIG_X86_64 +void add_identity_map(unsigned long start, unsigned long size); +void finalize_identity_maps(void); +extern unsigned char _pgtable[]; +#else +static inline void add_identity_map(unsigned long start, unsigned long size) +{ } +static inline void finalize_identity_maps(void) +{ } +#endif + #ifdef CONFIG_EARLY_PRINTK /* early_serial_console.c */ extern int early_serial_base; diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index d8222f213182f120c6f1d4c43e9ac477b2c2ca24..72bad2c8debe3bc2dcb2fccc493b76d58eb67e83 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -18,11 +18,10 @@ * * H. Peter Anvin * - * ----------------------------------------------------------------------- */ - -/* - * Compute the desired load offset from a compressed program; outputs - * a small assembly wrapper with the appropriate symbols defined. + * ----------------------------------------------------------------------- + * + * Outputs a small assembly wrapper with the appropriate symbols defined. + * */ #include @@ -35,14 +34,11 @@ int main(int argc, char *argv[]) { uint32_t olen; long ilen; - unsigned long offs; - unsigned long run_size; FILE *f = NULL; int retval = 1; - if (argc < 3) { - fprintf(stderr, "Usage: %s compressed_file run_size\n", - argv[0]); + if (argc < 2) { + fprintf(stderr, "Usage: %s compressed_file\n", argv[0]); goto bail; } @@ -67,29 +63,11 @@ int main(int argc, char *argv[]) ilen = ftell(f); olen = get_unaligned_le32(&olen); - /* - * Now we have the input (compressed) and output (uncompressed) - * sizes, compute the necessary decompression offset... - */ - - offs = (olen > ilen) ? olen - ilen : 0; - offs += olen >> 12; /* Add 8 bytes for each 32K block */ - offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */ - offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ - run_size = atoi(argv[2]); - printf(".section \".rodata..compressed\",\"a\",@progbits\n"); printf(".globl z_input_len\n"); printf("z_input_len = %lu\n", ilen); printf(".globl z_output_len\n"); printf("z_output_len = %lu\n", (unsigned long)olen); - printf(".globl z_extract_offset\n"); - printf("z_extract_offset = 0x%lx\n", offs); - /* z_extract_offset_negative allows simplification of head_32.S */ - printf(".globl z_extract_offset_negative\n"); - printf("z_extract_offset_negative = -0x%lx\n", offs); - printf(".globl z_run_size\n"); - printf("z_run_size = %lu\n", run_size); printf(".globl input_data, input_data_end\n"); printf("input_data:\n"); diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c new file mode 100644 index 0000000000000000000000000000000000000000..34b95df14e694d419432cb4543f2a7bfa082b0e6 --- /dev/null +++ b/arch/x86/boot/compressed/pagetable.c @@ -0,0 +1,129 @@ +/* + * This code is used on x86_64 to create page table identity mappings on + * demand by building up a new set of page tables (or appending to the + * existing ones), and then switching over to them when ready. + */ + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +#include "misc.h" + +/* These actually do the work of building the kernel identity maps. */ +#include +#include +#include "../../mm/ident_map.c" + +/* Used by pgtable.h asm code to force instruction serialization. */ +unsigned long __force_order; + +/* Used to track our page table allocation area. */ +struct alloc_pgt_data { + unsigned char *pgt_buf; + unsigned long pgt_buf_size; + unsigned long pgt_buf_offset; +}; + +/* + * Allocates space for a page table entry, using struct alloc_pgt_data + * above. Besides the local callers, this is used as the allocation + * callback in mapping_info below. + */ +static void *alloc_pgt_page(void *context) +{ + struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; + unsigned char *entry; + + /* Validate there is space available for a new page. */ + if (pages->pgt_buf_offset >= pages->pgt_buf_size) { + debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); + debug_putaddr(pages->pgt_buf_offset); + debug_putaddr(pages->pgt_buf_size); + return NULL; + } + + entry = pages->pgt_buf + pages->pgt_buf_offset; + pages->pgt_buf_offset += PAGE_SIZE; + + return entry; +} + +/* Used to track our allocated page tables. */ +static struct alloc_pgt_data pgt_data; + +/* The top level page table entry pointer. */ +static unsigned long level4p; + +/* Locates and clears a region for a new top level page table. */ +static void prepare_level4(void) +{ + /* + * It should be impossible for this not to already be true, + * but since calling this a second time would rewind the other + * counters, let's just make sure this is reset too. + */ + pgt_data.pgt_buf_offset = 0; + + /* + * If we came here via startup_32(), cr3 will be _pgtable already + * and we must append to the existing area instead of entirely + * overwriting it. + */ + level4p = read_cr3(); + if (level4p == (unsigned long)_pgtable) { + debug_putstr("booted via startup_32()\n"); + pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + } else { + debug_putstr("booted via startup_64()\n"); + pgt_data.pgt_buf = _pgtable; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + level4p = (unsigned long)alloc_pgt_page(&pgt_data); + } +} + +/* + * Adds the specified range to what will become the new identity mappings. + * Once all ranges have been added, the new mapping is activated by calling + * finalize_identity_maps() below. + */ +void add_identity_map(unsigned long start, unsigned long size) +{ + struct x86_mapping_info mapping_info = { + .alloc_pgt_page = alloc_pgt_page, + .context = &pgt_data, + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + }; + unsigned long end = start + size; + + /* Make sure we have a top level page table ready to use. */ + if (!level4p) + prepare_level4(); + + /* Align boundary to 2M. */ + start = round_down(start, PMD_SIZE); + end = round_up(end, PMD_SIZE); + if (start >= end) + return; + + /* Build the mapping. */ + kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p, + start, end); +} + +/* + * This switches the page tables to the new level4 that has been built + * via calls to add_identity_map() above. If booted via startup_32(), + * this is effectively a no-op. + */ +void finalize_identity_maps(void) +{ + write_cr3(level4p); +} diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 00e788be1db943360830701be69fdcbf41294b49..cea140ce6b42302deac808ba0c643cf619753781 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,7 +1,16 @@ +/* + * This provides an optimized implementation of memcpy, and a simplified + * implementation of memset and memmove. These are used here because the + * standard kernel runtime versions are not yet available and we don't + * trust the gcc built-in implementations as they may do unexpected things + * (e.g. FPU ops) in the minimal decompression stub execution environment. + */ +#include "error.h" + #include "../string.c" #ifdef CONFIG_X86_32 -void *memcpy(void *dest, const void *src, size_t n) +static void *__memcpy(void *dest, const void *src, size_t n) { int d0, d1, d2; asm volatile( @@ -15,7 +24,7 @@ void *memcpy(void *dest, const void *src, size_t n) return dest; } #else -void *memcpy(void *dest, const void *src, size_t n) +static void *__memcpy(void *dest, const void *src, size_t n) { long d0, d1, d2; asm volatile( @@ -39,3 +48,27 @@ void *memset(void *s, int c, size_t n) ss[i] = c; return s; } + +void *memmove(void *dest, const void *src, size_t n) +{ + unsigned char *d = dest; + const unsigned char *s = src; + + if (d <= s || d - s >= n) + return __memcpy(dest, src, n); + + while (n-- > 0) + d[n] = s[n]; + + return dest; +} + +/* Detect and warn about potential overlaps, but handle them with memmove. */ +void *memcpy(void *dest, const void *src, size_t n) +{ + if (dest > src && dest - src < n) { + warn("Avoiding potentially unsafe overlapping memcpy()!"); + return memmove(dest, src, n); + } + return __memcpy(dest, src, n); +} diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 34d047c982848439e7ed907cbea7b4398bf860cd..e24e0a0c90c9b18702f051de7e64825f5170534c 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -70,5 +70,6 @@ SECTIONS _epgtable = . ; } #endif + . = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */ _end = .; } diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c index 45a07684bbabf3b617dcbd5b53ac5710bcb10e57..f0b8d6d93164d8de8a7879ac6ef89343ad199a47 100644 --- a/arch/x86/boot/early_serial_console.c +++ b/arch/x86/boot/early_serial_console.c @@ -1,3 +1,7 @@ +/* + * Serial port routines for use during early boot reporting. This code is + * included from both the compressed kernel and the regular kernel. + */ #include "boot.h" #define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 6236b9ec4b764cfd988fb47f08cc000a653fbed4..3dd5be33aaa7b091fce5c21c3c8105cbbef5f707 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -440,13 +440,116 @@ setup_data: .quad 0 # 64-bit physical pointer to pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr -#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) +# +# Getting to provably safe in-place decompression is hard. Worst case +# behaviours need to be analyzed. Here let's take the decompression of +# a gzip-compressed kernel as example, to illustrate it: +# +# The file layout of gzip compressed kernel is: +# +# magic[2] +# method[1] +# flags[1] +# timestamp[4] +# extraflags[1] +# os[1] +# compressed data blocks[N] +# crc[4] orig_len[4] +# +# ... resulting in +18 bytes overhead of uncompressed data. +# +# (For more information, please refer to RFC 1951 and RFC 1952.) +# +# Files divided into blocks +# 1 bit (last block flag) +# 2 bits (block type) +# +# 1 block occurs every 32K -1 bytes or when there 50% compression +# has been achieved. The smallest block type encoding is always used. +# +# stored: +# 32 bits length in bytes. +# +# fixed: +# magic fixed tree. +# symbols. +# +# dynamic: +# dynamic tree encoding. +# symbols. +# +# +# The buffer for decompression in place is the length of the uncompressed +# data, plus a small amount extra to keep the algorithm safe. The +# compressed data is placed at the end of the buffer. The output pointer +# is placed at the start of the buffer and the input pointer is placed +# where the compressed data starts. Problems will occur when the output +# pointer overruns the input pointer. +# +# The output pointer can only overrun the input pointer if the input +# pointer is moving faster than the output pointer. A condition only +# triggered by data whose compressed form is larger than the uncompressed +# form. +# +# The worst case at the block level is a growth of the compressed data +# of 5 bytes per 32767 bytes. +# +# The worst case internal to a compressed block is very hard to figure. +# The worst case can at least be bounded by having one bit that represents +# 32764 bytes and then all of the rest of the bytes representing the very +# very last byte. +# +# All of which is enough to compute an amount of extra data that is required +# to be safe. To avoid problems at the block level allocating 5 extra bytes +# per 32767 bytes of data is sufficient. To avoid problems internal to a +# block adding an extra 32767 bytes (the worst case uncompressed block size) +# is sufficient, to ensure that in the worst case the decompressed data for +# block will stop the byte before the compressed data for a block begins. +# To avoid problems with the compressed data's meta information an extra 18 +# bytes are needed. Leading to the formula: +# +# extra_bytes = (uncompressed_size >> 12) + 32768 + 18 +# +# Adding 8 bytes per 32K is a bit excessive but much easier to calculate. +# Adding 32768 instead of 32767 just makes for round numbers. +# +# Above analysis is for decompressing gzip compressed kernel only. Up to +# now 6 different decompressor are supported all together. And among them +# xz stores data in chunks and has maximum chunk of 64K. Hence safety +# margin should be updated to cover all decompressors so that we don't +# need to deal with each of them separately. Please check +# the description in lib/decompressor_xxx.c for specific information. +# +# extra_bytes = (uncompressed_size >> 12) + 65536 + 128 + +#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 65536 + 128) +#if ZO_z_output_len > ZO_z_input_len +# define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \ + ZO_z_input_len) +#else +# define ZO_z_extract_offset ZO_z_extra_bytes +#endif + +/* + * The extract_offset has to be bigger than ZO head section. Otherwise when + * the head code is running to move ZO to the end of the buffer, it will + * overwrite the head code itself. + */ +#if (ZO__ehead - ZO_startup_32) > ZO_z_extract_offset +# define ZO_z_min_extract_offset ((ZO__ehead - ZO_startup_32 + 4095) & ~4095) +#else +# define ZO_z_min_extract_offset ((ZO_z_extract_offset + 4095) & ~4095) +#endif + +#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_min_extract_offset) + #define VO_INIT_SIZE (VO__end - VO__text) #if ZO_INIT_SIZE > VO_INIT_SIZE -#define INIT_SIZE ZO_INIT_SIZE +# define INIT_SIZE ZO_INIT_SIZE #else -#define INIT_SIZE VO_INIT_SIZE +# define INIT_SIZE VO_INIT_SIZE #endif + init_size: .long INIT_SIZE # kernel initialization size handover_offset: .long 0 # Filled in by build.c diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 4f404a64681b879b3e1365af9bb4488562d654dd..0c8d7963483ced9814c6864058aecb272f4cf9f4 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -173,6 +173,7 @@ CONFIG_TIGON3=y CONFIG_NET_TULIP=y CONFIG_E100=y CONFIG_E1000=y +CONFIG_E1000E=y CONFIG_SKY2=y CONFIG_FORCEDETH=y CONFIG_8139TOO=y diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 064c7e2bd7c8e8b9c10e709273217625a17ac6fe..5b7fa14710073bdc6902de2775d27c3f3ed53bb3 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1477,7 +1477,7 @@ static int __init aesni_init(void) } aesni_ctr_enc_tfm = aesni_ctr_enc; #ifdef CONFIG_AS_AVX - if (cpu_has_avx) { + if (boot_cpu_has(X86_FEATURE_AVX)) { /* optimize performance of ctr mode encryption transform */ aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm; pr_info("AES CTR mode by8 optimization enabled\n"); diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index d844569245633845da1fe485dca9c44043be680f..60907c139c4e2a7842f412ebcbfd744085cb6020 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -562,7 +562,10 @@ static int __init camellia_aesni_init(void) { const char *feature_name; - if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 or AES-NI instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 93d8f295784e399c2fc363d8b5602fede9039529..d96429da88eb8bf274620b79de372b2ac72adec4 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -554,7 +554,9 @@ static int __init camellia_aesni_init(void) { const char *feature_name; - if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX or AES-NI instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 8baaff5af0b572b27e9c488083d2d751216775d9..2d5c2e0bd939b9e267102adf3577a91e53abfce1 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -129,7 +129,8 @@ static int __init chacha20_simd_mod_init(void) return -ENODEV; #ifdef CONFIG_AS_AVX2 - chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 && + chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); #endif return crypto_register_alg(&alg); diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 4264a3d595894b1a8e8bbba635f477878b0461dd..e32142bc071d9344533f39d2a6ee5326845b62f2 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -179,11 +179,12 @@ static struct shash_alg alg = { static int __init poly1305_simd_mod_init(void) { - if (!cpu_has_xmm2) + if (!boot_cpu_has(X86_FEATURE_XMM2)) return -ENODEV; #ifdef CONFIG_AS_AVX2 - poly1305_use_avx2 = cpu_has_avx && cpu_has_avx2 && + poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); alg.descsize = sizeof(struct poly1305_simd_desc_ctx); if (poly1305_use_avx2) diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 6d198342e2de4951635c46aa3486c88cbc793274..870f6d812a2dd251392498ce10fabef37a3826ff 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -538,7 +538,7 @@ static int __init init(void) { const char *feature_name; - if (!cpu_has_avx2 || !cpu_has_osxsave) { + if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 8943407e8917a343658c0a3c81ed9ec0dc6b1735..644f97ab8cace2910d91cf9abb798f044aef0c1f 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -600,7 +600,7 @@ static struct crypto_alg serpent_algs[10] = { { static int __init serpent_sse2_init(void) { - if (!cpu_has_xmm2) { + if (!boot_cpu_has(X86_FEATURE_XMM2)) { printk(KERN_INFO "SSE2 instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index 081255cea1ee5d442a75529172e097afce7f396c..9c5af331a956f67bf868a717c9e7a82389c0cda6 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -102,14 +102,14 @@ static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)(struct sha1_mb_mgr *st static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)(struct sha1_mb_mgr *state); static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)(struct sha1_mb_mgr *state); -inline void sha1_init_digest(uint32_t *digest) +static inline void sha1_init_digest(uint32_t *digest) { static const uint32_t initial_digest[SHA1_DIGEST_LENGTH] = {SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }; memcpy(digest, initial_digest, sizeof(initial_digest)); } -inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], +static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint32_t total_len) { uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1); diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index dd14616b773970d13c2886f255c0f76b4eb58450..1024e378a358f69ef49864a0014cd6af5aed3523 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -166,7 +166,7 @@ static struct shash_alg sha1_avx_alg = { static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (cpu_has_avx) + if (boot_cpu_has(X86_FEATURE_AVX)) pr_info("AVX detected but unusable.\n"); return false; } diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 5f4d6086dc5913be7d680882ab94d2286ad3e230..3ae0f43ebd376527ec450f8a2dc16eee98f17fe9 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -201,7 +201,7 @@ static struct shash_alg sha256_avx_algs[] = { { static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (cpu_has_avx) + if (boot_cpu_has(X86_FEATURE_AVX)) pr_info("AVX detected but unusable.\n"); return false; } diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 34e5083d6f36540e967dc755384012ca35afd714..0b17c83d027ddcac6c211b2848486096a84cdbea 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -151,7 +151,7 @@ asmlinkage void sha512_transform_avx(u64 *digest, const char *data, static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (cpu_has_avx) + if (boot_cpu_has(X86_FEATURE_AVX)) pr_info("AVX detected but unusable.\n"); return false; } diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index e79d93d44ecd9c66b1e29078a11aa1ee405c0fa2..ec138e538c44f9acf7f829827d14a3991473763f 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -191,7 +191,7 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, long syscall_trace_enter(struct pt_regs *regs) { - u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); if (phase1_result == 0) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 10868aa734dc07e9a438bcdd3d4c7b4aae0106d3..983e5d3a0d271c387e24371ddc3fcd6e7110a27d 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -207,10 +207,7 @@ ENTRY(ret_from_fork) pushl %eax call schedule_tail - GET_THREAD_INFO(%ebp) popl %eax - pushl $0x0202 # Reset kernel eflags - popfl /* When we fork, we trace the syscall return in the child, too. */ movl %esp, %eax @@ -221,10 +218,7 @@ END(ret_from_fork) ENTRY(ret_from_kernel_thread) pushl %eax call schedule_tail - GET_THREAD_INFO(%ebp) popl %eax - pushl $0x0202 # Reset kernel eflags - popfl movl PT_EBP(%esp), %eax call *PT_EBX(%esp) movl $0, PT_EAX(%esp) @@ -251,7 +245,6 @@ ENDPROC(ret_from_kernel_thread) ret_from_exception: preempt_stop(CLBR_ANY) ret_from_intr: - GET_THREAD_INFO(%ebp) #ifdef CONFIG_VM86 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS movb PT_CS(%esp), %al diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 858b555e274b8d763d97d9b9cf14998125bce563..9ee0da1807edff462536e3628e1db217bb835cd8 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -372,9 +372,6 @@ END(ptregs_\func) ENTRY(ret_from_fork) LOCK ; btr $TIF_FORK, TI_flags(%r8) - pushq $0x0002 - popfq /* reset kernel eflags */ - call schedule_tail /* rdi: 'prev' task parameter */ testb $3, CS(%rsp) /* from kernel_thread? */ @@ -781,19 +778,25 @@ ENTRY(native_load_gs_index) pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS -gs_change: +.Lgs_change: movl %edi, %gs -2: mfence /* workaround */ +2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS popfq ret END(native_load_gs_index) - _ASM_EXTABLE(gs_change, bad_gs) + _ASM_EXTABLE(.Lgs_change, bad_gs) .section .fixup, "ax" /* running with kernelgs */ bad_gs: SWAPGS /* switch back to user gs */ +.macro ZAP_GS + /* This can't be a string because the preprocessor needs to see it. */ + movl $__USER_DS, %eax + movl %eax, %gs +.endm + ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG xorl %eax, %eax movl %eax, %gs jmp 2b @@ -1019,13 +1022,13 @@ ENTRY(error_entry) movl %ecx, %eax /* zero extend */ cmpq %rax, RIP+8(%rsp) je .Lbstep_iret - cmpq $gs_change, RIP+8(%rsp) + cmpq $.Lgs_change, RIP+8(%rsp) jne .Lerror_entry_done /* - * hack: gs_change can fail with user gsbase. If this happens, fix up + * hack: .Lgs_change can fail with user gsbase. If this happens, fix up * gsbase and proceed. We'll fix up the exception and land in - * gs_change's error handler with kernel gsbase. + * .Lgs_change's error handler with kernel gsbase. */ jmp .Lerror_entry_from_usermode_swapgs diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 847f2f0c31e50d1029a2e39600ed86e772ef8847..e1721dafbcb13fab9230cc20d598b18ebef8306b 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -72,24 +72,23 @@ ENTRY(entry_SYSENTER_compat) pushfq /* pt_regs->flags (except IF = 0) */ orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ pushq $__USER32_CS /* pt_regs->cs */ - xorq %r8,%r8 - pushq %r8 /* pt_regs->ip = 0 (placeholder) */ + pushq $0 /* pt_regs->ip = 0 (placeholder) */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 = 0 */ - pushq %r8 /* pt_regs->r9 = 0 */ - pushq %r8 /* pt_regs->r10 = 0 */ - pushq %r8 /* pt_regs->r11 = 0 */ + pushq $0 /* pt_regs->r8 = 0 */ + pushq $0 /* pt_regs->r9 = 0 */ + pushq $0 /* pt_regs->r10 = 0 */ + pushq $0 /* pt_regs->r11 = 0 */ pushq %rbx /* pt_regs->rbx */ pushq %rbp /* pt_regs->rbp (will be overwritten) */ - pushq %r8 /* pt_regs->r12 = 0 */ - pushq %r8 /* pt_regs->r13 = 0 */ - pushq %r8 /* pt_regs->r14 = 0 */ - pushq %r8 /* pt_regs->r15 = 0 */ + pushq $0 /* pt_regs->r12 = 0 */ + pushq $0 /* pt_regs->r13 = 0 */ + pushq $0 /* pt_regs->r14 = 0 */ + pushq $0 /* pt_regs->r15 = 0 */ cld /* @@ -205,17 +204,16 @@ ENTRY(entry_SYSCALL_compat) pushq %rdx /* pt_regs->dx */ pushq %rbp /* pt_regs->cx (stashed in bp) */ pushq $-ENOSYS /* pt_regs->ax */ - xorq %r8,%r8 - pushq %r8 /* pt_regs->r8 = 0 */ - pushq %r8 /* pt_regs->r9 = 0 */ - pushq %r8 /* pt_regs->r10 = 0 */ - pushq %r8 /* pt_regs->r11 = 0 */ + pushq $0 /* pt_regs->r8 = 0 */ + pushq $0 /* pt_regs->r9 = 0 */ + pushq $0 /* pt_regs->r10 = 0 */ + pushq $0 /* pt_regs->r11 = 0 */ pushq %rbx /* pt_regs->rbx */ pushq %rbp /* pt_regs->rbp (will be overwritten) */ - pushq %r8 /* pt_regs->r12 = 0 */ - pushq %r8 /* pt_regs->r13 = 0 */ - pushq %r8 /* pt_regs->r14 = 0 */ - pushq %r8 /* pt_regs->r15 = 0 */ + pushq $0 /* pt_regs->r12 = 0 */ + pushq $0 /* pt_regs->r13 = 0 */ + pushq $0 /* pt_regs->r14 = 0 */ + pushq $0 /* pt_regs->r15 = 0 */ /* * User mode is traced as though IRQs are on, and SYSENTER @@ -316,11 +314,10 @@ ENTRY(entry_INT80_compat) pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ - xorq %r8,%r8 - pushq %r8 /* pt_regs->r8 = 0 */ - pushq %r8 /* pt_regs->r9 = 0 */ - pushq %r8 /* pt_regs->r10 = 0 */ - pushq %r8 /* pt_regs->r11 = 0 */ + pushq $0 /* pt_regs->r8 = 0 */ + pushq $0 /* pt_regs->r9 = 0 */ + pushq $0 /* pt_regs->r10 = 0 */ + pushq $0 /* pt_regs->r11 = 0 */ pushq %rbx /* pt_regs->rbx */ pushq %rbp /* pt_regs->rbp */ pushq %r12 /* pt_regs->r12 */ diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index cac6d17ce5db000ea008d63d5905acbd6fea21e0..555263e385c9210af5f70e08dd27871005c5a865 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -374,3 +374,5 @@ 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat/ptregs +534 x32 preadv2 compat_sys_preadv2 +535 x32 pwritev2 compat_sys_pwritev2 diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 03c3eb77bfcebce765b838271b6d7789365ff3de..2f02d23a05ef4b20c8a620c9bef4e7943e02ab1f 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -13,7 +13,6 @@ #include #include -#include #include #include #include @@ -28,16 +27,6 @@ extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); -#ifdef CONFIG_HPET_TIMER -extern u8 hpet_page - __attribute__((visibility("hidden"))); - -static notrace cycle_t vread_hpet(void) -{ - return *(const volatile u32 *)(&hpet_page + HPET_COUNTER); -} -#endif - #ifdef CONFIG_PARAVIRT_CLOCK extern u8 pvclock_page __attribute__((visibility("hidden"))); @@ -195,10 +184,6 @@ notrace static inline u64 vgetsns(int *mode) if (gtod->vclock_mode == VCLOCK_TSC) cycles = vread_tsc(); -#ifdef CONFIG_HPET_TIMER - else if (gtod->vclock_mode == VCLOCK_HPET) - cycles = vread_hpet(); -#endif #ifdef CONFIG_PARAVIRT_CLOCK else if (gtod->vclock_mode == VCLOCK_PVCLOCK) cycles = vread_pvclock(mode); diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 4158acc17df07c355f0d86e19fd3b4f9b43e590e..a708aa90b507fce048e05676f073f7b9f3347e59 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -25,7 +25,7 @@ SECTIONS * segment. */ - vvar_start = . - 3 * PAGE_SIZE; + vvar_start = . - 2 * PAGE_SIZE; vvar_page = vvar_start; /* Place all vvars at the offsets in asm/vvar.h. */ @@ -35,8 +35,7 @@ SECTIONS #undef __VVAR_KERNEL_LDS #undef EMIT_VVAR - hpet_page = vvar_start + PAGE_SIZE; - pvclock_page = vvar_start + 2 * PAGE_SIZE; + pvclock_page = vvar_start + PAGE_SIZE; . = SIZEOF_HEADERS; diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 10f704584922653fd208646cac11c4f8a9cd776b..b3cf81333a54edf146c26a747db201111b7863d3 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -129,16 +128,6 @@ static int vvar_fault(const struct vm_special_mapping *sm, if (sym_offset == image->sym_vvar_page) { ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); - } else if (sym_offset == image->sym_hpet_page) { -#ifdef CONFIG_HPET_TIMER - if (hpet_address && vclock_was_used(VCLOCK_HPET)) { - ret = vm_insert_pfn_prot( - vma, - (unsigned long)vmf->virtual_address, - hpet_address >> PAGE_SHIFT, - pgprot_noncached(PAGE_READONLY)); - } -#endif } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = pvclock_pvti_cpu0_va(); diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..98397db5ceaece5e6531da82db5328ecff28033a --- /dev/null +++ b/arch/x86/events/Kconfig @@ -0,0 +1,36 @@ +menu "Performance monitoring" + +config PERF_EVENTS_INTEL_UNCORE + tristate "Intel uncore performance events" + depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + default y + ---help--- + Include support for Intel uncore performance events. These are + available on NehalemEX and more modern processors. + +config PERF_EVENTS_INTEL_RAPL + tristate "Intel rapl performance events" + depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + default y + ---help--- + Include support for Intel rapl performance events for power + monitoring on modern processors. + +config PERF_EVENTS_INTEL_CSTATE + tristate "Intel cstate performance events" + depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + default y + ---help--- + Include support for Intel cstate performance events for power + monitoring on modern processors. + +config PERF_EVENTS_AMD_POWER + depends on PERF_EVENTS && CPU_SUP_AMD + tristate "AMD Processor Power Reporting Mechanism" + ---help--- + Provide power reporting mechanism support for AMD processors. + Currently, it leverages X86_FEATURE_ACC_POWER + (CPUID Fn8000_0007_EDX[12]) interface to calculate the + average power consumption on Family 15h processors. + +endmenu diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index f59618a3999058515027f413e7691fa2f7cdbb02..1d392c39fe560a782ae7fd71296ee0cc64f47fe5 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -6,9 +6,6 @@ obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o ifdef CONFIG_AMD_IOMMU obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o endif -obj-$(CONFIG_CPU_SUP_INTEL) += intel/core.o intel/bts.o intel/cqm.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel/cstate.o intel/ds.o intel/knc.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel/lbr.o intel/p4.o intel/p6.o intel/pt.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel/rapl.o msr.o -obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore.o intel/uncore_nhmex.o -obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore_snb.o intel/uncore_snbep.o + +obj-$(CONFIG_CPU_SUP_INTEL) += msr.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel/ diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 3db9569e658c86a794df30454904e9b06de4ea4c..98ac57381bf9bb1b2bea222dd1c6b265e9f40a01 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -263,6 +263,7 @@ static const struct attribute_group *amd_uncore_attr_groups[] = { }; static struct pmu amd_nb_pmu = { + .task_ctx_nr = perf_invalid_context, .attr_groups = amd_uncore_attr_groups, .name = "amd_nb", .event_init = amd_uncore_event_init, @@ -274,6 +275,7 @@ static struct pmu amd_nb_pmu = { }; static struct pmu amd_l2_pmu = { + .task_ctx_nr = perf_invalid_context, .attr_groups = amd_uncore_attr_groups, .name = "amd_l2", .event_init = amd_uncore_event_init, diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 041e442a3e2806ed884584758cb8e62abd809e36..73a75aa5a66db39d69a6c4f60e091f32fd570c04 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -360,6 +360,9 @@ int x86_add_exclusive(unsigned int what) { int i; + if (x86_pmu.lbr_pt_coexist) + return 0; + if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { mutex_lock(&pmc_reserve_mutex); for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { @@ -380,6 +383,9 @@ int x86_add_exclusive(unsigned int what) void x86_del_exclusive(unsigned int what) { + if (x86_pmu.lbr_pt_coexist) + return; + atomic_dec(&x86_pmu.lbr_exclusive[what]); atomic_dec(&active_events); } @@ -1518,7 +1524,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) static void __init pmu_check_apic(void) { - if (cpu_has_apic) + if (boot_cpu_has(X86_FEATURE_APIC)) return; x86_pmu.apic = 0; @@ -2177,7 +2183,7 @@ void arch_perf_update_userpage(struct perf_event *event, * cap_user_time_zero doesn't make sense when we're using a different * time base for the records. */ - if (event->clock == &local_clock) { + if (!event->attr.use_clockid) { userpg->cap_user_time_zero = 1; userpg->time_zero = data->cyc2ns_offset; } @@ -2277,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { unsigned long bytes; frame.next_frame = 0; frame.return_address = 0; @@ -2337,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; pagefault_disable(); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..3660b2cf245ad7d22eaf0f9d1cf2602d270b62c2 --- /dev/null +++ b/arch/x86/events/intel/Makefile @@ -0,0 +1,9 @@ +obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o +obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o +obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o +obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl.o +intel-rapl-objs := rapl.o +obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o +intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o +obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o +intel-cstate-objs := cstate.o diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index b99dc9258c0f9ccc319a15e4d6ec170b0ea13e17..0a6e393a2e6298bb9240831714d5915ab26abc5a 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -171,18 +171,6 @@ static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head) memset(page_address(phys->page) + index, 0, phys->size - index); } -static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts) -{ - if (buf->snapshot) - return false; - - if (local_read(&buf->data_size) >= bts->handle.size || - bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE) - return true; - - return false; -} - static void bts_update(struct bts_ctx *bts) { int cpu = raw_smp_processor_id(); @@ -213,18 +201,15 @@ static void bts_update(struct bts_ctx *bts) } } +static int +bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); + static void __bts_event_start(struct perf_event *event) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); struct bts_buffer *buf = perf_get_aux(&bts->handle); u64 config = 0; - if (!buf || bts_buffer_is_full(buf, bts)) - return; - - event->hw.itrace_started = 1; - event->hw.state = 0; - if (!buf->snapshot) config |= ARCH_PERFMON_EVENTSEL_INT; if (!event->attr.exclude_kernel) @@ -241,16 +226,41 @@ static void __bts_event_start(struct perf_event *event) wmb(); intel_pmu_enable_bts(config); + } static void bts_event_start(struct perf_event *event, int flags) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_buffer *buf; + + buf = perf_aux_output_begin(&bts->handle, event); + if (!buf) + goto fail_stop; + + if (bts_buffer_reset(buf, &bts->handle)) + goto fail_end_stop; + + bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; + bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; + bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; + + event->hw.itrace_started = 1; + event->hw.state = 0; __bts_event_start(event); /* PMI handler: this counter is running and likely generating PMIs */ ACCESS_ONCE(bts->started) = 1; + + return; + +fail_end_stop: + perf_aux_output_end(&bts->handle, 0, false); + +fail_stop: + event->hw.state = PERF_HES_STOPPED; } static void __bts_event_stop(struct perf_event *event) @@ -269,15 +279,32 @@ static void __bts_event_stop(struct perf_event *event) static void bts_event_stop(struct perf_event *event, int flags) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_buffer *buf = perf_get_aux(&bts->handle); /* PMI handler: don't restart this counter */ ACCESS_ONCE(bts->started) = 0; __bts_event_stop(event); - if (flags & PERF_EF_UPDATE) + if (flags & PERF_EF_UPDATE) { bts_update(bts); + + if (buf) { + if (buf->snapshot) + bts->handle.head = + local_xchg(&buf->data_size, + buf->nr_pages << PAGE_SHIFT); + perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), + !!local_xchg(&buf->lost, 0)); + } + + cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; + cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base; + cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum; + cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold; + } } void intel_bts_enable_local(void) @@ -417,34 +444,14 @@ int intel_bts_interrupt(void) static void bts_event_del(struct perf_event *event, int mode) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - struct bts_buffer *buf = perf_get_aux(&bts->handle); - bts_event_stop(event, PERF_EF_UPDATE); - - if (buf) { - if (buf->snapshot) - bts->handle.head = - local_xchg(&buf->data_size, - buf->nr_pages << PAGE_SHIFT); - perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), - !!local_xchg(&buf->lost, 0)); - } - - cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; - cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base; - cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum; - cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold; } static int bts_event_add(struct perf_event *event, int mode) { - struct bts_buffer *buf; struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int ret = -EBUSY; event->hw.state = PERF_HES_STOPPED; @@ -454,26 +461,10 @@ static int bts_event_add(struct perf_event *event, int mode) if (bts->handle.event) return -EBUSY; - buf = perf_aux_output_begin(&bts->handle, event); - if (!buf) - return -EINVAL; - - ret = bts_buffer_reset(buf, &bts->handle); - if (ret) { - perf_aux_output_end(&bts->handle, 0, false); - return ret; - } - - bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; - bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; - bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; - if (mode & PERF_EF_START) { bts_event_start(event, 0); - if (hwc->state & PERF_HES_STOPPED) { - bts_event_del(event, 0); - return -EBUSY; - } + if (hwc->state & PERF_HES_STOPPED) + return -EINVAL; } return 0; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a6fd4dbcf820abf727b6118c0084a6877ec0340d..7c666958a6250354aa204d24e73f94670264ffee 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1465,6 +1465,140 @@ static __initconst const u64 slm_hw_cache_event_ids }, }; +static struct extra_reg intel_glm_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x360005ffbfull, RSP_1), + EVENT_EXTRA_END +}; + +#define GLM_DEMAND_DATA_RD BIT_ULL(0) +#define GLM_DEMAND_RFO BIT_ULL(1) +#define GLM_ANY_RESPONSE BIT_ULL(16) +#define GLM_SNP_NONE_OR_MISS BIT_ULL(33) +#define GLM_DEMAND_READ GLM_DEMAND_DATA_RD +#define GLM_DEMAND_WRITE GLM_DEMAND_RFO +#define GLM_DEMAND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) +#define GLM_LLC_ACCESS GLM_ANY_RESPONSE +#define GLM_SNP_ANY (GLM_SNP_NONE_OR_MISS|SNB_NO_FWD|SNB_HITM) +#define GLM_LLC_MISS (GLM_SNP_ANY|SNB_NON_DRAM) + +static __initconst const u64 glm_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, + [C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0380, /* ICACHE.ACCESSES */ + [C(RESULT_MISS)] = 0x0280, /* ICACHE.MISSES */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */ + [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */ + [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */ + [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + }, + [C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, + [C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, /* INST_RETIRED.ANY_P */ + [C(RESULT_MISS)] = 0x0481, /* ITLB.MISS */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + }, + [C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [C(RESULT_MISS)] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + }, +}; + +static __initconst const u64 glm_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = GLM_DEMAND_READ| + GLM_LLC_ACCESS, + [C(RESULT_MISS)] = GLM_DEMAND_READ| + GLM_LLC_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = GLM_DEMAND_WRITE| + GLM_LLC_ACCESS, + [C(RESULT_MISS)] = GLM_DEMAND_WRITE| + GLM_LLC_MISS, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = GLM_DEMAND_PREFETCH| + GLM_LLC_ACCESS, + [C(RESULT_MISS)] = GLM_DEMAND_PREFETCH| + GLM_LLC_MISS, + }, + }, +}; + #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ #define KNL_MCDRAM_LOCAL BIT_ULL(21) @@ -3447,7 +3581,7 @@ __init int intel_pmu_init(void) memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); - intel_pmu_lbr_init_atom(); + intel_pmu_lbr_init_slm(); x86_pmu.event_constraints = intel_slm_event_constraints; x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; @@ -3456,6 +3590,30 @@ __init int intel_pmu_init(void) pr_cont("Silvermont events, "); break; + case 92: /* 14nm Atom "Goldmont" */ + case 95: /* 14nm Atom "Goldmont Denverton" */ + memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_skl(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_glm_pebs_event_constraints; + x86_pmu.extra_regs = intel_glm_extra_regs; + /* + * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS + * for precise cycles. + * :pp is identical to :ppp + */ + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.lbr_pt_coexist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + pr_cont("Goldmont events, "); + break; + case 37: /* 32nm Westmere */ case 44: /* 32nm Westmere-EP */ case 47: /* 32nm Westmere-EX */ @@ -3708,7 +3866,7 @@ __init int intel_pmu_init(void) c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; } c->idxmsk64 &= - ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed)); + ~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed)); c->weight = hweight64(c->idxmsk64); } } diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 7946c4231169ff81ed2c22cd538cdf289e8930f2..9ba4e4136a1539ba791052b726c12343694ea85f 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -91,6 +91,8 @@ #include #include "../perf_event.h" +MODULE_LICENSE("GPL"); + #define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \ static ssize_t __cstate_##_var##_show(struct kobject *kobj, \ struct kobj_attribute *attr, \ @@ -106,22 +108,27 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf); +/* Model -> events mapping */ +struct cstate_model { + unsigned long core_events; + unsigned long pkg_events; + unsigned long quirks; +}; + +/* Quirk flags */ +#define SLM_PKG_C6_USE_C7_MSR (1UL << 0) + struct perf_cstate_msr { u64 msr; struct perf_pmu_events_attr *attr; - bool (*test)(int idx); }; /* cstate_core PMU */ - static struct pmu cstate_core_pmu; static bool has_cstate_core; -enum perf_cstate_core_id { - /* - * cstate_core events - */ +enum perf_cstate_core_events { PERF_CSTATE_CORE_C1_RES = 0, PERF_CSTATE_CORE_C3_RES, PERF_CSTATE_CORE_C6_RES, @@ -130,69 +137,16 @@ enum perf_cstate_core_id { PERF_CSTATE_CORE_EVENT_MAX, }; -bool test_core(int idx) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 != 6) - return false; - - switch (boot_cpu_data.x86_model) { - case 30: /* 45nm Nehalem */ - case 26: /* 45nm Nehalem-EP */ - case 46: /* 45nm Nehalem-EX */ - - case 37: /* 32nm Westmere */ - case 44: /* 32nm Westmere-EP */ - case 47: /* 32nm Westmere-EX */ - if (idx == PERF_CSTATE_CORE_C3_RES || - idx == PERF_CSTATE_CORE_C6_RES) - return true; - break; - case 42: /* 32nm SandyBridge */ - case 45: /* 32nm SandyBridge-E/EN/EP */ - - case 58: /* 22nm IvyBridge */ - case 62: /* 22nm IvyBridge-EP/EX */ - - case 60: /* 22nm Haswell Core */ - case 63: /* 22nm Haswell Server */ - case 69: /* 22nm Haswell ULT */ - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ - - case 61: /* 14nm Broadwell Core-M */ - case 86: /* 14nm Broadwell Xeon D */ - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ - case 79: /* 14nm Broadwell Server */ - - case 78: /* 14nm Skylake Mobile */ - case 94: /* 14nm Skylake Desktop */ - if (idx == PERF_CSTATE_CORE_C3_RES || - idx == PERF_CSTATE_CORE_C6_RES || - idx == PERF_CSTATE_CORE_C7_RES) - return true; - break; - case 55: /* 22nm Atom "Silvermont" */ - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ - case 76: /* 14nm Atom "Airmont" */ - if (idx == PERF_CSTATE_CORE_C1_RES || - idx == PERF_CSTATE_CORE_C6_RES) - return true; - break; - } - - return false; -} - PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00"); PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01"); PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02"); PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03"); static struct perf_cstate_msr core_msr[] = { - [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1, test_core, }, - [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3, test_core, }, - [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6, test_core, }, - [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7, test_core, }, + [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1 }, + [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3 }, + [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6 }, + [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7 }, }; static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = { @@ -234,18 +188,11 @@ static const struct attribute_group *core_attr_groups[] = { NULL, }; -/* cstate_core PMU end */ - - /* cstate_pkg PMU */ - static struct pmu cstate_pkg_pmu; static bool has_cstate_pkg; -enum perf_cstate_pkg_id { - /* - * cstate_pkg events - */ +enum perf_cstate_pkg_events { PERF_CSTATE_PKG_C2_RES = 0, PERF_CSTATE_PKG_C3_RES, PERF_CSTATE_PKG_C6_RES, @@ -257,69 +204,6 @@ enum perf_cstate_pkg_id { PERF_CSTATE_PKG_EVENT_MAX, }; -bool test_pkg(int idx) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 != 6) - return false; - - switch (boot_cpu_data.x86_model) { - case 30: /* 45nm Nehalem */ - case 26: /* 45nm Nehalem-EP */ - case 46: /* 45nm Nehalem-EX */ - - case 37: /* 32nm Westmere */ - case 44: /* 32nm Westmere-EP */ - case 47: /* 32nm Westmere-EX */ - if (idx == PERF_CSTATE_CORE_C3_RES || - idx == PERF_CSTATE_CORE_C6_RES || - idx == PERF_CSTATE_CORE_C7_RES) - return true; - break; - case 42: /* 32nm SandyBridge */ - case 45: /* 32nm SandyBridge-E/EN/EP */ - - case 58: /* 22nm IvyBridge */ - case 62: /* 22nm IvyBridge-EP/EX */ - - case 60: /* 22nm Haswell Core */ - case 63: /* 22nm Haswell Server */ - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ - - case 61: /* 14nm Broadwell Core-M */ - case 86: /* 14nm Broadwell Xeon D */ - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ - case 79: /* 14nm Broadwell Server */ - - case 78: /* 14nm Skylake Mobile */ - case 94: /* 14nm Skylake Desktop */ - if (idx == PERF_CSTATE_PKG_C2_RES || - idx == PERF_CSTATE_PKG_C3_RES || - idx == PERF_CSTATE_PKG_C6_RES || - idx == PERF_CSTATE_PKG_C7_RES) - return true; - break; - case 55: /* 22nm Atom "Silvermont" */ - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ - case 76: /* 14nm Atom "Airmont" */ - if (idx == PERF_CSTATE_CORE_C6_RES) - return true; - break; - case 69: /* 22nm Haswell ULT */ - if (idx == PERF_CSTATE_PKG_C2_RES || - idx == PERF_CSTATE_PKG_C3_RES || - idx == PERF_CSTATE_PKG_C6_RES || - idx == PERF_CSTATE_PKG_C7_RES || - idx == PERF_CSTATE_PKG_C8_RES || - idx == PERF_CSTATE_PKG_C9_RES || - idx == PERF_CSTATE_PKG_C10_RES) - return true; - break; - } - - return false; -} - PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00"); PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01"); PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02"); @@ -329,13 +213,13 @@ PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05"); PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06"); static struct perf_cstate_msr pkg_msr[] = { - [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2, test_pkg, }, - [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3, test_pkg, }, - [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6, test_pkg, }, - [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7, test_pkg, }, - [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8, test_pkg, }, - [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9, test_pkg, }, - [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10, test_pkg, }, + [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2 }, + [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3 }, + [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6 }, + [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7 }, + [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8 }, + [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9 }, + [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10 }, }; static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = { @@ -366,8 +250,6 @@ static const struct attribute_group *pkg_attr_groups[] = { NULL, }; -/* cstate_pkg PMU end*/ - static ssize_t cstate_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) @@ -385,7 +267,7 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev, static int cstate_pmu_event_init(struct perf_event *event) { u64 cfg = event->attr.config; - int ret = 0; + int cpu; if (event->attr.type != event->pmu->type) return -ENOENT; @@ -400,26 +282,36 @@ static int cstate_pmu_event_init(struct perf_event *event) event->attr.sample_period) /* no sampling */ return -EINVAL; + if (event->cpu < 0) + return -EINVAL; + if (event->pmu == &cstate_core_pmu) { if (cfg >= PERF_CSTATE_CORE_EVENT_MAX) return -EINVAL; if (!core_msr[cfg].attr) return -EINVAL; event->hw.event_base = core_msr[cfg].msr; + cpu = cpumask_any_and(&cstate_core_cpu_mask, + topology_sibling_cpumask(event->cpu)); } else if (event->pmu == &cstate_pkg_pmu) { if (cfg >= PERF_CSTATE_PKG_EVENT_MAX) return -EINVAL; if (!pkg_msr[cfg].attr) return -EINVAL; event->hw.event_base = pkg_msr[cfg].msr; - } else + cpu = cpumask_any_and(&cstate_pkg_cpu_mask, + topology_core_cpumask(event->cpu)); + } else { return -ENOENT; + } + + if (cpu >= nr_cpu_ids) + return -ENODEV; - /* must be done before validate_group */ + event->cpu = cpu; event->hw.config = cfg; event->hw.idx = -1; - - return ret; + return 0; } static inline u64 cstate_pmu_read_counter(struct perf_event *event) @@ -469,172 +361,91 @@ static int cstate_pmu_event_add(struct perf_event *event, int mode) return 0; } +/* + * Check if exiting cpu is the designated reader. If so migrate the + * events when there is a valid target available + */ static void cstate_cpu_exit(int cpu) { - int i, id, target; + unsigned int target; - /* cpu exit for cstate core */ - if (has_cstate_core) { - id = topology_core_id(cpu); - target = -1; - - for_each_online_cpu(i) { - if (i == cpu) - continue; - if (id == topology_core_id(i)) { - target = i; - break; - } - } - if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0) + if (has_cstate_core && + cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) { + + target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); + /* Migrate events if there is a valid target */ + if (target < nr_cpu_ids) { cpumask_set_cpu(target, &cstate_core_cpu_mask); - WARN_ON(cpumask_empty(&cstate_core_cpu_mask)); - if (target >= 0) perf_pmu_migrate_context(&cstate_core_pmu, cpu, target); + } } - /* cpu exit for cstate pkg */ - if (has_cstate_pkg) { - id = topology_physical_package_id(cpu); - target = -1; - - for_each_online_cpu(i) { - if (i == cpu) - continue; - if (id == topology_physical_package_id(i)) { - target = i; - break; - } - } - if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0) + if (has_cstate_pkg && + cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) { + + target = cpumask_any_but(topology_core_cpumask(cpu), cpu); + /* Migrate events if there is a valid target */ + if (target < nr_cpu_ids) { cpumask_set_cpu(target, &cstate_pkg_cpu_mask); - WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask)); - if (target >= 0) perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target); + } } } static void cstate_cpu_init(int cpu) { - int i, id; + unsigned int target; - /* cpu init for cstate core */ - if (has_cstate_core) { - id = topology_core_id(cpu); - for_each_cpu(i, &cstate_core_cpu_mask) { - if (id == topology_core_id(i)) - break; - } - if (i >= nr_cpu_ids) - cpumask_set_cpu(cpu, &cstate_core_cpu_mask); - } + /* + * If this is the first online thread of that core, set it in + * the core cpu mask as the designated reader. + */ + target = cpumask_any_and(&cstate_core_cpu_mask, + topology_sibling_cpumask(cpu)); - /* cpu init for cstate pkg */ - if (has_cstate_pkg) { - id = topology_physical_package_id(cpu); - for_each_cpu(i, &cstate_pkg_cpu_mask) { - if (id == topology_physical_package_id(i)) - break; - } - if (i >= nr_cpu_ids) - cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); - } + if (has_cstate_core && target >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_core_cpu_mask); + + /* + * If this is the first online thread of that package, set it + * in the package cpu mask as the designated reader. + */ + target = cpumask_any_and(&cstate_pkg_cpu_mask, + topology_core_cpumask(cpu)); + if (has_cstate_pkg && target >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); } static int cstate_cpu_notifier(struct notifier_block *self, - unsigned long action, void *hcpu) + unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - break; case CPU_STARTING: cstate_cpu_init(cpu); break; - case CPU_UP_CANCELED: - case CPU_DYING: - break; - case CPU_ONLINE: - case CPU_DEAD: - break; case CPU_DOWN_PREPARE: cstate_cpu_exit(cpu); break; default: break; } - return NOTIFY_OK; } -/* - * Probe the cstate events and insert the available one into sysfs attrs - * Return false if there is no available events. - */ -static bool cstate_probe_msr(struct perf_cstate_msr *msr, - struct attribute **events_attrs, - int max_event_nr) -{ - int i, j = 0; - u64 val; - - /* Probe the cstate events. */ - for (i = 0; i < max_event_nr; i++) { - if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) - msr[i].attr = NULL; - } - - /* List remaining events in the sysfs attrs. */ - for (i = 0; i < max_event_nr; i++) { - if (msr[i].attr) - events_attrs[j++] = &msr[i].attr->attr.attr; - } - events_attrs[j] = NULL; - - return (j > 0) ? true : false; -} - -static int __init cstate_init(void) -{ - /* SLM has different MSR for PKG C6 */ - switch (boot_cpu_data.x86_model) { - case 55: - case 76: - case 77: - pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY; - } - - if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX)) - has_cstate_core = true; - - if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX)) - has_cstate_pkg = true; - - return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; -} - -static void __init cstate_cpumask_init(void) -{ - int cpu; - - cpu_notifier_register_begin(); - - for_each_online_cpu(cpu) - cstate_cpu_init(cpu); - - __perf_cpu_notifier(cstate_cpu_notifier); - - cpu_notifier_register_done(); -} +static struct notifier_block cstate_cpu_nb = { + .notifier_call = cstate_cpu_notifier, + .priority = CPU_PRI_PERF + 1, +}; static struct pmu cstate_core_pmu = { .attr_groups = core_attr_groups, .name = "cstate_core", .task_ctx_nr = perf_invalid_context, .event_init = cstate_pmu_event_init, - .add = cstate_pmu_event_add, /* must have */ - .del = cstate_pmu_event_del, /* must have */ + .add = cstate_pmu_event_add, + .del = cstate_pmu_event_del, .start = cstate_pmu_event_start, .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, @@ -646,49 +457,203 @@ static struct pmu cstate_pkg_pmu = { .name = "cstate_pkg", .task_ctx_nr = perf_invalid_context, .event_init = cstate_pmu_event_init, - .add = cstate_pmu_event_add, /* must have */ - .del = cstate_pmu_event_del, /* must have */ + .add = cstate_pmu_event_add, + .del = cstate_pmu_event_del, .start = cstate_pmu_event_start, .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, }; -static void __init cstate_pmus_register(void) +static const struct cstate_model nhm_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C3_RES) | + BIT(PERF_CSTATE_CORE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES), +}; + +static const struct cstate_model snb_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C3_RES) | + BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES), +}; + +static const struct cstate_model hswult_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C3_RES) | + BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES) | + BIT(PERF_CSTATE_PKG_C8_RES) | + BIT(PERF_CSTATE_PKG_C9_RES) | + BIT(PERF_CSTATE_PKG_C10_RES), +}; + +static const struct cstate_model slm_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES), + .quirks = SLM_PKG_C6_USE_C7_MSR, +}; + +#define X86_CSTATES_MODEL(model, states) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) } + +static const struct x86_cpu_id intel_cstates_match[] __initconst = { + X86_CSTATES_MODEL(30, nhm_cstates), /* 45nm Nehalem */ + X86_CSTATES_MODEL(26, nhm_cstates), /* 45nm Nehalem-EP */ + X86_CSTATES_MODEL(46, nhm_cstates), /* 45nm Nehalem-EX */ + + X86_CSTATES_MODEL(37, nhm_cstates), /* 32nm Westmere */ + X86_CSTATES_MODEL(44, nhm_cstates), /* 32nm Westmere-EP */ + X86_CSTATES_MODEL(47, nhm_cstates), /* 32nm Westmere-EX */ + + X86_CSTATES_MODEL(42, snb_cstates), /* 32nm SandyBridge */ + X86_CSTATES_MODEL(45, snb_cstates), /* 32nm SandyBridge-E/EN/EP */ + + X86_CSTATES_MODEL(58, snb_cstates), /* 22nm IvyBridge */ + X86_CSTATES_MODEL(62, snb_cstates), /* 22nm IvyBridge-EP/EX */ + + X86_CSTATES_MODEL(60, snb_cstates), /* 22nm Haswell Core */ + X86_CSTATES_MODEL(63, snb_cstates), /* 22nm Haswell Server */ + X86_CSTATES_MODEL(70, snb_cstates), /* 22nm Haswell + GT3e */ + + X86_CSTATES_MODEL(69, hswult_cstates), /* 22nm Haswell ULT */ + + X86_CSTATES_MODEL(55, slm_cstates), /* 22nm Atom Silvermont */ + X86_CSTATES_MODEL(77, slm_cstates), /* 22nm Atom Avoton/Rangely */ + X86_CSTATES_MODEL(76, slm_cstates), /* 22nm Atom Airmont */ + + X86_CSTATES_MODEL(61, snb_cstates), /* 14nm Broadwell Core-M */ + X86_CSTATES_MODEL(86, snb_cstates), /* 14nm Broadwell Xeon D */ + X86_CSTATES_MODEL(71, snb_cstates), /* 14nm Broadwell + GT3e */ + X86_CSTATES_MODEL(79, snb_cstates), /* 14nm Broadwell Server */ + + X86_CSTATES_MODEL(78, snb_cstates), /* 14nm Skylake Mobile */ + X86_CSTATES_MODEL(94, snb_cstates), /* 14nm Skylake Desktop */ + { }, +}; +MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); + +/* + * Probe the cstate events and insert the available one into sysfs attrs + * Return false if there are no available events. + */ +static bool __init cstate_probe_msr(const unsigned long evmsk, int max, + struct perf_cstate_msr *msr, + struct attribute **attrs) { - int err; + bool found = false; + unsigned int bit; + u64 val; + + for (bit = 0; bit < max; bit++) { + if (test_bit(bit, &evmsk) && !rdmsrl_safe(msr[bit].msr, &val)) { + *attrs++ = &msr[bit].attr->attr.attr; + found = true; + } else { + msr[bit].attr = NULL; + } + } + *attrs = NULL; + + return found; +} + +static int __init cstate_probe(const struct cstate_model *cm) +{ + /* SLM has different MSR for PKG C6 */ + if (cm->quirks & SLM_PKG_C6_USE_C7_MSR) + pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY; + + has_cstate_core = cstate_probe_msr(cm->core_events, + PERF_CSTATE_CORE_EVENT_MAX, + core_msr, core_events_attrs); + + has_cstate_pkg = cstate_probe_msr(cm->pkg_events, + PERF_CSTATE_PKG_EVENT_MAX, + pkg_msr, pkg_events_attrs); + + return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; +} + +static inline void cstate_cleanup(void) +{ + if (has_cstate_core) + perf_pmu_unregister(&cstate_core_pmu); + + if (has_cstate_pkg) + perf_pmu_unregister(&cstate_pkg_pmu); +} + +static int __init cstate_init(void) +{ + int cpu, err; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + cstate_cpu_init(cpu); if (has_cstate_core) { err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1); - if (WARN_ON(err)) - pr_info("Failed to register PMU %s error %d\n", - cstate_core_pmu.name, err); + if (err) { + has_cstate_core = false; + pr_info("Failed to register cstate core pmu\n"); + goto out; + } } if (has_cstate_pkg) { err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1); - if (WARN_ON(err)) - pr_info("Failed to register PMU %s error %d\n", - cstate_pkg_pmu.name, err); + if (err) { + has_cstate_pkg = false; + pr_info("Failed to register cstate pkg pmu\n"); + cstate_cleanup(); + goto out; + } } + __register_cpu_notifier(&cstate_cpu_nb); +out: + cpu_notifier_register_done(); + return err; } static int __init cstate_pmu_init(void) { + const struct x86_cpu_id *id; int err; - if (cpu_has_hypervisor) + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return -ENODEV; + + id = x86_match_cpu(intel_cstates_match); + if (!id) return -ENODEV; - err = cstate_init(); + err = cstate_probe((const struct cstate_model *) id->driver_data); if (err) return err; - cstate_cpumask_init(); - - cstate_pmus_register(); - - return 0; + return cstate_init(); } +module_init(cstate_pmu_init); -device_initcall(cstate_pmu_init); +static void __exit cstate_pmu_exit(void) +{ + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&cstate_cpu_nb); + cstate_cleanup(); + cpu_notifier_register_done(); +} +module_exit(cstate_pmu_exit); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8584b90d8e0bb08e8e9ddc3dd7b6f38dae5363c2..7ce9f3f669e63d2bd4bbf15db801cf8bc5fa7565 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -645,6 +645,12 @@ struct event_constraint intel_slm_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_glm_pebs_event_constraints[] = { + /* Allow all events as PEBS with no flags */ + INTEL_ALL_EVENT_CONSTRAINT(0, 0x1), + EVENT_CONSTRAINT_END +}; + struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 1ca5d1e7d4f253429fc1c44968d8219dff4086cd..9e2b40cdb05f8c68061e43a14b04aa3aace3429c 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -14,7 +14,8 @@ enum { LBR_FORMAT_EIP_FLAGS = 0x03, LBR_FORMAT_EIP_FLAGS2 = 0x04, LBR_FORMAT_INFO = 0x05, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO, + LBR_FORMAT_TIME = 0x06, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, }; static enum { @@ -464,6 +465,16 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) abort = !!(info & LBR_INFO_ABORT); cycles = (info & LBR_INFO_CYCLES); } + + if (lbr_format == LBR_FORMAT_TIME) { + mis = !!(from & LBR_FROM_FLAG_MISPRED); + pred = !mis; + skip = 1; + cycles = ((to >> 48) & LBR_INFO_CYCLES); + + to = (u64)((((s64)to) << 16) >> 16); + } + if (lbr_flags & LBR_EIP_FLAGS) { mis = !!(from & LBR_FROM_FLAG_MISPRED); pred = !mis; @@ -1049,6 +1060,24 @@ void __init intel_pmu_lbr_init_atom(void) pr_cont("8-deep LBR, "); } +/* slm */ +void __init intel_pmu_lbr_init_slm(void) +{ + x86_pmu.lbr_nr = 8; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = nhm_lbr_sel_map; + + /* + * SW branch filter usage: + * - compensate for lack of HW filter + */ + pr_cont("8-deep LBR, "); +} + /* Knights Landing */ void intel_pmu_lbr_init_knl(void) { diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 09a77dbc73c93110a40d2afbf2dedc86e50ea44f..04bb5fb5a8d7a13308fdcc20d2c1a5930146f07f 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -67,11 +67,13 @@ static struct pt_cap_desc { PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff), PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)), PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)), + PT_CAP(ip_filtering, 0, CR_EBX, BIT(2)), PT_CAP(mtc, 0, CR_EBX, BIT(3)), PT_CAP(topa_output, 0, CR_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), PT_CAP(single_range_output, 0, CR_ECX, BIT(2)), PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)), + PT_CAP(num_address_ranges, 1, CR_EAX, 0x3), PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000), PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff), PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000), @@ -125,9 +127,46 @@ static struct attribute_group pt_format_group = { .attrs = pt_formats_attr, }; +static ssize_t +pt_timing_attr_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_attr, attr); + + switch (pmu_attr->id) { + case 0: + return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio); + case 1: + return sprintf(page, "%u:%u\n", + pt_pmu.tsc_art_num, + pt_pmu.tsc_art_den); + default: + break; + } + + return -EINVAL; +} + +PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0, + pt_timing_attr_show); +PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1, + pt_timing_attr_show); + +static struct attribute *pt_timing_attr[] = { + &timing_attr_max_nonturbo_ratio.attr.attr, + &timing_attr_tsc_art_ratio.attr.attr, + NULL, +}; + +static struct attribute_group pt_timing_group = { + .attrs = pt_timing_attr, +}; + static const struct attribute_group *pt_attr_groups[] = { &pt_cap_group, &pt_format_group, + &pt_timing_group, NULL, }; @@ -140,6 +179,23 @@ static int __init pt_pmu_hw_init(void) int ret; long i; + rdmsrl(MSR_PLATFORM_INFO, reg); + pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8; + + /* + * if available, read in TSC to core crystal clock ratio, + * otherwise, zero for numerator stands for "not enumerated" + * as per SDM + */ + if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) { + u32 eax, ebx, ecx, edx; + + cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx); + + pt_pmu.tsc_art_num = ebx; + pt_pmu.tsc_art_den = eax; + } + if (boot_cpu_has(X86_FEATURE_VMX)) { /* * Intel SDM, 36.5 "Tracing post-VMXON" says that @@ -263,6 +319,75 @@ static bool pt_event_valid(struct perf_event *event) * These all are cpu affine and operate on a local PT */ +/* Address ranges and their corresponding msr configuration registers */ +static const struct pt_address_range { + unsigned long msr_a; + unsigned long msr_b; + unsigned int reg_off; +} pt_address_ranges[] = { + { + .msr_a = MSR_IA32_RTIT_ADDR0_A, + .msr_b = MSR_IA32_RTIT_ADDR0_B, + .reg_off = RTIT_CTL_ADDR0_OFFSET, + }, + { + .msr_a = MSR_IA32_RTIT_ADDR1_A, + .msr_b = MSR_IA32_RTIT_ADDR1_B, + .reg_off = RTIT_CTL_ADDR1_OFFSET, + }, + { + .msr_a = MSR_IA32_RTIT_ADDR2_A, + .msr_b = MSR_IA32_RTIT_ADDR2_B, + .reg_off = RTIT_CTL_ADDR2_OFFSET, + }, + { + .msr_a = MSR_IA32_RTIT_ADDR3_A, + .msr_b = MSR_IA32_RTIT_ADDR3_B, + .reg_off = RTIT_CTL_ADDR3_OFFSET, + } +}; + +static u64 pt_config_filters(struct perf_event *event) +{ + struct pt_filters *filters = event->hw.addr_filters; + struct pt *pt = this_cpu_ptr(&pt_ctx); + unsigned int range = 0; + u64 rtit_ctl = 0; + + if (!filters) + return 0; + + perf_event_addr_filters_sync(event); + + for (range = 0; range < filters->nr_filters; range++) { + struct pt_filter *filter = &filters->filter[range]; + + /* + * Note, if the range has zero start/end addresses due + * to its dynamic object not being loaded yet, we just + * go ahead and program zeroed range, which will simply + * produce no data. Note^2: if executable code at 0x0 + * is a concern, we can set up an "invalid" configuration + * such as msr_b < msr_a. + */ + + /* avoid redundant msr writes */ + if (pt->filters.filter[range].msr_a != filter->msr_a) { + wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a); + pt->filters.filter[range].msr_a = filter->msr_a; + } + + if (pt->filters.filter[range].msr_b != filter->msr_b) { + wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b); + pt->filters.filter[range].msr_b = filter->msr_b; + } + + rtit_ctl |= filter->config << pt_address_ranges[range].reg_off; + } + + return rtit_ctl; +} + static void pt_config(struct perf_event *event) { u64 reg; @@ -272,7 +397,8 @@ static void pt_config(struct perf_event *event) wrmsrl(MSR_IA32_RTIT_STATUS, 0); } - reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; + reg = pt_config_filters(event); + reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; if (!event->attr.exclude_kernel) reg |= RTIT_CTL_OS; @@ -709,6 +835,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, /* clear STOP and INT from current entry */ buf->topa_index[buf->stop_pos]->stop = 0; + buf->topa_index[buf->stop_pos]->intr = 0; buf->topa_index[buf->intr_pos]->intr = 0; /* how many pages till the STOP marker */ @@ -733,6 +860,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, buf->intr_pos = idx; buf->topa_index[buf->stop_pos]->stop = 1; + buf->topa_index[buf->stop_pos]->intr = 1; buf->topa_index[buf->intr_pos]->intr = 1; return 0; @@ -919,24 +1047,80 @@ static void pt_buffer_free_aux(void *data) kfree(buf); } -/** - * pt_buffer_is_full() - check if the buffer is full - * @buf: PT buffer. - * @pt: Per-cpu pt handle. - * - * If the user hasn't read data from the output region that aux_head - * points to, the buffer is considered full: the user needs to read at - * least this region and update aux_tail to point past it. - */ -static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt) +static int pt_addr_filters_init(struct perf_event *event) { - if (buf->snapshot) - return false; + struct pt_filters *filters; + int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu); + + if (!pt_cap_get(PT_CAP_num_address_ranges)) + return 0; + + filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node); + if (!filters) + return -ENOMEM; + + if (event->parent) + memcpy(filters, event->parent->hw.addr_filters, + sizeof(*filters)); + + event->hw.addr_filters = filters; + + return 0; +} + +static void pt_addr_filters_fini(struct perf_event *event) +{ + kfree(event->hw.addr_filters); + event->hw.addr_filters = NULL; +} + +static int pt_event_addr_filters_validate(struct list_head *filters) +{ + struct perf_addr_filter *filter; + int range = 0; + + list_for_each_entry(filter, filters, entry) { + /* PT doesn't support single address triggers */ + if (!filter->range) + return -EOPNOTSUPP; + + if (!filter->inode && !kernel_ip(filter->offset)) + return -EINVAL; + + if (++range > pt_cap_get(PT_CAP_num_address_ranges)) + return -EOPNOTSUPP; + } + + return 0; +} + +static void pt_event_addr_filters_sync(struct perf_event *event) +{ + struct perf_addr_filters_head *head = perf_event_addr_filters(event); + unsigned long msr_a, msr_b, *offs = event->addr_filters_offs; + struct pt_filters *filters = event->hw.addr_filters; + struct perf_addr_filter *filter; + int range = 0; + + if (!filters) + return; - if (local_read(&buf->data_size) >= pt->handle.size) - return true; + list_for_each_entry(filter, &head->list, entry) { + if (filter->inode && !offs[range]) { + msr_a = msr_b = 0; + } else { + /* apply the offset */ + msr_a = filter->offset + offs[range]; + msr_b = filter->size + msr_a; + } + + filters->filter[range].msr_a = msr_a; + filters->filter[range].msr_b = msr_b; + filters->filter[range].config = filter->filter ? 1 : 2; + range++; + } - return false; + filters->nr_filters = range; } /** @@ -953,7 +1137,7 @@ void intel_pt_interrupt(void) * after PT has been disabled by pt_event_stop(). Make sure we don't * do anything (particularly, re-enable) for this event here. */ - if (!ACCESS_ONCE(pt->handle_nmi)) + if (!READ_ONCE(pt->handle_nmi)) return; /* @@ -1038,23 +1222,36 @@ EXPORT_SYMBOL_GPL(intel_pt_handle_vmx); static void pt_event_start(struct perf_event *event, int mode) { + struct hw_perf_event *hwc = &event->hw; struct pt *pt = this_cpu_ptr(&pt_ctx); - struct pt_buffer *buf = perf_get_aux(&pt->handle); + struct pt_buffer *buf; if (READ_ONCE(pt->vmx_on)) return; - if (!buf || pt_buffer_is_full(buf, pt)) { - event->hw.state = PERF_HES_STOPPED; - return; + buf = perf_aux_output_begin(&pt->handle, event); + if (!buf) + goto fail_stop; + + pt_buffer_reset_offsets(buf, pt->handle.head); + if (!buf->snapshot) { + if (pt_buffer_reset_markers(buf, &pt->handle)) + goto fail_end_stop; } - ACCESS_ONCE(pt->handle_nmi) = 1; - event->hw.state = 0; + WRITE_ONCE(pt->handle_nmi, 1); + hwc->state = 0; pt_config_buffer(buf->cur->table, buf->cur_idx, buf->output_off); pt_config(event); + + return; + +fail_end_stop: + perf_aux_output_end(&pt->handle, 0, true); +fail_stop: + hwc->state = PERF_HES_STOPPED; } static void pt_event_stop(struct perf_event *event, int mode) @@ -1065,7 +1262,7 @@ static void pt_event_stop(struct perf_event *event, int mode) * Protect against the PMI racing with disabling wrmsr, * see comment in intel_pt_interrupt(). */ - ACCESS_ONCE(pt->handle_nmi) = 0; + WRITE_ONCE(pt->handle_nmi, 0); pt_config_stop(event); @@ -1088,19 +1285,7 @@ static void pt_event_stop(struct perf_event *event, int mode) pt_handle_status(pt); pt_update_head(pt); - } -} - -static void pt_event_del(struct perf_event *event, int mode) -{ - struct pt *pt = this_cpu_ptr(&pt_ctx); - struct pt_buffer *buf; - pt_event_stop(event, PERF_EF_UPDATE); - - buf = perf_get_aux(&pt->handle); - - if (buf) { if (buf->snapshot) pt->handle.head = local_xchg(&buf->data_size, @@ -1110,9 +1295,13 @@ static void pt_event_del(struct perf_event *event, int mode) } } +static void pt_event_del(struct perf_event *event, int mode) +{ + pt_event_stop(event, PERF_EF_UPDATE); +} + static int pt_event_add(struct perf_event *event, int mode) { - struct pt_buffer *buf; struct pt *pt = this_cpu_ptr(&pt_ctx); struct hw_perf_event *hwc = &event->hw; int ret = -EBUSY; @@ -1120,34 +1309,18 @@ static int pt_event_add(struct perf_event *event, int mode) if (pt->handle.event) goto fail; - buf = perf_aux_output_begin(&pt->handle, event); - ret = -EINVAL; - if (!buf) - goto fail_stop; - - pt_buffer_reset_offsets(buf, pt->handle.head); - if (!buf->snapshot) { - ret = pt_buffer_reset_markers(buf, &pt->handle); - if (ret) - goto fail_end_stop; - } - if (mode & PERF_EF_START) { pt_event_start(event, 0); - ret = -EBUSY; + ret = -EINVAL; if (hwc->state == PERF_HES_STOPPED) - goto fail_end_stop; + goto fail; } else { hwc->state = PERF_HES_STOPPED; } - return 0; - -fail_end_stop: - perf_aux_output_end(&pt->handle, 0, true); -fail_stop: - hwc->state = PERF_HES_STOPPED; + ret = 0; fail: + return ret; } @@ -1157,6 +1330,7 @@ static void pt_event_read(struct perf_event *event) static void pt_event_destroy(struct perf_event *event) { + pt_addr_filters_fini(event); x86_del_exclusive(x86_lbr_exclusive_pt); } @@ -1171,6 +1345,11 @@ static int pt_event_init(struct perf_event *event) if (x86_add_exclusive(x86_lbr_exclusive_pt)) return -EBUSY; + if (pt_addr_filters_init(event)) { + x86_del_exclusive(x86_lbr_exclusive_pt); + return -ENOMEM; + } + event->destroy = pt_event_destroy; return 0; @@ -1190,7 +1369,7 @@ static __init int pt_init(void) BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE); - if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) + if (!boot_cpu_has(X86_FEATURE_INTEL_PT)) return -ENODEV; get_online_cpus(); @@ -1224,16 +1403,21 @@ static __init int pt_init(void) PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE; - pt_pmu.pmu.attr_groups = pt_attr_groups; - pt_pmu.pmu.task_ctx_nr = perf_sw_context; - pt_pmu.pmu.event_init = pt_event_init; - pt_pmu.pmu.add = pt_event_add; - pt_pmu.pmu.del = pt_event_del; - pt_pmu.pmu.start = pt_event_start; - pt_pmu.pmu.stop = pt_event_stop; - pt_pmu.pmu.read = pt_event_read; - pt_pmu.pmu.setup_aux = pt_buffer_setup_aux; - pt_pmu.pmu.free_aux = pt_buffer_free_aux; + pt_pmu.pmu.attr_groups = pt_attr_groups; + pt_pmu.pmu.task_ctx_nr = perf_sw_context; + pt_pmu.pmu.event_init = pt_event_init; + pt_pmu.pmu.add = pt_event_add; + pt_pmu.pmu.del = pt_event_del; + pt_pmu.pmu.start = pt_event_start; + pt_pmu.pmu.stop = pt_event_stop; + pt_pmu.pmu.read = pt_event_read; + pt_pmu.pmu.setup_aux = pt_buffer_setup_aux; + pt_pmu.pmu.free_aux = pt_buffer_free_aux; + pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync; + pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate; + pt_pmu.pmu.nr_addr_filters = + pt_cap_get(PT_CAP_num_address_ranges); + ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); return ret; diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 3abb5f5cccc87d0a00cd4103cdad0f2674ca8423..efffa4a09f687f9578a6d8b99a89b3d2595ded4c 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -19,6 +19,40 @@ #ifndef __INTEL_PT_H__ #define __INTEL_PT_H__ +/* + * PT MSR bit definitions + */ +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_CYCLEACC BIT(1) +#define RTIT_CTL_OS BIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_MTC_EN BIT(9) +#define RTIT_CTL_TSC_EN BIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET 14 +#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET 19 +#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET 24 +#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) +#define RTIT_CTL_ADDR0_OFFSET 32 +#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) +#define RTIT_CTL_ADDR1_OFFSET 36 +#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) +#define RTIT_CTL_ADDR2_OFFSET 40 +#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) +#define RTIT_CTL_ADDR3_OFFSET 44 +#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) +#define RTIT_STATUS_FILTEREN BIT(0) +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_BUFFOVF BIT(3) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPED BIT(5) + /* * Single-entry ToPA: when this close to region boundary, switch * buffers to avoid losing data. @@ -48,15 +82,20 @@ struct topa_entry { #define PT_CPUID_LEAVES 2 #define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ +/* TSC to Core Crystal Clock Ratio */ +#define CPUID_TSC_LEAF 0x15 + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, PT_CAP_psb_cyc, + PT_CAP_ip_filtering, PT_CAP_mtc, PT_CAP_topa_output, PT_CAP_topa_multiple_entries, PT_CAP_single_range_output, PT_CAP_payloads_lip, + PT_CAP_num_address_ranges, PT_CAP_mtc_periods, PT_CAP_cycle_thresholds, PT_CAP_psb_periods, @@ -66,6 +105,9 @@ struct pt_pmu { struct pmu pmu; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; bool vmx; + unsigned long max_nonturbo_ratio; + unsigned int tsc_art_num; + unsigned int tsc_art_den; }; /** @@ -104,14 +146,40 @@ struct pt_buffer { struct topa_entry *topa_index[0]; }; +#define PT_FILTERS_NUM 4 + +/** + * struct pt_filter - IP range filter configuration + * @msr_a: range start, goes to RTIT_ADDRn_A + * @msr_b: range end, goes to RTIT_ADDRn_B + * @config: 4-bit field in RTIT_CTL + */ +struct pt_filter { + unsigned long msr_a; + unsigned long msr_b; + unsigned long config; +}; + +/** + * struct pt_filters - IP range filtering context + * @filter: filters defined for this context + * @nr_filters: number of defined filters in the @filter array + */ +struct pt_filters { + struct pt_filter filter[PT_FILTERS_NUM]; + unsigned int nr_filters; +}; + /** * struct pt - per-cpu pt context * @handle: perf output handle + * @filters: last configured filters * @handle_nmi: do handle PT PMI on this cpu, there's an active event * @vmx_on: 1 if VMX is ON on this cpu */ struct pt { struct perf_output_handle handle; + struct pt_filters filters; int handle_nmi; int vmx_on; }; diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 1705c9d75e4477e1c246d608fa1e9ebb36f6f50b..99c4bab123cdae71b14ab92241d81dbf15616fe1 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -27,10 +27,14 @@ * event: rapl_energy_dram * perf code: 0x3 * - * dram counter: consumption of the builtin-gpu domain (client only) + * gpu counter: consumption of the builtin-gpu domain (client only) * event: rapl_energy_gpu * perf code: 0x4 * + * psys counter: consumption of the builtin-psys domain (client only) + * event: rapl_energy_psys + * perf code: 0x5 + * * We manage those counters as free running (read-only). They may be * use simultaneously by other tools, such as turbostat. * @@ -53,6 +57,8 @@ #include #include "../perf_event.h" +MODULE_LICENSE("GPL"); + /* * RAPL energy status counters */ @@ -64,13 +70,16 @@ #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ #define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ +#define RAPL_IDX_PSYS_NRG_STAT 4 /* psys */ +#define INTEL_RAPL_PSYS 0x5 /* pseudo-encoding */ -#define NR_RAPL_DOMAINS 0x4 +#define NR_RAPL_DOMAINS 0x5 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { "pp0-core", "package", "dram", "pp1-gpu", + "psys", }; /* Clients have PP0, PKG */ @@ -89,6 +98,13 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { 1<driver_data; + apply_quirk = rapl_init->apply_quirk; + rapl_cntr_mask = rapl_init->cntr_mask; + rapl_pmu_events_group.attrs = rapl_init->attrs; ret = rapl_check_hw_unit(apply_quirk); if (ret) @@ -756,7 +845,7 @@ static int __init rapl_pmu_init(void) if (ret) goto out; - __perf_cpu_notifier(rapl_cpu_notifier); + __register_cpu_notifier(&rapl_cpu_nb); cpu_notifier_register_done(); rapl_advertise(); return 0; @@ -767,4 +856,14 @@ static int __init rapl_pmu_init(void) cpu_notifier_register_done(); return ret; } -device_initcall(rapl_pmu_init); +module_init(rapl_pmu_init); + +static void __exit intel_rapl_exit(void) +{ + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&rapl_cpu_nb); + perf_pmu_unregister(&rapl_pmus->pmu); + cleanup_rapl_pmus(); + cpu_notifier_register_done(); +} +module_exit(intel_rapl_exit); diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7012d18bb293073e7cf0021cab79c50e3e42289c..16c1789164122b70f6d2cd4ad16ae18b741236aa 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1,3 +1,4 @@ +#include #include "uncore.h" static struct intel_uncore_type *empty_uncore[] = { NULL, }; @@ -21,6 +22,8 @@ static struct event_constraint uncore_constraint_fixed = struct event_constraint uncore_constraint_empty = EVENT_CONSTRAINT(0, 0, 0); +MODULE_LICENSE("GPL"); + static int uncore_pcibus_to_physid(struct pci_bus *bus) { struct pci2phy_map *map; @@ -754,7 +757,7 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu) pmu->registered = false; } -static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu) +static void __uncore_exit_boxes(struct intel_uncore_type *type, int cpu) { struct intel_uncore_pmu *pmu = type->pmus; struct intel_uncore_box *box; @@ -770,7 +773,7 @@ static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu) } } -static void __init uncore_exit_boxes(void *dummy) +static void uncore_exit_boxes(void *dummy) { struct intel_uncore_type **types; @@ -787,7 +790,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu) kfree(pmu->boxes); } -static void __init uncore_type_exit(struct intel_uncore_type *type) +static void uncore_type_exit(struct intel_uncore_type *type) { struct intel_uncore_pmu *pmu = type->pmus; int i; @@ -804,7 +807,7 @@ static void __init uncore_type_exit(struct intel_uncore_type *type) type->events_group = NULL; } -static void __init uncore_types_exit(struct intel_uncore_type **types) +static void uncore_types_exit(struct intel_uncore_type **types) { for (; *types; types++) uncore_type_exit(*types); @@ -989,46 +992,6 @@ static int __init uncore_pci_init(void) size_t size; int ret; - switch (boot_cpu_data.x86_model) { - case 45: /* Sandy Bridge-EP */ - ret = snbep_uncore_pci_init(); - break; - case 62: /* Ivy Bridge-EP */ - ret = ivbep_uncore_pci_init(); - break; - case 63: /* Haswell-EP */ - ret = hswep_uncore_pci_init(); - break; - case 79: /* BDX-EP */ - case 86: /* BDX-DE */ - ret = bdx_uncore_pci_init(); - break; - case 42: /* Sandy Bridge */ - ret = snb_uncore_pci_init(); - break; - case 58: /* Ivy Bridge */ - ret = ivb_uncore_pci_init(); - break; - case 60: /* Haswell */ - case 69: /* Haswell Celeron */ - ret = hsw_uncore_pci_init(); - break; - case 61: /* Broadwell */ - ret = bdw_uncore_pci_init(); - break; - case 87: /* Knights Landing */ - ret = knl_uncore_pci_init(); - break; - case 94: /* SkyLake */ - ret = skl_uncore_pci_init(); - break; - default: - return -ENODEV; - } - - if (ret) - return ret; - size = max_packages * sizeof(struct pci_extra_dev); uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); if (!uncore_extra_pci_dev) { @@ -1060,7 +1023,7 @@ static int __init uncore_pci_init(void) return ret; } -static void __init uncore_pci_exit(void) +static void uncore_pci_exit(void) { if (pcidrv_registered) { pcidrv_registered = false; @@ -1287,46 +1250,6 @@ static int __init uncore_cpu_init(void) { int ret; - switch (boot_cpu_data.x86_model) { - case 26: /* Nehalem */ - case 30: - case 37: /* Westmere */ - case 44: - nhm_uncore_cpu_init(); - break; - case 42: /* Sandy Bridge */ - case 58: /* Ivy Bridge */ - case 60: /* Haswell */ - case 69: /* Haswell */ - case 70: /* Haswell */ - case 61: /* Broadwell */ - case 71: /* Broadwell */ - snb_uncore_cpu_init(); - break; - case 45: /* Sandy Bridge-EP */ - snbep_uncore_cpu_init(); - break; - case 46: /* Nehalem-EX */ - case 47: /* Westmere-EX aka. Xeon E7 */ - nhmex_uncore_cpu_init(); - break; - case 62: /* Ivy Bridge-EP */ - ivbep_uncore_cpu_init(); - break; - case 63: /* Haswell-EP */ - hswep_uncore_cpu_init(); - break; - case 79: /* BDX-EP */ - case 86: /* BDX-DE */ - bdx_uncore_cpu_init(); - break; - case 87: /* Knights Landing */ - knl_uncore_cpu_init(); - break; - default: - return -ENODEV; - } - ret = uncore_types_init(uncore_msr_uncores, true); if (ret) goto err; @@ -1376,20 +1299,123 @@ static int __init uncore_cpumask_init(bool msr) return 0; } +#define X86_UNCORE_MODEL_MATCH(model, init) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } + +struct intel_uncore_init_fun { + void (*cpu_init)(void); + int (*pci_init)(void); +}; + +static const struct intel_uncore_init_fun nhm_uncore_init __initconst = { + .cpu_init = nhm_uncore_cpu_init, +}; + +static const struct intel_uncore_init_fun snb_uncore_init __initconst = { + .cpu_init = snb_uncore_cpu_init, + .pci_init = snb_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun ivb_uncore_init __initconst = { + .cpu_init = snb_uncore_cpu_init, + .pci_init = ivb_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun hsw_uncore_init __initconst = { + .cpu_init = snb_uncore_cpu_init, + .pci_init = hsw_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun bdw_uncore_init __initconst = { + .cpu_init = snb_uncore_cpu_init, + .pci_init = bdw_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun snbep_uncore_init __initconst = { + .cpu_init = snbep_uncore_cpu_init, + .pci_init = snbep_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun nhmex_uncore_init __initconst = { + .cpu_init = nhmex_uncore_cpu_init, +}; + +static const struct intel_uncore_init_fun ivbep_uncore_init __initconst = { + .cpu_init = ivbep_uncore_cpu_init, + .pci_init = ivbep_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun hswep_uncore_init __initconst = { + .cpu_init = hswep_uncore_cpu_init, + .pci_init = hswep_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun bdx_uncore_init __initconst = { + .cpu_init = bdx_uncore_cpu_init, + .pci_init = bdx_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun knl_uncore_init __initconst = { + .cpu_init = knl_uncore_cpu_init, + .pci_init = knl_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun skl_uncore_init __initconst = { + .pci_init = skl_uncore_pci_init, +}; + +static const struct x86_cpu_id intel_uncore_match[] __initconst = { + X86_UNCORE_MODEL_MATCH(26, nhm_uncore_init), /* Nehalem */ + X86_UNCORE_MODEL_MATCH(30, nhm_uncore_init), + X86_UNCORE_MODEL_MATCH(37, nhm_uncore_init), /* Westmere */ + X86_UNCORE_MODEL_MATCH(44, nhm_uncore_init), + X86_UNCORE_MODEL_MATCH(42, snb_uncore_init), /* Sandy Bridge */ + X86_UNCORE_MODEL_MATCH(58, ivb_uncore_init), /* Ivy Bridge */ + X86_UNCORE_MODEL_MATCH(60, hsw_uncore_init), /* Haswell */ + X86_UNCORE_MODEL_MATCH(69, hsw_uncore_init), /* Haswell Celeron */ + X86_UNCORE_MODEL_MATCH(70, hsw_uncore_init), /* Haswell */ + X86_UNCORE_MODEL_MATCH(61, bdw_uncore_init), /* Broadwell */ + X86_UNCORE_MODEL_MATCH(71, bdw_uncore_init), /* Broadwell */ + X86_UNCORE_MODEL_MATCH(45, snbep_uncore_init), /* Sandy Bridge-EP */ + X86_UNCORE_MODEL_MATCH(46, nhmex_uncore_init), /* Nehalem-EX */ + X86_UNCORE_MODEL_MATCH(47, nhmex_uncore_init), /* Westmere-EX aka. Xeon E7 */ + X86_UNCORE_MODEL_MATCH(62, ivbep_uncore_init), /* Ivy Bridge-EP */ + X86_UNCORE_MODEL_MATCH(63, hswep_uncore_init), /* Haswell-EP */ + X86_UNCORE_MODEL_MATCH(79, bdx_uncore_init), /* BDX-EP */ + X86_UNCORE_MODEL_MATCH(86, bdx_uncore_init), /* BDX-DE */ + X86_UNCORE_MODEL_MATCH(87, knl_uncore_init), /* Knights Landing */ + X86_UNCORE_MODEL_MATCH(94, skl_uncore_init), /* SkyLake */ + {}, +}; + +MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); + static int __init intel_uncore_init(void) { - int pret, cret, ret; + const struct x86_cpu_id *id; + struct intel_uncore_init_fun *uncore_init; + int pret = 0, cret = 0, ret; - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + id = x86_match_cpu(intel_uncore_match); + if (!id) return -ENODEV; - if (cpu_has_hypervisor) + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; max_packages = topology_max_packages(); - pret = uncore_pci_init(); - cret = uncore_cpu_init(); + uncore_init = (struct intel_uncore_init_fun *)id->driver_data; + if (uncore_init->pci_init) { + pret = uncore_init->pci_init(); + if (!pret) + pret = uncore_pci_init(); + } + + if (uncore_init->cpu_init) { + uncore_init->cpu_init(); + cret = uncore_cpu_init(); + } if (cret && pret) return -ENODEV; @@ -1409,4 +1435,14 @@ static int __init intel_uncore_init(void) cpu_notifier_register_done(); return ret; } -device_initcall(intel_uncore_init); +module_init(intel_uncore_init); + +static void __exit intel_uncore_exit(void) +{ + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&uncore_cpu_nb); + uncore_types_exit(uncore_msr_uncores); + uncore_pci_exit(); + cpu_notifier_register_done(); +} +module_exit(intel_uncore_exit); diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index ab2bcaaebe38d464ab7863c901ac41fc7b847711..b2625867ebd17543401cf6dbc5bf9e76fcd6ee6c 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -219,6 +219,9 @@ #define KNL_CHA_MSR_PMON_BOX_FILTER_TID 0x1ff #define KNL_CHA_MSR_PMON_BOX_FILTER_STATE (7 << 18) #define KNL_CHA_MSR_PMON_BOX_FILTER_OP (0xfffffe2aULL << 32) +#define KNL_CHA_MSR_PMON_BOX_FILTER_REMOTE_NODE (0x1ULL << 32) +#define KNL_CHA_MSR_PMON_BOX_FILTER_LOCAL_NODE (0x1ULL << 33) +#define KNL_CHA_MSR_PMON_BOX_FILTER_NNC (0x1ULL << 37) /* KNL EDC/MC UCLK */ #define KNL_UCLK_MSR_PMON_CTR0_LOW 0x400 @@ -1902,6 +1905,10 @@ static int knl_cha_hw_config(struct intel_uncore_box *box, reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 + KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx; reg1->config = event->attr.config1 & knl_cha_filter_mask(idx); + + reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_REMOTE_NODE; + reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_LOCAL_NODE; + reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_NNC; reg1->idx = idx; } return 0; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index ec863b9a9f780c7507634353d64f9c2f76f1a0e1..85ef3c2e80e0450350f347bfa08e3036c459e857 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -6,6 +6,8 @@ enum perf_msr_id { PERF_MSR_MPERF = 2, PERF_MSR_PPERF = 3, PERF_MSR_SMI = 4, + PERF_MSR_PTSC = 5, + PERF_MSR_IRPERF = 6, PERF_MSR_EVENT_MAX, }; @@ -15,6 +17,16 @@ static bool test_aperfmperf(int idx) return boot_cpu_has(X86_FEATURE_APERFMPERF); } +static bool test_ptsc(int idx) +{ + return boot_cpu_has(X86_FEATURE_PTSC); +} + +static bool test_irperf(int idx) +{ + return boot_cpu_has(X86_FEATURE_IRPERF); +} + static bool test_intel(int idx) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || @@ -69,18 +81,22 @@ struct perf_msr { bool (*test)(int idx); }; -PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00"); -PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01"); -PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); -PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); -PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); +PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00"); +PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01"); +PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); +PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); +PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); +PMU_EVENT_ATTR_STRING(ptsc, evattr_ptsc, "event=0x05"); +PMU_EVENT_ATTR_STRING(irperf, evattr_irperf, "event=0x06"); static struct perf_msr msr[] = { - [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, - [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, - [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, - [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, - [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, + [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, + [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, + [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, + [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, + [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, + [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &evattr_ptsc, test_ptsc, }, + [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &evattr_irperf, test_irperf, }, }; static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { @@ -166,7 +182,7 @@ static void msr_event_update(struct perf_event *event) if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) delta = sign_extend64(delta, 31); - local64_add(now - prev, &event->count); + local64_add(delta, &event->count); } static void msr_event_start(struct perf_event *event, int flags) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ad4dc7ffffb5eb44eeb79c08e924697547f14ebd..8bd764df815d36287ca4b29effb969ede50735a6 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -601,6 +601,7 @@ struct x86_pmu { u64 lbr_sel_mask; /* LBR_SELECT valid bits */ const int *lbr_sel_map; /* lbr_select mappings */ bool lbr_double_abort; /* duplicated lbr aborts */ + bool lbr_pt_coexist; /* LBR may coexist with PT */ /* * Intel PT/LBR/BTS are exclusive @@ -859,6 +860,8 @@ extern struct event_constraint intel_atom_pebs_event_constraints[]; extern struct event_constraint intel_slm_pebs_event_constraints[]; +extern struct event_constraint intel_glm_pebs_event_constraints[]; + extern struct event_constraint intel_nehalem_pebs_event_constraints[]; extern struct event_constraint intel_westmere_pebs_event_constraints[]; @@ -907,6 +910,8 @@ void intel_pmu_lbr_init_nhm(void); void intel_pmu_lbr_init_atom(void); +void intel_pmu_lbr_init_slm(void); + void intel_pmu_lbr_init_snb(void); void intel_pmu_lbr_init_hsw(void); diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 0552884da18db24c7910fa8166a1385f56b6d4a9..2f29f4e407c315114b6866e496b04e8d4ab51a1d 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -357,7 +357,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, put_user_ex(ptr_to_compat(&frame->uc), &frame->puc); /* Create the ucontext. */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); else put_user_ex(0, &frame->uc.uc_flags); diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 99afb665a004cb8ea82c1751f6ef29b12d116cc6..e77a6443104ff1d2b3162e17b1958b753c95be39 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -1,11 +1,12 @@ #ifndef _ASM_X86_ALTERNATIVE_H #define _ASM_X86_ALTERNATIVE_H +#ifndef __ASSEMBLY__ + #include #include #include #include -#include /* * Alternative inline assembly for SMP. @@ -233,36 +234,6 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr -struct paravirt_patch_site; -#ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end); -#else -static inline void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) -{} -#define __parainstructions NULL -#define __parainstructions_end NULL -#endif - -extern void *text_poke_early(void *addr, const void *opcode, size_t len); - -/* - * Clear and restore the kernel write-protection flag on the local CPU. - * Allows the kernel to edit read-only pages. - * Side-effect: any interrupt handler running between save and restore will have - * the ability to write to read-only pages. - * - * Warning: - * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and - * no thread can be preempted in the instructions being modified (no iret to an - * invalid instruction possible) or if the instructions are changed from a - * consistent state to another consistent state atomically. - * On the local CPU you need to be protected again NMI or MCE handlers seeing an - * inconsistent instruction while you patch. - */ -extern void *text_poke(void *addr, const void *opcode, size_t len); -extern int poke_int3_handler(struct pt_regs *regs); -extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +#endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 98f25bbafac4c52c10e87dc8cd85e6e3914fa4d6..bc27611fa58f1b0b522beefb3117afb1f5c250e6 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -239,10 +239,10 @@ extern void __init check_x2apic(void); extern void x2apic_setup(void); static inline int x2apic_enabled(void) { - return cpu_has_x2apic && apic_is_x2apic_enabled(); + return boot_cpu_has(X86_FEATURE_X2APIC) && apic_is_x2apic_enabled(); } -#define x2apic_supported() (cpu_has_x2apic) +#define x2apic_supported() (boot_cpu_has(X86_FEATURE_X2APIC)) #else /* !CONFIG_X86_X2APIC */ static inline void check_x2apic(void) { } static inline void x2apic_setup(void) { } diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h index aa6a3170ab5ad9368749ba711e0a3cd5919035c5..2b00c776f223af3e05e35a7c53096e77f724e892 100644 --- a/arch/x86/include/asm/bios_ebda.h +++ b/arch/x86/include/asm/bios_ebda.h @@ -17,27 +17,6 @@ static inline unsigned int get_bios_ebda(void) return address; /* 0 means none */ } -/* - * Return the sanitized length of the EBDA in bytes, if it exists. - */ -static inline unsigned int get_bios_ebda_length(void) -{ - unsigned int address; - unsigned int length; - - address = get_bios_ebda(); - if (!address) - return 0; - - /* EBDA length is byte 0 of the EBDA (stored in KiB) */ - length = *(unsigned char *)phys_to_virt(address); - length <<= 10; - - /* Trim the length if it extends beyond 640KiB */ - length = min_t(unsigned int, (640 * 1024) - address, length); - return length; -} - void reserve_ebda_region(void); #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 6b8d6e8cd4494ad4938a444ed48816c3089e495f..abd06b19ddd26871872c5f786c17d1f97e4618e1 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -12,29 +12,46 @@ /* Minimum kernel alignment, as a power of two */ #ifdef CONFIG_X86_64 -#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT +# define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else -#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER) +# define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER) #endif #define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) #if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \ (CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN) -#error "Invalid value for CONFIG_PHYSICAL_ALIGN" +# error "Invalid value for CONFIG_PHYSICAL_ALIGN" #endif #ifdef CONFIG_KERNEL_BZIP2 -#define BOOT_HEAP_SIZE 0x400000 +# define BOOT_HEAP_SIZE 0x400000 #else /* !CONFIG_KERNEL_BZIP2 */ - -#define BOOT_HEAP_SIZE 0x10000 - -#endif /* !CONFIG_KERNEL_BZIP2 */ +# define BOOT_HEAP_SIZE 0x10000 +#endif #ifdef CONFIG_X86_64 -#define BOOT_STACK_SIZE 0x4000 -#else -#define BOOT_STACK_SIZE 0x1000 +# define BOOT_STACK_SIZE 0x4000 + +# define BOOT_INIT_PGT_SIZE (6*4096) +# ifdef CONFIG_RANDOMIZE_BASE +/* + * Assuming all cross the 512GB boundary: + * 1 page for level4 + * (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel + * 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP). + * Total is 19 pages. + */ +# ifdef CONFIG_X86_VERBOSE_BOOTUP +# define BOOT_PGT_SIZE (19*4096) +# else /* !CONFIG_X86_VERBOSE_BOOTUP */ +# define BOOT_PGT_SIZE (17*4096) +# endif +# else /* !CONFIG_RANDOMIZE_BASE */ +# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE +# endif + +#else /* !CONFIG_X86_64 */ +# define BOOT_STACK_SIZE 0x1000 #endif #endif /* _ASM_X86_BOOT_H */ diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index d194266acb28e52d237c19c21291633c15d99c9e..eae33c7170c868993ffc3b861ca30ddc14285aa7 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -3,11 +3,10 @@ #ifndef _ASM_X86_CLOCKSOURCE_H #define _ASM_X86_CLOCKSOURCE_H -#define VCLOCK_NONE 0 /* No vDSO clock available. */ -#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ -#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ -#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ -#define VCLOCK_MAX 3 +#define VCLOCK_NONE 0 /* No vDSO clock available. */ +#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ +#define VCLOCK_PVCLOCK 2 /* vDSO should use vread_pvclock. */ +#define VCLOCK_MAX 2 struct arch_clocksource_data { int vclock_mode; diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index ebb102e1bbc7ad84cff580de23380becbe900ed1..5a3b2c119ed0eb70137bff968607f5d44481f223 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -307,7 +307,7 @@ static inline void __user *arch_compat_alloc_user_space(long len) return (void __user *)round_down(sp - len, 16); } -static inline bool is_x32_task(void) +static inline bool in_x32_syscall(void) { #ifdef CONFIG_X86_X32_ABI if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT) @@ -318,7 +318,7 @@ static inline bool is_x32_task(void) static inline bool in_compat_syscall(void) { - return is_ia32_task() || is_x32_task(); + return in_ia32_syscall() || in_x32_syscall(); } #define in_compat_syscall in_compat_syscall /* override the generic impl */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index aeab4793293324cc6b8ad0d45fa625f080759362..483fb547e3c048afe850993b94dc2be2bc841d96 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -27,6 +27,7 @@ enum cpuid_leafs CPUID_6_EAX, CPUID_8000_000A_EDX, CPUID_7_ECX, + CPUID_8000_0007_EBX, }; #ifdef CONFIG_X86_FEATURE_NAMES @@ -118,31 +119,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) -#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) -#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE) -#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC) -#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE) -#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) -#define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) -#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) -#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) -#define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) -#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) -#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) -#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH) -#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) -#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) -#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) -#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) -#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) -#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) -#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) -#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) -/* - * Do not add any more of those clumsy macros - use static_cpu_has() for - * fast paths and boot_cpu_has() otherwise! - */ - #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) /* * Static testing of CPU features. Used the same as boot_cpu_has(). diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8f9afefd2dc5ab8159131ae1815f9ae122d7024b..4a413485f9eb8ef58ec71c77ff2594f4300c8ea6 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -12,7 +12,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 17 /* N 32-bit words worth of info */ +#define NCAPINTS 18 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -177,6 +177,7 @@ #define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ +#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ #define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ @@ -250,6 +251,7 @@ /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ @@ -280,6 +282,11 @@ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ +#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ + /* * BUG word(s) */ @@ -294,6 +301,9 @@ #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#define X86_BUG_NULL_SEG X86_BUG(9) /* Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(10) /* SWAPGS without input dep on GS */ + #ifdef CONFIG_X86_32 /* diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 53748c45e4885f574531f8cd6cf143f6dbc264d6..78d1e7467eae9fb9bd200cb04205c22322b01b4f 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -3,6 +3,7 @@ #include #include +#include #include /* @@ -28,33 +29,22 @@ #define MAX_CMDLINE_ADDRESS UINT_MAX -#ifdef CONFIG_X86_32 +#define ARCH_EFI_IRQ_FLAGS_MASK X86_EFLAGS_IF +#ifdef CONFIG_X86_32 extern unsigned long asmlinkage efi_call_phys(void *, ...); +#define arch_efi_call_virt_setup() kernel_fpu_begin() +#define arch_efi_call_virt_teardown() kernel_fpu_end() + /* * Wrap all the virtual calls in a way that forces the parameters on the stack. */ - -/* Use this macro if your virtual returns a non-void value */ -#define efi_call_virt(f, args...) \ +#define arch_efi_call_virt(f, args...) \ ({ \ - efi_status_t __s; \ - kernel_fpu_begin(); \ - __s = ((efi_##f##_t __attribute__((regparm(0)))*) \ - efi.systab->runtime->f)(args); \ - kernel_fpu_end(); \ - __s; \ -}) - -/* Use this macro if your virtual call does not return any value */ -#define __efi_call_virt(f, args...) \ -({ \ - kernel_fpu_begin(); \ ((efi_##f##_t __attribute__((regparm(0)))*) \ efi.systab->runtime->f)(args); \ - kernel_fpu_end(); \ }) #define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size) @@ -78,10 +68,8 @@ struct efi_scratch { u64 phys_stack; } __packed; -#define efi_call_virt(f, ...) \ +#define arch_efi_call_virt_setup() \ ({ \ - efi_status_t __s; \ - \ efi_sync_low_kernel_mappings(); \ preempt_disable(); \ __kernel_fpu_begin(); \ @@ -91,9 +79,13 @@ struct efi_scratch { write_cr3((unsigned long)efi_scratch.efi_pgt); \ __flush_tlb_all(); \ } \ - \ - __s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__); \ - \ +}) + +#define arch_efi_call_virt(f, args...) \ + efi_call((void *)efi.systab->runtime->f, args) \ + +#define arch_efi_call_virt_teardown() \ +({ \ if (efi_scratch.use_pgd) { \ write_cr3(efi_scratch.prev_cr3); \ __flush_tlb_all(); \ @@ -101,15 +93,8 @@ struct efi_scratch { \ __kernel_fpu_end(); \ preempt_enable(); \ - __s; \ }) -/* - * All X86_64 virt calls return non-void values. Thus, use non-void call for - * virt calls that would be void on X86_32. - */ -#define __efi_call_virt(f, args...) efi_call_virt(f, args) - extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); @@ -180,6 +165,8 @@ static inline bool efi_runtime_supported(void) extern struct console early_efi_console; extern void parse_efi_setup(u64 phys_addr, u32 data_len); +extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt); + #ifdef CONFIG_EFI_MIXED extern void efi_thunk_runtime_setup(void); extern efi_status_t efi_thunk_set_virtual_address_map( @@ -225,6 +212,11 @@ __pure const struct efi_config *__efi_early(void); #define efi_call_early(f, ...) \ __efi_early()->call(__efi_early()->f, __VA_ARGS__); +#define __efi_call_early(f, ...) \ + __efi_early()->call((unsigned long)f, __VA_ARGS__); + +#define efi_is_64bit() __efi_early()->is64 + extern bool efi_reboot_required(void); #else diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 15340e36ddcb3364e16eb63cd61c61a42676d756..fea7724141a04be703c86032db79cc29d37148ca 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -176,7 +176,7 @@ static inline void elf_common_init(struct thread_struct *t, regs->si = regs->di = regs->bp = 0; regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; - t->fs = t->gs = 0; + t->fsbase = t->gsbase = 0; t->fsindex = t->gsindex = 0; t->ds = t->es = ds; } @@ -226,8 +226,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - (pr_reg)[21] = current->thread.fs; \ - (pr_reg)[22] = current->thread.gs; \ + (pr_reg)[21] = current->thread.fsbase; \ + (pr_reg)[22] = current->thread.gsbase; \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index e6a8613fbfb0ea19f8d507c4ce6ae1f0d4a35be6..3a106165e03ad035484df5fb97989265798263df 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -4,7 +4,7 @@ #include #include -#define hugepages_supported() cpu_has_pse +#define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE) static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index d0afb05c84fc1ffd434e85f6267326b9091d2595..f70604125286b90d820bc14be0b65ecccb4f7a44 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -5,7 +5,7 @@ static inline bool arch_irq_work_has_interrupt(void) { - return cpu_has_apic; + return boot_cpu_has(X86_FEATURE_APIC); } #endif /* _ASM_IRQ_WORK_H */ diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h index 332f98c9111f41d92def3e02f0771783c8dce100..22a8537eb780b231f24cdec6b2b1797cd7cd5868 100644 --- a/arch/x86/include/asm/kgdb.h +++ b/arch/x86/include/asm/kgdb.h @@ -6,6 +6,8 @@ * Copyright (C) 2008 Wind River Systems, Inc. */ +#include + /* * BUFMAX defines the maximum number of characters in inbound/outbound * buffers at least NUMREGBYTES*2 are needed for register packets diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 79327e9483a34ec33c3ba71bbdc94397f2456f46..0ccb26dda126da6d5876c84264c290d1fa203d58 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -8,40 +8,6 @@ #ifdef CONFIG_X86_32 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) - -/* - * Make sure the compiler doesn't do anything stupid with the - * arguments on the stack - they are owned by the *caller*, not - * the callee. This just fools gcc into not spilling into them, - * and keeps it from doing tailcall recursion and/or using the - * stack slots for temporaries, since they are live and "used" - * all the way to the end of the function. - * - * NOTE! On x86-64, all the arguments are in registers, so this - * only matters on a 32-bit kernel. - */ -#define asmlinkage_protect(n, ret, args...) \ - __asmlinkage_protect##n(ret, ##args) -#define __asmlinkage_protect_n(ret, args...) \ - __asm__ __volatile__ ("" : "=r" (ret) : "0" (ret), ##args) -#define __asmlinkage_protect0(ret) \ - __asmlinkage_protect_n(ret) -#define __asmlinkage_protect1(ret, arg1) \ - __asmlinkage_protect_n(ret, "m" (arg1)) -#define __asmlinkage_protect2(ret, arg1, arg2) \ - __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2)) -#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \ - __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3)) -#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \ - __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \ - "m" (arg4)) -#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \ - __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \ - "m" (arg4), "m" (arg5)) -#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \ - __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \ - "m" (arg4), "m" (arg5), "m" (arg6)) - #endif /* CONFIG_X86_32 */ #ifdef __ASSEMBLY__ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 92b6f651fa4fcc7e58d5764f317f9db8b0a5616b..8bf766ef0e188f1ed6c8798f47bb3f59c898a14d 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -104,13 +104,23 @@ #define MCE_LOG_SIGNATURE "MACHINECHECK" /* AMD Scalable MCA */ +#define MSR_AMD64_SMCA_MC0_CTL 0xc0002000 +#define MSR_AMD64_SMCA_MC0_STATUS 0xc0002001 +#define MSR_AMD64_SMCA_MC0_ADDR 0xc0002002 #define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003 #define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004 #define MSR_AMD64_SMCA_MC0_IPID 0xc0002005 +#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008 +#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009 #define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a +#define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x))) /* @@ -168,9 +178,18 @@ struct mce_vendor_flags { __reserved_0 : 61; }; + +struct mca_msr_regs { + u32 (*ctl) (int bank); + u32 (*status) (int bank); + u32 (*addr) (int bank); + u32 (*misc) (int bank); +}; + extern struct mce_vendor_flags mce_flags; extern struct mca_config mca_cfg; +extern struct mca_msr_regs msr_ops; extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 84280029cafd73a83e64efd3133a8f9d9575bcb3..396348196aa779aeb55bea2b1d9b0782475e3558 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -115,103 +115,12 @@ static inline void destroy_context(struct mm_struct *mm) destroy_context_ldt(mm); } -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) -{ - unsigned cpu = smp_processor_id(); +extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); - if (likely(prev != next)) { -#ifdef CONFIG_SMP - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - this_cpu_write(cpu_tlbstate.active_mm, next); -#endif - cpumask_set_cpu(cpu, mm_cpumask(next)); - - /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - * - */ - load_cr3(next->pgd); - - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - - /* Stop flush ipis for the previous mm */ - cpumask_clear_cpu(cpu, mm_cpumask(prev)); - - /* Load per-mm CR4 state */ - load_mm_cr4(next); - -#ifdef CONFIG_MODIFY_LDT_SYSCALL - /* - * Load the LDT, if the LDT is different. - * - * It's possible that prev->context.ldt doesn't match - * the LDT register. This can happen if leave_mm(prev) - * was called and then modify_ldt changed - * prev->context.ldt but suppressed an IPI to this CPU. - * In this case, prev->context.ldt != NULL, because we - * never set context.ldt to NULL while the mm still - * exists. That means that next->context.ldt != - * prev->context.ldt, because mms never share an LDT. - */ - if (unlikely(prev->context.ldt != next->context.ldt)) - load_mm_ldt(next); -#endif - } -#ifdef CONFIG_SMP - else { - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); - - if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { - /* - * On established mms, the mm_cpumask is only changed - * from irq context, from ptep_clear_flush() while in - * lazy tlb mode, and here. Irqs are blocked during - * schedule, protecting us from simultaneous changes. - */ - cpumask_set_cpu(cpu, mm_cpumask(next)); - - /* - * We were in lazy tlb mode and leave_mm disabled - * tlb flush IPI delivery. We must reload CR3 - * to make sure to use no freed page tables. - * - * As above, load_cr3() is serializing and orders TLB - * fills with respect to the mm_cpumask write. - */ - load_cr3(next->pgd); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - load_mm_cr4(next); - load_mm_ldt(next); - } - } -#endif -} +extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); +#define switch_mm_irqs_off switch_mm_irqs_off #define activate_mm(prev, next) \ do { \ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 5b3c9a55f51cbeda86cb52dedac1f3f494042e55..5a73a9c62c392f676bc786b58a27e50e3476a123 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -89,27 +89,16 @@ #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 #define MSR_IA32_RTIT_CTL 0x00000570 -#define RTIT_CTL_TRACEEN BIT(0) -#define RTIT_CTL_CYCLEACC BIT(1) -#define RTIT_CTL_OS BIT(2) -#define RTIT_CTL_USR BIT(3) -#define RTIT_CTL_CR3EN BIT(7) -#define RTIT_CTL_TOPA BIT(8) -#define RTIT_CTL_MTC_EN BIT(9) -#define RTIT_CTL_TSC_EN BIT(10) -#define RTIT_CTL_DISRETC BIT(11) -#define RTIT_CTL_BRANCH_EN BIT(13) -#define RTIT_CTL_MTC_RANGE_OFFSET 14 -#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) -#define RTIT_CTL_CYC_THRESH_OFFSET 19 -#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET) -#define RTIT_CTL_PSB_FREQ_OFFSET 24 -#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) #define MSR_IA32_RTIT_STATUS 0x00000571 -#define RTIT_STATUS_CONTEXTEN BIT(1) -#define RTIT_STATUS_TRIGGEREN BIT(2) -#define RTIT_STATUS_ERROR BIT(4) -#define RTIT_STATUS_STOPPED BIT(5) +#define MSR_IA32_RTIT_STATUS 0x00000571 +#define MSR_IA32_RTIT_ADDR0_A 0x00000580 +#define MSR_IA32_RTIT_ADDR0_B 0x00000581 +#define MSR_IA32_RTIT_ADDR1_A 0x00000582 +#define MSR_IA32_RTIT_ADDR1_B 0x00000583 +#define MSR_IA32_RTIT_ADDR2_A 0x00000584 +#define MSR_IA32_RTIT_ADDR2_B 0x00000585 +#define MSR_IA32_RTIT_ADDR3_A 0x00000586 +#define MSR_IA32_RTIT_ADDR3_B 0x00000587 #define MSR_IA32_RTIT_CR3_MATCH 0x00000572 #define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560 #define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561 @@ -205,6 +194,8 @@ #define MSR_CONFIG_TDP_CONTROL 0x0000064B #define MSR_TURBO_ACTIVATION_RATIO 0x0000064C +#define MSR_PLATFORM_ENERGY_STATUS 0x0000064D + #define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 #define MSR_PKG_ANY_CORE_C0_RES 0x00000659 #define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A @@ -315,6 +306,9 @@ #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +/* Fam 17h MSRs */ +#define MSR_F17H_IRPERF 0xc00000e9 + /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 @@ -328,6 +322,7 @@ #define MSR_F15H_PERF_CTR 0xc0010201 #define MSR_F15H_NB_PERF_CTL 0xc0010240 #define MSR_F15H_NB_PERF_CTR 0xc0010241 +#define MSR_F15H_PTSC 0xc0010280 #define MSR_F15H_IC_CFG 0xc0011021 /* Fam 10h MSRs */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 7a79ee2778b3b5067fa816e541bf93bc26cf782d..7dc1d8fef7fdec6a633f5cd3f7ada4d2ba09e8fe 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -84,7 +84,10 @@ static inline unsigned long long native_read_msr(unsigned int msr) { DECLARE_ARGS(val, low, high); - asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr)); + asm volatile("1: rdmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe) + : EAX_EDX_RET(val, low, high) : "c" (msr)); if (msr_tracepoint_active(__tracepoint_read_msr)) do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0); return EAX_EDX_VAL(val, low, high); @@ -98,7 +101,10 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, asm volatile("2: rdmsr ; xor %[err],%[err]\n" "1:\n\t" ".section .fixup,\"ax\"\n\t" - "3: mov %[fault],%[err] ; jmp 1b\n\t" + "3: mov %[fault],%[err]\n\t" + "xorl %%eax, %%eax\n\t" + "xorl %%edx, %%edx\n\t" + "jmp 1b\n\t" ".previous\n\t" _ASM_EXTABLE(2b, 3b) : [err] "=r" (*err), EAX_EDX_RET(val, low, high) @@ -108,10 +114,14 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, return EAX_EDX_VAL(val, low, high); } -static inline void native_write_msr(unsigned int msr, - unsigned low, unsigned high) +/* Can be uninlined because referenced by paravirt */ +notrace static inline void native_write_msr(unsigned int msr, + unsigned low, unsigned high) { - asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); + asm volatile("1: wrmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) + : : "c" (msr), "a"(low), "d" (high) : "memory"); if (msr_tracepoint_active(__tracepoint_read_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index b94f6f64e23d0cf7e630c190fe48518b47e819ed..dbff1456d2152a6993ba5f381f6a7ef838b52f28 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -24,6 +24,7 @@ #define _ASM_X86_MTRR_H #include +#include /* @@ -83,9 +84,12 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } +static inline void mtrr_bp_init(void) +{ + pat_disable("MTRRs disabled, skipping PAT initialization too."); +} #define mtrr_ap_init() do {} while (0) -#define mtrr_bp_init() do {} while (0) #define set_mtrr_aps_delayed_init() do {} while (0) #define mtrr_aps_init() do {} while (0) #define mtrr_bp_restore() do {} while (0) diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 802dde30c92877588be2d068a692cf9ed4418b7d..cf8f619b305fd87a085423b075118d3fcd92d222 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -37,7 +37,10 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) +#endif + #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ @@ -51,7 +54,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, #define __pa_symbol(x) \ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) +#ifndef __va #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) +#endif #define __boot_va(x) __va(x) #define __boot_pa(x) __pa(x) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 4928cf0d5af0fefef7240a6112c67f19e42059d3..d5c2f8b40faabf3f52386e7d14199c362bf2373a 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -47,12 +47,10 @@ * are fully set up. If kernel ASLR is configured, it can extend the * kernel page table mapping, reducing the size of the modules area. */ -#define KERNEL_IMAGE_SIZE_DEFAULT (512 * 1024 * 1024) -#if defined(CONFIG_RANDOMIZE_BASE) && \ - CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT -#define KERNEL_IMAGE_SIZE CONFIG_RANDOMIZE_BASE_MAX_OFFSET +#if defined(CONFIG_RANDOMIZE_BASE) +#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) #else -#define KERNEL_IMAGE_SIZE KERNEL_IMAGE_SIZE_DEFAULT +#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) #endif #endif /* _ASM_X86_PAGE_64_DEFS_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 601f1b8f9961af35f113ad456b1ab90f1aceaced..2970d22d77663f5299a8d49af690c3cf1aa7df60 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -15,17 +15,6 @@ #include #include -static inline int paravirt_enabled(void) -{ - return pv_info.paravirt_enabled; -} - -static inline int paravirt_has_feature(unsigned int feature) -{ - WARN_ON_ONCE(!pv_info.paravirt_enabled); - return (pv_info.features & feature); -} - static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { @@ -130,21 +119,31 @@ static inline void wbinvd(void) #define get_kernel_rpl() (pv_info.kernel_rpl) -static inline u64 paravirt_read_msr(unsigned msr, int *err) +static inline u64 paravirt_read_msr(unsigned msr) +{ + return PVOP_CALL1(u64, pv_cpu_ops.read_msr, msr); +} + +static inline void paravirt_write_msr(unsigned msr, + unsigned low, unsigned high) +{ + return PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high); +} + +static inline u64 paravirt_read_msr_safe(unsigned msr, int *err) { - return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); + return PVOP_CALL2(u64, pv_cpu_ops.read_msr_safe, msr, err); } -static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) +static inline int paravirt_write_msr_safe(unsigned msr, + unsigned low, unsigned high) { - return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); + return PVOP_CALL3(int, pv_cpu_ops.write_msr_safe, msr, low, high); } -/* These should all do BUG_ON(_err), but our headers are too tangled. */ #define rdmsr(msr, val1, val2) \ do { \ - int _err; \ - u64 _l = paravirt_read_msr(msr, &_err); \ + u64 _l = paravirt_read_msr(msr); \ val1 = (u32)_l; \ val2 = _l >> 32; \ } while (0) @@ -156,8 +155,7 @@ do { \ #define rdmsrl(msr, val) \ do { \ - int _err; \ - val = paravirt_read_msr(msr, &_err); \ + val = paravirt_read_msr(msr); \ } while (0) static inline void wrmsrl(unsigned msr, u64 val) @@ -165,23 +163,23 @@ static inline void wrmsrl(unsigned msr, u64 val) wrmsr(msr, (u32)val, (u32)(val>>32)); } -#define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b) +#define wrmsr_safe(msr, a, b) paravirt_write_msr_safe(msr, a, b) /* rdmsr with exception handling */ -#define rdmsr_safe(msr, a, b) \ -({ \ - int _err; \ - u64 _l = paravirt_read_msr(msr, &_err); \ - (*a) = (u32)_l; \ - (*b) = _l >> 32; \ - _err; \ +#define rdmsr_safe(msr, a, b) \ +({ \ + int _err; \ + u64 _l = paravirt_read_msr_safe(msr, &_err); \ + (*a) = (u32)_l; \ + (*b) = _l >> 32; \ + _err; \ }) static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; - *p = paravirt_read_msr(msr, &err); + *p = paravirt_read_msr_safe(msr, &err); return err; } diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index e8c2326478c8fabc4e03c04534c599072e2598fe..7fa9e7740ba3db18d04c8405f549183edf60e8a5 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -69,15 +69,9 @@ struct pv_info { u16 extra_user_64bit_cs; /* __USER_CS if none */ #endif - int paravirt_enabled; - unsigned int features; /* valid only if paravirt_enabled is set */ const char *name; }; -#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x) -/* Supported features */ -#define PV_SUPPORTED_RTC (1<<0) - struct pv_init_ops { /* * Patch may replace one of the defined code sequences with @@ -155,10 +149,16 @@ struct pv_cpu_ops { void (*cpuid)(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); - /* MSR, PMC and TSR operations. - err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ - u64 (*read_msr)(unsigned int msr, int *err); - int (*write_msr)(unsigned int msr, unsigned low, unsigned high); + /* Unsafe MSR operations. These will warn or panic on failure. */ + u64 (*read_msr)(unsigned int msr); + void (*write_msr)(unsigned int msr, unsigned low, unsigned high); + + /* + * Safe MSR operations. + * read sets err to 0 or -EIO. write returns 0 or -EIO. + */ + u64 (*read_msr_safe)(unsigned int msr, int *err); + int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high); u64 (*read_pmc)(int counter); diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index ca6c228d5e62837be88984b652bb436949295d03..0b1ff4c1c14e782c0375027ce99cab09e96a04fb 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -5,8 +5,8 @@ #include bool pat_enabled(void); +void pat_disable(const char *reason); extern void pat_init(void); -void pat_init_cache_modes(u64); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 97f3242e133ccc9c2866baade8ca4f82ade65f04..f86491a7bc9dd1c8c96f52f28b9befd0ff59d6ea 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -183,7 +183,7 @@ static inline int pmd_trans_huge(pmd_t pmd) static inline int has_transparent_hugepage(void) { - return cpu_has_pse; + return boot_cpu_has(X86_FEATURE_PSE); } #ifdef __HAVE_ARCH_PTE_DEVMAP diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9264476f3d578e8fa346411aad4900e85afcb41a..62c6cc3cc5d32f5490b56e4b6590fbdf74af76d9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -388,9 +388,16 @@ struct thread_struct { unsigned long ip; #endif #ifdef CONFIG_X86_64 - unsigned long fs; + unsigned long fsbase; + unsigned long gsbase; +#else + /* + * XXX: this could presumably be unsigned short. Alternatively, + * 32-bit kernels could be taught to use fsindex instead. + */ + unsigned long fs; + unsigned long gs; #endif - unsigned long gs; /* Save middle states of ptrace breakpoints */ struct perf_event *ptrace_bps[HBP_NUM]; @@ -473,8 +480,6 @@ static inline unsigned long current_top_of_stack(void) #include #else #define __cpuid native_cpuid -#define paravirt_enabled() 0 -#define paravirt_has(x) 0 static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index ceec86eb68e963d9b6d51ca015d8753ee8bd5144..453744c1d34752c20988cf513ff1eef75c3fa657 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -99,26 +99,36 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +#define ____down_write(sem, slow_path) \ +({ \ + long tmp; \ + struct rw_semaphore* ret; \ + asm volatile("# beginning down_write\n\t" \ + LOCK_PREFIX " xadd %1,(%3)\n\t" \ + /* adds 0xffff0001, returns the old value */ \ + " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \ + /* was the active mask 0 before? */\ + " jz 1f\n" \ + " call " slow_path "\n" \ + "1:\n" \ + "# ending down_write" \ + : "+m" (sem->count), "=d" (tmp), "=a" (ret) \ + : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \ + : "memory", "cc"); \ + ret; \ +}) + +static inline void __down_write(struct rw_semaphore *sem) { - long tmp; - asm volatile("# beginning down_write\n\t" - LOCK_PREFIX " xadd %1,(%2)\n\t" - /* adds 0xffff0001, returns the old value */ - " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" - /* was the active mask 0 before? */ - " jz 1f\n" - " call call_rwsem_down_write_failed\n" - "1:\n" - "# ending down_write" - : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc"); + ____down_write(sem, "call_rwsem_down_write_failed"); } -static inline void __down_write(struct rw_semaphore *sem) +static inline int __down_write_killable(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + if (IS_ERR(____down_write(sem, "call_rwsem_down_write_failed_killable"))) + return -EINTR; + + return 0; } /* diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 7d5a1929d76b31bba69295e533e460ed50904cfd..1549caa098f0828e6c533f714f22a7f97532427e 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -2,6 +2,7 @@ #define _ASM_X86_SEGMENT_H #include +#include /* * Constructor for a conventional segment GDT (or LDT) entry. @@ -207,13 +208,6 @@ #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) #define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3) -/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */ -#define FS_TLS 0 -#define GS_TLS 1 - -#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) -#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) - #endif #ifndef CONFIG_PARAVIRT @@ -249,10 +243,13 @@ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDL #endif /* - * Load a segment. Fall back on loading the zero - * segment if something goes wrong.. + * Load a segment. Fall back on loading the zero segment if something goes + * wrong. This variant assumes that loading zero fully clears the segment. + * This is always the case on Intel CPUs and, even on 64-bit AMD CPUs, any + * failure to fully clear the cached descriptor is only observable for + * FS and GS. */ -#define loadsegment(seg, value) \ +#define __loadsegment_simple(seg, value) \ do { \ unsigned short __val = (value); \ \ @@ -269,6 +266,38 @@ do { \ : "+r" (__val) : : "memory"); \ } while (0) +#define __loadsegment_ss(value) __loadsegment_simple(ss, (value)) +#define __loadsegment_ds(value) __loadsegment_simple(ds, (value)) +#define __loadsegment_es(value) __loadsegment_simple(es, (value)) + +#ifdef CONFIG_X86_32 + +/* + * On 32-bit systems, the hidden parts of FS and GS are unobservable if + * the selector is NULL, so there's no funny business here. + */ +#define __loadsegment_fs(value) __loadsegment_simple(fs, (value)) +#define __loadsegment_gs(value) __loadsegment_simple(gs, (value)) + +#else + +static inline void __loadsegment_fs(unsigned short value) +{ + asm volatile(" \n" + "1: movw %0, %%fs \n" + "2: \n" + + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_clear_fs) + + : : "rm" (value) : "memory"); +} + +/* __loadsegment_gs is intentionally undefined. Use load_gs_index instead. */ + +#endif + +#define loadsegment(seg, value) __loadsegment_ ## seg (value) + /* * Save a segment register away: */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 11af24e09c8a667911bf9f0b4a3e17b7c75acf16..ac1d5da1473429b930ca38dc68a7875221d0c589 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -6,6 +6,7 @@ #define COMMAND_LINE_SIZE 2048 #include +#include #ifdef __i386__ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 751bf4b7bf114da12231a56f4217c2583ddeafb2..8f321a1b03a1aaa0e87c4c1182d2b2f282efa1e4 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -39,8 +39,7 @@ do { \ */ \ unsigned long ebx, ecx, edx, esi, edi; \ \ - asm volatile("pushfl\n\t" /* save flags */ \ - "pushl %%ebp\n\t" /* save EBP */ \ + asm volatile("pushl %%ebp\n\t" /* save EBP */ \ "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ @@ -49,7 +48,6 @@ do { \ "jmp __switch_to\n" /* regparm call */ \ "1:\t" \ "popl %%ebp\n\t" /* restore EBP */ \ - "popfl\n" /* restore flags */ \ \ /* output parameters */ \ : [prev_sp] "=m" (prev->thread.sp), \ diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h new file mode 100644 index 0000000000000000000000000000000000000000..90395063383ccedc28ccd903558a9910f1b804bd --- /dev/null +++ b/arch/x86/include/asm/text-patching.h @@ -0,0 +1,40 @@ +#ifndef _ASM_X86_TEXT_PATCHING_H +#define _ASM_X86_TEXT_PATCHING_H + +#include +#include +#include + +struct paravirt_patch_site; +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end); +#else +static inline void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) +{} +#define __parainstructions NULL +#define __parainstructions_end NULL +#endif + +extern void *text_poke_early(void *addr, const void *opcode, size_t len); + +/* + * Clear and restore the kernel write-protection flag on the local CPU. + * Allows the kernel to edit read-only pages. + * Side-effect: any interrupt handler running between save and restore will have + * the ability to write to read-only pages. + * + * Warning: + * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and + * no thread can be preempted in the instructions being modified (no iret to an + * invalid instruction possible) or if the instructions are changed from a + * consistent state to another consistent state atomically. + * On the local CPU you need to be protected again NMI or MCE handlers seeing an + * inconsistent instruction while you patch. + */ +extern void *text_poke(void *addr, const void *opcode, size_t len); +extern int poke_int3_handler(struct pt_regs *regs); +extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); + +#endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ffae84df8a9313cd3cc7b12953c0c24a809cd792..30c133ac05cd86d6ddba68c8993160296cafde04 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -255,7 +255,7 @@ static inline bool test_and_clear_restore_sigmask(void) return true; } -static inline bool is_ia32_task(void) +static inline bool in_ia32_syscall(void) { #ifdef CONFIG_X86_32 return true; diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 1fde8d580a5bac371c7a56cf60e688c49d2299f6..4e5be94e079a6c64353bd327c9fe4ef9796e16b1 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -181,7 +181,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) static inline void __flush_tlb_all(void) { - if (cpu_has_pge) + if (static_cpu_has(X86_FEATURE_PGE)) __flush_tlb_global(); else __flush_tlb(); diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 174c4212780afde12200fd23a1dbc17a08f40309..7428697c5b8df1d6856ca63fdda62cb047083f28 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -22,7 +22,7 @@ extern void disable_TSC(void); static inline cycles_t get_cycles(void) { #ifndef CONFIG_X86_TSC - if (!cpu_has_tsc) + if (!boot_cpu_has(X86_FEATURE_TSC)) return 0; #endif diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 2e7513d1f1f45eb3bb7ec4ebe459aec2fa76b46d..12f9653bde8d968c3374e6f13a2fbc2abacccda0 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -118,7 +118,7 @@ struct exception_table_entry { extern int fixup_exception(struct pt_regs *regs, int trapnr); extern bool ex_has_fault_handler(unsigned long ip); -extern int early_fixup_exception(unsigned long *ip); +extern void early_fixup_exception(struct pt_regs *regs, int trapnr); /* * These are the main single-value transfer routines. They automatically diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 71605c7d5c5c52989d64a3c9305f58b091cd15eb..c852590254d5f4191609f92fce8ee488896efe8b 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -51,15 +51,66 @@ enum { BIOS_STATUS_UNAVAIL = -EBUSY }; +/* Address map parameters */ +struct uv_gam_parameters { + u64 mmr_base; + u64 gru_base; + u8 mmr_shift; /* Convert PNode to MMR space offset */ + u8 gru_shift; /* Convert PNode to GRU space offset */ + u8 gpa_shift; /* Size of offset field in GRU phys addr */ + u8 unused1; +}; + +/* UV_TABLE_GAM_RANGE_ENTRY values */ +#define UV_GAM_RANGE_TYPE_UNUSED 0 /* End of table */ +#define UV_GAM_RANGE_TYPE_RAM 1 /* Normal RAM */ +#define UV_GAM_RANGE_TYPE_NVRAM 2 /* Non-volatile memory */ +#define UV_GAM_RANGE_TYPE_NV_WINDOW 3 /* NVMDIMM block window */ +#define UV_GAM_RANGE_TYPE_NV_MAILBOX 4 /* NVMDIMM mailbox */ +#define UV_GAM_RANGE_TYPE_HOLE 5 /* Unused address range */ +#define UV_GAM_RANGE_TYPE_MAX 6 + +/* The structure stores PA bits 56:26, for 64MB granularity */ +#define UV_GAM_RANGE_SHFT 26 /* 64MB */ + +struct uv_gam_range_entry { + char type; /* Entry type: GAM_RANGE_TYPE_UNUSED, etc. */ + char unused1; + u16 nasid; /* HNasid */ + u16 sockid; /* Socket ID, high bits of APIC ID */ + u16 pnode; /* Index to MMR and GRU spaces */ + u32 pxm; /* ACPI proximity domain number */ + u32 limit; /* PA bits 56:26 (UV_GAM_RANGE_SHFT) */ +}; + +#define UV_SYSTAB_SIG "UVST" +#define UV_SYSTAB_VERSION_1 1 /* UV1/2/3 BIOS version */ +#define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */ +#define UV_SYSTAB_VERSION_UV4_1 0x401 /* + gpa_shift */ +#define UV_SYSTAB_VERSION_UV4_2 0x402 /* + TYPE_NVRAM/WINDOW/MBOX */ +#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_2 + +#define UV_SYSTAB_TYPE_UNUSED 0 /* End of table (offset == 0) */ +#define UV_SYSTAB_TYPE_GAM_PARAMS 1 /* GAM PARAM conversions */ +#define UV_SYSTAB_TYPE_GAM_RNG_TBL 2 /* GAM entry table */ +#define UV_SYSTAB_TYPE_MAX 3 + /* * The UV system table describes specific firmware * capabilities available to the Linux kernel at runtime. */ struct uv_systab { - char signature[4]; /* must be "UVST" */ + char signature[4]; /* must be UV_SYSTAB_SIG */ u32 revision; /* distinguish different firmware revs */ u64 function; /* BIOS runtime callback function ptr */ + u32 size; /* systab size (starting with _VERSION_UV4) */ + struct { + u32 type:8; /* type of entry */ + u32 offset:24; /* byte offset from struct start to entry */ + } entry[1]; /* additional entries follow */ }; +extern struct uv_systab *uv_systab; +/* (... end of definitions from UV BIOS ...) */ enum { BIOS_FREQ_BASE_PLATFORM = 0, @@ -99,7 +150,11 @@ extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); +#ifdef CONFIG_EFI extern void uv_bios_init(void); +#else +void uv_bios_init(void) { } +#endif extern unsigned long sn_rtc_cycles_per_second; extern int uv_type; @@ -107,7 +162,7 @@ extern long sn_partition_id; extern long sn_coherency_id; extern long sn_region_size; extern long system_serial_number; -#define partition_coherence_id() (sn_coherency_id) +#define uv_partition_coherence_id() (sn_coherency_id) extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index fc808b83fccb28916a27ce757c4c63f3cd53704c..cc44d926c17e326cad73c4f127c72a05d077960a 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -598,7 +598,7 @@ struct bau_control { int timeout_tries; int ipi_attempts; int conseccompletes; - short nobau; + bool nobau; short baudisabled; short cpu; short osnode; diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index ea7074784cc4717935e579610b814971127f6102..097b80c989c4a915f543106f670034a8b99c56d7 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -16,9 +16,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -103,7 +105,6 @@ * processor APICID register. */ - /* * Maximum number of bricks in all partitions and in all coherency domains. * This is the total number of bricks accessible in the numalink fabric. It @@ -127,6 +128,7 @@ */ #define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_BLADES * 2) +/* System Controller Interface Reg info */ struct uv_scir_s { struct timer_list timer; unsigned long offset; @@ -137,71 +139,173 @@ struct uv_scir_s { unsigned char enabled; }; +/* GAM (globally addressed memory) range table */ +struct uv_gam_range_s { + u32 limit; /* PA bits 56:26 (GAM_RANGE_SHFT) */ + u16 nasid; /* node's global physical address */ + s8 base; /* entry index of node's base addr */ + u8 reserved; +}; + /* * The following defines attributes of the HUB chip. These attributes are - * frequently referenced and are kept in the per-cpu data areas of each cpu. - * They are kept together in a struct to minimize cache misses. + * frequently referenced and are kept in a common per hub struct. + * After setup, the struct is read only, so it should be readily + * available in the L3 cache on the cpu socket for the node. */ struct uv_hub_info_s { unsigned long global_mmr_base; + unsigned long global_mmr_shift; unsigned long gpa_mask; - unsigned int gnode_extra; + unsigned short *socket_to_node; + unsigned short *socket_to_pnode; + unsigned short *pnode_to_socket; + struct uv_gam_range_s *gr_table; + unsigned short min_socket; + unsigned short min_pnode; + unsigned char m_val; + unsigned char n_val; + unsigned char gr_table_len; unsigned char hub_revision; unsigned char apic_pnode_shift; + unsigned char gpa_shift; unsigned char m_shift; unsigned char n_lshift; + unsigned int gnode_extra; unsigned long gnode_upper; unsigned long lowmem_remap_top; unsigned long lowmem_remap_base; + unsigned long global_gru_base; + unsigned long global_gru_shift; unsigned short pnode; unsigned short pnode_mask; unsigned short coherency_domain_number; unsigned short numa_blade_id; - unsigned char blade_processor_id; - unsigned char m_val; - unsigned char n_val; + unsigned short nr_possible_cpus; + unsigned short nr_online_cpus; + short memory_nid; +}; + +/* CPU specific info with a pointer to the hub common info struct */ +struct uv_cpu_info_s { + void *p_uv_hub_info; + unsigned char blade_cpu_id; struct uv_scir_s scir; }; +DECLARE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info); + +#define uv_cpu_info this_cpu_ptr(&__uv_cpu_info) +#define uv_cpu_info_per(cpu) (&per_cpu(__uv_cpu_info, cpu)) + +#define uv_scir_info (&uv_cpu_info->scir) +#define uv_cpu_scir_info(cpu) (&uv_cpu_info_per(cpu)->scir) + +/* Node specific hub common info struct */ +extern void **__uv_hub_info_list; +static inline struct uv_hub_info_s *uv_hub_info_list(int node) +{ + return (struct uv_hub_info_s *)__uv_hub_info_list[node]; +} + +static inline struct uv_hub_info_s *_uv_hub_info(void) +{ + return (struct uv_hub_info_s *)uv_cpu_info->p_uv_hub_info; +} +#define uv_hub_info _uv_hub_info() -DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); -#define uv_hub_info this_cpu_ptr(&__uv_hub_info) -#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) +static inline struct uv_hub_info_s *uv_cpu_hub_info(int cpu) +{ + return (struct uv_hub_info_s *)uv_cpu_info_per(cpu)->p_uv_hub_info; +} + +#define UV_HUB_INFO_VERSION 0x7150 +extern int uv_hub_info_version(void); +static inline int uv_hub_info_check(int version) +{ + if (uv_hub_info_version() == version) + return 0; + + pr_crit("UV: uv_hub_info version(%x) mismatch, expecting(%x)\n", + uv_hub_info_version(), version); + + BUG(); /* Catastrophic - cannot continue on unknown UV system */ +} +#define _uv_hub_info_check() uv_hub_info_check(UV_HUB_INFO_VERSION) /* - * Hub revisions less than UV2_HUB_REVISION_BASE are UV1 hubs. All UV2 - * hubs have revision numbers greater than or equal to UV2_HUB_REVISION_BASE. + * HUB revision ranges for each UV HUB architecture. * This is a software convention - NOT the hardware revision numbers in * the hub chip. */ #define UV1_HUB_REVISION_BASE 1 #define UV2_HUB_REVISION_BASE 3 #define UV3_HUB_REVISION_BASE 5 +#define UV4_HUB_REVISION_BASE 7 +#ifdef UV1_HUB_IS_SUPPORTED static inline int is_uv1_hub(void) { return uv_hub_info->hub_revision < UV2_HUB_REVISION_BASE; } +#else +static inline int is_uv1_hub(void) +{ + return 0; +} +#endif +#ifdef UV2_HUB_IS_SUPPORTED static inline int is_uv2_hub(void) { return ((uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE) && (uv_hub_info->hub_revision < UV3_HUB_REVISION_BASE)); } +#else +static inline int is_uv2_hub(void) +{ + return 0; +} +#endif +#ifdef UV3_HUB_IS_SUPPORTED +static inline int is_uv3_hub(void) +{ + return ((uv_hub_info->hub_revision >= UV3_HUB_REVISION_BASE) && + (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE)); +} +#else static inline int is_uv3_hub(void) { - return uv_hub_info->hub_revision >= UV3_HUB_REVISION_BASE; + return 0; } +#endif -static inline int is_uv_hub(void) +#ifdef UV4_HUB_IS_SUPPORTED +static inline int is_uv4_hub(void) { - return uv_hub_info->hub_revision; + return uv_hub_info->hub_revision >= UV4_HUB_REVISION_BASE; } +#else +static inline int is_uv4_hub(void) +{ + return 0; +} +#endif -/* code common to uv2 and uv3 only */ static inline int is_uvx_hub(void) { - return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE; + if (uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE) + return uv_hub_info->hub_revision; + + return 0; +} + +static inline int is_uv_hub(void) +{ +#ifdef UV1_HUB_IS_SUPPORTED + return uv_hub_info->hub_revision; +#endif + return is_uvx_hub(); } union uvh_apicid { @@ -243,24 +347,42 @@ union uvh_apicid { #define UV3_LOCAL_MMR_SIZE (32UL * 1024 * 1024) #define UV3_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024) -#define UV_LOCAL_MMR_BASE (is_uv1_hub() ? UV1_LOCAL_MMR_BASE : \ - (is_uv2_hub() ? UV2_LOCAL_MMR_BASE : \ - UV3_LOCAL_MMR_BASE)) -#define UV_GLOBAL_MMR32_BASE (is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE :\ - (is_uv2_hub() ? UV2_GLOBAL_MMR32_BASE :\ - UV3_GLOBAL_MMR32_BASE)) -#define UV_LOCAL_MMR_SIZE (is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \ - (is_uv2_hub() ? UV2_LOCAL_MMR_SIZE : \ - UV3_LOCAL_MMR_SIZE)) -#define UV_GLOBAL_MMR32_SIZE (is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE :\ - (is_uv2_hub() ? UV2_GLOBAL_MMR32_SIZE :\ - UV3_GLOBAL_MMR32_SIZE)) +#define UV4_LOCAL_MMR_BASE 0xfa000000UL +#define UV4_GLOBAL_MMR32_BASE 0xfc000000UL +#define UV4_LOCAL_MMR_SIZE (32UL * 1024 * 1024) +#define UV4_GLOBAL_MMR32_SIZE (16UL * 1024 * 1024) + +#define UV_LOCAL_MMR_BASE ( \ + is_uv1_hub() ? UV1_LOCAL_MMR_BASE : \ + is_uv2_hub() ? UV2_LOCAL_MMR_BASE : \ + is_uv3_hub() ? UV3_LOCAL_MMR_BASE : \ + /*is_uv4_hub*/ UV4_LOCAL_MMR_BASE) + +#define UV_GLOBAL_MMR32_BASE ( \ + is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE : \ + is_uv2_hub() ? UV2_GLOBAL_MMR32_BASE : \ + is_uv3_hub() ? UV3_GLOBAL_MMR32_BASE : \ + /*is_uv4_hub*/ UV4_GLOBAL_MMR32_BASE) + +#define UV_LOCAL_MMR_SIZE ( \ + is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \ + is_uv2_hub() ? UV2_LOCAL_MMR_SIZE : \ + is_uv3_hub() ? UV3_LOCAL_MMR_SIZE : \ + /*is_uv4_hub*/ UV4_LOCAL_MMR_SIZE) + +#define UV_GLOBAL_MMR32_SIZE ( \ + is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE : \ + is_uv2_hub() ? UV2_GLOBAL_MMR32_SIZE : \ + is_uv3_hub() ? UV3_GLOBAL_MMR32_SIZE : \ + /*is_uv4_hub*/ UV4_GLOBAL_MMR32_SIZE) + #define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base) #define UV_GLOBAL_GRU_MMR_BASE 0x4000000 #define UV_GLOBAL_MMR32_PNODE_SHIFT 15 -#define UV_GLOBAL_MMR64_PNODE_SHIFT 26 +#define _UV_GLOBAL_MMR64_PNODE_SHIFT 26 +#define UV_GLOBAL_MMR64_PNODE_SHIFT (uv_hub_info->global_mmr_shift) #define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) @@ -307,18 +429,74 @@ union uvh_apicid { * between socket virtual and socket physical addresses. */ +/* global bits offset - number of local address bits in gpa for this UV arch */ +static inline unsigned int uv_gpa_shift(void) +{ + return uv_hub_info->gpa_shift; +} +#define _uv_gpa_shift + +/* Find node that has the address range that contains global address */ +static inline struct uv_gam_range_s *uv_gam_range(unsigned long pa) +{ + struct uv_gam_range_s *gr = uv_hub_info->gr_table; + unsigned long pal = (pa & uv_hub_info->gpa_mask) >> UV_GAM_RANGE_SHFT; + int i, num = uv_hub_info->gr_table_len; + + if (gr) { + for (i = 0; i < num; i++, gr++) { + if (pal < gr->limit) + return gr; + } + } + pr_crit("UV: GAM Range for 0x%lx not found at %p!\n", pa, gr); + BUG(); +} + +/* Return base address of node that contains global address */ +static inline unsigned long uv_gam_range_base(unsigned long pa) +{ + struct uv_gam_range_s *gr = uv_gam_range(pa); + int base = gr->base; + + if (base < 0) + return 0UL; + + return uv_hub_info->gr_table[base].limit; +} + +/* socket phys RAM --> UV global NASID (UV4+) */ +static inline unsigned long uv_soc_phys_ram_to_nasid(unsigned long paddr) +{ + return uv_gam_range(paddr)->nasid; +} +#define _uv_soc_phys_ram_to_nasid + +/* socket virtual --> UV global NASID (UV4+) */ +static inline unsigned long uv_gpa_nasid(void *v) +{ + return uv_soc_phys_ram_to_nasid(__pa(v)); +} + /* socket phys RAM --> UV global physical address */ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) { + unsigned int m_val = uv_hub_info->m_val; + if (paddr < uv_hub_info->lowmem_remap_top) paddr |= uv_hub_info->lowmem_remap_base; paddr |= uv_hub_info->gnode_upper; - paddr = ((paddr << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | - ((paddr >> uv_hub_info->m_val) << uv_hub_info->n_lshift); + if (m_val) + paddr = ((paddr << uv_hub_info->m_shift) + >> uv_hub_info->m_shift) | + ((paddr >> uv_hub_info->m_val) + << uv_hub_info->n_lshift); + else + paddr |= uv_soc_phys_ram_to_nasid(paddr) + << uv_hub_info->gpa_shift; return paddr; } - /* socket virtual --> UV global physical address */ static inline unsigned long uv_gpa(void *v) { @@ -338,54 +516,89 @@ static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa) unsigned long paddr; unsigned long remap_base = uv_hub_info->lowmem_remap_base; unsigned long remap_top = uv_hub_info->lowmem_remap_top; + unsigned int m_val = uv_hub_info->m_val; + + if (m_val) + gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | + ((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val); - gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | - ((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val); paddr = gpa & uv_hub_info->gpa_mask; if (paddr >= remap_base && paddr < remap_base + remap_top) paddr -= remap_base; return paddr; } - -/* gpa -> pnode */ +/* gpa -> gnode */ static inline unsigned long uv_gpa_to_gnode(unsigned long gpa) { - return gpa >> uv_hub_info->n_lshift; + unsigned int n_lshift = uv_hub_info->n_lshift; + + if (n_lshift) + return gpa >> n_lshift; + + return uv_gam_range(gpa)->nasid >> 1; } /* gpa -> pnode */ static inline int uv_gpa_to_pnode(unsigned long gpa) { - unsigned long n_mask = (1UL << uv_hub_info->n_val) - 1; - - return uv_gpa_to_gnode(gpa) & n_mask; + return uv_gpa_to_gnode(gpa) & uv_hub_info->pnode_mask; } -/* gpa -> node offset*/ +/* gpa -> node offset */ static inline unsigned long uv_gpa_to_offset(unsigned long gpa) { - return (gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift; + unsigned int m_shift = uv_hub_info->m_shift; + + if (m_shift) + return (gpa << m_shift) >> m_shift; + + return (gpa & uv_hub_info->gpa_mask) - uv_gam_range_base(gpa); +} + +/* Convert socket to node */ +static inline int _uv_socket_to_node(int socket, unsigned short *s2nid) +{ + return s2nid ? s2nid[socket - uv_hub_info->min_socket] : socket; +} + +static inline int uv_socket_to_node(int socket) +{ + return _uv_socket_to_node(socket, uv_hub_info->socket_to_node); } /* pnode, offset --> socket virtual */ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) { - return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset); -} + unsigned int m_val = uv_hub_info->m_val; + unsigned long base; + unsigned short sockid, node, *p2s; + if (m_val) + return __va(((unsigned long)pnode << m_val) | offset); -/* - * Extract a PNODE from an APICID (full apicid, not processor subset) - */ + p2s = uv_hub_info->pnode_to_socket; + sockid = p2s ? p2s[pnode - uv_hub_info->min_pnode] : pnode; + node = uv_socket_to_node(sockid); + + /* limit address of previous socket is our base, except node 0 is 0 */ + if (!node) + return __va((unsigned long)offset); + + base = (unsigned long)(uv_hub_info->gr_table[node - 1].limit); + return __va(base << UV_GAM_RANGE_SHFT | offset); +} + +/* Extract/Convert a PNODE from an APICID (full apicid, not processor subset) */ static inline int uv_apicid_to_pnode(int apicid) { - return (apicid >> uv_hub_info->apic_pnode_shift); + int pnode = apicid >> uv_hub_info->apic_pnode_shift; + unsigned short *s2pn = uv_hub_info->socket_to_pnode; + + return s2pn ? s2pn[pnode - uv_hub_info->min_socket] : pnode; } -/* - * Convert an apicid to the socket number on the blade - */ +/* Convert an apicid to the socket number on the blade */ static inline int uv_apicid_to_socket(int apicid) { if (is_uv1_hub()) @@ -434,16 +647,6 @@ static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset return readq(uv_global_mmr64_address(pnode, offset)); } -/* - * Global MMR space addresses when referenced by the GRU. (GRU does - * NOT use socket addressing). - */ -static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset) -{ - return UV_GLOBAL_GRU_MMR_BASE | offset | - ((unsigned long)pnode << uv_hub_info->m_val); -} - static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val) { writeb(val, uv_global_mmr64_address(pnode, offset)); @@ -483,27 +686,23 @@ static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val) writeb(val, uv_local_mmr_address(offset)); } -/* - * Structures and definitions for converting between cpu, node, pnode, and blade - * numbers. - */ -struct uv_blade_info { - unsigned short nr_possible_cpus; - unsigned short nr_online_cpus; - unsigned short pnode; - short memory_nid; - spinlock_t nmi_lock; /* obsolete, see uv_hub_nmi */ - unsigned long nmi_count; /* obsolete, see uv_hub_nmi */ -}; -extern struct uv_blade_info *uv_blade_info; -extern short *uv_node_to_blade; -extern short *uv_cpu_to_blade; -extern short uv_possible_blades; - /* Blade-local cpu number of current cpu. Numbered 0 .. <# cpus on the blade> */ static inline int uv_blade_processor_id(void) { - return uv_hub_info->blade_processor_id; + return uv_cpu_info->blade_cpu_id; +} + +/* Blade-local cpu number of cpu N. Numbered 0 .. <# cpus on the blade> */ +static inline int uv_cpu_blade_processor_id(int cpu) +{ + return uv_cpu_info_per(cpu)->blade_cpu_id; +} +#define _uv_cpu_blade_processor_id 1 /* indicate function available */ + +/* Blade number to Node number (UV1..UV4 is 1:1) */ +static inline int uv_blade_to_node(int blade) +{ + return blade; } /* Blade number of current cpu. Numnbered 0 .. <#blades -1> */ @@ -512,55 +711,60 @@ static inline int uv_numa_blade_id(void) return uv_hub_info->numa_blade_id; } -/* Convert a cpu number to the the UV blade number */ -static inline int uv_cpu_to_blade_id(int cpu) +/* + * Convert linux node number to the UV blade number. + * .. Currently for UV1 thru UV4 the node and the blade are identical. + * .. If this changes then you MUST check references to this function! + */ +static inline int uv_node_to_blade_id(int nid) { - return uv_cpu_to_blade[cpu]; + return nid; } -/* Convert linux node number to the UV blade number */ -static inline int uv_node_to_blade_id(int nid) +/* Convert a cpu number to the the UV blade number */ +static inline int uv_cpu_to_blade_id(int cpu) { - return uv_node_to_blade[nid]; + return uv_node_to_blade_id(cpu_to_node(cpu)); } /* Convert a blade id to the PNODE of the blade */ static inline int uv_blade_to_pnode(int bid) { - return uv_blade_info[bid].pnode; + return uv_hub_info_list(uv_blade_to_node(bid))->pnode; } /* Nid of memory node on blade. -1 if no blade-local memory */ static inline int uv_blade_to_memory_nid(int bid) { - return uv_blade_info[bid].memory_nid; + return uv_hub_info_list(uv_blade_to_node(bid))->memory_nid; } /* Determine the number of possible cpus on a blade */ static inline int uv_blade_nr_possible_cpus(int bid) { - return uv_blade_info[bid].nr_possible_cpus; + return uv_hub_info_list(uv_blade_to_node(bid))->nr_possible_cpus; } /* Determine the number of online cpus on a blade */ static inline int uv_blade_nr_online_cpus(int bid) { - return uv_blade_info[bid].nr_online_cpus; + return uv_hub_info_list(uv_blade_to_node(bid))->nr_online_cpus; } /* Convert a cpu id to the PNODE of the blade containing the cpu */ static inline int uv_cpu_to_pnode(int cpu) { - return uv_blade_info[uv_cpu_to_blade_id(cpu)].pnode; + return uv_cpu_hub_info(cpu)->pnode; } /* Convert a linux node number to the PNODE of the blade */ static inline int uv_node_to_pnode(int nid) { - return uv_blade_info[uv_node_to_blade_id(nid)].pnode; + return uv_hub_info_list(nid)->pnode; } /* Maximum possible number of blades */ +extern short uv_possible_blades; static inline int uv_num_possible_blades(void) { return uv_possible_blades; @@ -578,9 +782,7 @@ extern void uv_nmi_setup(void); /* Newer SMM NMI handler, not present in all systems */ #define UVH_NMI_MMRX UVH_EVENT_OCCURRED0 #define UVH_NMI_MMRX_CLEAR UVH_EVENT_OCCURRED0_ALIAS -#define UVH_NMI_MMRX_SHIFT (is_uv1_hub() ? \ - UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT :\ - UVXH_EVENT_OCCURRED0_EXTIO_INT0_SHFT) +#define UVH_NMI_MMRX_SHIFT UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT #define UVH_NMI_MMRX_TYPE "EXTIO_INT0" /* Non-zero indicates newer SMM NMI handler present */ @@ -622,9 +824,9 @@ DECLARE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi); /* Update SCIR state */ static inline void uv_set_scir_bits(unsigned char value) { - if (uv_hub_info->scir.state != value) { - uv_hub_info->scir.state = value; - uv_write_local_mmr8(uv_hub_info->scir.offset, value); + if (uv_scir_info->state != value) { + uv_scir_info->state = value; + uv_write_local_mmr8(uv_scir_info->offset, value); } } @@ -635,10 +837,10 @@ static inline unsigned long uv_scir_offset(int apicid) static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) { - if (uv_cpu_hub_info(cpu)->scir.state != value) { + if (uv_cpu_scir_info(cpu)->state != value) { uv_write_global_mmr8(uv_cpu_to_pnode(cpu), - uv_cpu_hub_info(cpu)->scir.offset, value); - uv_cpu_hub_info(cpu)->scir.state = value; + uv_cpu_scir_info(cpu)->offset, value); + uv_cpu_scir_info(cpu)->state = value; } } @@ -666,10 +868,7 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) /* * Get the minimum revision number of the hub chips within the partition. - * 1 - UV1 rev 1.0 initial silicon - * 2 - UV1 rev 2.0 production silicon - * 3 - UV2 rev 1.0 initial silicon - * 5 - UV3 rev 1.0 initial silicon + * (See UVx_HUB_REVISION_BASE above for specific values.) */ static inline int uv_get_min_hub_revision_id(void) { diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index ddd8db6b6e70c9e44065410d81710867ded2e156..548d684a7960d39b5afef2d18694dc5b9c0b52d8 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -5,7 +5,7 @@ * * SGI UV MMR definitions * - * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2016 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_MMRS_H @@ -18,10 +18,11 @@ * grouped by architecture types. * * UVH - definitions common to all UV hub types. - * UVXH - definitions common to all UV eXtended hub types (currently 2 & 3). + * UVXH - definitions common to all UV eXtended hub types (currently 2, 3, 4). * UV1H - definitions specific to UV type 1 hub. * UV2H - definitions specific to UV type 2 hub. * UV3H - definitions specific to UV type 3 hub. + * UV4H - definitions specific to UV type 4 hub. * * So in general, MMR addresses and structures are identical on all hubs types. * These MMRs are identified as: @@ -32,19 +33,25 @@ * } s; * }; * - * If the MMR exists on all hub types but have different addresses: + * If the MMR exists on all hub types but have different addresses, + * use a conditional operator to define the value at runtime. * #define UV1Hxxx a * #define UV2Hxxx b * #define UV3Hxxx c + * #define UV4Hxxx d * #define UVHxxx (is_uv1_hub() ? UV1Hxxx : * (is_uv2_hub() ? UV2Hxxx : - * UV3Hxxx)) + * (is_uv3_hub() ? UV3Hxxx : + * UV4Hxxx)) * - * If the MMR exists on all hub types > 1 but have different addresses: + * If the MMR exists on all hub types > 1 but have different addresses, the + * variation using "UVX" as the prefix exists. * #define UV2Hxxx b * #define UV3Hxxx c - * #define UVXHxxx (is_uv2_hub() ? UV2Hxxx : - * UV3Hxxx)) + * #define UV4Hxxx d + * #define UVHxxx (is_uv2_hub() ? UV2Hxxx : + * (is_uv3_hub() ? UV3Hxxx : + * UV4Hxxx)) * * union uvh_xxx { * unsigned long v; @@ -56,6 +63,8 @@ * } s2; * struct uv3h_xxx_s { # Full UV3 definition (*) * } s3; + * struct uv4h_xxx_s { # Full UV4 definition (*) + * } s4; * }; * (* - if present and different than the common struct) * @@ -73,7 +82,7 @@ * } sn; * }; * - * (GEN Flags: mflags_opt= undefs=0 UV23=UVXH) + * (GEN Flags: mflags_opt= undefs=function UV234=UVXH) */ #define UV_MMR_ENABLE (1UL << 63) @@ -83,20 +92,36 @@ #define UV2_HUB_PART_NUMBER_X 0x1111 #define UV3_HUB_PART_NUMBER 0x9578 #define UV3_HUB_PART_NUMBER_X 0x4321 +#define UV4_HUB_PART_NUMBER 0x99a1 /* Compat: Indicate which UV Hubs are supported. */ +#define UV1_HUB_IS_SUPPORTED 1 #define UV2_HUB_IS_SUPPORTED 1 #define UV3_HUB_IS_SUPPORTED 1 +#define UV4_HUB_IS_SUPPORTED 1 + +/* Error function to catch undefined references */ +extern unsigned long uv_undefined(char *str); /* ========================================================================= */ /* UVH_BAU_DATA_BROADCAST */ /* ========================================================================= */ #define UVH_BAU_DATA_BROADCAST 0x61688UL -#define UVH_BAU_DATA_BROADCAST_32 0x440 + +#define UV1H_BAU_DATA_BROADCAST_32 0x440 +#define UV2H_BAU_DATA_BROADCAST_32 0x440 +#define UV3H_BAU_DATA_BROADCAST_32 0x440 +#define UV4H_BAU_DATA_BROADCAST_32 0x360 +#define UVH_BAU_DATA_BROADCAST_32 ( \ + is_uv1_hub() ? UV1H_BAU_DATA_BROADCAST_32 : \ + is_uv2_hub() ? UV2H_BAU_DATA_BROADCAST_32 : \ + is_uv3_hub() ? UV3H_BAU_DATA_BROADCAST_32 : \ + /*is_uv4_hub*/ UV4H_BAU_DATA_BROADCAST_32) #define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0 #define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL + union uvh_bau_data_broadcast_u { unsigned long v; struct uvh_bau_data_broadcast_s { @@ -109,7 +134,16 @@ union uvh_bau_data_broadcast_u { /* UVH_BAU_DATA_CONFIG */ /* ========================================================================= */ #define UVH_BAU_DATA_CONFIG 0x61680UL -#define UVH_BAU_DATA_CONFIG_32 0x438 + +#define UV1H_BAU_DATA_CONFIG_32 0x438 +#define UV2H_BAU_DATA_CONFIG_32 0x438 +#define UV3H_BAU_DATA_CONFIG_32 0x438 +#define UV4H_BAU_DATA_CONFIG_32 0x358 +#define UVH_BAU_DATA_CONFIG_32 ( \ + is_uv1_hub() ? UV1H_BAU_DATA_CONFIG_32 : \ + is_uv2_hub() ? UV2H_BAU_DATA_CONFIG_32 : \ + is_uv3_hub() ? UV3H_BAU_DATA_CONFIG_32 : \ + /*is_uv4_hub*/ UV4H_BAU_DATA_CONFIG_32) #define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 #define UVH_BAU_DATA_CONFIG_DM_SHFT 8 @@ -128,6 +162,7 @@ union uvh_bau_data_broadcast_u { #define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL #define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + union uvh_bau_data_config_u { unsigned long v; struct uvh_bau_data_config_s { @@ -266,7 +301,6 @@ union uvh_bau_data_config_u { #define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL #define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL -#define UVXH_EVENT_OCCURRED0_QP_HCERR_SHFT 1 #define UVXH_EVENT_OCCURRED0_RH_HCERR_SHFT 2 #define UVXH_EVENT_OCCURRED0_LH0_HCERR_SHFT 3 #define UVXH_EVENT_OCCURRED0_LH1_HCERR_SHFT 4 @@ -275,55 +309,11 @@ union uvh_bau_data_config_u { #define UVXH_EVENT_OCCURRED0_NI0_HCERR_SHFT 7 #define UVXH_EVENT_OCCURRED0_NI1_HCERR_SHFT 8 #define UVXH_EVENT_OCCURRED0_LB_AOERR0_SHFT 9 -#define UVXH_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 #define UVXH_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12 #define UVXH_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13 #define UVXH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14 #define UVXH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15 #define UVXH_EVENT_OCCURRED0_XB_AOERR0_SHFT 16 -#define UVXH_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 -#define UVXH_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 -#define UVXH_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 -#define UVXH_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 -#define UVXH_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 -#define UVXH_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 -#define UVXH_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 -#define UVXH_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 -#define UVXH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 -#define UVXH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 -#define UVXH_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 -#define UVXH_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 -#define UVXH_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 -#define UVXH_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 -#define UVXH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 -#define UVXH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 -#define UVXH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 -#define UVXH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 -#define UVXH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 -#define UVXH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 -#define UVXH_EVENT_OCCURRED0_IPI_INT_SHFT 53 -#define UVXH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 -#define UVXH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 -#define UVXH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 -#define UVXH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 -#define UVXH_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 -#define UVXH_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL #define UVXH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL #define UVXH_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL #define UVXH_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL @@ -332,54 +322,294 @@ union uvh_bau_data_config_u { #define UVXH_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL #define UVXH_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL #define UVXH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL -#define UVXH_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL #define UVXH_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL #define UVXH_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL #define UVXH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL #define UVXH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL #define UVXH_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL -#define UVXH_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL -#define UVXH_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL -#define UVXH_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL -#define UVXH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL -#define UVXH_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL -#define UVXH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL -#define UVXH_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL -#define UVXH_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL -#define UVXH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL -#define UVXH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL -#define UVXH_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL -#define UVXH_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL -#define UVXH_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL -#define UVXH_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL -#define UVXH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL -#define UVXH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL -#define UVXH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL -#define UVXH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL -#define UVXH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL -#define UVXH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL -#define UVXH_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL -#define UVXH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL -#define UVXH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL -#define UVXH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL -#define UVXH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL -#define UVXH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL + +#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1 +#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 +#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 +#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 +#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 +#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 +#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 +#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 +#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 +#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 +#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 +#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 +#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 +#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 +#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 +#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 +#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 +#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 +#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 +#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 +#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 +#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 +#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 +#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 +#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL +#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL +#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL +#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL +#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL +#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL +#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL +#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL +#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL +#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL +#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL +#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL +#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL +#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL +#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL +#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL +#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL +#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL +#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL +#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL +#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL +#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL +#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL +#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL + +#define UV3H_EVENT_OCCURRED0_QP_HCERR_SHFT 1 +#define UV3H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 +#define UV3H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 +#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 +#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 +#define UV3H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 +#define UV3H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 +#define UV3H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 +#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 +#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 +#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 +#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 +#define UV3H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 +#define UV3H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 +#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 +#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 +#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 +#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 +#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 +#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 +#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 +#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 +#define UV3H_EVENT_OCCURRED0_IPI_INT_SHFT 53 +#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 +#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 +#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 +#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 +#define UV3H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 +#define UV3H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL +#define UV3H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL +#define UV3H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL +#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL +#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL +#define UV3H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL +#define UV3H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL +#define UV3H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL +#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL +#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL +#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL +#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL +#define UV3H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL +#define UV3H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL +#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL +#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL +#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL +#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL +#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL +#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL +#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL +#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL +#define UV3H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL +#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL +#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL +#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL +#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL +#define UV3H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL + +#define UV4H_EVENT_OCCURRED0_KT_HCERR_SHFT 1 +#define UV4H_EVENT_OCCURRED0_KT_AOERR0_SHFT 10 +#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR0_SHFT 17 +#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR0_SHFT 18 +#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR0_SHFT 19 +#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR0_SHFT 20 +#define UV4H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 21 +#define UV4H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 22 +#define UV4H_EVENT_OCCURRED0_LB_AOERR1_SHFT 23 +#define UV4H_EVENT_OCCURRED0_KT_AOERR1_SHFT 24 +#define UV4H_EVENT_OCCURRED0_RH_AOERR1_SHFT 25 +#define UV4H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 26 +#define UV4H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 27 +#define UV4H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 28 +#define UV4H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 29 +#define UV4H_EVENT_OCCURRED0_XB_AOERR1_SHFT 30 +#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR1_SHFT 31 +#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR1_SHFT 32 +#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR1_SHFT 33 +#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR1_SHFT 34 +#define UV4H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 35 +#define UV4H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 36 +#define UV4H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 37 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 38 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 39 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 40 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 41 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 42 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 43 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 44 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 45 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 46 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 47 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 48 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 49 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 50 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 51 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 52 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 53 +#define UV4H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 54 +#define UV4H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 55 +#define UV4H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 56 +#define UV4H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 57 +#define UV4H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 58 +#define UV4H_EVENT_OCCURRED0_IPI_INT_SHFT 59 +#define UV4H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 60 +#define UV4H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 61 +#define UV4H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 62 +#define UV4H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 63 +#define UV4H_EVENT_OCCURRED0_KT_HCERR_MASK 0x0000000000000002UL +#define UV4H_EVENT_OCCURRED0_KT_AOERR0_MASK 0x0000000000000400UL +#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR0_MASK 0x0000000000020000UL +#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR0_MASK 0x0000000000040000UL +#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR0_MASK 0x0000000000080000UL +#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR0_MASK 0x0000000000100000UL +#define UV4H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000200000UL +#define UV4H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000400000UL +#define UV4H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000800000UL +#define UV4H_EVENT_OCCURRED0_KT_AOERR1_MASK 0x0000000001000000UL +#define UV4H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000002000000UL +#define UV4H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000004000000UL +#define UV4H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000008000000UL +#define UV4H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000010000000UL +#define UV4H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000020000000UL +#define UV4H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000040000000UL +#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR1_MASK 0x0000000080000000UL +#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR1_MASK 0x0000000100000000UL +#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR1_MASK 0x0000000200000000UL +#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR1_MASK 0x0000000400000000UL +#define UV4H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000800000000UL +#define UV4H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000001000000000UL +#define UV4H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000002000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000004000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000008000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000010000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000020000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000040000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000080000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000100000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000200000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000400000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000800000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0001000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0002000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0004000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0008000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0010000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0020000000000000UL +#define UV4H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0040000000000000UL +#define UV4H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0080000000000000UL +#define UV4H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0100000000000000UL +#define UV4H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0200000000000000UL +#define UV4H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0400000000000000UL +#define UV4H_EVENT_OCCURRED0_IPI_INT_MASK 0x0800000000000000UL +#define UV4H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x1000000000000000UL +#define UV4H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x2000000000000000UL +#define UV4H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x4000000000000000UL +#define UV4H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x8000000000000000UL + +#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT ( \ + is_uv1_hub() ? UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \ + is_uv2_hub() ? UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \ + is_uv3_hub() ? UV3H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \ + /*is_uv4_hub*/ UV4H_EVENT_OCCURRED0_EXTIO_INT0_SHFT) union uvh_event_occurred0_u { unsigned long v; @@ -391,7 +621,7 @@ union uvh_event_occurred0_u { } s; struct uvxh_event_occurred0_s { unsigned long lb_hcerr:1; /* RW */ - unsigned long qp_hcerr:1; /* RW */ + unsigned long rsvd_1:1; unsigned long rh_hcerr:1; /* RW */ unsigned long lh0_hcerr:1; /* RW */ unsigned long lh1_hcerr:1; /* RW */ @@ -400,25 +630,51 @@ union uvh_event_occurred0_u { unsigned long ni0_hcerr:1; /* RW */ unsigned long ni1_hcerr:1; /* RW */ unsigned long lb_aoerr0:1; /* RW */ - unsigned long qp_aoerr0:1; /* RW */ + unsigned long rsvd_10:1; unsigned long rh_aoerr0:1; /* RW */ unsigned long lh0_aoerr0:1; /* RW */ unsigned long lh1_aoerr0:1; /* RW */ unsigned long gr0_aoerr0:1; /* RW */ unsigned long gr1_aoerr0:1; /* RW */ unsigned long xb_aoerr0:1; /* RW */ - unsigned long rt_aoerr0:1; /* RW */ + unsigned long rsvd_17_63:47; + } sx; + struct uv4h_event_occurred0_s { + unsigned long lb_hcerr:1; /* RW */ + unsigned long kt_hcerr:1; /* RW */ + unsigned long rh_hcerr:1; /* RW */ + unsigned long lh0_hcerr:1; /* RW */ + unsigned long lh1_hcerr:1; /* RW */ + unsigned long gr0_hcerr:1; /* RW */ + unsigned long gr1_hcerr:1; /* RW */ + unsigned long ni0_hcerr:1; /* RW */ + unsigned long ni1_hcerr:1; /* RW */ + unsigned long lb_aoerr0:1; /* RW */ + unsigned long kt_aoerr0:1; /* RW */ + unsigned long rh_aoerr0:1; /* RW */ + unsigned long lh0_aoerr0:1; /* RW */ + unsigned long lh1_aoerr0:1; /* RW */ + unsigned long gr0_aoerr0:1; /* RW */ + unsigned long gr1_aoerr0:1; /* RW */ + unsigned long xb_aoerr0:1; /* RW */ + unsigned long rtq0_aoerr0:1; /* RW */ + unsigned long rtq1_aoerr0:1; /* RW */ + unsigned long rtq2_aoerr0:1; /* RW */ + unsigned long rtq3_aoerr0:1; /* RW */ unsigned long ni0_aoerr0:1; /* RW */ unsigned long ni1_aoerr0:1; /* RW */ unsigned long lb_aoerr1:1; /* RW */ - unsigned long qp_aoerr1:1; /* RW */ + unsigned long kt_aoerr1:1; /* RW */ unsigned long rh_aoerr1:1; /* RW */ unsigned long lh0_aoerr1:1; /* RW */ unsigned long lh1_aoerr1:1; /* RW */ unsigned long gr0_aoerr1:1; /* RW */ unsigned long gr1_aoerr1:1; /* RW */ unsigned long xb_aoerr1:1; /* RW */ - unsigned long rt_aoerr1:1; /* RW */ + unsigned long rtq0_aoerr1:1; /* RW */ + unsigned long rtq1_aoerr1:1; /* RW */ + unsigned long rtq2_aoerr1:1; /* RW */ + unsigned long rtq3_aoerr1:1; /* RW */ unsigned long ni0_aoerr1:1; /* RW */ unsigned long ni1_aoerr1:1; /* RW */ unsigned long system_shutdown_int:1; /* RW */ @@ -448,9 +704,7 @@ union uvh_event_occurred0_u { unsigned long extio_int1:1; /* RW */ unsigned long extio_int2:1; /* RW */ unsigned long extio_int3:1; /* RW */ - unsigned long profile_int:1; /* RW */ - unsigned long rsvd_59_63:5; - } sx; + } s4; }; /* ========================================================================= */ @@ -464,11 +718,21 @@ union uvh_event_occurred0_u { /* UVH_EXTIO_INT0_BROADCAST */ /* ========================================================================= */ #define UVH_EXTIO_INT0_BROADCAST 0x61448UL -#define UVH_EXTIO_INT0_BROADCAST_32 0x3f0 + +#define UV1H_EXTIO_INT0_BROADCAST_32 0x3f0 +#define UV2H_EXTIO_INT0_BROADCAST_32 0x3f0 +#define UV3H_EXTIO_INT0_BROADCAST_32 0x3f0 +#define UV4H_EXTIO_INT0_BROADCAST_32 0x310 +#define UVH_EXTIO_INT0_BROADCAST_32 ( \ + is_uv1_hub() ? UV1H_EXTIO_INT0_BROADCAST_32 : \ + is_uv2_hub() ? UV2H_EXTIO_INT0_BROADCAST_32 : \ + is_uv3_hub() ? UV3H_EXTIO_INT0_BROADCAST_32 : \ + /*is_uv4_hub*/ UV4H_EXTIO_INT0_BROADCAST_32) #define UVH_EXTIO_INT0_BROADCAST_ENABLE_SHFT 0 #define UVH_EXTIO_INT0_BROADCAST_ENABLE_MASK 0x0000000000000001UL + union uvh_extio_int0_broadcast_u { unsigned long v; struct uvh_extio_int0_broadcast_s { @@ -499,6 +763,7 @@ union uvh_extio_int0_broadcast_u { #define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL #define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + union uvh_gr0_tlb_int0_config_u { unsigned long v; struct uvh_gr0_tlb_int0_config_s { @@ -537,6 +802,7 @@ union uvh_gr0_tlb_int0_config_u { #define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL #define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + union uvh_gr0_tlb_int1_config_u { unsigned long v; struct uvh_gr0_tlb_int1_config_s { @@ -559,19 +825,18 @@ union uvh_gr0_tlb_int1_config_u { #define UV1H_GR0_TLB_MMR_CONTROL 0x401080UL #define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL #define UV3H_GR0_TLB_MMR_CONTROL 0xc01080UL -#define UVH_GR0_TLB_MMR_CONTROL \ - (is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL : \ - (is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL : \ - UV3H_GR0_TLB_MMR_CONTROL)) +#define UV4H_GR0_TLB_MMR_CONTROL 0x601080UL +#define UVH_GR0_TLB_MMR_CONTROL ( \ + is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL : \ + is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL : \ + is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL : \ + /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL) #define UVH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 #define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 #define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 #define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 #define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL #define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL #define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL #define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL @@ -601,14 +866,11 @@ union uvh_gr0_tlb_int1_config_u { #define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL #define UVXH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UVXH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 #define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 -#define UVXH_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UVXH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL #define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL #define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL #define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL @@ -651,12 +913,45 @@ union uvh_gr0_tlb_int1_config_u { #define UV3H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL #define UV3H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL +#define UV4H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 13 +#define UV4H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV4H_GR0_TLB_MMR_CONTROL_ECC_SEL_SHFT 21 +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UV4H_GR0_TLB_MMR_CONTROL_PAGE_SIZE_SHFT 59 +#define UV4H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000001fffUL +#define UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000006000UL +#define UV4H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV4H_GR0_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL +#define UV4H_GR0_TLB_MMR_CONTROL_PAGE_SIZE_MASK 0xf800000000000000UL + +#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK ( \ + is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \ + is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \ + is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \ + /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_INDEX_MASK) +#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK ( \ + is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \ + is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \ + is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \ + /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK) +#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT ( \ + is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \ + is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \ + is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \ + /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT) + union uvh_gr0_tlb_mmr_control_u { unsigned long v; struct uvh_gr0_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; + unsigned long rsvd_0_15:16; unsigned long auto_valid_en:1; /* RW */ unsigned long rsvd_17_19:3; unsigned long mmr_hash_index_en:1; /* RW */ @@ -690,9 +985,7 @@ union uvh_gr0_tlb_mmr_control_u { unsigned long rsvd_61_63:3; } s1; struct uvxh_gr0_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; + unsigned long rsvd_0_15:16; unsigned long auto_valid_en:1; /* RW */ unsigned long rsvd_17_19:3; unsigned long mmr_hash_index_en:1; /* RW */ @@ -703,8 +996,7 @@ union uvh_gr0_tlb_mmr_control_u { unsigned long rsvd_33_47:15; unsigned long rsvd_48:1; unsigned long rsvd_49_51:3; - unsigned long rsvd_52:1; - unsigned long rsvd_53_63:11; + unsigned long rsvd_52_63:12; } sx; struct uv2h_gr0_tlb_mmr_control_s { unsigned long index:12; /* RW */ @@ -741,6 +1033,24 @@ union uvh_gr0_tlb_mmr_control_u { unsigned long undef_52:1; /* Undefined */ unsigned long rsvd_53_63:11; } s3; + struct uv4h_gr0_tlb_mmr_control_s { + unsigned long index:13; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_15:1; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long ecc_sel:1; /* RW */ + unsigned long rsvd_22_29:8; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long undef_48:1; /* Undefined */ + unsigned long rsvd_49_51:3; + unsigned long rsvd_52_58:7; + unsigned long page_size:5; /* RW */ + } s4; }; /* ========================================================================= */ @@ -749,19 +1059,14 @@ union uvh_gr0_tlb_mmr_control_u { #define UV1H_GR0_TLB_MMR_READ_DATA_HI 0x4010a0UL #define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL #define UV3H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL -#define UVH_GR0_TLB_MMR_READ_DATA_HI \ - (is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_HI : \ - (is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_HI : \ - UV3H_GR0_TLB_MMR_READ_DATA_HI)) +#define UV4H_GR0_TLB_MMR_READ_DATA_HI 0x6010a0UL +#define UVH_GR0_TLB_MMR_READ_DATA_HI ( \ + is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_HI : \ + is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_HI : \ + is_uv3_hub() ? UV3H_GR0_TLB_MMR_READ_DATA_HI : \ + /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_READ_DATA_HI) #define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL #define UV1H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 #define UV1H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 @@ -773,13 +1078,6 @@ union uvh_gr0_tlb_mmr_control_u { #define UV1H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL #define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL #define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 #define UV2H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 @@ -803,15 +1101,24 @@ union uvh_gr0_tlb_mmr_control_u { #define UV3H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0000200000000000UL #define UV3H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PNID_SHFT 34 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 49 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 51 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 52 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 53 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x00000003ffffffffUL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PNID_MASK 0x0001fffc00000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0006000000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0008000000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0010000000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0020000000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL + + union uvh_gr0_tlb_mmr_read_data_hi_u { unsigned long v; - struct uvh_gr0_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long rsvd_45_63:19; - } s; struct uv1h_gr0_tlb_mmr_read_data_hi_s { unsigned long pfn:41; /* RO */ unsigned long gaa:2; /* RO */ @@ -819,13 +1126,6 @@ union uvh_gr0_tlb_mmr_read_data_hi_u { unsigned long larger:1; /* RO */ unsigned long rsvd_45_63:19; } s1; - struct uvxh_gr0_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long rsvd_45_63:19; - } sx; struct uv2h_gr0_tlb_mmr_read_data_hi_s { unsigned long pfn:41; /* RO */ unsigned long gaa:2; /* RO */ @@ -842,6 +1142,16 @@ union uvh_gr0_tlb_mmr_read_data_hi_u { unsigned long undef_46_54:9; /* Undefined */ unsigned long way_ecc:9; /* RO */ } s3; + struct uv4h_gr0_tlb_mmr_read_data_hi_s { + unsigned long pfn:34; /* RO */ + unsigned long pnid:15; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long aa_ext:1; /* RO */ + unsigned long undef_54:1; /* Undefined */ + unsigned long way_ecc:9; /* RO */ + } s4; }; /* ========================================================================= */ @@ -850,10 +1160,12 @@ union uvh_gr0_tlb_mmr_read_data_hi_u { #define UV1H_GR0_TLB_MMR_READ_DATA_LO 0x4010a8UL #define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL #define UV3H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL -#define UVH_GR0_TLB_MMR_READ_DATA_LO \ - (is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_LO : \ - (is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_LO : \ - UV3H_GR0_TLB_MMR_READ_DATA_LO)) +#define UV4H_GR0_TLB_MMR_READ_DATA_LO 0x6010a8UL +#define UVH_GR0_TLB_MMR_READ_DATA_LO ( \ + is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_LO : \ + is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_LO : \ + is_uv3_hub() ? UV3H_GR0_TLB_MMR_READ_DATA_LO : \ + /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_READ_DATA_LO) #define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 #define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 @@ -890,6 +1202,14 @@ union uvh_gr0_tlb_mmr_read_data_hi_u { #define UV3H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL #define UV3H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV4H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV4H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + + union uvh_gr0_tlb_mmr_read_data_lo_u { unsigned long v; struct uvh_gr0_tlb_mmr_read_data_lo_s { @@ -917,12 +1237,25 @@ union uvh_gr0_tlb_mmr_read_data_lo_u { unsigned long asid:24; /* RO */ unsigned long valid:1; /* RO */ } s3; + struct uv4h_gr0_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s4; }; /* ========================================================================= */ /* UVH_GR1_TLB_INT0_CONFIG */ /* ========================================================================= */ -#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL +#define UV1H_GR1_TLB_INT0_CONFIG 0x61f00UL +#define UV2H_GR1_TLB_INT0_CONFIG 0x61f00UL +#define UV3H_GR1_TLB_INT0_CONFIG 0x61f00UL +#define UV4H_GR1_TLB_INT0_CONFIG 0x62100UL +#define UVH_GR1_TLB_INT0_CONFIG ( \ + is_uv1_hub() ? UV1H_GR1_TLB_INT0_CONFIG : \ + is_uv2_hub() ? UV2H_GR1_TLB_INT0_CONFIG : \ + is_uv3_hub() ? UV3H_GR1_TLB_INT0_CONFIG : \ + /*is_uv4_hub*/ UV4H_GR1_TLB_INT0_CONFIG) #define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0 #define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8 @@ -941,6 +1274,7 @@ union uvh_gr0_tlb_mmr_read_data_lo_u { #define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL #define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + union uvh_gr1_tlb_int0_config_u { unsigned long v; struct uvh_gr1_tlb_int0_config_s { @@ -960,7 +1294,15 @@ union uvh_gr1_tlb_int0_config_u { /* ========================================================================= */ /* UVH_GR1_TLB_INT1_CONFIG */ /* ========================================================================= */ -#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL +#define UV1H_GR1_TLB_INT1_CONFIG 0x61f40UL +#define UV2H_GR1_TLB_INT1_CONFIG 0x61f40UL +#define UV3H_GR1_TLB_INT1_CONFIG 0x61f40UL +#define UV4H_GR1_TLB_INT1_CONFIG 0x62140UL +#define UVH_GR1_TLB_INT1_CONFIG ( \ + is_uv1_hub() ? UV1H_GR1_TLB_INT1_CONFIG : \ + is_uv2_hub() ? UV2H_GR1_TLB_INT1_CONFIG : \ + is_uv3_hub() ? UV3H_GR1_TLB_INT1_CONFIG : \ + /*is_uv4_hub*/ UV4H_GR1_TLB_INT1_CONFIG) #define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0 #define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8 @@ -979,6 +1321,7 @@ union uvh_gr1_tlb_int0_config_u { #define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL #define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + union uvh_gr1_tlb_int1_config_u { unsigned long v; struct uvh_gr1_tlb_int1_config_s { @@ -1001,19 +1344,18 @@ union uvh_gr1_tlb_int1_config_u { #define UV1H_GR1_TLB_MMR_CONTROL 0x801080UL #define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL #define UV3H_GR1_TLB_MMR_CONTROL 0x1001080UL -#define UVH_GR1_TLB_MMR_CONTROL \ - (is_uv1_hub() ? UV1H_GR1_TLB_MMR_CONTROL : \ - (is_uv2_hub() ? UV2H_GR1_TLB_MMR_CONTROL : \ - UV3H_GR1_TLB_MMR_CONTROL)) +#define UV4H_GR1_TLB_MMR_CONTROL 0x701080UL +#define UVH_GR1_TLB_MMR_CONTROL ( \ + is_uv1_hub() ? UV1H_GR1_TLB_MMR_CONTROL : \ + is_uv2_hub() ? UV2H_GR1_TLB_MMR_CONTROL : \ + is_uv3_hub() ? UV3H_GR1_TLB_MMR_CONTROL : \ + /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_CONTROL) #define UVH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 #define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 #define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 #define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 #define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UVH_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL #define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL #define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL #define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL @@ -1043,14 +1385,11 @@ union uvh_gr1_tlb_int1_config_u { #define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL #define UVXH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UVXH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 #define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 -#define UVXH_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UVXH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL #define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL #define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL #define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL @@ -1093,12 +1432,30 @@ union uvh_gr1_tlb_int1_config_u { #define UV3H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL #define UV3H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL +#define UV4H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV4H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 13 +#define UV4H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV4H_GR1_TLB_MMR_CONTROL_ECC_SEL_SHFT 21 +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UV4H_GR1_TLB_MMR_CONTROL_PAGE_SIZE_SHFT 59 +#define UV4H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000001fffUL +#define UV4H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000006000UL +#define UV4H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV4H_GR1_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL +#define UV4H_GR1_TLB_MMR_CONTROL_PAGE_SIZE_MASK 0xf800000000000000UL + + union uvh_gr1_tlb_mmr_control_u { unsigned long v; struct uvh_gr1_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; + unsigned long rsvd_0_15:16; unsigned long auto_valid_en:1; /* RW */ unsigned long rsvd_17_19:3; unsigned long mmr_hash_index_en:1; /* RW */ @@ -1132,9 +1489,7 @@ union uvh_gr1_tlb_mmr_control_u { unsigned long rsvd_61_63:3; } s1; struct uvxh_gr1_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; + unsigned long rsvd_0_15:16; unsigned long auto_valid_en:1; /* RW */ unsigned long rsvd_17_19:3; unsigned long mmr_hash_index_en:1; /* RW */ @@ -1145,8 +1500,7 @@ union uvh_gr1_tlb_mmr_control_u { unsigned long rsvd_33_47:15; unsigned long rsvd_48:1; unsigned long rsvd_49_51:3; - unsigned long rsvd_52:1; - unsigned long rsvd_53_63:11; + unsigned long rsvd_52_63:12; } sx; struct uv2h_gr1_tlb_mmr_control_s { unsigned long index:12; /* RW */ @@ -1183,6 +1537,24 @@ union uvh_gr1_tlb_mmr_control_u { unsigned long undef_52:1; /* Undefined */ unsigned long rsvd_53_63:11; } s3; + struct uv4h_gr1_tlb_mmr_control_s { + unsigned long index:13; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_15:1; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long ecc_sel:1; /* RW */ + unsigned long rsvd_22_29:8; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long undef_48:1; /* Undefined */ + unsigned long rsvd_49_51:3; + unsigned long rsvd_52_58:7; + unsigned long page_size:5; /* RW */ + } s4; }; /* ========================================================================= */ @@ -1191,19 +1563,14 @@ union uvh_gr1_tlb_mmr_control_u { #define UV1H_GR1_TLB_MMR_READ_DATA_HI 0x8010a0UL #define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL #define UV3H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL -#define UVH_GR1_TLB_MMR_READ_DATA_HI \ - (is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_HI : \ - (is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_HI : \ - UV3H_GR1_TLB_MMR_READ_DATA_HI)) +#define UV4H_GR1_TLB_MMR_READ_DATA_HI 0x7010a0UL +#define UVH_GR1_TLB_MMR_READ_DATA_HI ( \ + is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_HI : \ + is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_HI : \ + is_uv3_hub() ? UV3H_GR1_TLB_MMR_READ_DATA_HI : \ + /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_READ_DATA_HI) #define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL #define UV1H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 #define UV1H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 @@ -1215,13 +1582,6 @@ union uvh_gr1_tlb_mmr_control_u { #define UV1H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL #define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL #define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 #define UV2H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 @@ -1245,15 +1605,24 @@ union uvh_gr1_tlb_mmr_control_u { #define UV3H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0000200000000000UL #define UV3H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PNID_SHFT 34 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 49 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 51 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 52 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 53 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x00000003ffffffffUL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PNID_MASK 0x0001fffc00000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0006000000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0008000000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0010000000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0020000000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL + + union uvh_gr1_tlb_mmr_read_data_hi_u { unsigned long v; - struct uvh_gr1_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long rsvd_45_63:19; - } s; struct uv1h_gr1_tlb_mmr_read_data_hi_s { unsigned long pfn:41; /* RO */ unsigned long gaa:2; /* RO */ @@ -1261,13 +1630,6 @@ union uvh_gr1_tlb_mmr_read_data_hi_u { unsigned long larger:1; /* RO */ unsigned long rsvd_45_63:19; } s1; - struct uvxh_gr1_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long rsvd_45_63:19; - } sx; struct uv2h_gr1_tlb_mmr_read_data_hi_s { unsigned long pfn:41; /* RO */ unsigned long gaa:2; /* RO */ @@ -1284,6 +1646,16 @@ union uvh_gr1_tlb_mmr_read_data_hi_u { unsigned long undef_46_54:9; /* Undefined */ unsigned long way_ecc:9; /* RO */ } s3; + struct uv4h_gr1_tlb_mmr_read_data_hi_s { + unsigned long pfn:34; /* RO */ + unsigned long pnid:15; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long aa_ext:1; /* RO */ + unsigned long undef_54:1; /* Undefined */ + unsigned long way_ecc:9; /* RO */ + } s4; }; /* ========================================================================= */ @@ -1292,10 +1664,12 @@ union uvh_gr1_tlb_mmr_read_data_hi_u { #define UV1H_GR1_TLB_MMR_READ_DATA_LO 0x8010a8UL #define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL #define UV3H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL -#define UVH_GR1_TLB_MMR_READ_DATA_LO \ - (is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_LO : \ - (is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_LO : \ - UV3H_GR1_TLB_MMR_READ_DATA_LO)) +#define UV4H_GR1_TLB_MMR_READ_DATA_LO 0x7010a8UL +#define UVH_GR1_TLB_MMR_READ_DATA_LO ( \ + is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_LO : \ + is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_LO : \ + is_uv3_hub() ? UV3H_GR1_TLB_MMR_READ_DATA_LO : \ + /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_READ_DATA_LO) #define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 #define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 @@ -1332,6 +1706,14 @@ union uvh_gr1_tlb_mmr_read_data_hi_u { #define UV3H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL #define UV3H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV4H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV4H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + + union uvh_gr1_tlb_mmr_read_data_lo_u { unsigned long v; struct uvh_gr1_tlb_mmr_read_data_lo_s { @@ -1359,6 +1741,11 @@ union uvh_gr1_tlb_mmr_read_data_lo_u { unsigned long asid:24; /* RO */ unsigned long valid:1; /* RO */ } s3; + struct uv4h_gr1_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s4; }; /* ========================================================================= */ @@ -1369,6 +1756,7 @@ union uvh_gr1_tlb_mmr_read_data_lo_u { #define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0 #define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL + union uvh_int_cmpb_u { unsigned long v; struct uvh_int_cmpb_s { @@ -1382,12 +1770,14 @@ union uvh_int_cmpb_u { /* ========================================================================= */ #define UVH_INT_CMPC 0x22100UL + #define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT 0 #define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL #define UVXH_INT_CMPC_REAL_TIME_CMP_2_SHFT 0 #define UVXH_INT_CMPC_REAL_TIME_CMP_2_MASK 0x00ffffffffffffffUL + union uvh_int_cmpc_u { unsigned long v; struct uvh_int_cmpc_s { @@ -1401,12 +1791,14 @@ union uvh_int_cmpc_u { /* ========================================================================= */ #define UVH_INT_CMPD 0x22180UL + #define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT 0 #define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL #define UVXH_INT_CMPD_REAL_TIME_CMP_3_SHFT 0 #define UVXH_INT_CMPD_REAL_TIME_CMP_3_MASK 0x00ffffffffffffffUL + union uvh_int_cmpd_u { unsigned long v; struct uvh_int_cmpd_s { @@ -1419,7 +1811,16 @@ union uvh_int_cmpd_u { /* UVH_IPI_INT */ /* ========================================================================= */ #define UVH_IPI_INT 0x60500UL -#define UVH_IPI_INT_32 0x348 + +#define UV1H_IPI_INT_32 0x348 +#define UV2H_IPI_INT_32 0x348 +#define UV3H_IPI_INT_32 0x348 +#define UV4H_IPI_INT_32 0x268 +#define UVH_IPI_INT_32 ( \ + is_uv1_hub() ? UV1H_IPI_INT_32 : \ + is_uv2_hub() ? UV2H_IPI_INT_32 : \ + is_uv3_hub() ? UV3H_IPI_INT_32 : \ + /*is_uv4_hub*/ UV4H_IPI_INT_32) #define UVH_IPI_INT_VECTOR_SHFT 0 #define UVH_IPI_INT_DELIVERY_MODE_SHFT 8 @@ -1432,6 +1833,7 @@ union uvh_int_cmpd_u { #define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL #define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL + union uvh_ipi_int_u { unsigned long v; struct uvh_ipi_int_s { @@ -1448,103 +1850,269 @@ union uvh_ipi_int_u { /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL +#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST") +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST ( \ + is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \ + is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \ + is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \ + /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST) #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL + +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL + + +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL + +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL + union uvh_lb_bau_intd_payload_queue_first_u { unsigned long v; - struct uvh_lb_bau_intd_payload_queue_first_s { + struct uv1h_lb_bau_intd_payload_queue_first_s { unsigned long rsvd_0_3:4; unsigned long address:39; /* RW */ unsigned long rsvd_43_48:6; unsigned long node_id:14; /* RW */ unsigned long rsvd_63:1; - } s; + } s1; + struct uv2h_lb_bau_intd_payload_queue_first_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_48:6; + unsigned long node_id:14; /* RW */ + unsigned long rsvd_63:1; + } s2; + struct uv3h_lb_bau_intd_payload_queue_first_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_48:6; + unsigned long node_id:14; /* RW */ + unsigned long rsvd_63:1; + } s3; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL +#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST") +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST ( \ + is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \ + is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \ + is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \ + /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST) #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL + +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL + + +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL + +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL + union uvh_lb_bau_intd_payload_queue_last_u { unsigned long v; - struct uvh_lb_bau_intd_payload_queue_last_s { + struct uv1h_lb_bau_intd_payload_queue_last_s { unsigned long rsvd_0_3:4; unsigned long address:39; /* RW */ unsigned long rsvd_43_63:21; - } s; + } s1; + struct uv2h_lb_bau_intd_payload_queue_last_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_63:21; + } s2; + struct uv3h_lb_bau_intd_payload_queue_last_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_63:21; + } s3; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL +#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL") +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL ( \ + is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \ + is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \ + is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \ + /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL) #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL + +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL + + +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL + +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL + union uvh_lb_bau_intd_payload_queue_tail_u { unsigned long v; - struct uvh_lb_bau_intd_payload_queue_tail_s { + struct uv1h_lb_bau_intd_payload_queue_tail_s { unsigned long rsvd_0_3:4; unsigned long address:39; /* RW */ unsigned long rsvd_43_63:21; - } s; + } s1; + struct uv2h_lb_bau_intd_payload_queue_tail_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_63:21; + } s2; + struct uv3h_lb_bau_intd_payload_queue_tail_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_63:21; + } s3; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL +#define UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE uv_undefined("UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE") +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE ( \ + is_uv1_hub() ? UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \ + is_uv2_hub() ? UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \ + is_uv3_hub() ? UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \ + /*is_uv4_hub*/ UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE) #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL + +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL + + +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL + +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL + union uvh_lb_bau_intd_software_acknowledge_u { unsigned long v; - struct uvh_lb_bau_intd_software_acknowledge_s { + struct uv1h_lb_bau_intd_software_acknowledge_s { unsigned long pending_0:1; /* RW, W1C */ unsigned long pending_1:1; /* RW, W1C */ unsigned long pending_2:1; /* RW, W1C */ @@ -1562,27 +2130,84 @@ union uvh_lb_bau_intd_software_acknowledge_u { unsigned long timeout_6:1; /* RW, W1C */ unsigned long timeout_7:1; /* RW, W1C */ unsigned long rsvd_16_63:48; - } s; + } s1; + struct uv2h_lb_bau_intd_software_acknowledge_s { + unsigned long pending_0:1; /* RW */ + unsigned long pending_1:1; /* RW */ + unsigned long pending_2:1; /* RW */ + unsigned long pending_3:1; /* RW */ + unsigned long pending_4:1; /* RW */ + unsigned long pending_5:1; /* RW */ + unsigned long pending_6:1; /* RW */ + unsigned long pending_7:1; /* RW */ + unsigned long timeout_0:1; /* RW */ + unsigned long timeout_1:1; /* RW */ + unsigned long timeout_2:1; /* RW */ + unsigned long timeout_3:1; /* RW */ + unsigned long timeout_4:1; /* RW */ + unsigned long timeout_5:1; /* RW */ + unsigned long timeout_6:1; /* RW */ + unsigned long timeout_7:1; /* RW */ + unsigned long rsvd_16_63:48; + } s2; + struct uv3h_lb_bau_intd_software_acknowledge_s { + unsigned long pending_0:1; /* RW */ + unsigned long pending_1:1; /* RW */ + unsigned long pending_2:1; /* RW */ + unsigned long pending_3:1; /* RW */ + unsigned long pending_4:1; /* RW */ + unsigned long pending_5:1; /* RW */ + unsigned long pending_6:1; /* RW */ + unsigned long pending_7:1; /* RW */ + unsigned long timeout_0:1; /* RW */ + unsigned long timeout_1:1; /* RW */ + unsigned long timeout_2:1; /* RW */ + unsigned long timeout_3:1; /* RW */ + unsigned long timeout_4:1; /* RW */ + unsigned long timeout_5:1; /* RW */ + unsigned long timeout_6:1; /* RW */ + unsigned long timeout_7:1; /* RW */ + unsigned long rsvd_16_63:48; + } s3; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL +#define UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS uv_undefined("UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS") +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS ( \ + is_uv1_hub() ? UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \ + is_uv2_hub() ? UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \ + is_uv3_hub() ? UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \ + /*is_uv4_hub*/ UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS) #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70 /* ========================================================================= */ /* UVH_LB_BAU_MISC_CONTROL */ /* ========================================================================= */ -#define UVH_LB_BAU_MISC_CONTROL 0x320170UL #define UV1H_LB_BAU_MISC_CONTROL 0x320170UL #define UV2H_LB_BAU_MISC_CONTROL 0x320170UL #define UV3H_LB_BAU_MISC_CONTROL 0x320170UL -#define UVH_LB_BAU_MISC_CONTROL_32 0xa10 -#define UV1H_LB_BAU_MISC_CONTROL_32 0x320170UL -#define UV2H_LB_BAU_MISC_CONTROL_32 0x320170UL -#define UV3H_LB_BAU_MISC_CONTROL_32 0x320170UL +#define UV4H_LB_BAU_MISC_CONTROL 0xc8170UL +#define UVH_LB_BAU_MISC_CONTROL ( \ + is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL : \ + is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL : \ + is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL : \ + /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL) + +#define UV1H_LB_BAU_MISC_CONTROL_32 0xa10 +#define UV2H_LB_BAU_MISC_CONTROL_32 0xa10 +#define UV3H_LB_BAU_MISC_CONTROL_32 0xa10 +#define UV4H_LB_BAU_MISC_CONTROL_32 0xa18 +#define UVH_LB_BAU_MISC_CONTROL_32 ( \ + is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_32 : \ + is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_32 : \ + is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_32 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_32) #define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 #define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 @@ -1590,8 +2215,6 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 #define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 #define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 -#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 -#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 @@ -1606,8 +2229,6 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL #define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL #define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL -#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL -#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL #define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL @@ -1656,8 +2277,6 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 #define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 #define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 -#define UVXH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 #define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 #define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 #define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 @@ -1679,8 +2298,6 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL #define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL #define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL -#define UVXH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL #define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL #define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL #define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL @@ -1797,6 +2414,88 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UV3H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_MASK 0x00003fc000000000UL #define UV3H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL +#define UV4H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UV4H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UV4H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 +#define UV4H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 +#define UV4H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 +#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_15_19_SHFT 15 +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 +#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 +#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 +#define UV4H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 +#define UV4H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 +#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 +#define UV4H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_SHFT 36 +#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_37_SHFT 37 +#define UV4H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_SHFT 38 +#define UV4H_LB_BAU_MISC_CONTROL_ADDRESS_INTERLEAVE_SELECT_SHFT 46 +#define UV4H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 +#define UV4H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UV4H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UV4H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UV4H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UV4H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_15_19_MASK 0x00000000000f8000UL +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UV4H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UV4H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL +#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL +#define UV4H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_MASK 0x0000001000000000UL +#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_37_MASK 0x0000002000000000UL +#define UV4H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_MASK 0x00003fc000000000UL +#define UV4H_LB_BAU_MISC_CONTROL_ADDRESS_INTERLEAVE_SELECT_MASK 0x0000400000000000UL +#define UV4H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL + +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK \ + uv_undefined("UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK") +#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK ( \ + is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \ + is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \ + is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \ + /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK) +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT \ + uv_undefined("UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT") +#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT ( \ + is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \ + is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \ + is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \ + /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT) +#define UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK \ + uv_undefined("UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK") +#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK ( \ + is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \ + is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \ + is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \ + /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK) +#define UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT \ + uv_undefined("UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT") +#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT ( \ + is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \ + is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \ + is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \ + /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT) + union uvh_lb_bau_misc_control_u { unsigned long v; struct uvh_lb_bau_misc_control_s { @@ -1806,8 +2505,7 @@ union uvh_lb_bau_misc_control_u { unsigned long force_lock_nop:1; /* RW */ unsigned long qpi_agent_presence_vector:3; /* RW */ unsigned long descriptor_fetch_mode:1; /* RW */ - unsigned long enable_intd_soft_ack_mode:1; /* RW */ - unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long rsvd_15_19:5; unsigned long enable_dual_mapping_mode:1; /* RW */ unsigned long vga_io_port_decode_enable:1; /* RW */ unsigned long vga_io_port_16_bit_decode:1; /* RW */ @@ -1844,8 +2542,7 @@ union uvh_lb_bau_misc_control_u { unsigned long force_lock_nop:1; /* RW */ unsigned long qpi_agent_presence_vector:3; /* RW */ unsigned long descriptor_fetch_mode:1; /* RW */ - unsigned long enable_intd_soft_ack_mode:1; /* RW */ - unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long rsvd_15_19:5; unsigned long enable_dual_mapping_mode:1; /* RW */ unsigned long vga_io_port_decode_enable:1; /* RW */ unsigned long vga_io_port_16_bit_decode:1; /* RW */ @@ -1918,13 +2615,59 @@ union uvh_lb_bau_misc_control_u { unsigned long rsvd_46_47:2; unsigned long fun:16; /* RW */ } s3; + struct uv4h_lb_bau_misc_control_s { + unsigned long rejection_delay:8; /* RW */ + unsigned long apic_mode:1; /* RW */ + unsigned long force_broadcast:1; /* RW */ + unsigned long force_lock_nop:1; /* RW */ + unsigned long qpi_agent_presence_vector:3; /* RW */ + unsigned long descriptor_fetch_mode:1; /* RW */ + unsigned long rsvd_15_19:5; + unsigned long enable_dual_mapping_mode:1; /* RW */ + unsigned long vga_io_port_decode_enable:1; /* RW */ + unsigned long vga_io_port_16_bit_decode:1; /* RW */ + unsigned long suppress_dest_registration:1; /* RW */ + unsigned long programmed_initial_priority:3; /* RW */ + unsigned long use_incoming_priority:1; /* RW */ + unsigned long enable_programmed_initial_priority:1;/* RW */ + unsigned long enable_automatic_apic_mode_selection:1;/* RW */ + unsigned long apic_mode_status:1; /* RO */ + unsigned long suppress_interrupts_to_self:1; /* RW */ + unsigned long enable_lock_based_system_flush:1;/* RW */ + unsigned long enable_extended_sb_status:1; /* RW */ + unsigned long suppress_int_prio_udt_to_self:1;/* RW */ + unsigned long use_legacy_descriptor_formats:1;/* RW */ + unsigned long suppress_quiesce_msgs_to_qpi:1; /* RW */ + unsigned long rsvd_37:1; + unsigned long thread_kill_timebase:8; /* RW */ + unsigned long address_interleave_select:1; /* RW */ + unsigned long rsvd_47:1; + unsigned long fun:16; /* RW */ + } s4; }; /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 +#define UV1H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL +#define UV2H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL +#define UV3H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL +#define UV4H_LB_BAU_SB_ACTIVATION_CONTROL 0xc8020UL +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_CONTROL : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_CONTROL : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_CONTROL : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_CONTROL) + +#define UV1H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 +#define UV2H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 +#define UV3H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 +#define UV4H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9c8 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_CONTROL_32) #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62 @@ -1933,6 +2676,7 @@ union uvh_lb_bau_misc_control_u { #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL + union uvh_lb_bau_sb_activation_control_u { unsigned long v; struct uvh_lb_bau_sb_activation_control_s { @@ -1946,12 +2690,30 @@ union uvh_lb_bau_sb_activation_control_u { /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 +#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_0 0xc8030UL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_0 : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_0 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_0 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_0) + +#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9d0 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_0_32) #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL + union uvh_lb_bau_sb_activation_status_0_u { unsigned long v; struct uvh_lb_bau_sb_activation_status_0_s { @@ -1962,12 +2724,30 @@ union uvh_lb_bau_sb_activation_status_0_u { /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 +#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_1 0xc8040UL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_1 : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_1 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_1 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_1) + +#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9d8 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_1_32) #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL + union uvh_lb_bau_sb_activation_status_1_u { unsigned long v; struct uvh_lb_bau_sb_activation_status_1_s { @@ -1978,23 +2758,55 @@ union uvh_lb_bau_sb_activation_status_1_u { /* ========================================================================= */ /* UVH_LB_BAU_SB_DESCRIPTOR_BASE */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 +#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL +#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL +#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL +#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE 0xc8010UL +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE) + +#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 +#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 +#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 +#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9c0 +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_32) #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL +#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL + + +#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL + +#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL + +#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x00003ffffffff000UL + + union uvh_lb_bau_sb_descriptor_base_u { unsigned long v; struct uvh_lb_bau_sb_descriptor_base_s { unsigned long rsvd_0_11:12; - unsigned long page_address:31; /* RW */ - unsigned long rsvd_43_48:6; + unsigned long rsvd_12_48:37; unsigned long node_id:14; /* RW */ unsigned long rsvd_63:1; } s; + struct uv4h_lb_bau_sb_descriptor_base_s { + unsigned long rsvd_0_11:12; + unsigned long page_address:34; /* RW */ + unsigned long rsvd_46_48:3; + unsigned long node_id:14; /* RW */ + unsigned long rsvd_63:1; + } s4; }; /* ========================================================================= */ @@ -2004,6 +2816,7 @@ union uvh_lb_bau_sb_descriptor_base_u { #define UV1H_NODE_ID 0x0UL #define UV2H_NODE_ID 0x0UL #define UV3H_NODE_ID 0x0UL +#define UV4H_NODE_ID 0x0UL #define UVH_NODE_ID_FORCE1_SHFT 0 #define UVH_NODE_ID_MANUFACTURER_SHFT 1 @@ -2080,6 +2893,26 @@ union uvh_lb_bau_sb_descriptor_base_u { #define UV3H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL #define UV3H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL +#define UV4H_NODE_ID_FORCE1_SHFT 0 +#define UV4H_NODE_ID_MANUFACTURER_SHFT 1 +#define UV4H_NODE_ID_PART_NUMBER_SHFT 12 +#define UV4H_NODE_ID_REVISION_SHFT 28 +#define UV4H_NODE_ID_NODE_ID_SHFT 32 +#define UV4H_NODE_ID_ROUTER_SELECT_SHFT 48 +#define UV4H_NODE_ID_RESERVED_2_SHFT 49 +#define UV4H_NODE_ID_NODES_PER_BIT_SHFT 50 +#define UV4H_NODE_ID_NI_PORT_SHFT 57 +#define UV4H_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UV4H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UV4H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UV4H_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UV4H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UV4H_NODE_ID_ROUTER_SELECT_MASK 0x0001000000000000UL +#define UV4H_NODE_ID_RESERVED_2_MASK 0x0002000000000000UL +#define UV4H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL +#define UV4H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL + + union uvh_node_id_u { unsigned long v; struct uvh_node_id_s { @@ -2137,17 +2970,40 @@ union uvh_node_id_u { unsigned long ni_port:5; /* RO */ unsigned long rsvd_62_63:2; } s3; + struct uv4h_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:15; /* RW */ + unsigned long rsvd_47:1; + unsigned long router_select:1; /* RO */ + unsigned long rsvd_49:1; + unsigned long nodes_per_bit:7; /* RO */ + unsigned long ni_port:5; /* RO */ + unsigned long rsvd_62_63:2; + } s4; }; /* ========================================================================= */ /* UVH_NODE_PRESENT_TABLE */ /* ========================================================================= */ #define UVH_NODE_PRESENT_TABLE 0x1400UL -#define UVH_NODE_PRESENT_TABLE_DEPTH 16 + +#define UV1H_NODE_PRESENT_TABLE_DEPTH 16 +#define UV2H_NODE_PRESENT_TABLE_DEPTH 16 +#define UV3H_NODE_PRESENT_TABLE_DEPTH 16 +#define UV4H_NODE_PRESENT_TABLE_DEPTH 4 +#define UVH_NODE_PRESENT_TABLE_DEPTH ( \ + is_uv1_hub() ? UV1H_NODE_PRESENT_TABLE_DEPTH : \ + is_uv2_hub() ? UV2H_NODE_PRESENT_TABLE_DEPTH : \ + is_uv3_hub() ? UV3H_NODE_PRESENT_TABLE_DEPTH : \ + /*is_uv4_hub*/ UV4H_NODE_PRESENT_TABLE_DEPTH) #define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0 #define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL + union uvh_node_present_table_u { unsigned long v; struct uvh_node_present_table_s { @@ -2158,7 +3014,15 @@ union uvh_node_present_table_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x4800c8UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR) #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 @@ -2167,6 +3031,7 @@ union uvh_node_present_table_u { #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL + union uvh_rh_gam_alias210_overlay_config_0_mmr_u { unsigned long v; struct uvh_rh_gam_alias210_overlay_config_0_mmr_s { @@ -2182,7 +3047,15 @@ union uvh_rh_gam_alias210_overlay_config_0_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x4800d8UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR) #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 @@ -2191,6 +3064,7 @@ union uvh_rh_gam_alias210_overlay_config_0_mmr_u { #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL + union uvh_rh_gam_alias210_overlay_config_1_mmr_u { unsigned long v; struct uvh_rh_gam_alias210_overlay_config_1_mmr_s { @@ -2206,7 +3080,15 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x4800e8UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR) #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 @@ -2215,6 +3097,7 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u { #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL + union uvh_rh_gam_alias210_overlay_config_2_mmr_u { unsigned long v; struct uvh_rh_gam_alias210_overlay_config_2_mmr_s { @@ -2230,11 +3113,20 @@ union uvh_rh_gam_alias210_overlay_config_2_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL +#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL +#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL +#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL +#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x4800d0UL +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR) #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL + union uvh_rh_gam_alias210_redirect_config_0_mmr_u { unsigned long v; struct uvh_rh_gam_alias210_redirect_config_0_mmr_s { @@ -2247,11 +3139,20 @@ union uvh_rh_gam_alias210_redirect_config_0_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL +#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL +#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL +#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL +#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x4800e0UL +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR) #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL + union uvh_rh_gam_alias210_redirect_config_1_mmr_u { unsigned long v; struct uvh_rh_gam_alias210_redirect_config_1_mmr_s { @@ -2264,11 +3165,20 @@ union uvh_rh_gam_alias210_redirect_config_1_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL +#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL +#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL +#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL +#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x4800f0UL +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR) #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL + union uvh_rh_gam_alias210_redirect_config_2_mmr_u { unsigned long v; struct uvh_rh_gam_alias210_redirect_config_2_mmr_s { @@ -2281,14 +3191,17 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL #define UV1H_RH_GAM_CONFIG_MMR 0x1600000UL #define UV2H_RH_GAM_CONFIG_MMR 0x1600000UL #define UV3H_RH_GAM_CONFIG_MMR 0x1600000UL +#define UV4H_RH_GAM_CONFIG_MMR 0x480000UL +#define UVH_RH_GAM_CONFIG_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_CONFIG_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_CONFIG_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_CONFIG_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_CONFIG_MMR) -#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 #define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 -#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL #define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL #define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 @@ -2298,9 +3211,7 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { #define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL #define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL -#define UVXH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 #define UVXH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 -#define UVXH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL #define UVXH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL #define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 @@ -2313,10 +3224,14 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { #define UV3H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL #define UV3H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL +#define UV4H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UV4H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL + + union uvh_rh_gam_config_mmr_u { unsigned long v; struct uvh_rh_gam_config_mmr_s { - unsigned long m_skt:6; /* RW */ + unsigned long rsvd_0_5:6; unsigned long n_skt:4; /* RW */ unsigned long rsvd_10_63:54; } s; @@ -2328,7 +3243,7 @@ union uvh_rh_gam_config_mmr_u { unsigned long rsvd_13_63:51; } s1; struct uvxh_rh_gam_config_mmr_s { - unsigned long m_skt:6; /* RW */ + unsigned long rsvd_0_5:6; unsigned long n_skt:4; /* RW */ unsigned long rsvd_10_63:54; } sx; @@ -2342,20 +3257,28 @@ union uvh_rh_gam_config_mmr_u { unsigned long n_skt:4; /* RW */ unsigned long rsvd_10_63:54; } s3; + struct uv4h_rh_gam_config_mmr_s { + unsigned long rsvd_0_5:6; + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } s4; }; /* ========================================================================= */ /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL #define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x480010UL +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR) -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL @@ -2368,10 +3291,8 @@ union uvh_rh_gam_config_mmr_u { #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL -#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL @@ -2391,12 +3312,28 @@ union uvh_rh_gam_config_mmr_u { #define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_MODE_MASK 0x4000000000000000UL #define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK ( \ + is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \ + is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \ + is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \ + /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK) +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT ( \ + is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \ + is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \ + is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \ + /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT) + union uvh_rh_gam_gru_overlay_config_mmr_u { unsigned long v; struct uvh_rh_gam_gru_overlay_config_mmr_s { - unsigned long rsvd_0_27:28; - unsigned long base:18; /* RW */ - unsigned long rsvd_46_51:6; + unsigned long rsvd_0_51:52; unsigned long n_gru:4; /* RW */ unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ @@ -2412,8 +3349,7 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { unsigned long enable:1; /* RW */ } s1; struct uvxh_rh_gam_gru_overlay_config_mmr_s { - unsigned long rsvd_0_27:28; - unsigned long base:18; /* RW */ + unsigned long rsvd_0_45:46; unsigned long rsvd_46_51:6; unsigned long n_gru:4; /* RW */ unsigned long rsvd_56_62:7; @@ -2436,6 +3372,15 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { unsigned long mode:1; /* RW */ unsigned long enable:1; /* RW */ } s3; + struct uv4h_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_24:25; + unsigned long undef_25:1; /* Undefined */ + unsigned long base:20; /* RW */ + unsigned long rsvd_46_51:6; + unsigned long n_gru:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s4; }; /* ========================================================================= */ @@ -2443,6 +3388,14 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { /* ========================================================================= */ #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR uv_undefined("UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR") +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR uv_undefined("UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR") +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR) + #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 @@ -2453,6 +3406,7 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 @@ -2462,6 +3416,7 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + union uvh_rh_gam_mmioh_overlay_config_mmr_u { unsigned long v; struct uv1h_rh_gam_mmioh_overlay_config_mmr_s { @@ -2485,10 +3440,15 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL #define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL #define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL +#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x480028UL +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR) #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 @@ -2517,6 +3477,12 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u { #define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL #define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + + union uvh_rh_gam_mmr_overlay_config_mmr_u { unsigned long v; struct uvh_rh_gam_mmr_overlay_config_mmr_s { @@ -2550,16 +3516,31 @@ union uvh_rh_gam_mmr_overlay_config_mmr_u { unsigned long rsvd_46_62:17; unsigned long enable:1; /* RW */ } s3; + struct uv4h_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long rsvd_46_62:17; + unsigned long enable:1; /* RW */ + } s4; }; /* ========================================================================= */ /* UVH_RTC */ /* ========================================================================= */ -#define UVH_RTC 0x340000UL +#define UV1H_RTC 0x340000UL +#define UV2H_RTC 0x340000UL +#define UV3H_RTC 0x340000UL +#define UV4H_RTC 0xe0000UL +#define UVH_RTC ( \ + is_uv1_hub() ? UV1H_RTC : \ + is_uv2_hub() ? UV2H_RTC : \ + is_uv3_hub() ? UV3H_RTC : \ + /*is_uv4_hub*/ UV4H_RTC) #define UVH_RTC_REAL_TIME_CLOCK_SHFT 0 #define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL + union uvh_rtc_u { unsigned long v; struct uvh_rtc_s { @@ -2590,6 +3571,7 @@ union uvh_rtc_u { #define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL #define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + union uvh_rtc1_int_config_u { unsigned long v; struct uvh_rtc1_int_config_s { @@ -2609,12 +3591,30 @@ union uvh_rtc1_int_config_u { /* ========================================================================= */ /* UVH_SCRATCH5 */ /* ========================================================================= */ -#define UVH_SCRATCH5 0x2d0200UL -#define UVH_SCRATCH5_32 0x778 +#define UV1H_SCRATCH5 0x2d0200UL +#define UV2H_SCRATCH5 0x2d0200UL +#define UV3H_SCRATCH5 0x2d0200UL +#define UV4H_SCRATCH5 0xb0200UL +#define UVH_SCRATCH5 ( \ + is_uv1_hub() ? UV1H_SCRATCH5 : \ + is_uv2_hub() ? UV2H_SCRATCH5 : \ + is_uv3_hub() ? UV3H_SCRATCH5 : \ + /*is_uv4_hub*/ UV4H_SCRATCH5) + +#define UV1H_SCRATCH5_32 0x778 +#define UV2H_SCRATCH5_32 0x778 +#define UV3H_SCRATCH5_32 0x778 +#define UV4H_SCRATCH5_32 0x798 +#define UVH_SCRATCH5_32 ( \ + is_uv1_hub() ? UV1H_SCRATCH5_32 : \ + is_uv2_hub() ? UV2H_SCRATCH5_32 : \ + is_uv3_hub() ? UV3H_SCRATCH5_32 : \ + /*is_uv4_hub*/ UV4H_SCRATCH5_32) #define UVH_SCRATCH5_SCRATCH5_SHFT 0 #define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL + union uvh_scratch5_u { unsigned long v; struct uvh_scratch5_s { @@ -2625,14 +3625,39 @@ union uvh_scratch5_u { /* ========================================================================= */ /* UVH_SCRATCH5_ALIAS */ /* ========================================================================= */ -#define UVH_SCRATCH5_ALIAS 0x2d0208UL -#define UVH_SCRATCH5_ALIAS_32 0x780 +#define UV1H_SCRATCH5_ALIAS 0x2d0208UL +#define UV2H_SCRATCH5_ALIAS 0x2d0208UL +#define UV3H_SCRATCH5_ALIAS 0x2d0208UL +#define UV4H_SCRATCH5_ALIAS 0xb0208UL +#define UVH_SCRATCH5_ALIAS ( \ + is_uv1_hub() ? UV1H_SCRATCH5_ALIAS : \ + is_uv2_hub() ? UV2H_SCRATCH5_ALIAS : \ + is_uv3_hub() ? UV3H_SCRATCH5_ALIAS : \ + /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS) + +#define UV1H_SCRATCH5_ALIAS_32 0x780 +#define UV2H_SCRATCH5_ALIAS_32 0x780 +#define UV3H_SCRATCH5_ALIAS_32 0x780 +#define UV4H_SCRATCH5_ALIAS_32 0x7a0 +#define UVH_SCRATCH5_ALIAS_32 ( \ + is_uv1_hub() ? UV1H_SCRATCH5_ALIAS_32 : \ + is_uv2_hub() ? UV2H_SCRATCH5_ALIAS_32 : \ + is_uv3_hub() ? UV3H_SCRATCH5_ALIAS_32 : \ + /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS_32) /* ========================================================================= */ /* UVH_SCRATCH5_ALIAS_2 */ /* ========================================================================= */ -#define UVH_SCRATCH5_ALIAS_2 0x2d0210UL +#define UV1H_SCRATCH5_ALIAS_2 0x2d0210UL +#define UV2H_SCRATCH5_ALIAS_2 0x2d0210UL +#define UV3H_SCRATCH5_ALIAS_2 0x2d0210UL +#define UV4H_SCRATCH5_ALIAS_2 0xb0210UL +#define UVH_SCRATCH5_ALIAS_2 ( \ + is_uv1_hub() ? UV1H_SCRATCH5_ALIAS_2 : \ + is_uv2_hub() ? UV2H_SCRATCH5_ALIAS_2 : \ + is_uv3_hub() ? UV3H_SCRATCH5_ALIAS_2 : \ + /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS_2) #define UVH_SCRATCH5_ALIAS_2_32 0x788 @@ -2640,76 +3665,255 @@ union uvh_scratch5_u { /* UVXH_EVENT_OCCURRED2 */ /* ========================================================================= */ #define UVXH_EVENT_OCCURRED2 0x70100UL -#define UVXH_EVENT_OCCURRED2_32 0xb68 - -#define UVXH_EVENT_OCCURRED2_RTC_0_SHFT 0 -#define UVXH_EVENT_OCCURRED2_RTC_1_SHFT 1 -#define UVXH_EVENT_OCCURRED2_RTC_2_SHFT 2 -#define UVXH_EVENT_OCCURRED2_RTC_3_SHFT 3 -#define UVXH_EVENT_OCCURRED2_RTC_4_SHFT 4 -#define UVXH_EVENT_OCCURRED2_RTC_5_SHFT 5 -#define UVXH_EVENT_OCCURRED2_RTC_6_SHFT 6 -#define UVXH_EVENT_OCCURRED2_RTC_7_SHFT 7 -#define UVXH_EVENT_OCCURRED2_RTC_8_SHFT 8 -#define UVXH_EVENT_OCCURRED2_RTC_9_SHFT 9 -#define UVXH_EVENT_OCCURRED2_RTC_10_SHFT 10 -#define UVXH_EVENT_OCCURRED2_RTC_11_SHFT 11 -#define UVXH_EVENT_OCCURRED2_RTC_12_SHFT 12 -#define UVXH_EVENT_OCCURRED2_RTC_13_SHFT 13 -#define UVXH_EVENT_OCCURRED2_RTC_14_SHFT 14 -#define UVXH_EVENT_OCCURRED2_RTC_15_SHFT 15 -#define UVXH_EVENT_OCCURRED2_RTC_16_SHFT 16 -#define UVXH_EVENT_OCCURRED2_RTC_17_SHFT 17 -#define UVXH_EVENT_OCCURRED2_RTC_18_SHFT 18 -#define UVXH_EVENT_OCCURRED2_RTC_19_SHFT 19 -#define UVXH_EVENT_OCCURRED2_RTC_20_SHFT 20 -#define UVXH_EVENT_OCCURRED2_RTC_21_SHFT 21 -#define UVXH_EVENT_OCCURRED2_RTC_22_SHFT 22 -#define UVXH_EVENT_OCCURRED2_RTC_23_SHFT 23 -#define UVXH_EVENT_OCCURRED2_RTC_24_SHFT 24 -#define UVXH_EVENT_OCCURRED2_RTC_25_SHFT 25 -#define UVXH_EVENT_OCCURRED2_RTC_26_SHFT 26 -#define UVXH_EVENT_OCCURRED2_RTC_27_SHFT 27 -#define UVXH_EVENT_OCCURRED2_RTC_28_SHFT 28 -#define UVXH_EVENT_OCCURRED2_RTC_29_SHFT 29 -#define UVXH_EVENT_OCCURRED2_RTC_30_SHFT 30 -#define UVXH_EVENT_OCCURRED2_RTC_31_SHFT 31 -#define UVXH_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL -#define UVXH_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL -#define UVXH_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL -#define UVXH_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL -#define UVXH_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL -#define UVXH_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL -#define UVXH_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL -#define UVXH_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL -#define UVXH_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL -#define UVXH_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL -#define UVXH_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL -#define UVXH_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL -#define UVXH_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL -#define UVXH_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL -#define UVXH_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL -#define UVXH_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL -#define UVXH_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL -#define UVXH_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL -#define UVXH_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL -#define UVXH_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL -#define UVXH_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL -#define UVXH_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL -#define UVXH_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL -#define UVXH_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL -#define UVXH_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL -#define UVXH_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL -#define UVXH_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL -#define UVXH_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL -#define UVXH_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL -#define UVXH_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL -#define UVXH_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL -#define UVXH_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL - -union uvxh_event_occurred2_u { + +#define UV2H_EVENT_OCCURRED2_32 0xb68 +#define UV3H_EVENT_OCCURRED2_32 0xb68 +#define UV4H_EVENT_OCCURRED2_32 0x608 +#define UVH_EVENT_OCCURRED2_32 ( \ + is_uv2_hub() ? UV2H_EVENT_OCCURRED2_32 : \ + is_uv3_hub() ? UV3H_EVENT_OCCURRED2_32 : \ + /*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_32) + + +#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0 +#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1 +#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2 +#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3 +#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4 +#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5 +#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6 +#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7 +#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8 +#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9 +#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10 +#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11 +#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12 +#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13 +#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14 +#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15 +#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16 +#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17 +#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18 +#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19 +#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20 +#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21 +#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22 +#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23 +#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24 +#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25 +#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26 +#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27 +#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28 +#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29 +#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30 +#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31 +#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL +#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL +#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL +#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL +#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL +#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL +#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL +#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL +#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL +#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL +#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL +#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL +#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL +#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL +#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL +#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL +#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL +#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL +#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL +#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL +#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL +#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL +#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL +#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL +#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL +#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL +#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL +#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL +#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL +#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL +#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL +#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL + +#define UV3H_EVENT_OCCURRED2_RTC_0_SHFT 0 +#define UV3H_EVENT_OCCURRED2_RTC_1_SHFT 1 +#define UV3H_EVENT_OCCURRED2_RTC_2_SHFT 2 +#define UV3H_EVENT_OCCURRED2_RTC_3_SHFT 3 +#define UV3H_EVENT_OCCURRED2_RTC_4_SHFT 4 +#define UV3H_EVENT_OCCURRED2_RTC_5_SHFT 5 +#define UV3H_EVENT_OCCURRED2_RTC_6_SHFT 6 +#define UV3H_EVENT_OCCURRED2_RTC_7_SHFT 7 +#define UV3H_EVENT_OCCURRED2_RTC_8_SHFT 8 +#define UV3H_EVENT_OCCURRED2_RTC_9_SHFT 9 +#define UV3H_EVENT_OCCURRED2_RTC_10_SHFT 10 +#define UV3H_EVENT_OCCURRED2_RTC_11_SHFT 11 +#define UV3H_EVENT_OCCURRED2_RTC_12_SHFT 12 +#define UV3H_EVENT_OCCURRED2_RTC_13_SHFT 13 +#define UV3H_EVENT_OCCURRED2_RTC_14_SHFT 14 +#define UV3H_EVENT_OCCURRED2_RTC_15_SHFT 15 +#define UV3H_EVENT_OCCURRED2_RTC_16_SHFT 16 +#define UV3H_EVENT_OCCURRED2_RTC_17_SHFT 17 +#define UV3H_EVENT_OCCURRED2_RTC_18_SHFT 18 +#define UV3H_EVENT_OCCURRED2_RTC_19_SHFT 19 +#define UV3H_EVENT_OCCURRED2_RTC_20_SHFT 20 +#define UV3H_EVENT_OCCURRED2_RTC_21_SHFT 21 +#define UV3H_EVENT_OCCURRED2_RTC_22_SHFT 22 +#define UV3H_EVENT_OCCURRED2_RTC_23_SHFT 23 +#define UV3H_EVENT_OCCURRED2_RTC_24_SHFT 24 +#define UV3H_EVENT_OCCURRED2_RTC_25_SHFT 25 +#define UV3H_EVENT_OCCURRED2_RTC_26_SHFT 26 +#define UV3H_EVENT_OCCURRED2_RTC_27_SHFT 27 +#define UV3H_EVENT_OCCURRED2_RTC_28_SHFT 28 +#define UV3H_EVENT_OCCURRED2_RTC_29_SHFT 29 +#define UV3H_EVENT_OCCURRED2_RTC_30_SHFT 30 +#define UV3H_EVENT_OCCURRED2_RTC_31_SHFT 31 +#define UV3H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL +#define UV3H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL +#define UV3H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL +#define UV3H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL +#define UV3H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL +#define UV3H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL +#define UV3H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL +#define UV3H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL +#define UV3H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL +#define UV3H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL +#define UV3H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL +#define UV3H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL +#define UV3H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL +#define UV3H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL +#define UV3H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL +#define UV3H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL +#define UV3H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL +#define UV3H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL +#define UV3H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL +#define UV3H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL +#define UV3H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL +#define UV3H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL +#define UV3H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL +#define UV3H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL +#define UV3H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL +#define UV3H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL +#define UV3H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL +#define UV3H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL +#define UV3H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL +#define UV3H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL +#define UV3H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL +#define UV3H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL + +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_SHFT 0 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_SHFT 1 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_SHFT 2 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_SHFT 3 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_SHFT 4 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_SHFT 5 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_SHFT 6 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_SHFT 7 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_SHFT 8 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_SHFT 9 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_SHFT 10 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_SHFT 11 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_SHFT 12 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_SHFT 13 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_SHFT 14 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_SHFT 15 +#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_SHFT 16 +#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_SHFT 17 +#define UV4H_EVENT_OCCURRED2_RTC_0_SHFT 18 +#define UV4H_EVENT_OCCURRED2_RTC_1_SHFT 19 +#define UV4H_EVENT_OCCURRED2_RTC_2_SHFT 20 +#define UV4H_EVENT_OCCURRED2_RTC_3_SHFT 21 +#define UV4H_EVENT_OCCURRED2_RTC_4_SHFT 22 +#define UV4H_EVENT_OCCURRED2_RTC_5_SHFT 23 +#define UV4H_EVENT_OCCURRED2_RTC_6_SHFT 24 +#define UV4H_EVENT_OCCURRED2_RTC_7_SHFT 25 +#define UV4H_EVENT_OCCURRED2_RTC_8_SHFT 26 +#define UV4H_EVENT_OCCURRED2_RTC_9_SHFT 27 +#define UV4H_EVENT_OCCURRED2_RTC_10_SHFT 28 +#define UV4H_EVENT_OCCURRED2_RTC_11_SHFT 29 +#define UV4H_EVENT_OCCURRED2_RTC_12_SHFT 30 +#define UV4H_EVENT_OCCURRED2_RTC_13_SHFT 31 +#define UV4H_EVENT_OCCURRED2_RTC_14_SHFT 32 +#define UV4H_EVENT_OCCURRED2_RTC_15_SHFT 33 +#define UV4H_EVENT_OCCURRED2_RTC_16_SHFT 34 +#define UV4H_EVENT_OCCURRED2_RTC_17_SHFT 35 +#define UV4H_EVENT_OCCURRED2_RTC_18_SHFT 36 +#define UV4H_EVENT_OCCURRED2_RTC_19_SHFT 37 +#define UV4H_EVENT_OCCURRED2_RTC_20_SHFT 38 +#define UV4H_EVENT_OCCURRED2_RTC_21_SHFT 39 +#define UV4H_EVENT_OCCURRED2_RTC_22_SHFT 40 +#define UV4H_EVENT_OCCURRED2_RTC_23_SHFT 41 +#define UV4H_EVENT_OCCURRED2_RTC_24_SHFT 42 +#define UV4H_EVENT_OCCURRED2_RTC_25_SHFT 43 +#define UV4H_EVENT_OCCURRED2_RTC_26_SHFT 44 +#define UV4H_EVENT_OCCURRED2_RTC_27_SHFT 45 +#define UV4H_EVENT_OCCURRED2_RTC_28_SHFT 46 +#define UV4H_EVENT_OCCURRED2_RTC_29_SHFT 47 +#define UV4H_EVENT_OCCURRED2_RTC_30_SHFT 48 +#define UV4H_EVENT_OCCURRED2_RTC_31_SHFT 49 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_MASK 0x0000000000000001UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_MASK 0x0000000000000002UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_MASK 0x0000000000000004UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_MASK 0x0000000000000008UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_MASK 0x0000000000000010UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_MASK 0x0000000000000020UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_MASK 0x0000000000000040UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_MASK 0x0000000000000080UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_MASK 0x0000000000000100UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_MASK 0x0000000000000200UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_MASK 0x0000000000000400UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_MASK 0x0000000000000800UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_MASK 0x0000000000001000UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_MASK 0x0000000000002000UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_MASK 0x0000000000004000UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_MASK 0x0000000000008000UL +#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_MASK 0x0000000000010000UL +#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_MASK 0x0000000000020000UL +#define UV4H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000040000UL +#define UV4H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000080000UL +#define UV4H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000100000UL +#define UV4H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000200000UL +#define UV4H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000400000UL +#define UV4H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000800000UL +#define UV4H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000001000000UL +#define UV4H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000002000000UL +#define UV4H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000004000000UL +#define UV4H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000008000000UL +#define UV4H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000010000000UL +#define UV4H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000020000000UL +#define UV4H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000040000000UL +#define UV4H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000080000000UL +#define UV4H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000100000000UL +#define UV4H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000200000000UL +#define UV4H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000400000000UL +#define UV4H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000800000000UL +#define UV4H_EVENT_OCCURRED2_RTC_18_MASK 0x0000001000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_19_MASK 0x0000002000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_20_MASK 0x0000004000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_21_MASK 0x0000008000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_22_MASK 0x0000010000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_23_MASK 0x0000020000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_24_MASK 0x0000040000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_25_MASK 0x0000080000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_26_MASK 0x0000100000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_27_MASK 0x0000200000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_28_MASK 0x0000400000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_29_MASK 0x0000800000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_30_MASK 0x0001000000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_31_MASK 0x0002000000000000UL + +#define UVXH_EVENT_OCCURRED2_RTC_1_MASK ( \ + is_uv2_hub() ? UV2H_EVENT_OCCURRED2_RTC_1_MASK : \ + is_uv3_hub() ? UV3H_EVENT_OCCURRED2_RTC_1_MASK : \ + /*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_RTC_1_MASK) + +union uvh_event_occurred2_u { unsigned long v; - struct uvxh_event_occurred2_s { + struct uv2h_event_occurred2_s { unsigned long rtc_0:1; /* RW */ unsigned long rtc_1:1; /* RW */ unsigned long rtc_2:1; /* RW */ @@ -2743,25 +3947,129 @@ union uvxh_event_occurred2_u { unsigned long rtc_30:1; /* RW */ unsigned long rtc_31:1; /* RW */ unsigned long rsvd_32_63:32; - } sx; + } s2; + struct uv3h_event_occurred2_s { + unsigned long rtc_0:1; /* RW */ + unsigned long rtc_1:1; /* RW */ + unsigned long rtc_2:1; /* RW */ + unsigned long rtc_3:1; /* RW */ + unsigned long rtc_4:1; /* RW */ + unsigned long rtc_5:1; /* RW */ + unsigned long rtc_6:1; /* RW */ + unsigned long rtc_7:1; /* RW */ + unsigned long rtc_8:1; /* RW */ + unsigned long rtc_9:1; /* RW */ + unsigned long rtc_10:1; /* RW */ + unsigned long rtc_11:1; /* RW */ + unsigned long rtc_12:1; /* RW */ + unsigned long rtc_13:1; /* RW */ + unsigned long rtc_14:1; /* RW */ + unsigned long rtc_15:1; /* RW */ + unsigned long rtc_16:1; /* RW */ + unsigned long rtc_17:1; /* RW */ + unsigned long rtc_18:1; /* RW */ + unsigned long rtc_19:1; /* RW */ + unsigned long rtc_20:1; /* RW */ + unsigned long rtc_21:1; /* RW */ + unsigned long rtc_22:1; /* RW */ + unsigned long rtc_23:1; /* RW */ + unsigned long rtc_24:1; /* RW */ + unsigned long rtc_25:1; /* RW */ + unsigned long rtc_26:1; /* RW */ + unsigned long rtc_27:1; /* RW */ + unsigned long rtc_28:1; /* RW */ + unsigned long rtc_29:1; /* RW */ + unsigned long rtc_30:1; /* RW */ + unsigned long rtc_31:1; /* RW */ + unsigned long rsvd_32_63:32; + } s3; + struct uv4h_event_occurred2_s { + unsigned long message_accelerator_int0:1; /* RW */ + unsigned long message_accelerator_int1:1; /* RW */ + unsigned long message_accelerator_int2:1; /* RW */ + unsigned long message_accelerator_int3:1; /* RW */ + unsigned long message_accelerator_int4:1; /* RW */ + unsigned long message_accelerator_int5:1; /* RW */ + unsigned long message_accelerator_int6:1; /* RW */ + unsigned long message_accelerator_int7:1; /* RW */ + unsigned long message_accelerator_int8:1; /* RW */ + unsigned long message_accelerator_int9:1; /* RW */ + unsigned long message_accelerator_int10:1; /* RW */ + unsigned long message_accelerator_int11:1; /* RW */ + unsigned long message_accelerator_int12:1; /* RW */ + unsigned long message_accelerator_int13:1; /* RW */ + unsigned long message_accelerator_int14:1; /* RW */ + unsigned long message_accelerator_int15:1; /* RW */ + unsigned long rtc_interval_int:1; /* RW */ + unsigned long bau_dashboard_int:1; /* RW */ + unsigned long rtc_0:1; /* RW */ + unsigned long rtc_1:1; /* RW */ + unsigned long rtc_2:1; /* RW */ + unsigned long rtc_3:1; /* RW */ + unsigned long rtc_4:1; /* RW */ + unsigned long rtc_5:1; /* RW */ + unsigned long rtc_6:1; /* RW */ + unsigned long rtc_7:1; /* RW */ + unsigned long rtc_8:1; /* RW */ + unsigned long rtc_9:1; /* RW */ + unsigned long rtc_10:1; /* RW */ + unsigned long rtc_11:1; /* RW */ + unsigned long rtc_12:1; /* RW */ + unsigned long rtc_13:1; /* RW */ + unsigned long rtc_14:1; /* RW */ + unsigned long rtc_15:1; /* RW */ + unsigned long rtc_16:1; /* RW */ + unsigned long rtc_17:1; /* RW */ + unsigned long rtc_18:1; /* RW */ + unsigned long rtc_19:1; /* RW */ + unsigned long rtc_20:1; /* RW */ + unsigned long rtc_21:1; /* RW */ + unsigned long rtc_22:1; /* RW */ + unsigned long rtc_23:1; /* RW */ + unsigned long rtc_24:1; /* RW */ + unsigned long rtc_25:1; /* RW */ + unsigned long rtc_26:1; /* RW */ + unsigned long rtc_27:1; /* RW */ + unsigned long rtc_28:1; /* RW */ + unsigned long rtc_29:1; /* RW */ + unsigned long rtc_30:1; /* RW */ + unsigned long rtc_31:1; /* RW */ + unsigned long rsvd_50_63:14; + } s4; }; /* ========================================================================= */ /* UVXH_EVENT_OCCURRED2_ALIAS */ /* ========================================================================= */ #define UVXH_EVENT_OCCURRED2_ALIAS 0x70108UL -#define UVXH_EVENT_OCCURRED2_ALIAS_32 0xb70 + +#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70 +#define UV3H_EVENT_OCCURRED2_ALIAS_32 0xb70 +#define UV4H_EVENT_OCCURRED2_ALIAS_32 0x610 +#define UVH_EVENT_OCCURRED2_ALIAS_32 ( \ + is_uv2_hub() ? UV2H_EVENT_OCCURRED2_ALIAS_32 : \ + is_uv3_hub() ? UV3H_EVENT_OCCURRED2_ALIAS_32 : \ + /*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_ALIAS_32) /* ========================================================================= */ /* UVXH_LB_BAU_SB_ACTIVATION_STATUS_2 */ /* ========================================================================= */ -#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL #define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL -#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x320130UL -#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x320130UL +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2 0xc8130UL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_2 ( \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_2 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_2) + +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0xa10 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_2_32 ( \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_32) #define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 #define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL @@ -2772,6 +4080,10 @@ union uvxh_event_occurred2_u { #define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 #define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL + + union uvxh_lb_bau_sb_activation_status_2_u { unsigned long v; struct uvxh_lb_bau_sb_activation_status_2_s { @@ -2783,6 +4095,9 @@ union uvxh_lb_bau_sb_activation_status_2_u { struct uv3h_lb_bau_sb_activation_status_2_s { unsigned long aux_error:64; /* RW */ } s3; + struct uv4h_lb_bau_sb_activation_status_2_s { + unsigned long aux_error:64; /* RW */ + } s4; }; /* ========================================================================= */ @@ -2822,26 +4137,6 @@ union uv3h_gr0_gam_gr_config_u { } s3; }; -/* ========================================================================= */ -/* UV3H_GR1_GAM_GR_CONFIG */ -/* ========================================================================= */ -#define UV3H_GR1_GAM_GR_CONFIG 0x1000028UL - -#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_SHFT 0 -#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_SHFT 10 -#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL -#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL - -union uv3h_gr1_gam_gr_config_u { - unsigned long v; - struct uv3h_gr1_gam_gr_config_s { - unsigned long m_skt:6; /* RW */ - unsigned long undef_6_9:4; /* Undefined */ - unsigned long subspace:1; /* RW */ - unsigned long reserved:53; - } s3; -}; - /* ========================================================================= */ /* UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR */ /* ========================================================================= */ @@ -2924,5 +4219,67 @@ union uv3h_rh_gam_mmioh_redirect_config1_mmr_u { } s3; }; +/* ========================================================================= */ +/* UV4H_LB_PROC_INTD_QUEUE_FIRST */ +/* ========================================================================= */ +#define UV4H_LB_PROC_INTD_QUEUE_FIRST 0xa4100UL + +#define UV4H_LB_PROC_INTD_QUEUE_FIRST_FIRST_PAYLOAD_ADDRESS_SHFT 6 +#define UV4H_LB_PROC_INTD_QUEUE_FIRST_FIRST_PAYLOAD_ADDRESS_MASK 0x00003fffffffffc0UL + +union uv4h_lb_proc_intd_queue_first_u { + unsigned long v; + struct uv4h_lb_proc_intd_queue_first_s { + unsigned long undef_0_5:6; /* Undefined */ + unsigned long first_payload_address:40; /* RW */ + } s4; +}; + +/* ========================================================================= */ +/* UV4H_LB_PROC_INTD_QUEUE_LAST */ +/* ========================================================================= */ +#define UV4H_LB_PROC_INTD_QUEUE_LAST 0xa4108UL + +#define UV4H_LB_PROC_INTD_QUEUE_LAST_LAST_PAYLOAD_ADDRESS_SHFT 5 +#define UV4H_LB_PROC_INTD_QUEUE_LAST_LAST_PAYLOAD_ADDRESS_MASK 0x00003fffffffffe0UL + +union uv4h_lb_proc_intd_queue_last_u { + unsigned long v; + struct uv4h_lb_proc_intd_queue_last_s { + unsigned long undef_0_4:5; /* Undefined */ + unsigned long last_payload_address:41; /* RW */ + } s4; +}; + +/* ========================================================================= */ +/* UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR */ +/* ========================================================================= */ +#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR 0xa4118UL + +#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR_SOFT_ACK_PENDING_FLAGS_SHFT 0 +#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR_SOFT_ACK_PENDING_FLAGS_MASK 0x00000000000000ffUL + +union uv4h_lb_proc_intd_soft_ack_clear_u { + unsigned long v; + struct uv4h_lb_proc_intd_soft_ack_clear_s { + unsigned long soft_ack_pending_flags:8; /* WP */ + } s4; +}; + +/* ========================================================================= */ +/* UV4H_LB_PROC_INTD_SOFT_ACK_PENDING */ +/* ========================================================================= */ +#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING 0xa4110UL + +#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING_SOFT_ACK_FLAGS_SHFT 0 +#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING_SOFT_ACK_FLAGS_MASK 0x00000000000000ffUL + +union uv4h_lb_proc_intd_soft_ack_pending_u { + unsigned long v; + struct uv4h_lb_proc_intd_soft_ack_pending_s { + unsigned long soft_ack_flags:8; /* RW */ + } s4; +}; + #endif /* _ASM_X86_UV_UV_MMRS_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 1ae89a2721d6f5a120650630d317913c3ee2c32a..4dcdf74dfed8606ebd0a948bd733a4d721c0d1b8 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -141,6 +141,44 @@ struct x86_cpuinit_ops { struct timespec; +/** + * struct x86_legacy_devices - legacy x86 devices + * + * @pnpbios: this platform can have a PNPBIOS. If this is disabled the platform + * is known to never have a PNPBIOS. + * + * These are devices known to require LPC or ISA bus. The definition of legacy + * devices adheres to the ACPI 5.2.9.3 IA-PC Boot Architecture flag + * ACPI_FADT_LEGACY_DEVICES. These devices consist of user visible devices on + * the LPC or ISA bus. User visible devices are devices that have end-user + * accessible connectors (for example, LPT parallel port). Legacy devices on + * the LPC bus consist for example of serial and parallel ports, PS/2 keyboard + * / mouse, and the floppy disk controller. A system that lacks all known + * legacy devices can assume all devices can be detected exclusively via + * standard device enumeration mechanisms including the ACPI namespace. + * + * A system which has does not have ACPI_FADT_LEGACY_DEVICES enabled must not + * have any of the legacy devices enumerated below present. + */ +struct x86_legacy_devices { + int pnpbios; +}; + +/** + * struct x86_legacy_features - legacy x86 features + * + * @rtc: this device has a CMOS real-time clock present + * @ebda_search: it's safe to search for the EBDA signature in the hardware's + * low RAM + * @devices: legacy x86 devices, refer to struct x86_legacy_devices + * documentation for further details. + */ +struct x86_legacy_features { + int rtc; + int ebda_search; + struct x86_legacy_devices devices; +}; + /** * struct x86_platform_ops - platform specific runtime functions * @calibrate_tsc: calibrate TSC @@ -152,6 +190,14 @@ struct timespec; * @save_sched_clock_state: save state for sched_clock() on suspend * @restore_sched_clock_state: restore state for sched_clock() on resume * @apic_post_init: adjust apic if neeeded + * @legacy: legacy features + * @set_legacy_features: override legacy features. Use of this callback + * is highly discouraged. You should only need + * this if your hardware platform requires further + * custom fine tuning far beyong what may be + * possible in x86_early_init_platform_quirks() by + * only using the current x86_hardware_subarch + * semantics. */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -165,6 +211,8 @@ struct x86_platform_ops { void (*save_sched_clock_state)(void); void (*restore_sched_clock_state)(void); void (*apic_post_init)(void); + struct x86_legacy_features legacy; + void (*set_legacy_features)(void); }; struct pci_dev; @@ -186,6 +234,8 @@ extern struct x86_cpuinit_ops x86_cpuinit; extern struct x86_platform_ops x86_platform; extern struct x86_msi_ops x86_msi; extern struct x86_io_apic_ops x86_io_apic_ops; + +extern void x86_early_init_platform_quirks(void); extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index c54beb44c4c1f20e4dda33fb901d17b2850fecf5..635eac54392293c6e3250a91aedfe30dee37a976 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -550,7 +550,7 @@ static struct xor_block_template xor_block_pIII_sse = { #define XOR_TRY_TEMPLATES \ do { \ AVX_XOR_SPEED; \ - if (cpu_has_xmm) { \ + if (boot_cpu_has(X86_FEATURE_XMM)) { \ xor_speed(&xor_block_pIII_sse); \ xor_speed(&xor_block_sse_pf64); \ } else if (boot_cpu_has(X86_FEATURE_MMX)) { \ diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 7c0a517ec7511a667166c216df8357087ff3e7b0..22a7b1870a31801be3a28513f6f595fb483b36e1 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -167,12 +167,12 @@ static struct xor_block_template xor_block_avx = { #define AVX_XOR_SPEED \ do { \ - if (cpu_has_avx && cpu_has_osxsave) \ + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ xor_speed(&xor_block_avx); \ } while (0) #define AVX_SELECT(FASTEST) \ - (cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST) + (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) #else diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 329254373479ad750779dedd696c7d41b6f50952..c18ce67495fadffb504d30031594ebe00acb6210 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -157,7 +157,46 @@ struct boot_params { __u8 _pad9[276]; /* 0xeec */ } __attribute__((packed)); -enum { +/** + * enum x86_hardware_subarch - x86 hardware subarchitecture + * + * The x86 hardware_subarch and hardware_subarch_data were added as of the x86 + * boot protocol 2.07 to help distinguish and support custom x86 boot + * sequences. This enum represents accepted values for the x86 + * hardware_subarch. Custom x86 boot sequences (not X86_SUBARCH_PC) do not + * have or simply *cannot* make use of natural stubs like BIOS or EFI, the + * hardware_subarch can be used on the Linux entry path to revector to a + * subarchitecture stub when needed. This subarchitecture stub can be used to + * set up Linux boot parameters or for special care to account for nonstandard + * handling of page tables. + * + * These enums should only ever be used by x86 code, and the code that uses + * it should be well contained and compartamentalized. + * + * KVM and Xen HVM do not have a subarch as these are expected to follow + * standard x86 boot entries. If there is a genuine need for "hypervisor" type + * that should be considered separately in the future. Future guest types + * should seriously consider working with standard x86 boot stubs such as + * the BIOS or EFI boot stubs. + * + * WARNING: this enum is only used for legacy hacks, for platform features that + * are not easily enumerated or discoverable. You should not ever use + * this for new features. + * + * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard + * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest + * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, + * which start at asm startup_xen() entry point and later jump to the C + * xen_start_kernel() entry point. Both domU and dom0 type of guests are + * currently supportd through this PV boot path. + * @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform + * systems which do not have the PCI legacy interfaces. + * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC for + * for settop boxes and media devices, the use of a subarch for CE4100 + * is more of a hack... + */ +enum x86_hardware_subarch { X86_SUBARCH_PC = 0, X86_SUBARCH_LGUEST, X86_SUBARCH_XEN, diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 616ebd22ef9a2eee183dde7fae5bac7d03e28351..9abf8551c7e4d3cec1d8312f484ecad78f3d78cc 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -2,7 +2,11 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds +extra-y := head_$(BITS).o +extra-y += head$(BITS).o +extra-y += ebda.o +extra-y += platform-quirks.o +extra-y += vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8c2f1ef6ca236ea16b9ae25a390b32d33ef566d4..f115a58f7c8438c70a0cad71ec92bb29a1e0ab85 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -136,7 +136,7 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) { struct acpi_table_madt *madt = NULL; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -EINVAL; madt = (struct acpi_table_madt *)table; @@ -913,6 +913,15 @@ late_initcall(hpet_insert_resource); static int __init acpi_parse_fadt(struct acpi_table_header *table) { + if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) { + pr_debug("ACPI: no legacy devices present\n"); + x86_platform.legacy.devices.pnpbios = 0; + } + + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { + pr_debug("ACPI: not registering RTC platform device\n"); + x86_platform.legacy.rtc = 0; + } #ifdef CONFIG_X86_PM_TIMER /* detect the location of the ACPI PM Timer */ @@ -951,7 +960,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) { int count; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; /* @@ -979,7 +988,7 @@ static int __init acpi_parse_madt_lapic_entries(void) int ret; struct acpi_subtable_proc madt_proc[2]; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; /* @@ -1125,7 +1134,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) if (acpi_disabled || acpi_noirq) return -ENODEV; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; /* diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 25f909362b7a89c42f32c8ebe74fb91239df3ca8..5cb272a7a5a32eccbbee61a23c50ab9b5a80a0fb 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d356987a04e97ba6537b2895d45114851e7b705b..60078a67d7e36064e667abfd679ed569db3a8173 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -607,7 +607,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) long tapic = apic_read(APIC_TMCCT); unsigned long pm = acpi_pm_read_early(); - if (cpu_has_tsc) + if (boot_cpu_has(X86_FEATURE_TSC)) tsc = rdtsc(); switch (lapic_cal_loops++) { @@ -668,7 +668,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) *delta = (long)res; /* Correct the tsc counter value */ - if (cpu_has_tsc) { + if (boot_cpu_has(X86_FEATURE_TSC)) { res = (((u64)(*deltatsc)) * pm_100ms); do_div(res, deltapm); apic_printk(APIC_VERBOSE, "TSC delta adjusted to " @@ -760,7 +760,7 @@ static int __init calibrate_APIC_clock(void) apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", lapic_timer_frequency); - if (cpu_has_tsc) { + if (boot_cpu_has(X86_FEATURE_TSC)) { apic_printk(APIC_VERBOSE, "..... CPU clock speed is " "%ld.%04ld MHz.\n", (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), @@ -1085,7 +1085,7 @@ void lapic_shutdown(void) { unsigned long flags; - if (!cpu_has_apic && !apic_from_smp_config()) + if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config()) return; local_irq_save(flags); @@ -1134,7 +1134,7 @@ void __init init_bsp_APIC(void) * Don't do the setup now if we have a SMP BIOS as the * through-I/O-APIC virtual wire mode might be active. */ - if (smp_found_config || !cpu_has_apic) + if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC)) return; /* @@ -1227,7 +1227,7 @@ void setup_local_APIC(void) unsigned long long tsc = 0, ntsc; long long max_loops = cpu_khz ? cpu_khz : 1000000; - if (cpu_has_tsc) + if (boot_cpu_has(X86_FEATURE_TSC)) tsc = rdtsc(); if (disable_apic) { @@ -1311,7 +1311,7 @@ void setup_local_APIC(void) break; } if (queued) { - if (cpu_has_tsc && cpu_khz) { + if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) { ntsc = rdtsc(); max_loops = (cpu_khz << 10) - (ntsc - tsc); } else @@ -1445,7 +1445,7 @@ static void __x2apic_disable(void) { u64 msr; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return; rdmsrl(MSR_IA32_APICBASE, msr); @@ -1561,7 +1561,7 @@ void __init check_x2apic(void) pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n"); x2apic_mode = 1; x2apic_state = X2APIC_ON; - } else if (!cpu_has_x2apic) { + } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) { x2apic_state = X2APIC_DISABLED; } } @@ -1632,7 +1632,7 @@ void __init enable_IR_x2apic(void) */ static int __init detect_init_APIC(void) { - if (!cpu_has_apic) { + if (!boot_cpu_has(X86_FEATURE_APIC)) { pr_info("No local APIC present\n"); return -1; } @@ -1711,14 +1711,14 @@ static int __init detect_init_APIC(void) goto no_apic; case X86_VENDOR_INTEL: if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || - (boot_cpu_data.x86 == 5 && cpu_has_apic)) + (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC))) break; goto no_apic; default: goto no_apic; } - if (!cpu_has_apic) { + if (!boot_cpu_has(X86_FEATURE_APIC)) { /* * Over-ride BIOS and try to enable the local APIC only if * "lapic" specified. @@ -2233,19 +2233,19 @@ int __init APIC_init_uniprocessor(void) return -1; } #ifdef CONFIG_X86_64 - if (!cpu_has_apic) { + if (!boot_cpu_has(X86_FEATURE_APIC)) { disable_apic = 1; pr_info("Apic disabled by BIOS\n"); return -1; } #else - if (!smp_found_config && !cpu_has_apic) + if (!smp_found_config && !boot_cpu_has(X86_FEATURE_APIC)) return -1; /* * Complain if the BIOS pretends there is one. */ - if (!cpu_has_apic && + if (!boot_cpu_has(X86_FEATURE_APIC) && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); @@ -2426,7 +2426,7 @@ static void apic_pm_activate(void) static int __init init_lapic_sysfs(void) { /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - if (cpu_has_apic) + if (boot_cpu_has(X86_FEATURE_APIC)) register_syscore_ops(&lapic_syscore_ops); return 0; diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 331a7a07c48fefe0313f3089c1bb497dc48cd7e4..13d19ed585142eae225625bd808c1c1acba0b790 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -100,13 +100,13 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, static u32 noop_apic_read(u32 reg) { - WARN_ON_ONCE((cpu_has_apic && !disable_apic)); + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); return 0; } static void noop_apic_write(u32 reg, u32 v) { - WARN_ON_ONCE(cpu_has_apic && !disable_apic); + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); } struct apic apic_noop = { diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fdb0fbfb1197a4cf4e485c4f330b3a399b3a6d5f..84e33ff5a6d595693c3718477f168f20d5546316 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1454,7 +1454,7 @@ void native_disable_io_apic(void) ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } - if (cpu_has_apic || apic_from_smp_config()) + if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config()) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 28bde88b0085d2284947016aae114574958428ed..2a0f225afebd5925d66decbba2060b2781837b39 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -230,7 +230,7 @@ int safe_smp_processor_id(void) { int apicid, cpuid; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return 0; apicid = hard_smp_processor_id(); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index ef495511f019f0a899325d6a28ed5da98e1c939e..a5e400afc5632d225f4a197ec9121c8019f5a9a8 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -944,7 +944,7 @@ static int __init print_ICs(void) print_PIC(); /* don't print out if apic is not there */ - if (!cpu_has_apic && !apic_from_smp_config()) + if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config()) return 0; print_local_APICs(show_lapic); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d7ce96a7dacaedc630a0e9a5ef5b761edbd0ffc7..29003154fafd2277e4e014f5c1ac019f1a446c32 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -48,12 +48,35 @@ static u64 gru_start_paddr, gru_end_paddr; static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; static u64 gru_dist_lmask, gru_dist_umask; static union uvh_apicid uvh_apicid; + +/* info derived from CPUID */ +static struct { + unsigned int apicid_shift; + unsigned int apicid_mask; + unsigned int socketid_shift; /* aka pnode_shift for UV1/2/3 */ + unsigned int pnode_mask; + unsigned int gpa_shift; +} uv_cpuid; + int uv_min_hub_revision_id; EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); unsigned int uv_apicid_hibits; EXPORT_SYMBOL_GPL(uv_apicid_hibits); static struct apic apic_x2apic_uv_x; +static struct uv_hub_info_s uv_hub_info_node0; + +/* Set this to use hardware error handler instead of kernel panic */ +static int disable_uv_undefined_panic = 1; +unsigned long uv_undefined(char *str) +{ + if (likely(!disable_uv_undefined_panic)) + panic("UV: error: undefined MMR: %s\n", str); + else + pr_crit("UV: error: undefined MMR: %s\n", str); + return ~0ul; /* cause a machine fault */ +} +EXPORT_SYMBOL(uv_undefined); static unsigned long __init uv_early_read_mmr(unsigned long addr) { @@ -108,21 +131,71 @@ static int __init early_get_pnodeid(void) case UV3_HUB_PART_NUMBER_X: uv_min_hub_revision_id += UV3_HUB_REVISION_BASE; break; + case UV4_HUB_PART_NUMBER: + uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1; + break; } uv_hub_info->hub_revision = uv_min_hub_revision_id; - pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); + uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1; + pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask; + uv_cpuid.gpa_shift = 46; /* default unless changed */ + + pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n", + node_id.s.revision, node_id.s.part_number, node_id.s.node_id, + m_n_config.s.n_skt, uv_cpuid.pnode_mask, pnode); return pnode; } -static void __init early_get_apic_pnode_shift(void) +/* [copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ +#define SMT_LEVEL 0 /* leaf 0xb SMT level */ +#define INVALID_TYPE 0 /* leaf 0xb sub-leaf types */ +#define SMT_TYPE 1 +#define CORE_TYPE 2 +#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) +#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) + +static void set_x2apic_bits(void) +{ + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int sid_shift; + + cpuid(0, &eax, &ebx, &ecx, &edx); + if (eax < 0xb) { + pr_info("UV: CPU does not have CPUID.11\n"); + return; + } + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) { + pr_info("UV: CPUID.11 not implemented\n"); + return; + } + sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); + sub_index = 1; + do { + cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); + break; + } + sub_index++; + } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + uv_cpuid.apicid_shift = 0; + uv_cpuid.apicid_mask = (~(-1 << sid_shift)); + uv_cpuid.socketid_shift = sid_shift; +} + +static void __init early_get_apic_socketid_shift(void) { - uvh_apicid.v = uv_early_read_mmr(UVH_APICID); - if (!uvh_apicid.v) - /* - * Old bios, use default value - */ - uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT; + if (is_uv2_hub() || is_uv3_hub()) + uvh_apicid.v = uv_early_read_mmr(UVH_APICID); + + set_x2apic_bits(); + + pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", + uv_cpuid.apicid_shift, uv_cpuid.apicid_mask); + pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", + uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); } /* @@ -150,13 +223,18 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (strncmp(oem_id, "SGI", 3) != 0) return 0; + /* Setup early hub type field in uv_hub_info for Node 0 */ + uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; + /* * Determine UV arch type. * SGI: UV100/1000 * SGI2: UV2000/3000 * SGI3: UV300 (truncated to 4 chars because of different varieties) + * SGI4: UV400 (truncated to 4 chars because of different varieties) */ uv_hub_info->hub_revision = + !strncmp(oem_id, "SGI4", 4) ? UV4_HUB_REVISION_BASE : !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE : !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE : !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0; @@ -165,7 +243,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) goto badbios; pnodeid = early_get_pnodeid(); - early_get_apic_pnode_shift(); + early_get_apic_socketid_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; x86_platform.nmi_init = uv_nmi_init; @@ -211,17 +289,11 @@ int is_uv_system(void) } EXPORT_SYMBOL_GPL(is_uv_system); -DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); -EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); - -struct uv_blade_info *uv_blade_info; -EXPORT_SYMBOL_GPL(uv_blade_info); - -short *uv_node_to_blade; -EXPORT_SYMBOL_GPL(uv_node_to_blade); +void **__uv_hub_info_list; +EXPORT_SYMBOL_GPL(__uv_hub_info_list); -short *uv_cpu_to_blade; -EXPORT_SYMBOL_GPL(uv_cpu_to_blade); +DEFINE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info); +EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_info); short uv_possible_blades; EXPORT_SYMBOL_GPL(uv_possible_blades); @@ -229,6 +301,115 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); +/* the following values are used for the per node hub info struct */ +static __initdata unsigned short *_node_to_pnode; +static __initdata unsigned short _min_socket, _max_socket; +static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len; +static __initdata struct uv_gam_range_entry *uv_gre_table; +static __initdata struct uv_gam_parameters *uv_gp_table; +static __initdata unsigned short *_socket_to_node; +static __initdata unsigned short *_socket_to_pnode; +static __initdata unsigned short *_pnode_to_socket; +static __initdata struct uv_gam_range_s *_gr_table; +#define SOCK_EMPTY ((unsigned short)~0) + +extern int uv_hub_info_version(void) +{ + return UV_HUB_INFO_VERSION; +} +EXPORT_SYMBOL(uv_hub_info_version); + +/* Build GAM range lookup table */ +static __init void build_uv_gr_table(void) +{ + struct uv_gam_range_entry *gre = uv_gre_table; + struct uv_gam_range_s *grt; + unsigned long last_limit = 0, ram_limit = 0; + int bytes, i, sid, lsid = -1; + + if (!gre) + return; + + bytes = _gr_table_len * sizeof(struct uv_gam_range_s); + grt = kzalloc(bytes, GFP_KERNEL); + BUG_ON(!grt); + _gr_table = grt; + + for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) { + if (!ram_limit) { /* mark hole between ram/non-ram */ + ram_limit = last_limit; + last_limit = gre->limit; + lsid++; + continue; + } + last_limit = gre->limit; + pr_info("UV: extra hole in GAM RE table @%d\n", + (int)(gre - uv_gre_table)); + continue; + } + if (_max_socket < gre->sockid) { + pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", + gre->sockid, _max_socket, + (int)(gre - uv_gre_table)); + continue; + } + sid = gre->sockid - _min_socket; + if (lsid < sid) { /* new range */ + grt = &_gr_table[sid]; + grt->base = lsid; + grt->nasid = gre->nasid; + grt->limit = last_limit = gre->limit; + lsid = sid; + continue; + } + if (lsid == sid && !ram_limit) { /* update range */ + if (grt->limit == last_limit) { /* .. if contiguous */ + grt->limit = last_limit = gre->limit; + continue; + } + } + if (!ram_limit) { /* non-contiguous ram range */ + grt++; + grt->base = sid - 1; + grt->nasid = gre->nasid; + grt->limit = last_limit = gre->limit; + continue; + } + grt++; /* non-contiguous/non-ram */ + grt->base = grt - _gr_table; /* base is this entry */ + grt->nasid = gre->nasid; + grt->limit = last_limit = gre->limit; + lsid++; + } + + /* shorten table if possible */ + grt++; + i = grt - _gr_table; + if (i < _gr_table_len) { + void *ret; + + bytes = i * sizeof(struct uv_gam_range_s); + ret = krealloc(_gr_table, bytes, GFP_KERNEL); + if (ret) { + _gr_table = ret; + _gr_table_len = i; + } + } + + /* display resultant gam range table */ + for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) { + int gb = grt->base; + unsigned long start = gb < 0 ? 0 : + (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT; + unsigned long end = + (unsigned long)grt->limit << UV_GAM_RANGE_SHFT; + + pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", + i, grt->nasid, start, end, gb); + } +} + static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { unsigned long val; @@ -355,7 +536,6 @@ static unsigned long set_apic_id(unsigned int id) static unsigned int uv_read_apic_id(void) { - return x2apic_get_apic_id(apic_read(APIC_ID)); } @@ -430,58 +610,38 @@ static void set_x2apic_extra_bits(int pnode) __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift); } -/* - * Called on boot cpu. - */ -static __init int boot_pnode_to_blade(int pnode) -{ - int blade; - - for (blade = 0; blade < uv_num_possible_blades(); blade++) - if (pnode == uv_blade_info[blade].pnode) - return blade; - BUG(); -} - -struct redir_addr { - unsigned long redirect; - unsigned long alias; -}; - +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3 #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT -static __initdata struct redir_addr redir_addrs[] = { - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR}, - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR}, - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR}, -}; - -static unsigned char get_n_lshift(int m_val) -{ - union uv3h_gr0_gam_gr_config_u m_gr_config; - - if (is_uv1_hub()) - return m_val; - - if (is_uv2_hub()) - return m_val == 40 ? 40 : 39; - - m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); - return m_gr_config.s3.m_skt; -} - static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) { union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; + unsigned long m_redirect; + unsigned long m_overlay; int i; - for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { - alias.v = uv_read_local_mmr(redir_addrs[i].alias); + for (i = 0; i < UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH; i++) { + switch (i) { + case 0: + m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR; + m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR; + break; + case 1: + m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR; + m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR; + break; + case 2: + m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR; + m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR; + break; + } + alias.v = uv_read_local_mmr(m_overlay); if (alias.s.enable && alias.s.base == 0) { *size = (1UL << alias.s.m_alias); - redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); - *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; + redirect.v = uv_read_local_mmr(m_redirect); + *base = (unsigned long)redirect.s.dest_base + << DEST_SHIFT; return; } } @@ -544,6 +704,8 @@ static __init void map_gru_high(int max_pnode) { union uvh_rh_gam_gru_overlay_config_mmr_u gru; int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; + unsigned long mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK; + unsigned long base; gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); if (!gru.s.enable) { @@ -555,8 +717,9 @@ static __init void map_gru_high(int max_pnode) map_gru_distributed(gru.v); return; } - map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb); - gru_start_paddr = ((u64)gru.s.base << shift); + base = (gru.v & mask) >> shift; + map_high("GRU", base, shift, shift, max_pnode, map_wb); + gru_start_paddr = ((u64)base << shift); gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); } @@ -595,6 +758,7 @@ static __initdata struct mmioh_config mmiohs[] = { }, }; +/* UV3 & UV4 have identical MMIOH overlay configs */ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) { union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay; @@ -674,7 +838,7 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode) unsigned long mmr, base; int shift, enable, m_io, n_io; - if (is_uv3_hub()) { + if (is_uv3_hub() || is_uv4_hub()) { /* Map both MMIOH Regions */ map_mmioh_high_uv3(0, min_pnode, max_pnode); map_mmioh_high_uv3(1, min_pnode, max_pnode); @@ -739,8 +903,8 @@ static __init void uv_rtc_init(void) */ static void uv_heartbeat(unsigned long ignored) { - struct timer_list *timer = &uv_hub_info->scir.timer; - unsigned char bits = uv_hub_info->scir.state; + struct timer_list *timer = &uv_scir_info->timer; + unsigned char bits = uv_scir_info->state; /* flip heartbeat bit */ bits ^= SCIR_CPU_HEARTBEAT; @@ -760,14 +924,14 @@ static void uv_heartbeat(unsigned long ignored) static void uv_heartbeat_enable(int cpu) { - while (!uv_cpu_hub_info(cpu)->scir.enabled) { - struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; + while (!uv_cpu_scir_info(cpu)->enabled) { + struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); setup_timer(timer, uv_heartbeat, cpu); timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); - uv_cpu_hub_info(cpu)->scir.enabled = 1; + uv_cpu_scir_info(cpu)->enabled = 1; /* also ensure that boot cpu is enabled */ cpu = 0; @@ -777,9 +941,9 @@ static void uv_heartbeat_enable(int cpu) #ifdef CONFIG_HOTPLUG_CPU static void uv_heartbeat_disable(int cpu) { - if (uv_cpu_hub_info(cpu)->scir.enabled) { - uv_cpu_hub_info(cpu)->scir.enabled = 0; - del_timer(&uv_cpu_hub_info(cpu)->scir.timer); + if (uv_cpu_scir_info(cpu)->enabled) { + uv_cpu_scir_info(cpu)->enabled = 0; + del_timer(&uv_cpu_scir_info(cpu)->timer); } uv_set_cpu_scir_bits(cpu, 0xff); } @@ -862,155 +1026,475 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode, void uv_cpu_init(void) { /* CPU 0 initialization will be done via uv_system_init. */ - if (!uv_blade_info) + if (smp_processor_id() == 0) return; - uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; + uv_hub_info->nr_online_cpus++; if (get_uv_system_type() == UV_NON_UNIQUE_APIC) set_x2apic_extra_bits(uv_hub_info->pnode); } -void __init uv_system_init(void) +struct mn { + unsigned char m_val; + unsigned char n_val; + unsigned char m_shift; + unsigned char n_lshift; +}; + +static void get_mn(struct mn *mnp) { - union uvh_rh_gam_config_mmr_u m_n_config; - union uvh_node_id_u node_id; - unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; - int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; - int gnode_extra, min_pnode = 999999, max_pnode = -1; - unsigned long mmr_base, present, paddr; - unsigned short pnode_mask; - unsigned char n_lshift; - char *hub = (is_uv1_hub() ? "UV100/1000" : - (is_uv2_hub() ? "UV2000/3000" : - (is_uv3_hub() ? "UV300" : NULL))); + union uvh_rh_gam_config_mmr_u m_n_config; + union uv3h_gr0_gam_gr_config_u m_gr_config; - if (!hub) { - pr_err("UV: Unknown/unsupported UV hub\n"); - return; + m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR); + mnp->n_val = m_n_config.s.n_skt; + if (is_uv4_hub()) { + mnp->m_val = 0; + mnp->n_lshift = 0; + } else if (is_uv3_hub()) { + mnp->m_val = m_n_config.s3.m_skt; + m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); + mnp->n_lshift = m_gr_config.s3.m_skt; + } else if (is_uv2_hub()) { + mnp->m_val = m_n_config.s2.m_skt; + mnp->n_lshift = mnp->m_val == 40 ? 40 : 39; + } else if (is_uv1_hub()) { + mnp->m_val = m_n_config.s1.m_skt; + mnp->n_lshift = mnp->m_val; } - pr_info("UV: Found %s hub\n", hub); + mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0; +} - map_low_mmrs(); +void __init uv_init_hub_info(struct uv_hub_info_s *hub_info) +{ + struct mn mn = {0}; /* avoid unitialized warnings */ + union uvh_node_id_u node_id; - m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); - m_val = m_n_config.s.m_skt; - n_val = m_n_config.s.n_skt; - pnode_mask = (1 << n_val) - 1; - n_lshift = get_n_lshift(m_val); - mmr_base = - uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & - ~UV_MMR_ENABLE; + get_mn(&mn); + hub_info->m_val = mn.m_val; + hub_info->n_val = mn.n_val; + hub_info->m_shift = mn.m_shift; + hub_info->n_lshift = mn.n_lshift ? mn.n_lshift : 0; + + hub_info->hub_revision = uv_hub_info->hub_revision; + hub_info->pnode_mask = uv_cpuid.pnode_mask; + hub_info->min_pnode = _min_pnode; + hub_info->min_socket = _min_socket; + hub_info->pnode_to_socket = _pnode_to_socket; + hub_info->socket_to_node = _socket_to_node; + hub_info->socket_to_pnode = _socket_to_pnode; + hub_info->gr_table_len = _gr_table_len; + hub_info->gr_table = _gr_table; + hub_info->gpa_mask = mn.m_val ? + (1UL << (mn.m_val + mn.n_val)) - 1 : + (1UL << uv_cpuid.gpa_shift) - 1; node_id.v = uv_read_local_mmr(UVH_NODE_ID); - gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; - gnode_upper = ((unsigned long)gnode_extra << m_val); - pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x n_lshift 0x%x\n", - n_val, m_val, pnode_mask, gnode_upper, gnode_extra, - n_lshift); + hub_info->gnode_extra = + (node_id.s.node_id & ~((1 << mn.n_val) - 1)) >> 1; + + hub_info->gnode_upper = + ((unsigned long)hub_info->gnode_extra << mn.m_val); + + if (uv_gp_table) { + hub_info->global_mmr_base = uv_gp_table->mmr_base; + hub_info->global_mmr_shift = uv_gp_table->mmr_shift; + hub_info->global_gru_base = uv_gp_table->gru_base; + hub_info->global_gru_shift = uv_gp_table->gru_shift; + hub_info->gpa_shift = uv_gp_table->gpa_shift; + hub_info->gpa_mask = (1UL << hub_info->gpa_shift) - 1; + } else { + hub_info->global_mmr_base = + uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & + ~UV_MMR_ENABLE; + hub_info->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT; + } - pr_info("UV: global MMR base 0x%lx\n", mmr_base); + get_lowmem_redirect( + &hub_info->lowmem_remap_base, &hub_info->lowmem_remap_top); - for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) - uv_possible_blades += - hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); + hub_info->apic_pnode_shift = uv_cpuid.socketid_shift; - /* uv_num_possible_blades() is really the hub count */ - pr_info("UV: Found %d blades, %d hubs\n", - is_uv1_hub() ? uv_num_possible_blades() : - (uv_num_possible_blades() + 1) / 2, - uv_num_possible_blades()); + /* show system specific info */ + pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", + hub_info->n_val, hub_info->m_val, + hub_info->m_shift, hub_info->n_lshift); - bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); - uv_blade_info = kzalloc(bytes, GFP_KERNEL); - BUG_ON(!uv_blade_info); + pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", + hub_info->gpa_mask, hub_info->gpa_shift, + hub_info->pnode_mask, hub_info->apic_pnode_shift); - for (blade = 0; blade < uv_num_possible_blades(); blade++) - uv_blade_info[blade].memory_nid = -1; + pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", + hub_info->global_mmr_base, hub_info->global_mmr_shift, + hub_info->global_gru_base, hub_info->global_gru_shift); - get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); + pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", + hub_info->gnode_upper, hub_info->gnode_extra); +} + +static void __init decode_gam_params(unsigned long ptr) +{ + uv_gp_table = (struct uv_gam_parameters *)ptr; + + pr_info("UV: GAM Params...\n"); + pr_info("UV: mmr_base/shift:0x%llx/%d gru_base/shift:0x%llx/%d gpa_shift:%d\n", + uv_gp_table->mmr_base, uv_gp_table->mmr_shift, + uv_gp_table->gru_base, uv_gp_table->gru_shift, + uv_gp_table->gpa_shift); +} + +static void __init decode_gam_rng_tbl(unsigned long ptr) +{ + struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr; + unsigned long lgre = 0; + int index = 0; + int sock_min = 999999, pnode_min = 99999; + int sock_max = -1, pnode_max = -1; + + uv_gre_table = gre; + for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + if (!index) { + pr_info("UV: GAM Range Table...\n"); + pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s %3s\n", + "Range", "", "Size", "Type", "NASID", + "SID", "PN", "PXM"); + } + pr_info( + "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x %3d\n", + index++, + (unsigned long)lgre << UV_GAM_RANGE_SHFT, + (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, + ((unsigned long)(gre->limit - lgre)) >> + (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */ + gre->type, gre->nasid, gre->sockid, + gre->pnode, gre->pxm); + + lgre = gre->limit; + if (sock_min > gre->sockid) + sock_min = gre->sockid; + if (sock_max < gre->sockid) + sock_max = gre->sockid; + if (pnode_min > gre->pnode) + pnode_min = gre->pnode; + if (pnode_max < gre->pnode) + pnode_max = gre->pnode; + } + _min_socket = sock_min; + _max_socket = sock_max; + _min_pnode = pnode_min; + _max_pnode = pnode_max; + _gr_table_len = index; + pr_info( + "UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", + index, _min_socket, _max_socket, _min_pnode, _max_pnode); +} + +static void __init decode_uv_systab(void) +{ + struct uv_systab *st; + int i; + + st = uv_systab; + if ((!st || st->revision < UV_SYSTAB_VERSION_UV4) && !is_uv4_hub()) + return; + if (st->revision != UV_SYSTAB_VERSION_UV4_LATEST) { + pr_crit( + "UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", + st->revision, UV_SYSTAB_VERSION_UV4_LATEST); + BUG(); + } + + for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) { + unsigned long ptr = st->entry[i].offset; - bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); - uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); - BUG_ON(!uv_node_to_blade); - memset(uv_node_to_blade, 255, bytes); + if (!ptr) + continue; + + ptr = ptr + (unsigned long)st; + + switch (st->entry[i].type) { + case UV_SYSTAB_TYPE_GAM_PARAMS: + decode_gam_params(ptr); + break; + + case UV_SYSTAB_TYPE_GAM_RNG_TBL: + decode_gam_rng_tbl(ptr); + break; + } + } +} - bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); - uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); - BUG_ON(!uv_cpu_to_blade); - memset(uv_cpu_to_blade, 255, bytes); +/* + * Setup physical blade translations from UVH_NODE_PRESENT_TABLE + * .. NB: UVH_NODE_PRESENT_TABLE is going away, + * .. being replaced by GAM Range Table + */ +static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info) +{ + int i, uv_pb = 0; - blade = 0; + pr_info("UV: NODE_PRESENT_DEPTH = %d\n", UVH_NODE_PRESENT_TABLE_DEPTH); for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { - present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); - for (j = 0; j < 64; j++) { - if (!test_bit(j, &present)) - continue; - pnode = (i * 64 + j) & pnode_mask; - uv_blade_info[blade].pnode = pnode; - uv_blade_info[blade].nr_possible_cpus = 0; - uv_blade_info[blade].nr_online_cpus = 0; - spin_lock_init(&uv_blade_info[blade].nmi_lock); - min_pnode = min(pnode, min_pnode); - max_pnode = max(pnode, max_pnode); - blade++; + unsigned long np; + + np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); + if (np) + pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np); + + uv_pb += hweight64(np); + } + if (uv_possible_blades != uv_pb) + uv_possible_blades = uv_pb; +} + +static void __init build_socket_tables(void) +{ + struct uv_gam_range_entry *gre = uv_gre_table; + int num, nump; + int cpu, i, lnid; + int minsock = _min_socket; + int maxsock = _max_socket; + int minpnode = _min_pnode; + int maxpnode = _max_pnode; + size_t bytes; + + if (!gre) { + if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) { + pr_info("UV: No UVsystab socket table, ignoring\n"); + return; /* not required */ } + pr_crit( + "UV: Error: UVsystab address translations not available!\n"); + BUG(); + } + + /* build socket id -> node id, pnode */ + num = maxsock - minsock + 1; + bytes = num * sizeof(_socket_to_node[0]); + _socket_to_node = kmalloc(bytes, GFP_KERNEL); + _socket_to_pnode = kmalloc(bytes, GFP_KERNEL); + + nump = maxpnode - minpnode + 1; + bytes = nump * sizeof(_pnode_to_socket[0]); + _pnode_to_socket = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!_socket_to_node || !_socket_to_pnode || !_pnode_to_socket); + + for (i = 0; i < num; i++) + _socket_to_node[i] = _socket_to_pnode[i] = SOCK_EMPTY; + + for (i = 0; i < nump; i++) + _pnode_to_socket[i] = SOCK_EMPTY; + + /* fill in pnode/node/addr conversion list values */ + pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n"); + for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) + continue; + i = gre->sockid - minsock; + if (_socket_to_pnode[i] != SOCK_EMPTY) + continue; /* duplicate */ + _socket_to_pnode[i] = gre->pnode; + _socket_to_node[i] = gre->pxm; + + i = gre->pnode - minpnode; + _pnode_to_socket[i] = gre->sockid; + + pr_info( + "UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n", + gre->sockid, gre->type, gre->nasid, + _socket_to_pnode[gre->sockid - minsock], + _socket_to_node[gre->sockid - minsock], + _pnode_to_socket[gre->pnode - minpnode]); } - uv_bios_init(); + /* check socket -> node values */ + lnid = -1; + for_each_present_cpu(cpu) { + int nid = cpu_to_node(cpu); + int apicid, sockid; + + if (lnid == nid) + continue; + lnid = nid; + apicid = per_cpu(x86_cpu_to_apicid, cpu); + sockid = apicid >> uv_cpuid.socketid_shift; + i = sockid - minsock; + + if (nid != _socket_to_node[i]) { + pr_warn( + "UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n", + i, sockid, gre->type, _socket_to_node[i], nid); + _socket_to_node[i] = nid; + } + } + + /* Setup physical blade to pnode translation from GAM Range Table */ + bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]); + _node_to_pnode = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!_node_to_pnode); + + for (lnid = 0; lnid < num_possible_nodes(); lnid++) { + unsigned short sockid; + + for (sockid = minsock; sockid <= maxsock; sockid++) { + if (lnid == _socket_to_node[sockid - minsock]) { + _node_to_pnode[lnid] = + _socket_to_pnode[sockid - minsock]; + break; + } + } + if (sockid > maxsock) { + pr_err("UV: socket for node %d not found!\n", lnid); + BUG(); + } + } + + /* + * If socket id == pnode or socket id == node for all nodes, + * system runs faster by removing corresponding conversion table. + */ + pr_info("UV: Checking socket->node/pnode for identity maps\n"); + if (minsock == 0) { + for (i = 0; i < num; i++) + if (_socket_to_node[i] == SOCK_EMPTY || + i != _socket_to_node[i]) + break; + if (i >= num) { + kfree(_socket_to_node); + _socket_to_node = NULL; + pr_info("UV: 1:1 socket_to_node table removed\n"); + } + } + if (minsock == minpnode) { + for (i = 0; i < num; i++) + if (_socket_to_pnode[i] != SOCK_EMPTY && + _socket_to_pnode[i] != i + minpnode) + break; + if (i >= num) { + kfree(_socket_to_pnode); + _socket_to_pnode = NULL; + pr_info("UV: 1:1 socket_to_pnode table removed\n"); + } + } +} + +void __init uv_system_init(void) +{ + struct uv_hub_info_s hub_info = {0}; + int bytes, cpu, nodeid; + unsigned short min_pnode = 9999, max_pnode = 0; + char *hub = is_uv4_hub() ? "UV400" : + is_uv3_hub() ? "UV300" : + is_uv2_hub() ? "UV2000/3000" : + is_uv1_hub() ? "UV100/1000" : NULL; + + if (!hub) { + pr_err("UV: Unknown/unsupported UV hub\n"); + return; + } + pr_info("UV: Found %s hub\n", hub); + + map_low_mmrs(); + + uv_bios_init(); /* get uv_systab for decoding */ + decode_uv_systab(); + build_socket_tables(); + build_uv_gr_table(); + uv_init_hub_info(&hub_info); + uv_possible_blades = num_possible_nodes(); + if (!_node_to_pnode) + boot_init_possible_blades(&hub_info); + + /* uv_num_possible_blades() is really the hub count */ + pr_info("UV: Found %d hubs, %d nodes, %d cpus\n", + uv_num_possible_blades(), + num_possible_nodes(), + num_possible_cpus()); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size, &system_serial_number); + hub_info.coherency_domain_number = sn_coherency_id; uv_rtc_init(); - for_each_present_cpu(cpu) { - int apicid = per_cpu(x86_cpu_to_apicid, cpu); + bytes = sizeof(void *) * uv_num_possible_blades(); + __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL); + BUG_ON(!__uv_hub_info_list); - nid = cpu_to_node(cpu); - /* - * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); - */ - uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; - uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; - uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision; + bytes = sizeof(struct uv_hub_info_s); + for_each_node(nodeid) { + struct uv_hub_info_s *new_hub; - uv_cpu_hub_info(cpu)->m_shift = 64 - m_val; - uv_cpu_hub_info(cpu)->n_lshift = n_lshift; + if (__uv_hub_info_list[nodeid]) { + pr_err("UV: Node %d UV HUB already initialized!?\n", + nodeid); + BUG(); + } + + /* Allocate new per hub info list */ + new_hub = (nodeid == 0) ? + &uv_hub_info_node0 : + kzalloc_node(bytes, GFP_KERNEL, nodeid); + BUG_ON(!new_hub); + __uv_hub_info_list[nodeid] = new_hub; + new_hub = uv_hub_info_list(nodeid); + BUG_ON(!new_hub); + *new_hub = hub_info; + + /* Use information from GAM table if available */ + if (_node_to_pnode) + new_hub->pnode = _node_to_pnode[nodeid]; + else /* Fill in during cpu loop */ + new_hub->pnode = 0xffff; + new_hub->numa_blade_id = uv_node_to_blade_id(nodeid); + new_hub->memory_nid = -1; + new_hub->nr_possible_cpus = 0; + new_hub->nr_online_cpus = 0; + } + /* Initialize per cpu info */ + for_each_possible_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + int numa_node_id; + unsigned short pnode; + + nodeid = cpu_to_node(cpu); + numa_node_id = numa_cpu_node(cpu); pnode = uv_apicid_to_pnode(apicid); - blade = boot_pnode_to_blade(pnode); - lcpu = uv_blade_info[blade].nr_possible_cpus; - uv_blade_info[blade].nr_possible_cpus++; - - /* Any node on the blade, else will contain -1. */ - uv_blade_info[blade].memory_nid = nid; - - uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; - uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; - uv_cpu_hub_info(cpu)->m_val = m_val; - uv_cpu_hub_info(cpu)->n_val = n_val; - uv_cpu_hub_info(cpu)->numa_blade_id = blade; - uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; - uv_cpu_hub_info(cpu)->pnode = pnode; - uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; - uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; - uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; - uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; - uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; - uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); - uv_node_to_blade[nid] = blade; - uv_cpu_to_blade[cpu] = blade; + + uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid); + uv_cpu_info_per(cpu)->blade_cpu_id = + uv_cpu_hub_info(cpu)->nr_possible_cpus++; + if (uv_cpu_hub_info(cpu)->memory_nid == -1) + uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); + if (nodeid != numa_node_id && /* init memoryless node */ + uv_hub_info_list(numa_node_id)->pnode == 0xffff) + uv_hub_info_list(numa_node_id)->pnode = pnode; + else if (uv_cpu_hub_info(cpu)->pnode == 0xffff) + uv_cpu_hub_info(cpu)->pnode = pnode; + uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid); } - /* Add blade/pnode info for nodes without cpus */ - for_each_online_node(nid) { - if (uv_node_to_blade[nid] >= 0) - continue; - paddr = node_start_pfn(nid) << PAGE_SHIFT; - pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr)); - blade = boot_pnode_to_blade(pnode); - uv_node_to_blade[nid] = blade; + for_each_node(nodeid) { + unsigned short pnode = uv_hub_info_list(nodeid)->pnode; + + /* Add pnode info for pre-GAM list nodes without cpus */ + if (pnode == 0xffff) { + unsigned long paddr; + + paddr = node_start_pfn(nodeid) << PAGE_SHIFT; + pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr)); + uv_hub_info_list(nodeid)->pnode = pnode; + } + min_pnode = min(pnode, min_pnode); + max_pnode = max(pnode, max_pnode); + pr_info("UV: UVHUB node:%2d pn:%02x nrcpus:%d\n", + nodeid, + uv_hub_info_list(nodeid)->pnode, + uv_hub_info_list(nodeid)->nr_possible_cpus); } + pr_info("UV: min_pnode:%02x max_pnode:%02x\n", min_pnode, max_pnode); map_gru_high(max_pnode); map_mmr_high(max_pnode); map_mmioh_high(min_pnode, max_pnode); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 9307f182fe3049f8e4f1c3d38bb9b1ebb3008d26..c7364bd633e1d8c1a346c69534ded295bc2ba48d 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -2267,7 +2267,7 @@ static int __init apm_init(void) dmi_check_system(apm_dmi_table); - if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { + if (apm_info.bios.version == 0 || machine_is_olpc()) { printk(KERN_INFO "apm: BIOS not found.\n"); return -ENODEV; } diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 5c042466f274d0f79dac84bba1c6835953565c84..674134e9f5e518c3662de5cc369d9a5092e921ac 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -80,6 +80,7 @@ void common(void) { OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); + OFFSET(BP_init_size, boot_params, hdr.init_size); OFFSET(BP_pref_address, boot_params, hdr.pref_address); OFFSET(BP_code32_start, boot_params, hdr.code32_start); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7b76eb67a9b3dcb84bb8e6cd6e40945924d32938..c343a54bed396d2f924d4ed6d8e5ec2188c636b0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -565,14 +565,17 @@ static void early_init_amd(struct cpuinfo_x86 *c) * can safely set X86_FEATURE_EXTD_APICID unconditionally for families * after 16h. */ - if (cpu_has_apic && c->x86 > 0x16) { - set_cpu_cap(c, X86_FEATURE_EXTD_APICID); - } else if (cpu_has_apic && c->x86 >= 0xf) { - /* check CPU config space for extended APIC ID */ - unsigned int val; - val = read_pci_config(0, 24, 0, 0x68); - if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) + if (boot_cpu_has(X86_FEATURE_APIC)) { + if (c->x86 > 0x16) set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + else if (c->x86 >= 0xf) { + /* check CPU config space for extended APIC ID */ + unsigned int val; + + val = read_pci_config(0, 24, 0, 0x68); + if ((val >> 17 & 0x3) == 0x3) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } } #endif @@ -628,6 +631,7 @@ static void init_amd_k8(struct cpuinfo_x86 *c) */ msr_set_bit(MSR_K7_HWCR, 6); #endif + set_cpu_bug(c, X86_BUG_SWAPGS_FENCE); } static void init_amd_gh(struct cpuinfo_x86 *c) @@ -746,7 +750,7 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); - if (cpu_has_xmm2) { + if (cpu_has(c, X86_FEATURE_XMM2)) { /* MFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 62ff5255ae16fb1436a94d7113aa3bb2deae8acd..0fe6953f421c9b92b72c331600f435e623c3e264 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -437,7 +437,7 @@ void load_percpu_segment(int cpu) #ifdef CONFIG_X86_32 loadsegment(fs, __KERNEL_PERCPU); #else - loadsegment(gs, 0); + __loadsegment_simple(gs, 0); wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); #endif load_stack_canary_segment(); @@ -724,6 +724,13 @@ void get_cpu_cap(struct cpuinfo_x86 *c) } } + if (c->extended_cpuid_level >= 0x80000007) { + cpuid(0x80000007, &eax, &ebx, &ecx, &edx); + + c->x86_capability[CPUID_8000_0007_EBX] = ebx; + c->x86_power = edx; + } + if (c->extended_cpuid_level >= 0x80000008) { cpuid(0x80000008, &eax, &ebx, &ecx, &edx); @@ -736,9 +743,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_phys_bits = 36; #endif - if (c->extended_cpuid_level >= 0x80000007) - c->x86_power = cpuid_edx(0x80000007); - if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); @@ -869,30 +873,34 @@ static void detect_nopl(struct cpuinfo_x86 *c) #else set_cpu_cap(c, X86_FEATURE_NOPL); #endif +} +static void detect_null_seg_behavior(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 /* - * ESPFIX is a strange bug. All real CPUs have it. Paravirt - * systems that run Linux at CPL > 0 may or may not have the - * issue, but, even if they have the issue, there's absolutely - * nothing we can do about it because we can't use the real IRET - * instruction. + * Empirically, writing zero to a segment selector on AMD does + * not clear the base, whereas writing zero to a segment + * selector on Intel does clear the base. Intel's behavior + * allows slightly faster context switches in the common case + * where GS is unused by the prev and next threads. * - * NB: For the time being, only 32-bit kernels support - * X86_BUG_ESPFIX as such. 64-bit kernels directly choose - * whether to apply espfix using paravirt hooks. If any - * non-paravirt system ever shows up that does *not* have the - * ESPFIX issue, we can change this. + * Since neither vendor documents this anywhere that I can see, + * detect it directly instead of hardcoding the choice by + * vendor. + * + * I've designated AMD's behavior as the "bug" because it's + * counterintuitive and less friendly. */ -#ifdef CONFIG_X86_32 -#ifdef CONFIG_PARAVIRT - do { - extern void native_iret(void); - if (pv_cpu_ops.iret == native_iret) - set_cpu_bug(c, X86_BUG_ESPFIX); - } while (0); -#else - set_cpu_bug(c, X86_BUG_ESPFIX); -#endif + + unsigned long old_base, tmp; + rdmsrl(MSR_FS_BASE, old_base); + wrmsrl(MSR_FS_BASE, 1); + loadsegment(fs, 0); + rdmsrl(MSR_FS_BASE, tmp); + if (tmp != 0) + set_cpu_bug(c, X86_BUG_NULL_SEG); + wrmsrl(MSR_FS_BASE, old_base); #endif } @@ -928,6 +936,33 @@ static void generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ detect_nopl(c); + + detect_null_seg_behavior(c); + + /* + * ESPFIX is a strange bug. All real CPUs have it. Paravirt + * systems that run Linux at CPL > 0 may or may not have the + * issue, but, even if they have the issue, there's absolutely + * nothing we can do about it because we can't use the real IRET + * instruction. + * + * NB: For the time being, only 32-bit kernels support + * X86_BUG_ESPFIX as such. 64-bit kernels directly choose + * whether to apply espfix using paravirt hooks. If any + * non-paravirt system ever shows up that does *not* have the + * ESPFIX issue, we can change this. + */ +#ifdef CONFIG_X86_32 +# ifdef CONFIG_PARAVIRT + do { + extern void native_iret(void); + if (pv_cpu_ops.iret == native_iret) + set_cpu_bug(c, X86_BUG_ESPFIX); + } while (0); +# else + set_cpu_bug(c, X86_BUG_ESPFIX); +# endif +#endif } static void x86_init_cache_qos(struct cpuinfo_x86 *c) @@ -1083,12 +1118,12 @@ void enable_sep_cpu(void) struct tss_struct *tss; int cpu; + if (!boot_cpu_has(X86_FEATURE_SEP)) + return; + cpu = get_cpu(); tss = &per_cpu(cpu_tss, cpu); - if (!boot_cpu_has(X86_FEATURE_SEP)) - goto out; - /* * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- * see the big comment in struct x86_hw_tss's definition. @@ -1103,7 +1138,6 @@ void enable_sep_cpu(void) wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); -out: put_cpu(); } #endif @@ -1535,7 +1569,7 @@ void cpu_init(void) pr_info("Initializing CPU#%d\n", cpu); if (cpu_feature_enabled(X86_FEATURE_VME) || - cpu_has_tsc || + boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 6adef9cac23ee99c96924e2789abeb2d1ad123f3..bd9dcd6b712d0c09937facb3cdd41da866edae81 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -333,7 +333,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) switch (dir0_lsn) { case 0xd: /* either a 486SLC or DLC w/o DEVID */ dir0_msn = 0; - p = Cx486_name[(cpu_has_fpu ? 1 : 0)]; + p = Cx486_name[!!boot_cpu_has(X86_FEATURE_FPU)]; break; case 0xe: /* a 486S A step */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b47df99dc5d21396e142c5551beb7c6d87d218af..6e2ffbebbcdbd053a66ee5dbfa7e421c4281b322 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -187,9 +187,9 @@ static void early_init_intel(struct cpuinfo_x86 *c) * the TLB when any changes are made to any of the page table entries. * The operating system must reload CR3 to cause the TLB to be flushed" * - * As a result cpu_has_pge() in arch/x86/include/asm/tlbflush.h should - * be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE - * to be modified + * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h + * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE + * to be modified. */ if (c->x86 == 5 && c->x86_model == 9) { pr_info("Disabling PGE capability bit\n"); @@ -270,7 +270,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * The Quark is also family 5, but does not have the same bug. */ clear_cpu_bug(c, X86_BUG_F00F); - if (!paravirt_enabled() && c->x86 == 5 && c->x86_model < 9) { + if (c->x86 == 5 && c->x86_model < 9) { static int f00f_workaround_enabled; set_cpu_bug(c, X86_BUG_F00F); @@ -318,7 +318,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * integrated APIC (see 11AP erratum in "Pentium Processor * Specification Update"). */ - if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && + if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 && (c->x86_mask < 0x6 || c->x86_mask == 0xb)) set_cpu_bug(c, X86_BUG_11AP); @@ -493,7 +493,7 @@ static void init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } - if (cpu_has_xmm2) + if (cpu_has(c, X86_FEATURE_XMM2)) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); if (boot_cpu_has(X86_FEATURE_DS)) { @@ -505,7 +505,7 @@ static void init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_PEBS); } - if (c->x86 == 6 && cpu_has_clflush && + if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) && (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c index 2658e2af74ec4c3f433b9958b4498d7f7d603fed..93d824ec3120ebfab0110f53a907e71a09c999dc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -26,6 +26,52 @@ static struct gen_pool *mce_evt_pool; static LLIST_HEAD(mce_event_llist); static char gen_pool_buf[MCE_POOLSZ]; +/* + * Compare the record "t" with each of the records on list "l" to see if + * an equivalent one is present in the list. + */ +static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) +{ + struct mce_evt_llist *node; + struct mce *m1, *m2; + + m1 = &t->mce; + + llist_for_each_entry(node, &l->llnode, llnode) { + m2 = &node->mce; + + if (!mce_cmp(m1, m2)) + return true; + } + return false; +} + +/* + * The system has panicked - we'd like to peruse the list of MCE records + * that have been queued, but not seen by anyone yet. The list is in + * reverse time order, so we need to reverse it. While doing that we can + * also drop duplicate records (these were logged because some banks are + * shared between cores or by all threads on a socket). + */ +struct llist_node *mce_gen_pool_prepare_records(void) +{ + struct llist_node *head; + LLIST_HEAD(new_head); + struct mce_evt_llist *node, *t; + + head = llist_del_all(&mce_event_llist); + if (!head) + return NULL; + + /* squeeze out duplicates while reversing order */ + llist_for_each_entry_safe(node, t, head, llnode) { + if (!is_duplicate_mce_record(node, t)) + llist_add(&node->llnode, &new_head); + } + + return new_head.first; +} + void mce_gen_pool_process(void) { struct llist_node *head; diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 547720efd923f0f64c5ce30ef2dff9b6cccf7805..cd74a3f00aea8185c4e99eec72fb01bbaffb7306 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -35,6 +35,7 @@ void mce_gen_pool_process(void); bool mce_gen_pool_empty(void); int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); +struct llist_node *mce_gen_pool_prepare_records(void); extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); @@ -81,3 +82,17 @@ static inline int apei_clear_mce(u64 record_id) #endif void mce_inject_log(struct mce *m); + +/* + * We consider records to be equivalent if bank+status+addr+misc all match. + * This is only used when the system is going down because of a fatal error + * to avoid cluttering the console log with essentially repeated information. + * In normal processing all errors seen are logged. + */ +static inline bool mce_cmp(struct mce *m1, struct mce *m2) +{ + return m1->bank != m2->bank || + m1->status != m2->status || + m1->addr != m2->addr || + m1->misc != m2->misc; +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 5119766d988925a4c8eb9df23a3ff7b1b626d174..631356c8cca4a552c042629c9a2a6744a3512ec9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -204,6 +204,33 @@ static int error_context(struct mce *m) return IN_KERNEL; } +static int mce_severity_amd_smca(struct mce *m, int err_ctx) +{ + u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); + u32 low, high; + + /* + * We need to look at the following bits: + * - "succor" bit (data poisoning support), and + * - TCC bit (Task Context Corrupt) + * in MCi_STATUS to determine error severity. + */ + if (!mce_flags.succor) + return MCE_PANIC_SEVERITY; + + if (rdmsr_safe(addr, &low, &high)) + return MCE_PANIC_SEVERITY; + + /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */ + if ((low & MCI_CONFIG_MCAX) && + (m->status & MCI_STATUS_TCC) && + (err_ctx == IN_KERNEL)) + return MCE_PANIC_SEVERITY; + + /* ...otherwise invoke hwpoison handler. */ + return MCE_AR_SEVERITY; +} + /* * See AMD Error Scope Hierarchy table in a newer BKDG. For example * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" @@ -225,6 +252,9 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc * to at least kill process to prolong system operation. */ if (mce_flags.overflow_recov) { + if (mce_flags.smca) + return mce_severity_amd_smca(m, ctx); + /* software can try to contain */ if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL)) return MCE_PANIC_SEVERITY; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f0c921b03e4245e1f7a4b16687c0d9e7600b4256..92e5e37d97bf4b808a7e1824ad4fd573ccf075e1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -161,7 +161,6 @@ void mce_log(struct mce *mce) if (!mce_gen_pool_add(mce)) irq_work_queue(&mce_irq_work); - mce->finished = 0; wmb(); for (;;) { entry = mce_log_get_idx_check(mcelog.next); @@ -194,7 +193,6 @@ void mce_log(struct mce *mce) mcelog.entry[entry].finished = 1; wmb(); - mce->finished = 1; set_bit(0, &mce_need_notify); } @@ -224,6 +222,53 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); +static inline u32 ctl_reg(int bank) +{ + return MSR_IA32_MCx_CTL(bank); +} + +static inline u32 status_reg(int bank) +{ + return MSR_IA32_MCx_STATUS(bank); +} + +static inline u32 addr_reg(int bank) +{ + return MSR_IA32_MCx_ADDR(bank); +} + +static inline u32 misc_reg(int bank) +{ + return MSR_IA32_MCx_MISC(bank); +} + +static inline u32 smca_ctl_reg(int bank) +{ + return MSR_AMD64_SMCA_MCx_CTL(bank); +} + +static inline u32 smca_status_reg(int bank) +{ + return MSR_AMD64_SMCA_MCx_STATUS(bank); +} + +static inline u32 smca_addr_reg(int bank) +{ + return MSR_AMD64_SMCA_MCx_ADDR(bank); +} + +static inline u32 smca_misc_reg(int bank) +{ + return MSR_AMD64_SMCA_MCx_MISC(bank); +} + +struct mca_msr_regs msr_ops = { + .ctl = ctl_reg, + .status = status_reg, + .addr = addr_reg, + .misc = misc_reg +}; + static void print_mce(struct mce *m) { int ret = 0; @@ -290,7 +335,9 @@ static void wait_for_panic(void) static void mce_panic(const char *msg, struct mce *final, char *exp) { - int i, apei_err = 0; + int apei_err = 0; + struct llist_node *pending; + struct mce_evt_llist *l; if (!fake_panic) { /* @@ -307,11 +354,10 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) if (atomic_inc_return(&mce_fake_panicked) > 1) return; } + pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ - for (i = 0; i < MCE_LOG_LEN; i++) { - struct mce *m = &mcelog.entry[i]; - if (!(m->status & MCI_STATUS_VAL)) - continue; + llist_for_each_entry(l, pending, llnode) { + struct mce *m = &l->mce; if (!(m->status & MCI_STATUS_UC)) { print_mce(m); if (!apei_err) @@ -319,13 +365,11 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) } } /* Now print uncorrected but with the final one last */ - for (i = 0; i < MCE_LOG_LEN; i++) { - struct mce *m = &mcelog.entry[i]; - if (!(m->status & MCI_STATUS_VAL)) - continue; + llist_for_each_entry(l, pending, llnode) { + struct mce *m = &l->mce; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || memcmp(m, final, sizeof(struct mce))) { + if (!final || mce_cmp(m, final)) { print_mce(m); if (!apei_err) apei_err = apei_write_mce(m); @@ -356,11 +400,11 @@ static int msr_to_offset(u32 msr) if (msr == mca_cfg.rip_msr) return offsetof(struct mce, ip); - if (msr == MSR_IA32_MCx_STATUS(bank)) + if (msr == msr_ops.status(bank)) return offsetof(struct mce, status); - if (msr == MSR_IA32_MCx_ADDR(bank)) + if (msr == msr_ops.addr(bank)) return offsetof(struct mce, addr); - if (msr == MSR_IA32_MCx_MISC(bank)) + if (msr == msr_ops.misc(bank)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); @@ -523,9 +567,9 @@ static struct notifier_block mce_srao_nb = { static void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) - m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); + m->misc = mce_rdmsrl(msr_ops.misc(i)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + m->addr = mce_rdmsrl(msr_ops.addr(i)); /* * Mask the reported address by the reported granularity. @@ -607,7 +651,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.tsc = 0; barrier(); - m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); + m.status = mce_rdmsrl(msr_ops.status(i)); if (!(m.status & MCI_STATUS_VAL)) continue; @@ -654,7 +698,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); + mce_wrmsrl(msr_ops.status(i), 0); } /* @@ -679,7 +723,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, char *tmp; for (i = 0; i < mca_cfg.banks; i++) { - m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); + m->status = mce_rdmsrl(msr_ops.status(i)); if (m->status & MCI_STATUS_VAL) { __set_bit(i, validp); if (quirk_no_way_out) @@ -830,9 +874,9 @@ static int mce_start(int *no_way_out) atomic_add(*no_way_out, &global_nwo); /* - * global_nwo should be updated before mce_callin + * Rely on the implied barrier below, such that global_nwo + * is updated before mce_callin. */ - smp_wmb(); order = atomic_inc_return(&mce_callin); /* @@ -957,7 +1001,7 @@ static void mce_clear_state(unsigned long *toclear) for (i = 0; i < mca_cfg.banks; i++) { if (test_bit(i, toclear)) - mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); + mce_wrmsrl(msr_ops.status(i), 0); } } @@ -994,11 +1038,12 @@ void do_machine_check(struct pt_regs *regs, long error_code) int i; int worst = 0; int severity; + /* * Establish sequential order between the CPUs entering the machine * check handler. */ - int order; + int order = -1; /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. @@ -1012,7 +1057,12 @@ void do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(toclear, MAX_NR_BANKS); DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - int lmce = 0; + + /* + * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES + * on Intel. + */ + int lmce = 1; /* If this CPU is offline, just bail out. */ if (cpu_is_offline(smp_processor_id())) { @@ -1051,19 +1101,20 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; /* - * Check if this MCE is signaled to only this logical processor + * Check if this MCE is signaled to only this logical processor, + * on Intel only. */ - if (m.mcgstatus & MCG_STATUS_LMCES) - lmce = 1; - else { - /* - * Go through all the banks in exclusion of the other CPUs. - * This way we don't report duplicated events on shared banks - * because the first one to see it will clear it. - * If this is a Local MCE, then no need to perform rendezvous. - */ + if (m.cpuvendor == X86_VENDOR_INTEL) + lmce = m.mcgstatus & MCG_STATUS_LMCES; + + /* + * Go through all banks in exclusion of the other CPUs. This way we + * don't report duplicated events on shared banks because the first one + * to see it will clear it. If this is a Local MCE, then no need to + * perform rendezvous. + */ + if (!lmce) order = mce_start(&no_way_out); - } for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); @@ -1076,7 +1127,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) m.addr = 0; m.bank = i; - m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); + m.status = mce_rdmsrl(msr_ops.status(i)); if ((m.status & MCI_STATUS_VAL) == 0) continue; @@ -1420,7 +1471,6 @@ static void __mcheck_cpu_init_generic(void) enum mcp_flags m_fl = 0; mce_banks_t all_banks; u64 cap; - int i; if (!mca_cfg.bootlog) m_fl = MCP_DONTLOG; @@ -1436,14 +1486,19 @@ static void __mcheck_cpu_init_generic(void) rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); +} + +static void __mcheck_cpu_init_clear_banks(void) +{ + int i; for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (!b->init) continue; - wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); - wrmsrl(MSR_IA32_MCx_STATUS(i), 0); + wrmsrl(msr_ops.ctl(i), b->ctl); + wrmsrl(msr_ops.status(i), 0); } } @@ -1495,7 +1550,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } - if (c->x86 <= 17 && cfg->bootlog < 0) { + if (c->x86 < 17 && cfg->bootlog < 0) { /* * Lots of broken BIOS around that don't clear them * by default and leave crap in there. Don't log: @@ -1628,11 +1683,19 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) break; case X86_VENDOR_AMD: { - u32 ebx = cpuid_ebx(0x80000007); + mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); + mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - mce_flags.overflow_recov = !!(ebx & BIT(0)); - mce_flags.succor = !!(ebx & BIT(1)); - mce_flags.smca = !!(ebx & BIT(3)); + /* + * Install proper ops for Scalable MCA enabled processors + */ + if (mce_flags.smca) { + msr_ops.ctl = smca_ctl_reg; + msr_ops.status = smca_status_reg; + msr_ops.addr = smca_addr_reg; + msr_ops.misc = smca_misc_reg; + } mce_amd_feature_init(c); break; @@ -1717,6 +1780,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); + __mcheck_cpu_init_clear_banks(); __mcheck_cpu_init_timer(); } @@ -2082,7 +2146,7 @@ static void mce_disable_error_reporting(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MCx_CTL(i), 0); + wrmsrl(msr_ops.ctl(i), 0); } return; } @@ -2121,6 +2185,7 @@ static void mce_syscore_resume(void) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); + __mcheck_cpu_init_clear_banks(); } static struct syscore_ops mce_syscore_ops = { @@ -2138,6 +2203,7 @@ static void mce_cpu_restart(void *data) if (!mce_available(raw_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); + __mcheck_cpu_init_clear_banks(); __mcheck_cpu_init_timer(); } @@ -2413,7 +2479,7 @@ static void mce_reenable_cpu(void *h) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); + wrmsrl(msr_ops.ctl(i), b->ctl); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9d656fd436efd6c90ad148a3cfbbd673248000f5..10b0661651e0cea853164f0510e88bb0909c1de9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -54,14 +54,6 @@ /* Threshold LVT offset is at MSR0xC0000410[15:12] */ #define SMCA_THR_LVT_OFF 0xF000 -/* - * OS is required to set the MCAX bit to acknowledge that it is now using the - * new MSR ranges and new registers under each bank. It also means that the OS - * will configure deferred errors in the new MCx_CONFIG register. If the bit is - * not set, uncorrectable errors will cause a system panic. - */ -#define SMCA_MCAX_EN_OFF 0x1 - static const char * const th_names[] = { "load_store", "insn_fetch", @@ -333,7 +325,7 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, /* Fall back to method we used for older processors: */ switch (block) { case 0: - addr = MSR_IA32_MCx_MISC(bank); + addr = msr_ops.misc(bank); break; case 1: offset = ((low & MASK_BLKPTR_LO) >> 21); @@ -351,6 +343,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, int offset, u32 misc_high) { unsigned int cpu = smp_processor_id(); + u32 smca_low, smca_high, smca_addr; struct threshold_block b; int new; @@ -369,24 +362,49 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, b.interrupt_enable = 1; - if (mce_flags.smca) { - u32 smca_low, smca_high; - u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank); + if (!mce_flags.smca) { + new = (misc_high & MASK_LVTOFF_HI) >> 20; + goto set_offset; + } - if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) { - smca_high |= SMCA_MCAX_EN_OFF; - wrmsr(smca_addr, smca_low, smca_high); - } + smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank); - /* Gather LVT offset for thresholding: */ - if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) - goto out; + if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) { + /* + * OS is required to set the MCAX bit to acknowledge that it is + * now using the new MSR ranges and new registers under each + * bank. It also means that the OS will configure deferred + * errors in the new MCx_CONFIG register. If the bit is not set, + * uncorrectable errors will cause a system panic. + * + * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) + */ + smca_high |= BIT(0); - new = (smca_low & SMCA_THR_LVT_OFF) >> 12; - } else { - new = (misc_high & MASK_LVTOFF_HI) >> 20; + /* + * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR} + * registers with the option of additionally logging to + * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set. + * + * This bit is usually set by BIOS to retain the old behavior + * for OSes that don't use the new registers. Linux supports the + * new registers so let's disable that additional logging here. + * + * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high + * portion of the MSR). + */ + smca_high &= ~BIT(2); + + wrmsr(smca_addr, smca_low, smca_high); } + /* Gather LVT offset for thresholding: */ + if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) + goto out; + + new = (smca_low & SMCA_THR_LVT_OFF) >> 12; + +set_offset: offset = setup_APIC_mce_threshold(offset, new); if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt)) @@ -430,12 +448,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } -static void __log_error(unsigned int bank, bool threshold_err, u64 misc) +static void +__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) { + u32 msr_status = msr_ops.status(bank); + u32 msr_addr = msr_ops.addr(bank); struct mce m; u64 status; - rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + WARN_ON_ONCE(deferred_err && threshold_err); + + if (deferred_err && mce_flags.smca) { + msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank); + msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank); + } + + rdmsrl(msr_status, status); + if (!(status & MCI_STATUS_VAL)) return; @@ -448,10 +477,11 @@ static void __log_error(unsigned int bank, bool threshold_err, u64 misc) m.misc = misc; if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr); + rdmsrl(msr_addr, m.addr); mce_log(&m); - wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); + + wrmsrl(msr_status, 0); } static inline void __smp_deferred_error_interrupt(void) @@ -479,17 +509,21 @@ asmlinkage __visible void smp_trace_deferred_error_interrupt(void) /* APIC interrupt handler for deferred errors */ static void amd_deferred_error_interrupt(void) { - u64 status; unsigned int bank; + u32 msr_status; + u64 status; for (bank = 0; bank < mca_cfg.banks; ++bank) { - rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank) + : msr_ops.status(bank); + + rdmsrl(msr_status, status); if (!(status & MCI_STATUS_VAL) || !(status & MCI_STATUS_DEFERRED)) continue; - __log_error(bank, false, 0); + __log_error(bank, true, false, 0); break; } } @@ -544,7 +578,7 @@ static void amd_threshold_interrupt(void) return; log: - __log_error(bank, true, ((u64)high << 32) | low); + __log_error(bank, false, true, ((u64)high << 32) | low); } /* diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 1e8bb6c94f14c0e639339a518f2c8949cf7143ca..1defb8ea882c09033461728596f5e65aa73fba51 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -84,7 +84,7 @@ static int cmci_supported(int *banks) */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; - if (!cpu_has_apic || lapic_get_maxlvt() < 6) + if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) return 0; rdmsrl(MSR_IA32_MCG_CAP, cap); *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index ac780cad3b8601db3bcc2e174f69dc9ca3f00473..6b9dc4d18cccd1cd8fa6cf070b618bc878ad5a05 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -450,7 +450,7 @@ asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) /* Thermal monitoring depends on APIC, ACPI and clock modulation */ static int intel_thermal_supported(struct cpuinfo_x86 *c) { - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return 0; if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) return 0; diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index f8c81ba0b4651c02cd31f9a1117e64192e64e1d0..b1086f79e57e44858105feb91d7f015caeaec18f 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -137,7 +137,7 @@ static void prepare_set(void) u32 cr0; /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (cpu_has_pge) { + if (boot_cpu_has(X86_FEATURE_PGE)) { cr4 = __read_cr4(); __write_cr4(cr4 & ~X86_CR4_PGE); } @@ -170,7 +170,7 @@ static void post_set(void) write_cr0(read_cr0() & ~X86_CR0_CD); /* Restore value of CR4 */ - if (cpu_has_pge) + if (boot_cpu_has(X86_FEATURE_PGE)) __write_cr4(cr4); } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 19f57360dfd2583b82743c8cb69ffaef7175139b..16e37a2581acd51dc00249c17918765bd2c97bd6 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -444,11 +444,24 @@ static void __init print_mtrr_state(void) pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } +/* PAT setup for BP. We need to go through sync steps here */ +void __init mtrr_bp_pat_init(void) +{ + unsigned long flags; + + local_irq_save(flags); + prepare_set(); + + pat_init(); + + post_set(); + local_irq_restore(flags); +} + /* Grab all of the MTRR state for this CPU into *state */ bool __init get_mtrr_state(void) { struct mtrr_var_range *vrs; - unsigned long flags; unsigned lo, dummy; unsigned int i; @@ -481,15 +494,6 @@ bool __init get_mtrr_state(void) mtrr_state_set = 1; - /* PAT setup for BP. We need to go through sync steps here */ - local_irq_save(flags); - prepare_set(); - - pat_init(); - - post_set(); - local_irq_restore(flags); - return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); } @@ -741,7 +745,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) wbinvd(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (cpu_has_pge) { + if (boot_cpu_has(X86_FEATURE_PGE)) { cr4 = __read_cr4(); __write_cr4(cr4 & ~X86_CR4_PGE); } @@ -771,7 +775,7 @@ static void post_set(void) __releases(set_atomicity_lock) write_cr0(read_cr0() & ~X86_CR0_CD); /* Restore value of CR4 */ - if (cpu_has_pge) + if (boot_cpu_has(X86_FEATURE_PGE)) __write_cr4(cr4); raw_spin_unlock(&set_atomicity_lock); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 10f8d4796240709cea61c0e56522d75eeff8ffdd..7d393ecdeee692187b726c003532e1124faa02d4 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -752,6 +752,9 @@ void __init mtrr_bp_init(void) /* BIOS may override */ __mtrr_enabled = get_mtrr_state(); + if (mtrr_enabled()) + mtrr_bp_pat_init(); + if (mtrr_cleanup(phys_addr)) { changed_by_mtrr_cleanup = 1; mtrr_if->set_all(); @@ -759,8 +762,16 @@ void __init mtrr_bp_init(void) } } - if (!mtrr_enabled()) + if (!mtrr_enabled()) { pr_info("MTRR: Disabled\n"); + + /* + * PAT initialization relies on MTRR's rendezvous handler. + * Skip PAT init until the handler can initialize both + * features independently. + */ + pat_disable("MTRRs disabled, skipping PAT initialization too."); + } } void mtrr_ap_init(void) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 951884dcc43354573c2bd234aed3fd3adb067a84..6c7ced07d16d1181c6ef21f4f2252ef63019a77b 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -52,6 +52,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); bool get_mtrr_state(void); +void mtrr_bp_pat_init(void); extern void set_mtrr_ops(const struct mtrr_ops *ops); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 364e5834689753fc7da34c6dc8a34cca22ab92db..8cac429b6a1d53255899e84459ab9436d3df4100 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -94,7 +94,7 @@ static void __init vmware_platform_setup(void) */ static uint32_t __init vmware_platform(void) { - if (cpu_has_hypervisor) { + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { unsigned int eax; unsigned int hyper_vendor_id[3]; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 1f4acd68b98bccb7bf4032bc6ff2bde3f4efecdb..3fe45f84ced4463b147bcb907feedca94c07cab0 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -151,7 +151,7 @@ static void __init dtb_lapic_setup(void) return; /* Did the boot loader setup the local APIC ? */ - if (!cpu_has_apic) { + if (!boot_cpu_has(X86_FEATURE_APIC)) { if (apic_force_enable(r.start)) return; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 8efa57a5f29ea58d119a0ab2ca130598d3144f0e..2bb25c3fe2e8e9c6933ab37c99a532f8a36e47fa 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -260,19 +260,12 @@ int __die(const char *str, struct pt_regs *regs, long err) unsigned long sp; #endif printk(KERN_DEFAULT - "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif - if (debug_pagealloc_enabled()) - printk("DEBUG_PAGEALLOC "); -#ifdef CONFIG_KASAN - printk("KASAN"); -#endif - printk("\n"); + "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, + IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + IS_ENABLED(CONFIG_SMP) ? " SMP" : "", + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/ebda.c similarity index 98% rename from arch/x86/kernel/head.c rename to arch/x86/kernel/ebda.c index 992f442ca15511d1d0c2c5773cd26e31b9d353c3..afe65dffee80b80bda7dc1f69072fc23d21472b3 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/ebda.c @@ -38,7 +38,7 @@ void __init reserve_ebda_region(void) * that the paravirt case can handle memory setup * correctly, without our help. */ - if (paravirt_enabled()) + if (!x86_platform.legacy.ebda_search) return; /* end of low (conventional) memory */ diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index dd9ca9b60ff3a497b5033c3fd714f93140ac689f..aad34aafc0e08ae6ed20d018874eb6b07073922a 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -21,11 +21,15 @@ static double __initdata y = 3145727.0; * We should really only care about bugs here * anyway. Not features. */ -static void __init check_fpu(void) +void __init fpu__init_check_bugs(void) { u32 cr0_saved; s32 fdiv_bug; + /* kernel_fpu_begin/end() relies on patched alternative instructions. */ + if (!boot_cpu_has(X86_FEATURE_FPU)) + return; + /* We might have CR0::TS set already, clear it: */ cr0_saved = read_cr0(); write_cr0(cr0_saved & ~X86_CR0_TS); @@ -59,13 +63,3 @@ static void __init check_fpu(void) pr_warn("Hmm, FPU with FDIV bug\n"); } } - -void __init fpu__init_check_bugs(void) -{ - /* - * kernel_fpu_begin/end() in check_fpu() relies on the patched - * alternative instructions. - */ - if (cpu_has_fpu) - check_fpu(); -} diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 8e37cc8a539adc1c9d348d9a25b13815d4166aa6..97027545a72dcd4c34964aff481ac1b7a94c0df7 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -217,14 +217,14 @@ static inline void fpstate_init_fstate(struct fregs_state *fp) void fpstate_init(union fpregs_state *state) { - if (!cpu_has_fpu) { + if (!static_cpu_has(X86_FEATURE_FPU)) { fpstate_init_soft(&state->soft); return; } memset(state, 0, xstate_size); - if (cpu_has_fxsr) + if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); else fpstate_init_fstate(&state->fsave); @@ -237,7 +237,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; - if (!src_fpu->fpstate_active || !cpu_has_fpu) + if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU)) return 0; WARN_ON_FPU(src_fpu != ¤t->thread.fpu); @@ -506,33 +506,6 @@ void fpu__clear(struct fpu *fpu) * x87 math exception handling: */ -static inline unsigned short get_fpu_cwd(struct fpu *fpu) -{ - if (cpu_has_fxsr) { - return fpu->state.fxsave.cwd; - } else { - return (unsigned short)fpu->state.fsave.cwd; - } -} - -static inline unsigned short get_fpu_swd(struct fpu *fpu) -{ - if (cpu_has_fxsr) { - return fpu->state.fxsave.swd; - } else { - return (unsigned short)fpu->state.fsave.swd; - } -} - -static inline unsigned short get_fpu_mxcsr(struct fpu *fpu) -{ - if (cpu_has_xmm) { - return fpu->state.fxsave.mxcsr; - } else { - return MXCSR_DEFAULT; - } -} - int fpu__exception_code(struct fpu *fpu, int trap_nr) { int err; @@ -547,10 +520,15 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) * so if this combination doesn't produce any single exception, * then we have a bad program that isn't synchronizing its FPU usage * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception + * fully reproduce the context of the exception. */ - cwd = get_fpu_cwd(fpu); - swd = get_fpu_swd(fpu); + if (boot_cpu_has(X86_FEATURE_FXSR)) { + cwd = fpu->state.fxsave.cwd; + swd = fpu->state.fxsave.swd; + } else { + cwd = (unsigned short)fpu->state.fsave.cwd; + swd = (unsigned short)fpu->state.fsave.swd; + } err = swd & ~cwd; } else { @@ -560,7 +538,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) * unmasked exception was caught we must mask the exception mask bits * at 0x1f80, and then use these to mask the exception bits at 0x3f. */ - unsigned short mxcsr = get_fpu_mxcsr(fpu); + unsigned short mxcsr = MXCSR_DEFAULT; + + if (boot_cpu_has(X86_FEATURE_XMM)) + mxcsr = fpu->state.fxsave.mxcsr; + err = ~(mxcsr >> 7) & mxcsr; } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 54c86fffbf9f85281a77977a4aa8dbb67f112464..aacfd7a82cec57b9f2eb2f57e17d277a9cd74141 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -29,22 +29,22 @@ static void fpu__init_cpu_generic(void) unsigned long cr0; unsigned long cr4_mask = 0; - if (cpu_has_fxsr) + if (boot_cpu_has(X86_FEATURE_FXSR)) cr4_mask |= X86_CR4_OSFXSR; - if (cpu_has_xmm) + if (boot_cpu_has(X86_FEATURE_XMM)) cr4_mask |= X86_CR4_OSXMMEXCPT; if (cr4_mask) cr4_set_bits(cr4_mask); cr0 = read_cr0(); cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ - if (!cpu_has_fpu) + if (!boot_cpu_has(X86_FEATURE_FPU)) cr0 |= X86_CR0_EM; write_cr0(cr0); /* Flush out any pending x87 state: */ #ifdef CONFIG_MATH_EMULATION - if (!cpu_has_fpu) + if (!boot_cpu_has(X86_FEATURE_FPU)) fpstate_init_soft(¤t->thread.fpu.state.soft); else #endif @@ -89,7 +89,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) } #ifndef CONFIG_MATH_EMULATION - if (!cpu_has_fpu) { + if (!boot_cpu_has(X86_FEATURE_FPU)) { pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n"); for (;;) asm volatile("hlt"); @@ -106,7 +106,7 @@ static void __init fpu__init_system_mxcsr(void) { unsigned int mask = 0; - if (cpu_has_fxsr) { + if (boot_cpu_has(X86_FEATURE_FXSR)) { /* Static because GCC does not get 16-byte stack alignment right: */ static struct fxregs_state fxregs __initdata; @@ -212,7 +212,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) * fpu__init_system_xstate(). */ - if (!cpu_has_fpu) { + if (!boot_cpu_has(X86_FEATURE_FPU)) { /* * Disable xsave as we do not support it if i387 * emulation is enabled. @@ -221,7 +221,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); xstate_size = sizeof(struct swregs_state); } else { - if (cpu_has_fxsr) + if (boot_cpu_has(X86_FEATURE_FXSR)) xstate_size = sizeof(struct fxregs_state); else xstate_size = sizeof(struct fregs_state); diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 8bd1c003942aa801ed026ec770bdf00bc4cae946..81422dfb152b7c8e012300637b1acfd40384f697 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -21,7 +21,10 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r { struct fpu *target_fpu = &target->thread.fpu; - return (cpu_has_fxsr && target_fpu->fpstate_active) ? regset->n : 0; + if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->fpstate_active) + return regset->n; + else + return 0; } int xfpregs_get(struct task_struct *target, const struct user_regset *regset, @@ -30,7 +33,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, { struct fpu *fpu = &target->thread.fpu; - if (!cpu_has_fxsr) + if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; fpu__activate_fpstate_read(fpu); @@ -47,7 +50,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, struct fpu *fpu = &target->thread.fpu; int ret; - if (!cpu_has_fxsr) + if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; fpu__activate_fpstate_write(fpu); @@ -65,7 +68,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, * update the header bits in the xsave header, indicating the * presence of FP and SSE state. */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; return ret; @@ -79,7 +82,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, struct xregs_state *xsave; int ret; - if (!cpu_has_xsave) + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; fpu__activate_fpstate_read(fpu); @@ -108,7 +111,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, struct xregs_state *xsave; int ret; - if (!cpu_has_xsave) + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; fpu__activate_fpstate_write(fpu); @@ -275,10 +278,10 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_read(fpu); - if (!static_cpu_has(X86_FEATURE_FPU)) + if (!boot_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); - if (!cpu_has_fxsr) + if (!boot_cpu_has(X86_FEATURE_FXSR)) return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &fpu->state.fsave, 0, -1); @@ -306,10 +309,10 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_write(fpu); fpstate_sanitize_xstate(fpu); - if (!static_cpu_has(X86_FEATURE_FPU)) + if (!boot_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); - if (!cpu_has_fxsr) + if (!boot_cpu_has(X86_FEATURE_FXSR)) return user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpu->state.fsave, 0, -1); @@ -325,7 +328,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, * update the header bit in the xsave header, indicating the * presence of FP. */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP; return ret; } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b48ef35b28d4fbcab4b5f00d613b57e63a00b1d0..4ea2a59483c7b1b07c60178daa55b084337dcf07 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -190,7 +190,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) */ void fpu__init_cpu_xstate(void) { - if (!cpu_has_xsave || !xfeatures_mask) + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) return; cr4_set_bits(X86_CR4_OSXSAVE); @@ -280,7 +280,7 @@ static void __init setup_xstate_comp(void) xstate_comp_offsets[0] = 0; xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space); - if (!cpu_has_xsaves) { + if (!boot_cpu_has(X86_FEATURE_XSAVES)) { for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (xfeature_enabled(i)) { xstate_comp_offsets[i] = xstate_offsets[i]; @@ -316,13 +316,13 @@ static void __init setup_init_fpu_buf(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - if (!cpu_has_xsave) + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; setup_xstate_features(); print_xstate_features(); - if (cpu_has_xsaves) { + if (boot_cpu_has(X86_FEATURE_XSAVES)) { init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; init_fpstate.xsave.header.xfeatures = xfeatures_mask; } @@ -417,7 +417,7 @@ static int xfeature_size(int xfeature_nr) */ static int using_compacted_format(void) { - return cpu_has_xsaves; + return boot_cpu_has(X86_FEATURE_XSAVES); } static void __xstate_dump_leaves(void) @@ -549,7 +549,7 @@ static unsigned int __init calculate_xstate_size(void) unsigned int eax, ebx, ecx, edx; unsigned int calculated_xstate_size; - if (!cpu_has_xsaves) { + if (!boot_cpu_has(X86_FEATURE_XSAVES)) { /* * - CPUID function 0DH, sub-function 0: * EBX enumerates the size (in bytes) required by @@ -630,7 +630,7 @@ void __init fpu__init_system_xstate(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - if (!cpu_has_xsave) { + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { pr_info("x86/fpu: Legacy x87 FPU detected.\n"); return; } @@ -667,7 +667,7 @@ void __init fpu__init_system_xstate(void) pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", xfeatures_mask, xstate_size, - cpu_has_xsaves ? "compacted" : "standard"); + boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); } /* @@ -678,7 +678,7 @@ void fpu__resume_cpu(void) /* * Restore XCR0 on xsave capable CPUs: */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 2911ef3a9f1c7bf11a2508a928cb216c5f48b6bc..d784bb547a9dd64f64ab0b9c429d7d0bff20df9d 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -34,6 +34,8 @@ asmlinkage __visible void __init i386_start_kernel(void) cr4_init_shadow(); sanitize_boot_params(&boot_params); + x86_early_init_platform_quirks(); + /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 1f4422d5c8d013992d6b97a6a08734bde5bdb2b1..b72fb0b71dd1f84c7e9e8141281a82e758be9909 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -182,6 +182,7 @@ void __init x86_64_start_reservations(char *real_mode_data) if (!boot_params.hdr.version) copy_bootdata(__va(real_mode_data)); + x86_early_init_platform_quirks(); reserve_ebda_region(); switch (boot_params.hdr.hardware_subarch) { diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index af1112980dd411334ef59d7d0a7b818946f2137d..6f8902b0d1514bd9f5b2b3ea86f55f96fa6d8618 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -555,62 +555,53 @@ early_idt_handler_common: */ cld - cmpl $2,(%esp) # X86_TRAP_NMI - je .Lis_nmi # Ignore NMI - - cmpl $2,%ss:early_recursion_flag - je hlt_loop incl %ss:early_recursion_flag - push %eax # 16(%esp) - push %ecx # 12(%esp) - push %edx # 8(%esp) - push %ds # 4(%esp) - push %es # 0(%esp) - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es - - cmpl $(__KERNEL_CS),32(%esp) - jne 10f + /* The vector number is in pt_regs->gs */ - leal 28(%esp),%eax # Pointer to %eip - call early_fixup_exception - andl %eax,%eax - jnz ex_entry /* found an exception entry */ - -10: -#ifdef CONFIG_PRINTK - xorl %eax,%eax - movw %ax,2(%esp) /* clean up the segment values on some cpus */ - movw %ax,6(%esp) - movw %ax,34(%esp) - leal 40(%esp),%eax - pushl %eax /* %esp before the exception */ - pushl %ebx - pushl %ebp - pushl %esi - pushl %edi - movl %cr2,%eax - pushl %eax - pushl (20+6*4)(%esp) /* trapno */ - pushl $fault_msg - call printk -#endif - call dump_stack -hlt_loop: - hlt - jmp hlt_loop - -ex_entry: - pop %es - pop %ds - pop %edx - pop %ecx - pop %eax - decl %ss:early_recursion_flag -.Lis_nmi: - addl $8,%esp /* drop vector number and error code */ + cld + pushl %fs /* pt_regs->fs */ + movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %es /* pt_regs->es */ + movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %ds /* pt_regs->ds */ + movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %eax /* pt_regs->ax */ + pushl %ebp /* pt_regs->bp */ + pushl %edi /* pt_regs->di */ + pushl %esi /* pt_regs->si */ + pushl %edx /* pt_regs->dx */ + pushl %ecx /* pt_regs->cx */ + pushl %ebx /* pt_regs->bx */ + + /* Fix up DS and ES */ + movl $(__KERNEL_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + + /* Load the vector number into EDX */ + movl PT_GS(%esp), %edx + + /* Load GS into pt_regs->gs and clear high bits */ + movw %gs, PT_GS(%esp) + movw $0, PT_GS+2(%esp) + + movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */ + call early_fixup_exception + + popl %ebx /* pt_regs->bx */ + popl %ecx /* pt_regs->cx */ + popl %edx /* pt_regs->dx */ + popl %esi /* pt_regs->si */ + popl %edi /* pt_regs->di */ + popl %ebp /* pt_regs->bp */ + popl %eax /* pt_regs->ax */ + popl %ds /* pt_regs->ds */ + popl %es /* pt_regs->es */ + popl %fs /* pt_regs->fs */ + popl %gs /* pt_regs->gs */ + decl %ss:early_recursion_flag + addl $4, %esp /* pop pt_regs->orig_ax */ iret ENDPROC(early_idt_handler_common) @@ -647,10 +638,14 @@ ignore_int: popl %eax #endif iret + +hlt_loop: + hlt + jmp hlt_loop ENDPROC(ignore_int) __INITDATA .align 4 -early_recursion_flag: +GLOBAL(early_recursion_flag) .long 0 __REFDATA @@ -715,19 +710,6 @@ __INITRODATA int_msg: .asciz "Unknown interrupt or fault at: %p %p %p\n" -fault_msg: -/* fault info: */ - .ascii "BUG: Int %d: CR2 %p\n" -/* regs pushed in early_idt_handler: */ - .ascii " EDI %p ESI %p EBP %p EBX %p\n" - .ascii " ESP %p ES %p DS %p\n" - .ascii " EDX %p ECX %p EAX %p\n" -/* fault frame: */ - .ascii " vec %p err %p EIP %p CS %p flg %p\n" - .ascii "Stack: %p %p %p %p %p %p %p %p\n" - .ascii " %p %p %p %p %p %p %p %p\n" - .asciz " %p %p %p %p %p %p %p %p\n" - #include "../../x86/xen/xen-head.S" /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 22fbf9df61bb4eecbb5ffe530562b56c1def90b8..5df831ef1442f36c5ee0ac00447a2505216d1a26 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -20,6 +20,7 @@ #include #include #include +#include "../entry/calling.h" #ifdef CONFIG_PARAVIRT #include @@ -64,6 +65,14 @@ startup_64: * tables and then reload them. */ + /* + * Setup stack for verify_cpu(). "-8" because stack_start is defined + * this way, see below. Our best guess is a NULL ptr for stack + * termination heuristics and we don't want to break anything which + * might depend on it (kgdb, ...). + */ + leaq (__end_init_task - 8)(%rip), %rsp + /* Sanitize CPU configuration */ call verify_cpu @@ -350,90 +359,48 @@ early_idt_handler_common: */ cld - cmpl $2,(%rsp) # X86_TRAP_NMI - je .Lis_nmi # Ignore NMI - - cmpl $2,early_recursion_flag(%rip) - jz 1f incl early_recursion_flag(%rip) - pushq %rax # 64(%rsp) - pushq %rcx # 56(%rsp) - pushq %rdx # 48(%rsp) - pushq %rsi # 40(%rsp) - pushq %rdi # 32(%rsp) - pushq %r8 # 24(%rsp) - pushq %r9 # 16(%rsp) - pushq %r10 # 8(%rsp) - pushq %r11 # 0(%rsp) - - cmpl $__KERNEL_CS,96(%rsp) - jne 11f - - cmpl $14,72(%rsp) # Page fault? + /* The vector number is currently in the pt_regs->di slot. */ + pushq %rsi /* pt_regs->si */ + movq 8(%rsp), %rsi /* RSI = vector number */ + movq %rdi, 8(%rsp) /* pt_regs->di = RDI */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq %rax /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + pushq %rbx /* pt_regs->bx */ + pushq %rbp /* pt_regs->bp */ + pushq %r12 /* pt_regs->r12 */ + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ + + cmpq $14,%rsi /* Page fault? */ jnz 10f - GET_CR2_INTO(%rdi) # can clobber any volatile register if pv + GET_CR2_INTO(%rdi) /* Can clobber any volatile register if pv */ call early_make_pgtable andl %eax,%eax - jz 20f # All good + jz 20f /* All good */ 10: - leaq 88(%rsp),%rdi # Pointer to %rip + movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ call early_fixup_exception - andl %eax,%eax - jnz 20f # Found an exception entry - -11: -#ifdef CONFIG_EARLY_PRINTK - GET_CR2_INTO(%r9) # can clobber any volatile register if pv - movl 80(%rsp),%r8d # error code - movl 72(%rsp),%esi # vector number - movl 96(%rsp),%edx # %cs - movq 88(%rsp),%rcx # %rip - xorl %eax,%eax - leaq early_idt_msg(%rip),%rdi - call early_printk - cmpl $2,early_recursion_flag(%rip) - jz 1f - call dump_stack -#ifdef CONFIG_KALLSYMS - leaq early_idt_ripmsg(%rip),%rdi - movq 40(%rsp),%rsi # %rip again - call __print_symbol -#endif -#endif /* EARLY_PRINTK */ -1: hlt - jmp 1b - -20: # Exception table entry found or page table generated - popq %r11 - popq %r10 - popq %r9 - popq %r8 - popq %rdi - popq %rsi - popq %rdx - popq %rcx - popq %rax + +20: decl early_recursion_flag(%rip) -.Lis_nmi: - addq $16,%rsp # drop vector number and error code - INTERRUPT_RETURN + jmp restore_regs_and_iret ENDPROC(early_idt_handler_common) __INITDATA .balign 4 -early_recursion_flag: +GLOBAL(early_recursion_flag) .long 0 -#ifdef CONFIG_EARLY_PRINTK -early_idt_msg: - .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" -early_idt_ripmsg: - .asciz "RIP %s\n" -#endif /* CONFIG_EARLY_PRINTK */ - #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ GLOBAL(name) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index a1f0e4a5c47e3239824ac9c61774127e378d4f03..f112af7aa62edb1b6167e1801040af07288fb2ac 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -54,7 +54,7 @@ struct hpet_dev { char name[10]; }; -inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) +static inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) { return container_of(evtdev, struct hpet_dev, evt); } @@ -773,7 +773,6 @@ static struct clocksource clocksource_hpet = { .mask = HPET_MASK, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, - .archdata = { .vclock_mode = VCLOCK_HPET }, }; static int hpet_clocksource_register(void) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index e565e0e4d21698c40297d01991a74ff659fca5c3..fc25f698d792faed00f461b0a378f30c35e3eb3d 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef HAVE_JUMP_LABEL diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 2da6ee9ae69b725a1d960c0467005c1e581d96c4..04cde527d72849be75ccb65e0d7ed650a1ef3a82 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -45,6 +45,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index ae703acb85c185e8c6f3735da83f9998b2f8e7a1..38cf7a7412503f513bb1aae1c1acc5b76e7d2e1d 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -51,6 +51,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 7b3b9d15c47a63953d6932026cc57db795e3a507..4425f593f0ec8c6f9d0842054e5986ed28f99fc2 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 807950860fb7028e28fe1e98d8a2cddeccfa8063..eea2a6f72b31c089d1b100eaefff32d1c6be4a87 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -285,14 +285,6 @@ static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; - /* - * KVM isn't paravirt in the sense of paravirt_enabled. A KVM - * guest kernel works like a bare metal kernel with additional - * features, and paravirt_enabled is about features that are - * missing. - */ - pv_info.paravirt_enabled = 0; - if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; @@ -522,7 +514,7 @@ static noinline uint32_t __kvm_cpuid_base(void) if (boot_cpu_data.cpuid_level < 0) return 0; /* So we don't blow up on old processors */ - if (cpu_has_hypervisor) + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); return 0; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 005c03e93fc54c7907e8e9e2bc1d771902b1c3ca..477ae806c2fa71f425ff56a8b75b1306dfcf7535 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f08ac28b8136da85e0866bf689b3b16faec1c157..7b3b3f24c3eac994f5c23c1ac8fa9f6420ef86fe 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -294,7 +294,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) struct pv_info pv_info = { .name = "bare hardware", - .paravirt_enabled = 0, .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ @@ -339,8 +338,10 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .write_cr8 = native_write_cr8, #endif .wbinvd = native_wbinvd, - .read_msr = native_read_msr_safe, - .write_msr = native_write_msr_safe, + .read_msr = native_read_msr, + .write_msr = native_write_msr, + .read_msr_safe = native_read_msr_safe, + .write_msr_safe = native_write_msr_safe, .read_pmc = native_read_pmc, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index 35ccf75696eb8ab22af0e94e6246b5cb3f655b4f..f712dfdf1357f7bf1164d9c03ce2a6789eaf1522 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -72,7 +72,7 @@ void __init check_iommu_entries(struct iommu_table_entry *start, } } #else -inline void check_iommu_entries(struct iommu_table_entry *start, +void __init check_iommu_entries(struct iommu_table_entry *start, struct iommu_table_entry *finish) { } diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c new file mode 100644 index 0000000000000000000000000000000000000000..b2f8a33b36ff4e6fb3beef130af93717f10baf11 --- /dev/null +++ b/arch/x86/kernel/platform-quirks.c @@ -0,0 +1,35 @@ +#include +#include + +#include +#include + +void __init x86_early_init_platform_quirks(void) +{ + x86_platform.legacy.rtc = 1; + x86_platform.legacy.ebda_search = 0; + x86_platform.legacy.devices.pnpbios = 1; + + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_PC: + x86_platform.legacy.ebda_search = 1; + break; + case X86_SUBARCH_XEN: + case X86_SUBARCH_LGUEST: + case X86_SUBARCH_INTEL_MID: + case X86_SUBARCH_CE4100: + x86_platform.legacy.devices.pnpbios = 0; + x86_platform.legacy.rtc = 0; + break; + } + + if (x86_platform.set_legacy_features) + x86_platform.set_legacy_features(); +} + +#if defined(CONFIG_PNPBIOS) +bool __init arch_pnpbios_disabled(void) +{ + return x86_platform.legacy.devices.pnpbios == 0; +} +#endif diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6cbab31ac23a20fb3980f06f88becddff135411b..6b16c36f0939313dde91d03428cdc855ff3dba9e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -136,25 +136,6 @@ void release_thread(struct task_struct *dead_task) } } -static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) -{ - struct user_desc ud = { - .base_addr = addr, - .limit = 0xfffff, - .seg_32bit = 1, - .limit_in_pages = 1, - .useable = 1, - }; - struct desc_struct *desc = t->thread.tls_array; - desc += tls; - fill_ldt(desc, &ud); -} - -static inline u32 read_32bit_tls(struct task_struct *t, int tls) -{ - return get_desc_base(&t->thread.tls_array[tls]); -} - int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@ -169,9 +150,9 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); - p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs; + p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase; savesegment(fs, p->thread.fsindex); - p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; + p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); @@ -210,7 +191,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, */ if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_IA32_EMULATION - if (is_ia32_task()) + if (in_ia32_syscall()) err = do_set_thread_area(p, -1, (struct user_desc __user *)tls, 0); else @@ -282,7 +263,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - unsigned fsindex, gsindex; + unsigned prev_fsindex, prev_gsindex; fpu_switch_t fpu_switch; fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); @@ -292,8 +273,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * * (e.g. xen_load_tls()) */ - savesegment(fs, fsindex); - savesegment(gs, gsindex); + savesegment(fs, prev_fsindex); + savesegment(gs, prev_gsindex); /* * Load TLS before restoring any segments so that segment loads @@ -336,66 +317,104 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch FS and GS. * * These are even more complicated than DS and ES: they have - * 64-bit bases are that controlled by arch_prctl. Those bases - * only differ from the values in the GDT or LDT if the selector - * is 0. - * - * Loading the segment register resets the hidden base part of - * the register to 0 or the value from the GDT / LDT. If the - * next base address zero, writing 0 to the segment register is - * much faster than using wrmsr to explicitly zero the base. - * - * The thread_struct.fs and thread_struct.gs values are 0 - * if the fs and gs bases respectively are not overridden - * from the values implied by fsindex and gsindex. They - * are nonzero, and store the nonzero base addresses, if - * the bases are overridden. - * - * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should - * be impossible. - * - * Therefore we need to reload the segment registers if either - * the old or new selector is nonzero, and we need to override - * the base address if next thread expects it to be overridden. + * 64-bit bases are that controlled by arch_prctl. The bases + * don't necessarily match the selectors, as user code can do + * any number of things to cause them to be inconsistent. * - * This code is unnecessarily slow in the case where the old and - * new indexes are zero and the new base is nonzero -- it will - * unnecessarily write 0 to the selector before writing the new - * base address. + * We don't promise to preserve the bases if the selectors are + * nonzero. We also don't promise to preserve the base if the + * selector is zero and the base doesn't match whatever was + * most recently passed to ARCH_SET_FS/GS. (If/when the + * FSGSBASE instructions are enabled, we'll need to offer + * stronger guarantees.) * - * Note: This all depends on arch_prctl being the only way that - * user code can override the segment base. Once wrfsbase and - * wrgsbase are enabled, most of this code will need to change. + * As an invariant, + * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is + * impossible. */ - if (unlikely(fsindex | next->fsindex | prev->fs)) { + if (next->fsindex) { + /* Loading a nonzero value into FS sets the index and base. */ loadsegment(fs, next->fsindex); - - /* - * If user code wrote a nonzero value to FS, then it also - * cleared the overridden base address. - * - * XXX: if user code wrote 0 to FS and cleared the base - * address itself, we won't notice and we'll incorrectly - * restore the prior base address next time we reschdule - * the process. - */ - if (fsindex) - prev->fs = 0; + } else { + if (next->fsbase) { + /* Next index is zero but next base is nonzero. */ + if (prev_fsindex) + loadsegment(fs, 0); + wrmsrl(MSR_FS_BASE, next->fsbase); + } else { + /* Next base and index are both zero. */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + /* + * We don't know the previous base and can't + * find out without RDMSR. Forcibly clear it. + */ + loadsegment(fs, __USER_DS); + loadsegment(fs, 0); + } else { + /* + * If the previous index is zero and ARCH_SET_FS + * didn't change the base, then the base is + * also zero and we don't need to do anything. + */ + if (prev->fsbase || prev_fsindex) + loadsegment(fs, 0); + } + } } - if (next->fs) - wrmsrl(MSR_FS_BASE, next->fs); - prev->fsindex = fsindex; + /* + * Save the old state and preserve the invariant. + * NB: if prev_fsindex == 0, then we can't reliably learn the base + * without RDMSR because Intel user code can zero it without telling + * us and AMD user code can program any 32-bit value without telling + * us. + */ + if (prev_fsindex) + prev->fsbase = 0; + prev->fsindex = prev_fsindex; - if (unlikely(gsindex | next->gsindex | prev->gs)) { + if (next->gsindex) { + /* Loading a nonzero value into GS sets the index and base. */ load_gs_index(next->gsindex); - - /* This works (and fails) the same way as fsindex above. */ - if (gsindex) - prev->gs = 0; + } else { + if (next->gsbase) { + /* Next index is zero but next base is nonzero. */ + if (prev_gsindex) + load_gs_index(0); + wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); + } else { + /* Next base and index are both zero. */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + /* + * We don't know the previous base and can't + * find out without RDMSR. Forcibly clear it. + * + * This contains a pointless SWAPGS pair. + * Fixing it would involve an explicit check + * for Xen or a new pvop. + */ + load_gs_index(__USER_DS); + load_gs_index(0); + } else { + /* + * If the previous index is zero and ARCH_SET_GS + * didn't change the base, then the base is + * also zero and we don't need to do anything. + */ + if (prev->gsbase || prev_gsindex) + load_gs_index(0); + } + } } - if (next->gs) - wrmsrl(MSR_KERNEL_GS_BASE, next->gs); - prev->gsindex = gsindex; + /* + * Save the old state and preserve the invariant. + * NB: if prev_gsindex == 0, then we can't reliably learn the base + * without RDMSR because Intel user code can zero it without telling + * us and AMD user code can program any 32-bit value without telling + * us. + */ + if (prev_gsindex) + prev->gsbase = 0; + prev->gsindex = prev_gsindex; switch_fpu_finish(next_fpu, fpu_switch); @@ -516,23 +535,11 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) if (addr >= TASK_SIZE_OF(task)) return -EPERM; cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to - switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, GS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); - load_gs_index(GS_TLS_SEL); - } - task->thread.gsindex = GS_TLS_SEL; - task->thread.gs = 0; - } else { - task->thread.gsindex = 0; - task->thread.gs = addr; - if (doit) { - load_gs_index(0); - ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); - } + task->thread.gsindex = 0; + task->thread.gsbase = addr; + if (doit) { + load_gs_index(0); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); } put_cpu(); break; @@ -542,52 +549,30 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) if (addr >= TASK_SIZE_OF(task)) return -EPERM; cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to - switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, FS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); - loadsegment(fs, FS_TLS_SEL); - } - task->thread.fsindex = FS_TLS_SEL; - task->thread.fs = 0; - } else { - task->thread.fsindex = 0; - task->thread.fs = addr; - if (doit) { - /* set the selector to 0 to not confuse - __switch_to */ - loadsegment(fs, 0); - ret = wrmsrl_safe(MSR_FS_BASE, addr); - } + task->thread.fsindex = 0; + task->thread.fsbase = addr; + if (doit) { + /* set the selector to 0 to not confuse __switch_to */ + loadsegment(fs, 0); + ret = wrmsrl_safe(MSR_FS_BASE, addr); } put_cpu(); break; case ARCH_GET_FS: { unsigned long base; - if (task->thread.fsindex == FS_TLS_SEL) - base = read_32bit_tls(task, FS_TLS); - else if (doit) + if (doit) rdmsrl(MSR_FS_BASE, base); else - base = task->thread.fs; + base = task->thread.fsbase; ret = put_user(base, (unsigned long __user *)addr); break; } case ARCH_GET_GS: { unsigned long base; - unsigned gsindex; - if (task->thread.gsindex == GS_TLS_SEL) - base = read_32bit_tls(task, GS_TLS); - else if (doit) { - savesegment(gs, gsindex); - if (gsindex) - rdmsrl(MSR_KERNEL_GS_BASE, base); - else - base = task->thread.gs; - } else - base = task->thread.gs; + if (doit) + rdmsrl(MSR_KERNEL_GS_BASE, base); + else + base = task->thread.gsbase; ret = put_user(base, (unsigned long __user *)addr); break; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 32e9d9cbb884ae10f0f5035c47bafb69f4517f80..e60ef918f53d52c765cd19821c5810adcda61820 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -303,29 +303,11 @@ static int set_segment_reg(struct task_struct *task, switch (offset) { case offsetof(struct user_regs_struct,fs): - /* - * If this is setting fs as for normal 64-bit use but - * setting fs_base has implicitly changed it, leave it. - */ - if ((value == FS_TLS_SEL && task->thread.fsindex == 0 && - task->thread.fs != 0) || - (value == 0 && task->thread.fsindex == FS_TLS_SEL && - task->thread.fs == 0)) - break; task->thread.fsindex = value; if (task == current) loadsegment(fs, task->thread.fsindex); break; case offsetof(struct user_regs_struct,gs): - /* - * If this is setting gs as for normal 64-bit use but - * setting gs_base has implicitly changed it, leave it. - */ - if ((value == GS_TLS_SEL && task->thread.gsindex == 0 && - task->thread.gs != 0) || - (value == 0 && task->thread.gsindex == GS_TLS_SEL && - task->thread.gs == 0)) - break; task->thread.gsindex = value; if (task == current) load_gs_index(task->thread.gsindex); @@ -417,7 +399,7 @@ static int putreg(struct task_struct *child, * to set either thread.fs or thread.fsindex and the * corresponding GDT slot. */ - if (child->thread.fs != value) + if (child->thread.fsbase != value) return do_arch_prctl(child, ARCH_SET_FS, value); return 0; case offsetof(struct user_regs_struct,gs_base): @@ -426,7 +408,7 @@ static int putreg(struct task_struct *child, */ if (value >= TASK_SIZE_OF(child)) return -EIO; - if (child->thread.gs != value) + if (child->thread.gsbase != value) return do_arch_prctl(child, ARCH_SET_GS, value); return 0; #endif @@ -453,31 +435,17 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset) #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct, fs_base): { /* - * do_arch_prctl may have used a GDT slot instead of - * the MSR. To userland, it appears the same either - * way, except the %fs segment selector might not be 0. + * XXX: This will not behave as expected if called on + * current or if fsindex != 0. */ - unsigned int seg = task->thread.fsindex; - if (task->thread.fs != 0) - return task->thread.fs; - if (task == current) - asm("movl %%fs,%0" : "=r" (seg)); - if (seg != FS_TLS_SEL) - return 0; - return get_desc_base(&task->thread.tls_array[FS_TLS]); + return task->thread.fsbase; } case offsetof(struct user_regs_struct, gs_base): { /* - * Exactly the same here as the %fs handling above. + * XXX: This will not behave as expected if called on + * current or if fsindex != 0. */ - unsigned int seg = task->thread.gsindex; - if (task->thread.gs != 0) - return task->thread.gs; - if (task == current) - asm("movl %%gs,%0" : "=r" (seg)); - if (seg != GS_TLS_SEL) - return 0; - return get_desc_base(&task->thread.tls_array[GS_TLS]); + return task->thread.gsbase; } #endif } @@ -1266,7 +1234,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { #ifdef CONFIG_X86_X32_ABI - if (!is_ia32_task()) + if (!in_ia32_syscall()) return x32_arch_ptrace(child, request, caddr, cdata); #endif #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index ab0adc0fa5db4da281de34e46c71e2cfbb707c07..a9b31eb815f23e93eae56960879448d7a63fba09 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -535,6 +535,15 @@ static void native_machine_emergency_restart(void) mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0; *((unsigned short *)__va(0x472)) = mode; + /* + * If an EFI capsule has been registered with the firmware then + * override the reboot= parameter. + */ + if (efi_capsule_pending(NULL)) { + pr_info("EFI capsule is pending, forcing EFI reboot.\n"); + reboot_type = BOOT_EFI; + } + for (;;) { /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 4af8d063fb362cd2bf92b97a48fa8c1d5d95fd3b..eceaa082ec3fcb1b0f98cb11a6d4723f5dbc3d73 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_X86_32 /* @@ -185,22 +186,7 @@ static __init int add_rtc_cmos(void) } } #endif - if (of_have_populated_dt()) - return 0; - - /* Intel MID platforms don't have ioport rtc */ - if (intel_mid_identify_cpu()) - return -ENODEV; - -#ifdef CONFIG_ACPI - if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { - /* This warning can likely go away again in a year or two. */ - pr_info("ACPI: not registering RTC platform device\n"); - return -ENODEV; - } -#endif - - if (paravirt_enabled() && !paravirt_has(RTC)) + if (!x86_platform.legacy.rtc) return -ENODEV; platform_device_register(&rtc_device); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 548ddf7d6fd2085676328ffcde6375ce3e88b477..22cc2f9f8aec4b1ad35d2efe547d3ae1b87c2932 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -248,18 +248,17 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, if (config_enabled(CONFIG_X86_64)) sp -= 128; - if (!onsigstack) { - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (current->sas_ss_size) - sp = current->sas_ss_sp + current->sas_ss_size; - } else if (config_enabled(CONFIG_X86_32) && - (regs->ss & 0xffff) != __USER_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) { - /* This is the legacy signal stack switching. */ - sp = (unsigned long) ka->sa.sa_restorer; - } + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; + } else if (config_enabled(CONFIG_X86_32) && + !onsigstack && + (regs->ss & 0xffff) != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) { + /* This is the legacy signal stack switching. */ + sp = (unsigned long) ka->sa.sa_restorer; } if (fpu->fpstate_active) { @@ -391,7 +390,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, put_user_ex(&frame->uc, &frame->puc); /* Create the ucontext. */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); else put_user_ex(0, &frame->uc.uc_flags); @@ -442,7 +441,7 @@ static unsigned long frame_uc_flags(struct pt_regs *regs) { unsigned long flags; - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; else flags = UC_SIGCONTEXT_SS; @@ -762,7 +761,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { #ifdef CONFIG_X86_64 - if (is_ia32_task()) + if (in_ia32_syscall()) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0e4329ed91ef61da5a86e5d604d1ad96109efbfb..fafe8b923cac2d27da4189e6be9a2761bfcd6d01 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1236,7 +1236,7 @@ static int __init smp_sanity_check(unsigned max_cpus) * If we couldn't find a local APIC, then get out of here now! */ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && - !cpu_has_apic) { + !boot_cpu_has(X86_FEATURE_APIC)) { if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", boot_cpu_physical_apicid); diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index 5da924bbf0a0f22aa676f5be5a3f97fe4811b087..623965e86b65eda431b8e5fdbc2204c47bcb8b33 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -68,6 +68,21 @@ struct efifb_dmi_info efifb_dmi_list[] = { [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE } }; +void efifb_setup_from_dmi(struct screen_info *si, const char *opt) +{ + int i; + + for (i = 0; i < M_UNKNOWN; i++) { + if (efifb_dmi_list[i].base != 0 && + !strcmp(opt, efifb_dmi_list[i].optname)) { + si->lfb_base = efifb_dmi_list[i].base; + si->lfb_linelength = efifb_dmi_list[i].stride; + si->lfb_width = efifb_dmi_list[i].width; + si->lfb_height = efifb_dmi_list[i].height; + } + } +} + #define choose_value(dmivalue, fwvalue, field, flags) ({ \ typeof(fwvalue) _ret_ = fwvalue; \ if ((flags) & (field)) \ diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index e72a07f20b05cfb079f2dca79e063b1dd340f2e4..9b0185fbe3eb44029040c05beea605126baa8644 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -74,12 +74,6 @@ void __init tboot_probe(void) return; } - /* only a natively booted kernel should be using TXT */ - if (paravirt_enabled()) { - pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); - return; - } - /* Map and check for tboot UUID. */ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c index ab40954e113e952b088818e60b569cabc96488f5..f386bad0984ed70f89a432d89b6d90f28bb4eaee 100644 --- a/arch/x86/kernel/tce_64.c +++ b/arch/x86/kernel/tce_64.c @@ -40,7 +40,7 @@ static inline void flush_tce(void* tceaddr) { /* a single tce can't cross a cache line */ - if (cpu_has_clflush) + if (boot_cpu_has(X86_FEATURE_CLFLUSH)) clflush(tceaddr); else wbinvd(); diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 7fc5e843f247b358288b23e459eebfefcf6631f0..9692a5e9fdab2002f31c6dd2dce2c113647608b6 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -114,6 +114,7 @@ int do_set_thread_area(struct task_struct *p, int idx, int can_allocate) { struct user_desc info; + unsigned short __maybe_unused sel, modified_sel; if (copy_from_user(&info, u_info, sizeof(info))) return -EFAULT; @@ -141,6 +142,47 @@ int do_set_thread_area(struct task_struct *p, int idx, set_tls_desc(p, idx, &info, 1); + /* + * If DS, ES, FS, or GS points to the modified segment, forcibly + * refresh it. Only needed on x86_64 because x86_32 reloads them + * on return to user mode. + */ + modified_sel = (idx << 3) | 3; + + if (p == current) { +#ifdef CONFIG_X86_64 + savesegment(ds, sel); + if (sel == modified_sel) + loadsegment(ds, sel); + + savesegment(es, sel); + if (sel == modified_sel) + loadsegment(es, sel); + + savesegment(fs, sel); + if (sel == modified_sel) + loadsegment(fs, sel); + + savesegment(gs, sel); + if (sel == modified_sel) + load_gs_index(sel); +#endif + +#ifdef CONFIG_X86_32_LAZY_GS + savesegment(gs, sel); + if (sel == modified_sel) + loadsegment(gs, sel); +#endif + } else { +#ifdef CONFIG_X86_64 + if (p->thread.fsindex == modified_sel) + p->thread.fsbase = info.base_addr; + + if (p->thread.gsindex == modified_sel) + p->thread.gsbase = info.base_addr; +#endif + } + return 0; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 06cbe25861f1591a7829b5d9347e1a753f874456..d1590486204a1bb52974c27793dc66139f20c4ee 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c9c4c7ce3eb23c8fea3f6b05647c482edf81c939..38ba6de56edec93badec52707045b0db3a189e0a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -36,7 +36,7 @@ static int __read_mostly tsc_unstable; /* native_sched_clock() is called before tsc_init(), so we must start with the TSC soft disabled to prevent - erroneous rdtsc usage on !cpu_has_tsc processors */ + erroneous rdtsc usage on !boot_cpu_has(X86_FEATURE_TSC) processors */ static int __read_mostly tsc_disabled = -1; static DEFINE_STATIC_KEY_FALSE(__use_tsc); @@ -834,15 +834,15 @@ int recalibrate_cpu_khz(void) #ifndef CONFIG_SMP unsigned long cpu_khz_old = cpu_khz; - if (cpu_has_tsc) { - tsc_khz = x86_platform.calibrate_tsc(); - cpu_khz = tsc_khz; - cpu_data(0).loops_per_jiffy = - cpufreq_scale(cpu_data(0).loops_per_jiffy, - cpu_khz_old, cpu_khz); - return 0; - } else + if (!boot_cpu_has(X86_FEATURE_TSC)) return -ENODEV; + + tsc_khz = x86_platform.calibrate_tsc(); + cpu_khz = tsc_khz; + cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, + cpu_khz_old, cpu_khz); + + return 0; #else return -ENODEV; #endif @@ -922,9 +922,6 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, struct cpufreq_freqs *freq = data; unsigned long *lpj; - if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) - return 0; - lpj = &boot_cpu_data.loops_per_jiffy; #ifdef CONFIG_SMP if (!(freq->flags & CPUFREQ_CONST_LOOPS)) @@ -954,9 +951,9 @@ static struct notifier_block time_cpufreq_notifier_block = { .notifier_call = time_cpufreq_notifier }; -static int __init cpufreq_tsc(void) +static int __init cpufreq_register_tsc_scaling(void) { - if (!cpu_has_tsc) + if (!boot_cpu_has(X86_FEATURE_TSC)) return 0; if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; @@ -965,7 +962,7 @@ static int __init cpufreq_tsc(void) return 0; } -core_initcall(cpufreq_tsc); +core_initcall(cpufreq_register_tsc_scaling); #endif /* CONFIG_CPU_FREQ */ @@ -1081,7 +1078,7 @@ static void __init check_system_tsc_reliable(void) */ int unsynchronized_tsc(void) { - if (!cpu_has_tsc || tsc_unstable) + if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable) return 1; #ifdef CONFIG_SMP @@ -1205,7 +1202,7 @@ static void tsc_refine_calibration_work(struct work_struct *work) static int __init init_tsc_clocksource(void) { - if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz) + if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz) return 0; if (tsc_clocksource_reliable) @@ -1242,7 +1239,7 @@ void __init tsc_init(void) u64 lpj; int cpu; - if (!cpu_has_tsc) { + if (!boot_cpu_has(X86_FEATURE_TSC)) { setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index bf4db6eaec8fda2aae48f2fe7c1cddb8d157b37f..6c1ff31d99ffeb0d0a28c5ee472bb1865ff23df3 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -516,7 +516,7 @@ struct uprobe_xol_ops { static inline int sizeof_long(void) { - return is_ia32_task() ? 4 : 8; + return in_ia32_syscall() ? 4 : 8; } static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) @@ -578,7 +578,7 @@ static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) riprel_post_xol(auprobe, regs); } -static struct uprobe_xol_ops default_xol_ops = { +static const struct uprobe_xol_ops default_xol_ops = { .pre_xol = default_pre_xol_op, .post_xol = default_post_xol_op, .abort = default_abort_op, @@ -695,7 +695,7 @@ static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) 0, insn->immediate.nbytes); } -static struct uprobe_xol_ops branch_xol_ops = { +static const struct uprobe_xol_ops branch_xol_ops = { .emulate = branch_emulate_op, .post_xol = branch_post_xol_op, }; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4c941f88d405fbe339d9194e845d9d748fd72604..9297a002d8e5ff3f06b8b8fee118b21f9042a994 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -334,7 +334,7 @@ SECTIONS __brk_limit = .; } - . = ALIGN(PAGE_SIZE); + . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ _end = .; STABS_DEBUG diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bbbaa802d13efc8b1e57f7defa351cacb2aaab78..769af907f82485edc91bf3e4bbfcebddbe148fcc 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -75,7 +75,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) return 0; /* Update OSXSAVE bit */ - if (cpu_has_xsave && best->function == 0x1) { + if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) { best->ecx &= ~F(OSXSAVE); if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) best->ecx |= F(OSXSAVE); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b6f50e8b0a393675009a5dcaad7f30af315bc91d..38c0c32926c96bc154c2ce6c7c6cb06a30b03ac2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3844,7 +3844,8 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, boot_cpu_data.x86_phys_bits, context->shadow_root_level, false, - cpu_has_gbpages, true, true); + boot_cpu_has(X86_FEATURE_GBPAGES), + true, true); else __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, boot_cpu_data.x86_phys_bits, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 31346a3f20a5c8b5384e6fda2a81029059cdf621..fafd720ce10a12cbe6e70da6c3dc1796af3bd447 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1254,7 +1254,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) kvm_load_ldt(svm->host.ldt); #ifdef CONFIG_X86_64 loadsegment(fs, svm->host.fs); - wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase); load_gs_index(svm->host.gs); #else #ifdef CONFIG_X86_32_LAZY_GS diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 2f1ea2f61e1fceef4a77b3955c4d3a9c4a936a72..b72743c5668d3d55387a55d06c0c886cf2b7b1b1 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -809,8 +809,7 @@ TRACE_EVENT(kvm_write_tsc_offset, #define host_clocks \ {VCLOCK_NONE, "none"}, \ - {VCLOCK_TSC, "tsc"}, \ - {VCLOCK_HPET, "hpet"} \ + {VCLOCK_TSC, "tsc"} \ TRACE_EVENT(kvm_update_master_clock, TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 133679d520afee3934bd5dc155d9675d69198a8d..cb47fe3da2926b3c1c17df41625bef9492353554 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3390,7 +3390,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) } } - if (cpu_has_xsaves) + if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9b7798c7b210e75499644ed1ca35b643fe743208..12f33e6623826dfcd0af660a534e8240683bc1a2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2611,7 +2611,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_MCE_BANKS; break; case KVM_CAP_XCRS: - r = cpu_has_xsave; + r = boot_cpu_has(X86_FEATURE_XSAVE); break; case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; @@ -3094,7 +3094,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) /* Set XSTATE_BV and possibly XCOMP_BV. */ xsave->header.xfeatures = xstate_bv; - if (cpu_has_xsaves) + if (boot_cpu_has(X86_FEATURE_XSAVES)) xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* @@ -3121,7 +3121,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - if (cpu_has_xsave) { + if (boot_cpu_has(X86_FEATURE_XSAVE)) { memset(guest_xsave, 0, sizeof(struct kvm_xsave)); fill_xsave((u8 *) guest_xsave->region, vcpu); } else { @@ -3139,7 +3139,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, u64 xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - if (cpu_has_xsave) { + if (boot_cpu_has(X86_FEATURE_XSAVE)) { /* * Here we allow setting states that are not present in * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility @@ -3160,7 +3160,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, struct kvm_xcrs *guest_xcrs) { - if (!cpu_has_xsave) { + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { guest_xcrs->nr_xcrs = 0; return; } @@ -3176,7 +3176,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, { int i, r = 0; - if (!cpu_has_xsave) + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -EINVAL; if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) @@ -5865,7 +5865,7 @@ int kvm_arch_init(void *opaque) perf_register_guest_info_callbacks(&kvm_guest_cbs); - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_lapic_init(); @@ -7293,7 +7293,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) static void fx_init(struct kvm_vcpu *vcpu) { fpstate_init(&vcpu->arch.guest_fpu.state); - if (cpu_has_xsaves) + if (boot_cpu_has(X86_FEATURE_XSAVES)) vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index fd57d3ae7e16daf24f8cfdb4e708c5e4e9a9f3f9..3847e736702e1153cc7e654087ca716d6b55d7da 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1408,13 +1408,10 @@ __init void lguest_init(void) { /* We're under lguest. */ pv_info.name = "lguest"; - /* Paravirt is enabled. */ - pv_info.paravirt_enabled = 1; /* We're running at privilege level 1, not 0 as normal. */ pv_info.kernel_rpl = 1; /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; - pv_info.features = 0; /* * We set up all the lguest overrides for sensitive operations. These diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index be110efa0096687c1ea35ada3fc439a9cb3f6180..bf2c6074efd2fb97886d974dea2668f8da59c49c 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -29,8 +29,10 @@ * there is contention on the semaphore. * * %eax contains the semaphore pointer on entry. Save the C-clobbered - * registers (%eax, %edx and %ecx) except %eax whish is either a return - * value or just clobbered.. + * registers (%eax, %edx and %ecx) except %eax which is either a return + * value or just gets clobbered. Same is true for %edx so make sure GCC + * reloads it after the slow path, by making it hold a temporary, for + * example see ____down_write(). */ #define save_common_regs \ @@ -106,6 +108,16 @@ ENTRY(call_rwsem_down_write_failed) ret ENDPROC(call_rwsem_down_write_failed) +ENTRY(call_rwsem_down_write_failed_killable) + FRAME_BEGIN + save_common_regs + movq %rax,%rdi + call rwsem_down_write_failed_killable + restore_common_regs + FRAME_END + ret +ENDPROC(call_rwsem_down_write_failed_killable) + ENTRY(call_rwsem_wake) FRAME_BEGIN /* do nothing if still outstanding active readers */ diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 91d93b95bd8685228b395c10e77d30e3a4303355..b559d923878133aadb4480c61e642bbf6c799086 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -612,7 +612,7 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, { stac(); #ifdef CONFIG_X86_INTEL_USERCOPY - if (n > 64 && cpu_has_xmm2) + if (n > 64 && static_cpu_has(X86_FEATURE_XMM2)) n = __copy_user_zeroing_intel_nocache(to, from, n); else __copy_user_zeroing(to, from, n); @@ -629,7 +629,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr { stac(); #ifdef CONFIG_X86_INTEL_USERCOPY - if (n > 64 && cpu_has_xmm2) + if (n > 64 && static_cpu_has(X86_FEATURE_XMM2)) n = __copy_user_intel_nocache(to, from, n); else __copy_user(to, from, n); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index f98913258c639dac160fa8f01304486c48fb391d..62c0043a5fd545f09a584e2c1f991923c9b16afb 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -2,7 +2,7 @@ KCOV_INSTRUMENT_tlb.o := n obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o gup.o setup_nx.o + pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) @@ -12,7 +12,6 @@ CFLAGS_setup_nx.o := $(nostackp) CFLAGS_fault.o := -I$(src)/../include/asm/trace obj-$(CONFIG_X86_PAT) += pat_rbtree.o -obj-$(CONFIG_SMP) += tlb.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 82447b3fba380d6547619958c07ed7c3d2d3010b..4bb53b89f3c55defd9098d81002eed1877b0fa07 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,5 +1,6 @@ #include #include +#include typedef bool (*ex_handler_t)(const struct exception_table_entry *, struct pt_regs *, int); @@ -42,6 +43,43 @@ bool ex_handler_ext(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_ext); +bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x\n", + (unsigned int)regs->cx); + + /* Pretend that the read succeeded and returned 0. */ + regs->ip = ex_fixup_addr(fixup); + regs->ax = 0; + regs->dx = 0; + return true; +} +EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); + +bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + WARN_ONCE(1, "unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x)\n", + (unsigned int)regs->cx, + (unsigned int)regs->dx, (unsigned int)regs->ax); + + /* Pretend that the write succeeded. */ + regs->ip = ex_fixup_addr(fixup); + return true; +} +EXPORT_SYMBOL(ex_handler_wrmsr_unsafe); + +bool ex_handler_clear_fs(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + if (static_cpu_has(X86_BUG_NULL_SEG)) + asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); + asm volatile ("mov %0, %%fs" : : "rm" (0)); + return ex_handler_default(fixup, regs, trapnr); +} +EXPORT_SYMBOL(ex_handler_clear_fs); + bool ex_has_fault_handler(unsigned long ip) { const struct exception_table_entry *e; @@ -82,24 +120,46 @@ int fixup_exception(struct pt_regs *regs, int trapnr) return handler(e, regs, trapnr); } +extern unsigned int early_recursion_flag; + /* Restricted version used during very early boot */ -int __init early_fixup_exception(unsigned long *ip) +void __init early_fixup_exception(struct pt_regs *regs, int trapnr) { - const struct exception_table_entry *e; - unsigned long new_ip; - ex_handler_t handler; - - e = search_exception_tables(*ip); - if (!e) - return 0; - - new_ip = ex_fixup_addr(e); - handler = ex_fixup_handler(e); - - /* special handling not supported during early boot */ - if (handler != ex_handler_default) - return 0; - - *ip = new_ip; - return 1; + /* Ignore early NMIs. */ + if (trapnr == X86_TRAP_NMI) + return; + + if (early_recursion_flag > 2) + goto halt_loop; + + if (regs->cs != __KERNEL_CS) + goto fail; + + /* + * The full exception fixup machinery is available as soon as + * the early IDT is loaded. This means that it is the + * responsibility of extable users to either function correctly + * when handlers are invoked early or to simply avoid causing + * exceptions before they're ready to handle them. + * + * This is better than filtering which handlers can be used, + * because refusing to call a handler here is guaranteed to + * result in a hard-to-debug panic. + * + * Keep in mind that not all vectors actually get here. Early + * fage faults, for example, are special. + */ + if (fixup_exception(regs, trapnr)) + return; + +fail: + early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n", + (unsigned)trapnr, (unsigned long)regs->cs, regs->ip, + regs->orig_ax, read_cr2()); + + show_regs(regs); + +halt_loop: + while (true) + halt(); } diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 740d7ac03a552bc4937edfc8ff7e8b9d044a61b6..14a95054d4e058a85f6b8d80c162aa2617e7f848 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -162,7 +162,7 @@ static __init int setup_hugepagesz(char *opt) unsigned long ps = memparse(opt, &opt); if (ps == PMD_SIZE) { hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (ps == PUD_SIZE && cpu_has_gbpages) { + } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); } else { printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", @@ -177,7 +177,7 @@ __setup("hugepagesz=", setup_hugepagesz); static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ - if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT)) + if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; } diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c new file mode 100644 index 0000000000000000000000000000000000000000..ec21796ac5fd5a95f5e386af57a67d2e4ba51f61 --- /dev/null +++ b/arch/x86/mm/ident_map.c @@ -0,0 +1,79 @@ +/* + * Helper routines for building identity mapping page tables. This is + * included by both the compressed kernel and the regular kernel. + */ + +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); + + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | pmd_flag)); + } +} + +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info->pmd_flag, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info->pmd_flag, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pud_t *pud; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; + } + + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 9d56f271d519592a5fbf316237f73b57b0fc49c4..372aad2b32910d30eb3f062e67a7589340847d32 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -157,23 +157,23 @@ static void __init probe_page_size_mask(void) * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (cpu_has_pse && !debug_pagealloc_enabled()) + if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) page_size_mask |= 1 << PG_LEVEL_2M; #endif /* Enable PSE if available */ - if (cpu_has_pse) + if (boot_cpu_has(X86_FEATURE_PSE)) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ - if (cpu_has_pge) { + if (boot_cpu_has(X86_FEATURE_PGE)) { cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } else __supported_pte_mask &= ~_PAGE_GLOBAL; /* Enable 1 GB linear kernel mappings if available: */ - if (direct_gbpages && cpu_has_gbpages) { + if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { printk(KERN_INFO "Using GB pages for direct mapping\n"); page_size_mask |= 1 << PG_LEVEL_1G; } else { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bd7a9b9e2e14a595adfb1c86bbcfacb787a6579c..84df150ee77e7073366799c695cdda8aa04896f8 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -284,7 +284,7 @@ kernel_physical_mapping_init(unsigned long start, */ mapping_iter = 1; - if (!cpu_has_pse) + if (!boot_cpu_has(X86_FEATURE_PSE)) use_pse = 0; repeat: @@ -804,9 +804,6 @@ void __init mem_init(void) BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); #undef high_memory #undef __FIXADDR_TOP -#ifdef CONFIG_RANDOMIZE_BASE - BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE); -#endif #ifdef CONFIG_HIGHMEM BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 214afda979114f3cf1ad7e2d95173e0446a84ed3..bce2e5d9edd458cf28f44ac921f91cc790d11a52 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -58,79 +58,7 @@ #include "mm_internal.h" -static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, - unsigned long addr, unsigned long end) -{ - addr &= PMD_MASK; - for (; addr < end; addr += PMD_SIZE) { - pmd_t *pmd = pmd_page + pmd_index(addr); - - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | pmd_flag)); - } -} -static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, - unsigned long addr, unsigned long end) -{ - unsigned long next; - - for (; addr < end; addr = next) { - pud_t *pud = pud_page + pud_index(addr); - pmd_t *pmd; - - next = (addr & PUD_MASK) + PUD_SIZE; - if (next > end) - next = end; - - if (pud_present(*pud)) { - pmd = pmd_offset(pud, 0); - ident_pmd_init(info->pmd_flag, pmd, addr, next); - continue; - } - pmd = (pmd_t *)info->alloc_pgt_page(info->context); - if (!pmd) - return -ENOMEM; - ident_pmd_init(info->pmd_flag, pmd, addr, next); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - } - - return 0; -} - -int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, - unsigned long addr, unsigned long end) -{ - unsigned long next; - int result; - int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; - - for (; addr < end; addr = next) { - pgd_t *pgd = pgd_page + pgd_index(addr) + off; - pud_t *pud; - - next = (addr & PGDIR_MASK) + PGDIR_SIZE; - if (next > end) - next = end; - - if (pgd_present(*pgd)) { - pud = pud_offset(pgd, 0); - result = ident_pud_init(info, pud, addr, next); - if (result) - return result; - continue; - } - - pud = (pud_t *)info->alloc_pgt_page(info->context); - if (!pud) - return -ENOMEM; - result = ident_pud_init(info, pud, addr, next); - if (result) - return result; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); - } - - return 0; -} +#include "ident_map.c" /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the @@ -1295,7 +1223,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) struct vmem_altmap *altmap = to_vmem_altmap(start); int err; - if (cpu_has_pse) + if (boot_cpu_has(X86_FEATURE_PSE)) err = vmemmap_populate_hugepages(start, end, node, altmap); else if (altmap) { pr_err_once("%s: no cpu support for altmap allocations\n", @@ -1338,7 +1266,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, } get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); - if (!cpu_has_pse) { + if (!boot_cpu_has(X86_FEATURE_PSE)) { next = (addr + PAGE_SIZE) & PAGE_MASK; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0d8d53d1f5cc29c2376e8d72204896b0be6b4878..f0894910bdd731c0e2adc3fd70d94c5f90f62f2a 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -378,7 +378,7 @@ EXPORT_SYMBOL(iounmap); int __init arch_ioremap_pud_supported(void) { #ifdef CONFIG_X86_64 - return cpu_has_gbpages; + return boot_cpu_has(X86_FEATURE_GBPAGES); #else return 0; #endif @@ -386,7 +386,7 @@ int __init arch_ioremap_pud_supported(void) int __init arch_ioremap_pmd_supported(void) { - return cpu_has_pse; + return boot_cpu_has(X86_FEATURE_PSE); } /* diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 01be9ec3bf792f65ac91cc57fe7d64e8e644998f..7a1f7bbf4105b6ec570c9c15497a00237c5185e9 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1055,7 +1055,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, /* * Map everything starting from the Gb boundary, possibly with 1G pages */ - while (cpu_has_gbpages && end - start >= PUD_SIZE) { + while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | massage_pgprot(pud_pgprot))); @@ -1125,8 +1125,14 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { - if (cpa->pgd) + if (cpa->pgd) { + /* + * Right now, we only execute this code path when mapping + * the EFI virtual memory map regions, no other users + * provide a ->pgd value. This may change in the future. + */ return populate_pgd(cpa, vaddr); + } /* * Ignore all non primary paths. @@ -1460,7 +1466,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, * error case we fall back to cpa_flush_all (which uses * WBINVD): */ - if (!ret && cpu_has_clflush) { + if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { cpa_flush_array(addr, numpages, cache, cpa.flags, pages); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index faec01e7a17d21fbd7abafc42fb9855dc7d530f4..fb0604f11eec268a2cc69b1d47e3885b073c6cf1 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -40,11 +40,22 @@ static bool boot_cpu_done; static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); +static void init_cache_modes(void); -static inline void pat_disable(const char *reason) +void pat_disable(const char *reason) { + if (!__pat_enabled) + return; + + if (boot_cpu_done) { + WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n"); + return; + } + __pat_enabled = 0; pr_info("x86/PAT: %s\n", reason); + + init_cache_modes(); } static int __init nopat(char *str) @@ -181,7 +192,7 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) * configuration. * Using lower indices is preferred, so we start with highest index. */ -void pat_init_cache_modes(u64 pat) +static void __init_cache_modes(u64 pat) { enum page_cache_mode cache; char pat_msg[33]; @@ -202,14 +213,11 @@ static void pat_bsp_init(u64 pat) { u64 tmp_pat; - if (!cpu_has_pat) { + if (!boot_cpu_has(X86_FEATURE_PAT)) { pat_disable("PAT not supported by CPU."); return; } - if (!pat_enabled()) - goto done; - rdmsrl(MSR_IA32_CR_PAT, tmp_pat); if (!tmp_pat) { pat_disable("PAT MSR is 0, disabled."); @@ -218,16 +226,12 @@ static void pat_bsp_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); -done: - pat_init_cache_modes(pat); + __init_cache_modes(pat); } static void pat_ap_init(u64 pat) { - if (!pat_enabled()) - return; - - if (!cpu_has_pat) { + if (!boot_cpu_has(X86_FEATURE_PAT)) { /* * If this happens we are on a secondary CPU, but switched to * PAT on the boot CPU. We have no way to undo PAT. @@ -238,18 +242,32 @@ static void pat_ap_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); } -void pat_init(void) +static void init_cache_modes(void) { - u64 pat; - struct cpuinfo_x86 *c = &boot_cpu_data; + u64 pat = 0; + static int init_cm_done; - if (!pat_enabled()) { + if (init_cm_done) + return; + + if (boot_cpu_has(X86_FEATURE_PAT)) { + /* + * CPU supports PAT. Set PAT table to be consistent with + * PAT MSR. This case supports "nopat" boot option, and + * virtual machine environments which support PAT without + * MTRRs. In specific, Xen has unique setup to PAT MSR. + * + * If PAT MSR returns 0, it is considered invalid and emulates + * as No PAT. + */ + rdmsrl(MSR_IA32_CR_PAT, pat); + } + + if (!pat) { /* * No PAT. Emulate the PAT table that corresponds to the two - * cache bits, PWT (Write Through) and PCD (Cache Disable). This - * setup is the same as the BIOS default setup when the system - * has PAT but the "nopat" boot option has been specified. This - * emulated PAT table is used when MSR_IA32_CR_PAT returns 0. + * cache bits, PWT (Write Through) and PCD (Cache Disable). + * This setup is also the same as the BIOS default setup. * * PTE encoding: * @@ -266,10 +284,36 @@ void pat_init(void) */ pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); + } + + __init_cache_modes(pat); + + init_cm_done = 1; +} + +/** + * pat_init - Initialize PAT MSR and PAT table + * + * This function initializes PAT MSR and PAT table with an OS-defined value + * to enable additional cache attributes, WC and WT. + * + * This function must be called on all CPUs using the specific sequence of + * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this + * procedure for PAT. + */ +void pat_init(void) +{ + u64 pat; + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (!pat_enabled()) { + init_cache_modes(); + return; + } - } else if ((c->x86_vendor == X86_VENDOR_INTEL) && - (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || - ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { + if ((c->x86_vendor == X86_VENDOR_INTEL) && + (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || + ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { /* * PAT support with the lower four entries. Intel Pentium 2, * 3, M, and 4 are affected by PAT errata, which makes the @@ -734,25 +778,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (file->f_flags & O_DSYNC) pcm = _PAGE_CACHE_MODE_UC_MINUS; -#ifdef CONFIG_X86_32 - /* - * On the PPro and successors, the MTRRs are used to set - * memory types for physical addresses outside main memory, - * so blindly setting UC or PWT on those pages is wrong. - * For Pentiums and earlier, the surround logic should disable - * caching for the high addresses through the KEN pin, but - * we maintain the tradition of paranoia in this code. - */ - if (!pat_enabled() && - !(boot_cpu_has(X86_FEATURE_MTRR) || - boot_cpu_has(X86_FEATURE_K6_MTRR) || - boot_cpu_has(X86_FEATURE_CYRIX_ARR) || - boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && - (pfn << PAGE_SHIFT) >= __pa(high_memory)) { - pcm = _PAGE_CACHE_MODE_UC; - } -#endif - *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | cachemode2protval(pcm)); return 1; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index fe9b9f77636168752f989b9d007634396864bf13..5643fd0b1a7d271da14dee848589d99437b25b49 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,8 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +#ifdef CONFIG_SMP + struct flush_tlb_info { struct mm_struct *flush_mm; unsigned long flush_start; @@ -57,6 +59,118 @@ void leave_mm(int cpu) } EXPORT_SYMBOL_GPL(leave_mm); +#endif /* CONFIG_SMP */ + +void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + switch_mm_irqs_off(prev, next, tsk); + local_irq_restore(flags); +} + +void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned cpu = smp_processor_id(); + + if (likely(prev != next)) { +#ifdef CONFIG_SMP + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + this_cpu_write(cpu_tlbstate.active_mm, next); +#endif + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* + * Re-load page tables. + * + * This logic has an ordering constraint: + * + * CPU 0: Write to a PTE for 'next' + * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. + * CPU 1: set bit 1 in next's mm_cpumask + * CPU 1: load from the PTE that CPU 0 writes (implicit) + * + * We need to prevent an outcome in which CPU 1 observes + * the new PTE value and CPU 0 observes bit 1 clear in + * mm_cpumask. (If that occurs, then the IPI will never + * be sent, and CPU 0's TLB will contain a stale entry.) + * + * The bad outcome can occur if either CPU's load is + * reordered before that CPU's store, so both CPUs must + * execute full barriers to prevent this from happening. + * + * Thus, switch_mm needs a full barrier between the + * store to mm_cpumask and any operation that could load + * from next->pgd. TLB fills are special and can happen + * due to instruction fetches or for no reason at all, + * and neither LOCK nor MFENCE orders them. + * Fortunately, load_cr3() is serializing and gives the + * ordering guarantee we need. + * + */ + load_cr3(next->pgd); + + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + + /* Stop flush ipis for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + + /* Load per-mm CR4 state */ + load_mm_cr4(next); + +#ifdef CONFIG_MODIFY_LDT_SYSCALL + /* + * Load the LDT, if the LDT is different. + * + * It's possible that prev->context.ldt doesn't match + * the LDT register. This can happen if leave_mm(prev) + * was called and then modify_ldt changed + * prev->context.ldt but suppressed an IPI to this CPU. + * In this case, prev->context.ldt != NULL, because we + * never set context.ldt to NULL while the mm still + * exists. That means that next->context.ldt != + * prev->context.ldt, because mms never share an LDT. + */ + if (unlikely(prev->context.ldt != next->context.ldt)) + load_mm_ldt(next); +#endif + } +#ifdef CONFIG_SMP + else { + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); + + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * On established mms, the mm_cpumask is only changed + * from irq context, from ptep_clear_flush() while in + * lazy tlb mode, and here. Irqs are blocked during + * schedule, protecting us from simultaneous changes. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* + * We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload CR3 + * to make sure to use no freed page tables. + * + * As above, load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask write. + */ + load_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + load_mm_cr4(next); + load_mm_ldt(next); + } + } +#endif +} + +#ifdef CONFIG_SMP + /* * The flush IPI assumes that a thread switch happens in this order: * [cpu0: the cpu that switches] @@ -353,3 +467,5 @@ static int __init create_tlb_single_page_flush_ceiling(void) return 0; } late_initcall(create_tlb_single_page_flush_ceiling); + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 0e07e0968c3a0d5959d554c0c29ab358eb78b82b..28c04123b6ddaebce73e967644cf589a0b524ac2 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -636,7 +636,7 @@ static int __init ppro_init(char **cpu_type) __u8 cpu_model = boot_cpu_data.x86_model; struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ - if (force_cpu_type == arch_perfmon && cpu_has_arch_perfmon) + if (force_cpu_type == arch_perfmon && boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) return 0; /* @@ -700,7 +700,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) char *cpu_type = NULL; int ret = 0; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; if (force_cpu_type == timer) @@ -761,7 +761,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (cpu_type) break; - if (!cpu_has_arch_perfmon) + if (!boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) return -ENODEV; /* use arch perfmon as fallback */ diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index d90528ea541206b57f3048e191d4340ee070b40b..350f7096baac82893bc076fd6db4d04a685d7104 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -75,7 +75,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, u64 val; int i; - if (cpu_has_arch_perfmon) { + if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) { union cpuid10_eax eax; eax.full = cpuid_eax(0xa); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index beac4dfdade6c05c02591e924fd913d5db53abcd..4bd08b0fc8ea1b1c9badf0128920858ef8d41336 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -445,7 +445,7 @@ void __init xen_msi_init(void) uint32_t eax = cpuid_eax(xen_cpuid_base() + 4); if (((eax & XEN_HVM_CPUID_X2APIC_VIRT) && x2apic_mode) || - ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && cpu_has_apic)) + ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && boot_cpu_has(X86_FEATURE_APIC))) return; } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 994a7df84a7bc713aa4913f04fc7c85992840183..f93545e7dc54e7e2aa19bf494db55eb74080b739 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -54,10 +54,6 @@ #include #include -#define EFI_DEBUG - -struct efi_memory_map memmap; - static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; @@ -119,11 +115,10 @@ void efi_get_time(struct timespec *now) void __init efi_find_mirror(void) { - void *p; + efi_memory_desc_t *md; u64 mirror_size = 0, total_size = 0; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; @@ -146,10 +141,9 @@ void __init efi_find_mirror(void) static void __init do_add_efi_memmap(void) { - void *p; + efi_memory_desc_t *md; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; int e820_type; @@ -209,47 +203,47 @@ int __init efi_memblock_x86_reserve_range(void) #else pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); #endif - memmap.phys_map = pmap; - memmap.nr_map = e->efi_memmap_size / + efi.memmap.phys_map = pmap; + efi.memmap.nr_map = e->efi_memmap_size / e->efi_memdesc_size; - memmap.desc_size = e->efi_memdesc_size; - memmap.desc_version = e->efi_memdesc_version; + efi.memmap.desc_size = e->efi_memdesc_size; + efi.memmap.desc_version = e->efi_memdesc_version; - memblock_reserve(pmap, memmap.nr_map * memmap.desc_size); + WARN(efi.memmap.desc_version != 1, + "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", + efi.memmap.desc_version); - efi.memmap = &memmap; + memblock_reserve(pmap, efi.memmap.nr_map * efi.memmap.desc_size); return 0; } void __init efi_print_memmap(void) { -#ifdef EFI_DEBUG efi_memory_desc_t *md; - void *p; - int i; + int i = 0; - for (p = memmap.map, i = 0; - p < memmap.map_end; - p += memmap.desc_size, i++) { + for_each_efi_memory_desc(md) { char buf[64]; - md = p; pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n", - i, efi_md_typeattr_format(buf, sizeof(buf), md), + i++, efi_md_typeattr_format(buf, sizeof(buf), md), md->phys_addr, md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1, (md->num_pages >> (20 - EFI_PAGE_SHIFT))); } -#endif /* EFI_DEBUG */ } void __init efi_unmap_memmap(void) { + unsigned long size; + clear_bit(EFI_MEMMAP, &efi.flags); - if (memmap.map) { - early_memunmap(memmap.map, memmap.nr_map * memmap.desc_size); - memmap.map = NULL; + + size = efi.memmap.nr_map * efi.memmap.desc_size; + if (efi.memmap.map) { + early_memunmap(efi.memmap.map, size); + efi.memmap.map = NULL; } } @@ -352,8 +346,6 @@ static int __init efi_systab_init(void *phys) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff); - set_bit(EFI_SYSTEM_TABLES, &efi.flags); - return 0; } @@ -440,17 +432,22 @@ static int __init efi_runtime_init(void) static int __init efi_memmap_init(void) { + unsigned long addr, size; + if (efi_enabled(EFI_PARAVIRT)) return 0; /* Map the EFI memory map */ - memmap.map = early_memremap((unsigned long)memmap.phys_map, - memmap.nr_map * memmap.desc_size); - if (memmap.map == NULL) { + size = efi.memmap.nr_map * efi.memmap.desc_size; + addr = (unsigned long)efi.memmap.phys_map; + + efi.memmap.map = early_memremap(addr, size); + if (efi.memmap.map == NULL) { pr_err("Could not map the memory map!\n"); return -ENOMEM; } - memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); + + efi.memmap.map_end = efi.memmap.map + size; if (add_efi_memmap) do_add_efi_memmap(); @@ -552,12 +549,9 @@ void __init efi_set_executable(efi_memory_desc_t *md, bool executable) void __init runtime_code_page_mkexec(void) { efi_memory_desc_t *md; - void *p; /* Make EFI runtime service code area executable */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - + for_each_efi_memory_desc(md) { if (md->type != EFI_RUNTIME_SERVICES_CODE) continue; @@ -604,12 +598,10 @@ void __init old_map_region(efi_memory_desc_t *md) /* Merge contiguous regions of the same type and attribute */ static void __init efi_merge_regions(void) { - void *p; efi_memory_desc_t *md, *prev_md = NULL; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + for_each_efi_memory_desc(md) { u64 prev_size; - md = p; if (!prev_md) { prev_md = md; @@ -651,30 +643,31 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md) static void __init save_runtime_map(void) { #ifdef CONFIG_KEXEC_CORE + unsigned long desc_size; efi_memory_desc_t *md; - void *tmp, *p, *q = NULL; + void *tmp, *q = NULL; int count = 0; if (efi_enabled(EFI_OLD_MEMMAP)) return; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + desc_size = efi.memmap.desc_size; + for_each_efi_memory_desc(md) { if (!(md->attribute & EFI_MEMORY_RUNTIME) || (md->type == EFI_BOOT_SERVICES_CODE) || (md->type == EFI_BOOT_SERVICES_DATA)) continue; - tmp = krealloc(q, (count + 1) * memmap.desc_size, GFP_KERNEL); + tmp = krealloc(q, (count + 1) * desc_size, GFP_KERNEL); if (!tmp) goto out; q = tmp; - memcpy(q + count * memmap.desc_size, md, memmap.desc_size); + memcpy(q + count * desc_size, md, desc_size); count++; } - efi_runtime_map_setup(q, count, memmap.desc_size); + efi_runtime_map_setup(q, count, desc_size); return; out: @@ -714,10 +707,10 @@ static inline void *efi_map_next_entry_reverse(void *entry) { /* Initial call */ if (!entry) - return memmap.map_end - memmap.desc_size; + return efi.memmap.map_end - efi.memmap.desc_size; - entry -= memmap.desc_size; - if (entry < memmap.map) + entry -= efi.memmap.desc_size; + if (entry < efi.memmap.map) return NULL; return entry; @@ -759,10 +752,10 @@ static void *efi_map_next_entry(void *entry) /* Initial call */ if (!entry) - return memmap.map; + return efi.memmap.map; - entry += memmap.desc_size; - if (entry >= memmap.map_end) + entry += efi.memmap.desc_size; + if (entry >= efi.memmap.map_end) return NULL; return entry; @@ -776,8 +769,11 @@ static void * __init efi_map_regions(int *count, int *pg_shift) { void *p, *new_memmap = NULL; unsigned long left = 0; + unsigned long desc_size; efi_memory_desc_t *md; + desc_size = efi.memmap.desc_size; + p = NULL; while ((p = efi_map_next_entry(p))) { md = p; @@ -792,7 +788,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift) efi_map_region(md); get_systab_virt_addr(md); - if (left < memmap.desc_size) { + if (left < desc_size) { new_memmap = realloc_pages(new_memmap, *pg_shift); if (!new_memmap) return NULL; @@ -801,10 +797,9 @@ static void * __init efi_map_regions(int *count, int *pg_shift) (*pg_shift)++; } - memcpy(new_memmap + (*count * memmap.desc_size), md, - memmap.desc_size); + memcpy(new_memmap + (*count * desc_size), md, desc_size); - left -= memmap.desc_size; + left -= desc_size; (*count)++; } @@ -816,7 +811,6 @@ static void __init kexec_enter_virtual_mode(void) #ifdef CONFIG_KEXEC_CORE efi_memory_desc_t *md; unsigned int num_pages; - void *p; efi.systab = NULL; @@ -840,8 +834,7 @@ static void __init kexec_enter_virtual_mode(void) * Map efi regions which were passed via setup_data. The virt_addr is a * fixed addr which was used in first kernel of a kexec boot. */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + for_each_efi_memory_desc(md) { efi_map_region_fixed(md); /* FIXME: add error handling */ get_systab_virt_addr(md); } @@ -850,10 +843,10 @@ static void __init kexec_enter_virtual_mode(void) BUG_ON(!efi.systab); - num_pages = ALIGN(memmap.nr_map * memmap.desc_size, PAGE_SIZE); + num_pages = ALIGN(efi.memmap.nr_map * efi.memmap.desc_size, PAGE_SIZE); num_pages >>= PAGE_SHIFT; - if (efi_setup_page_tables(memmap.phys_map, num_pages)) { + if (efi_setup_page_tables(efi.memmap.phys_map, num_pages)) { clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -937,16 +930,16 @@ static void __init __efi_enter_virtual_mode(void) if (efi_is_native()) { status = phys_efi_set_virtual_address_map( - memmap.desc_size * count, - memmap.desc_size, - memmap.desc_version, + efi.memmap.desc_size * count, + efi.memmap.desc_size, + efi.memmap.desc_version, (efi_memory_desc_t *)__pa(new_memmap)); } else { status = efi_thunk_set_virtual_address_map( efi_phys.set_virtual_address_map, - memmap.desc_size * count, - memmap.desc_size, - memmap.desc_version, + efi.memmap.desc_size * count, + efi.memmap.desc_size, + efi.memmap.desc_version, (efi_memory_desc_t *)__pa(new_memmap)); } @@ -1011,13 +1004,11 @@ void __init efi_enter_virtual_mode(void) u32 efi_mem_type(unsigned long phys_addr) { efi_memory_desc_t *md; - void *p; if (!efi_enabled(EFI_MEMMAP)) return 0; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + for_each_efi_memory_desc(md) { if ((md->phys_addr <= phys_addr) && (phys_addr < (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)))) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 49e4dd4a1f58257630c8b341bf8b12922200cd2e..6e7242be1c8744b89af4a96587fb79d3a387e77a 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -55,14 +55,12 @@ struct efi_scratch efi_scratch; static void __init early_code_mapping_set_exec(int executable) { efi_memory_desc_t *md; - void *p; if (!(__supported_pte_mask & _PAGE_NX)) return; /* Make EFI service code area executable */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + for_each_efi_memory_desc(md) { if (md->type == EFI_RUNTIME_SERVICES_CODE || md->type == EFI_BOOT_SERVICES_CODE) efi_set_executable(md, executable); @@ -253,7 +251,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * Map all of RAM so that we can access arguments in the 1:1 * mapping when making EFI runtime calls. */ - for_each_efi_memory_desc(&memmap, md) { + for_each_efi_memory_desc(md) { if (md->type != EFI_CONVENTIONAL_MEMORY && md->type != EFI_LOADER_DATA && md->type != EFI_LOADER_CODE) @@ -398,7 +396,6 @@ void __init efi_runtime_update_mappings(void) unsigned long pfn; pgd_t *pgd = efi_pgd; efi_memory_desc_t *md; - void *p; if (efi_enabled(EFI_OLD_MEMMAP)) { if (__supported_pte_mask & _PAGE_NX) @@ -409,9 +406,8 @@ void __init efi_runtime_update_mappings(void) if (!efi_enabled(EFI_NX_PE_DATA)) return; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + for_each_efi_memory_desc(md) { unsigned long pf = 0; - md = p; if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index ab50ada1d56e4f87f6fff3207995ea32312fb37b..097cb09d917b1f826c13d6fb498115a2dd690d31 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -195,10 +195,9 @@ static bool can_free_region(u64 start, u64 size) */ void __init efi_reserve_boot_services(void) { - void *p; + efi_memory_desc_t *md; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { u64 start = md->phys_addr; u64 size = md->num_pages << EFI_PAGE_SHIFT; bool already_reserved; @@ -250,10 +249,9 @@ void __init efi_reserve_boot_services(void) void __init efi_free_boot_services(void) { - void *p; + efi_memory_desc_t *md; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 1584cbed0dce25eab259cdaf00ad49724cd09bac..815fec6e05e2b8801f83c0aa34baef9aeeeb27ef 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -21,19 +21,20 @@ #include #include +#include #include #include #include #include -static struct uv_systab uv_systab; +struct uv_systab *uv_systab; s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { - struct uv_systab *tab = &uv_systab; + struct uv_systab *tab = uv_systab; s64 ret; - if (!tab->function) + if (!tab || !tab->function) /* * BIOS does not support UV systab */ @@ -183,34 +184,31 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) } EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); - #ifdef CONFIG_EFI void uv_bios_init(void) { - struct uv_systab *tab; - - if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || - (efi.uv_systab == (unsigned long)NULL)) { - printk(KERN_CRIT "No EFI UV System Table.\n"); - uv_systab.function = (unsigned long)NULL; + uv_systab = NULL; + if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab) { + pr_crit("UV: UVsystab: missing\n"); return; } - tab = (struct uv_systab *)ioremap(efi.uv_systab, - sizeof(struct uv_systab)); - if (strncmp(tab->signature, "UVST", 4) != 0) - printk(KERN_ERR "bad signature in UV system table!"); - - /* - * Copy table to permanent spot for later use. - */ - memcpy(&uv_systab, tab, sizeof(struct uv_systab)); - iounmap(tab); + uv_systab = ioremap(efi.uv_systab, sizeof(struct uv_systab)); + if (!uv_systab || strncmp(uv_systab->signature, UV_SYSTAB_SIG, 4)) { + pr_err("UV: UVsystab: bad signature!\n"); + iounmap(uv_systab); + return; + } - printk(KERN_INFO "EFI UV System Table Revision %d\n", - uv_systab.revision); + if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) { + iounmap(uv_systab); + uv_systab = ioremap(efi.uv_systab, uv_systab->size); + if (!uv_systab) { + pr_err("UV: UVsystab: ioremap(%d) failed!\n", + uv_systab->size); + return; + } + } + pr_info("UV: UVsystab: Revision:%x\n", uv_systab->revision); } -#else /* !CONFIG_EFI */ - -void uv_bios_init(void) { } #endif diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 3b6ec42718e460717182c2762ffd7e6a005fff6f..fdb4d42b4ce50c57ac4933c6aa3604dfa12ab9cd 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -37,7 +37,7 @@ static int timeout_base_ns[] = { }; static int timeout_us; -static int nobau; +static bool nobau = true; static int nobau_perm; static cycles_t congested_cycles; @@ -106,13 +106,28 @@ static char *stat_description[] = { "enable: number times use of the BAU was re-enabled" }; -static int __init -setup_nobau(char *arg) +static int __init setup_bau(char *arg) { - nobau = 1; + int result; + + if (!arg) + return -EINVAL; + + result = strtobool(arg, &nobau); + if (result) + return result; + + /* we need to flip the logic here, so that bau=y sets nobau to false */ + nobau = !nobau; + + if (!nobau) + pr_info("UV BAU Enabled\n"); + else + pr_info("UV BAU Disabled\n"); + return 0; } -early_param("nobau", setup_nobau); +early_param("bau", setup_bau); /* base pnode in this partition */ static int uv_base_pnode __read_mostly; @@ -131,10 +146,10 @@ set_bau_on(void) pr_info("BAU not initialized; cannot be turned on\n"); return; } - nobau = 0; + nobau = false; for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); - bcp->nobau = 0; + bcp->nobau = false; } pr_info("BAU turned on\n"); return; @@ -146,10 +161,10 @@ set_bau_off(void) int cpu; struct bau_control *bcp; - nobau = 1; + nobau = true; for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); - bcp->nobau = 1; + bcp->nobau = true; } pr_info("BAU turned off\n"); return; @@ -1886,7 +1901,7 @@ static void __init init_per_cpu_tunables(void) bcp = &per_cpu(bau_control, cpu); bcp->baudisabled = 0; if (nobau) - bcp->nobau = 1; + bcp->nobau = true; bcp->statp = &per_cpu(ptcstats, cpu); /* time interval to catch a hardware stay-busy bug */ bcp->timeout_interval = usec_2_cycles(2*timeout_us); @@ -2025,7 +2040,8 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, return 1; } bcp->uvhub_master = *hmasterp; - bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; + bcp->uvhub_cpu = uv_cpu_blade_processor_id(cpu); + if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { printk(KERN_EMERG "%d cpus per uvhub invalid\n", bcp->uvhub_cpu); diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c index 5d4ba301e776eb238e1d8e3736c7fb3fa6fb824c..e9da9ebd924a60676ee2cd60c32702e6ce0e6e50 100644 --- a/arch/x86/platform/uv/uv_sysfs.c +++ b/arch/x86/platform/uv/uv_sysfs.c @@ -34,7 +34,7 @@ static ssize_t partition_id_show(struct kobject *kobj, static ssize_t coherence_id_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id()); + return snprintf(buf, PAGE_SIZE, "%ld\n", uv_partition_coherence_id()); } static struct kobj_attribute partition_id_attr = diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 2b158a9fa1d796aa4ea688b04376b27a57724116..b333fc45f9ec8025e72b2ad29abfeffbc7285e4b 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -165,7 +165,7 @@ static __init int uv_rtc_allocate_timers(void) for_each_present_cpu(cpu) { int nid = cpu_to_node(cpu); int bid = uv_cpu_to_blade_id(cpu); - int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + int bcpu = uv_cpu_blade_processor_id(cpu); struct uv_rtc_timer_head *head = blade_info[bid]; if (!head) { @@ -226,7 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) int pnode = uv_cpu_to_pnode(cpu); int bid = uv_cpu_to_blade_id(cpu); struct uv_rtc_timer_head *head = blade_info[bid]; - int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + int bcpu = uv_cpu_blade_processor_id(cpu); u64 *t = &head->cpu[bcpu].expires; unsigned long flags; int next_cpu; @@ -262,7 +262,7 @@ static int uv_rtc_unset_timer(int cpu, int force) int pnode = uv_cpu_to_pnode(cpu); int bid = uv_cpu_to_blade_id(cpu); struct uv_rtc_timer_head *head = blade_info[bid]; - int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + int bcpu = uv_cpu_blade_processor_id(cpu); u64 *t = &head->cpu[bcpu].expires; unsigned long flags; int rc = 0; diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 291226b952a997f55d3a0be723f15cb9b9b1ab92..9f14bd34581d663a22cb326d0ad3f98b7c2822d6 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -106,7 +106,7 @@ static int resume_physical_mapping_init(pgd_t *pgd_base) * normal page tables. * NOTE: We can mark everything as executable here */ - if (cpu_has_pse) { + if (boot_cpu_has(X86_FEATURE_PSE)) { set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); pfn += PTRS_PER_PTE; } else { diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index df280da348255de9178c0c98cdb1f66b31f13f49..d957d5f21a86563e5259cf6cca9228fb59646118 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -1,4 +1,4 @@ -config AMD_MCE_INJ +config MCE_AMD_INJ tristate "Simple MCE injection interface for AMD processors" depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB default n diff --git a/arch/x86/ras/Makefile b/arch/x86/ras/Makefile index dd2c98b84037ba20813048e9dc8c13ef89302556..5f94546db280ca672b9da41bf09036f9c53cdd52 100644 --- a/arch/x86/ras/Makefile +++ b/arch/x86/ras/Makefile @@ -1,2 +1,2 @@ -obj-$(CONFIG_AMD_MCE_INJ) += mce_amd_inj.o +obj-$(CONFIG_MCE_AMD_INJ) += mce_amd_inj.o diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 9e02dcaef68311ed376f8fcd0579d6c207e80103..e69f4701a076da1006c85532de3a585822c2578f 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -290,14 +290,33 @@ static void do_inject(void) wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS, (u32)mcg_status, (u32)(mcg_status >> 32)); - wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b), - (u32)i_mce.status, (u32)(i_mce.status >> 32)); + if (boot_cpu_has(X86_FEATURE_SMCA)) { + if (inj_type == DFR_INT_INJ) { + wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DESTAT(b), + (u32)i_mce.status, (u32)(i_mce.status >> 32)); + + wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DEADDR(b), + (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); + } else { + wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_STATUS(b), + (u32)i_mce.status, (u32)(i_mce.status >> 32)); + + wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_ADDR(b), + (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); + } + + wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(b), + (u32)i_mce.misc, (u32)(i_mce.misc >> 32)); + } else { + wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b), + (u32)i_mce.status, (u32)(i_mce.status >> 32)); - wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b), - (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); + wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b), + (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); - wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b), - (u32)i_mce.misc, (u32)(i_mce.misc >> 32)); + wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b), + (u32)i_mce.misc, (u32)(i_mce.misc >> 32)); + } toggle_hw_mce_inject(cpu, false); diff --git a/arch/x86/tools/calc_run_size.sh b/arch/x86/tools/calc_run_size.sh deleted file mode 100644 index 1a4c17bb39108264dc33e7e2fea61ad1708ff296..0000000000000000000000000000000000000000 --- a/arch/x86/tools/calc_run_size.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/sh -# -# Calculate the amount of space needed to run the kernel, including room for -# the .bss and .brk sections. -# -# Usage: -# objdump -h a.out | sh calc_run_size.sh - -NUM='\([0-9a-fA-F]*[ \t]*\)' -OUT=$(sed -n 's/^[ \t0-9]*.b[sr][sk][ \t]*'"$NUM$NUM$NUM$NUM"'.*/\1\4/p') -if [ -z "$OUT" ] ; then - echo "Never found .bss or .brk file offset" >&2 - exit 1 -fi - -OUT=$(echo ${OUT# }) -sizeA=$(printf "%d" 0x${OUT%% *}) -OUT=${OUT#* } -offsetA=$(printf "%d" 0x${OUT%% *}) -OUT=${OUT#* } -sizeB=$(printf "%d" 0x${OUT%% *}) -OUT=${OUT#* } -offsetB=$(printf "%d" 0x${OUT%% *}) - -run_size=$(( $offsetA + $sizeA + $sizeB )) - -# BFD linker shows the same file offset in ELF. -if [ "$offsetA" -ne "$offsetB" ] ; then - # Gold linker shows them as consecutive. - endB=$(( $offsetB + $sizeB )) - if [ "$endB" != "$run_size" ] ; then - printf "sizeA: 0x%x\n" $sizeA >&2 - printf "offsetA: 0x%x\n" $offsetA >&2 - printf "sizeB: 0x%x\n" $sizeB >&2 - printf "offsetB: 0x%x\n" $offsetB >&2 - echo ".bss and .brk are non-contiguous" >&2 - exit 1 - fi -fi - -printf "%d\n" $run_size -exit 0 diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 880862c7d9ddba51e1b6964bc80dcf49d6a8b6ff..760789ae8562af21932adc93e56dca10717915ab 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -75,7 +75,6 @@ #include #include #include -#include #include #ifdef CONFIG_ACPI @@ -1093,6 +1092,26 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) return ret; } +static u64 xen_read_msr(unsigned int msr) +{ + /* + * This will silently swallow a #GP from RDMSR. It may be worth + * changing that. + */ + int err; + + return xen_read_msr_safe(msr, &err); +} + +static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) +{ + /* + * This will silently swallow a #GP from WRMSR. It may be worth + * changing that. + */ + xen_write_msr_safe(msr, low, high); +} + void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -1187,13 +1206,11 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, } static const struct pv_info xen_info __initconst = { - .paravirt_enabled = 1, .shared_kernel_pmd = 0, #ifdef CONFIG_X86_64 .extra_user_64bit_cs = FLAT_USER_CS64, #endif - .features = 0, .name = "Xen", }; @@ -1223,8 +1240,11 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .wbinvd = native_wbinvd, - .read_msr = xen_read_msr_safe, - .write_msr = xen_write_msr_safe, + .read_msr = xen_read_msr, + .write_msr = xen_write_msr, + + .read_msr_safe = xen_read_msr_safe, + .write_msr_safe = xen_write_msr_safe, .read_pmc = xen_read_pmc, @@ -1469,10 +1489,10 @@ static void xen_pvh_set_cr_flags(int cpu) * For BSP, PSE PGE are set in probe_page_size_mask(), for APs * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu(). */ - if (cpu_has_pse) + if (boot_cpu_has(X86_FEATURE_PSE)) cr4_set_bits_and_update_boot(X86_CR4_PSE); - if (cpu_has_pge) + if (boot_cpu_has(X86_FEATURE_PGE)) cr4_set_bits_and_update_boot(X86_CR4_PGE); } @@ -1506,12 +1526,16 @@ static void __init xen_pvh_early_guest_init(void) } #endif /* CONFIG_XEN_PVH */ +static void __init xen_dom0_set_legacy_features(void) +{ + x86_platform.legacy.rtc = 1; +} + /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; unsigned long initrd_start = 0; - u64 pat; int rc; if (!xen_start_info) @@ -1527,8 +1551,6 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - if (xen_initial_domain()) - pv_info.features |= PV_SUPPORTED_RTC; pv_init_ops = xen_init_ops; if (!xen_pvh_domain()) { pv_cpu_ops = xen_cpu_ops; @@ -1618,13 +1640,6 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_start_info->nr_pages); xen_reserve_special_pages(); - /* - * Modify the cache mode translation tables to match Xen's PAT - * configuration. - */ - rdmsrl(MSR_IA32_CR_PAT, pat); - pat_init_cache_modes(pat); - /* keep using Xen gdt for now; no urgent need to change it */ #ifdef CONFIG_X86_32 @@ -1670,6 +1685,7 @@ asmlinkage __visible void __init xen_start_kernel(void) boot_params.hdr.ramdisk_image = initrd_start; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); + boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; if (!xen_initial_domain()) { add_preferred_console("xenboot", 0, NULL); @@ -1687,6 +1703,8 @@ asmlinkage __visible void __init xen_start_kernel(void) .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, }; + x86_platform.set_legacy_features = + xen_dom0_set_legacy_features; xen_init_vga(info, xen_start_info->console.dom0.info_size); xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index b56855a1382a374f8c52632b9aa243a1112a1a06..28cf4c5d65efade019dbefaf809e71d13624e37c 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -22,6 +22,7 @@ generic-y += mm-arch-hooks.h generic-y += percpu.h generic-y += preempt.h generic-y += resource.h +generic-y += rwsem.h generic-y += sections.h generic-y += siginfo.h generic-y += statfs.h diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h deleted file mode 100644 index 249619e7e7f2aa161060e7f9eb64686b9116e8b8..0000000000000000000000000000000000000000 --- a/arch/xtensa/include/asm/rwsem.h +++ /dev/null @@ -1,131 +0,0 @@ -/* - * include/asm-xtensa/rwsem.h - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Largely copied from include/asm-ppc/rwsem.h - * - * Copyright (C) 2001 - 2005 Tensilica Inc. - */ - -#ifndef _XTENSA_RWSEM_H -#define _XTENSA_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "Please don't include directly, use instead." -#endif - -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (atomic_add_return(1,(atomic_t *)(&sem->count)) > 0) - smp_wmb(); - else - rwsem_down_read_failed(sem); -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - int tmp; - - while ((tmp = sem->count) >= 0) { - if (tmp == cmpxchg(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - smp_wmb(); - return 1; - } - } - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write(struct rw_semaphore *sem) -{ - int tmp; - - tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)); - if (tmp == RWSEM_ACTIVE_WRITE_BIAS) - smp_wmb(); - else - rwsem_down_write_failed(sem); -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - int tmp; - - tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - smp_wmb(); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - int tmp; - - smp_wmb(); - tmp = atomic_sub_return(1,(atomic_t *)(&sem->count)); - if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - smp_wmb(); - if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)) < 0) - rwsem_wake(sem); -} - -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) -{ - atomic_add(delta, (atomic_t *)(&sem->count)); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - int tmp; - - smp_wmb(); - tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count)); - if (tmp < 0) - rwsem_downgrade_wake(sem); -} - -/* - * implement exchange and add functionality - */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) -{ - smp_mb(); - return atomic_add_return(delta, (atomic_t *)(&sem->count)); -} - -#endif /* _XTENSA_RWSEM_H */ diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c index 54f01188c29c1a4048ac166d9383d7f3a26905c8..a6b00b3af42993e937181a8412c8949f8ae65983 100644 --- a/arch/xtensa/kernel/perf_event.c +++ b/arch/xtensa/kernel/perf_event.c @@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data) void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { - xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH, + xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack, callchain_trace, NULL, entry); } void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { - xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH, + xtensa_backtrace_user(regs, sysctl_perf_event_max_stack, callchain_trace, entry); } diff --git a/block/blk-map.c b/block/blk-map.c index a54f0543b956e5ccf5f20206e7983fe66ce2698d..b9f88b7751fbd87742b1d1439a1d89c97818f9ce 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -9,24 +9,6 @@ #include "blk.h" -static bool iovec_gap_to_prv(struct request_queue *q, - struct iovec *prv, struct iovec *cur) -{ - unsigned long prev_end; - - if (!queue_virt_boundary(q)) - return false; - - if (prv->iov_base == NULL && prv->iov_len == 0) - /* prv is not set - don't check */ - return false; - - prev_end = (unsigned long)(prv->iov_base + prv->iov_len); - - return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) || - prev_end & queue_virt_boundary(q)); -} - int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio) { @@ -125,31 +107,18 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { - struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; - bool copy = (q->dma_pad_mask & iter->count) || map_data; + bool copy = false; + unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); struct bio *bio = NULL; struct iov_iter i; int ret; - if (!iter || !iter->count) - return -EINVAL; - - iov_for_each(iov, i, *iter) { - unsigned long uaddr = (unsigned long) iov.iov_base; - - if (!iov.iov_len) - return -EINVAL; - - /* - * Keep going so we check length of all segments - */ - if ((uaddr & queue_dma_alignment(q)) || - iovec_gap_to_prv(q, &prv, &iov)) - copy = true; - - prv.iov_base = iov.iov_base; - prv.iov_len = iov.iov_len; - } + if (map_data) + copy = true; + else if (iov_iter_alignment(iter) & align) + copy = true; + else if (queue_virt_boundary(q)) + copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); i = *iter; do { diff --git a/crypto/testmgr.c b/crypto/testmgr.c index b86883aedca11a617ad2e19bafe646bfecac583a..7d4acc4492338921dd1307c1cc4d4a252130b36a 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1776,6 +1776,7 @@ static int alg_test_drbg(const struct alg_test_desc *desc, const char *driver, static int do_test_rsa(struct crypto_akcipher *tfm, struct akcipher_testvec *vecs) { + char *xbuf[XBUFSIZE]; struct akcipher_request *req; void *outbuf_enc = NULL; void *outbuf_dec = NULL; @@ -1784,9 +1785,12 @@ static int do_test_rsa(struct crypto_akcipher *tfm, int err = -ENOMEM; struct scatterlist src, dst, src_tab[2]; + if (testmgr_alloc_buf(xbuf)) + return err; + req = akcipher_request_alloc(tfm, GFP_KERNEL); if (!req) - return err; + goto free_xbuf; init_completion(&result.completion); @@ -1804,9 +1808,14 @@ static int do_test_rsa(struct crypto_akcipher *tfm, if (!outbuf_enc) goto free_req; + if (WARN_ON(vecs->m_size > PAGE_SIZE)) + goto free_all; + + memcpy(xbuf[0], vecs->m, vecs->m_size); + sg_init_table(src_tab, 2); - sg_set_buf(&src_tab[0], vecs->m, 8); - sg_set_buf(&src_tab[1], vecs->m + 8, vecs->m_size - 8); + sg_set_buf(&src_tab[0], xbuf[0], 8); + sg_set_buf(&src_tab[1], xbuf[0] + 8, vecs->m_size - 8); sg_init_one(&dst, outbuf_enc, out_len_max); akcipher_request_set_crypt(req, src_tab, &dst, vecs->m_size, out_len_max); @@ -1825,7 +1834,7 @@ static int do_test_rsa(struct crypto_akcipher *tfm, goto free_all; } /* verify that encrypted message is equal to expected */ - if (memcmp(vecs->c, sg_virt(req->dst), vecs->c_size)) { + if (memcmp(vecs->c, outbuf_enc, vecs->c_size)) { pr_err("alg: rsa: encrypt test failed. Invalid output\n"); err = -EINVAL; goto free_all; @@ -1840,7 +1849,13 @@ static int do_test_rsa(struct crypto_akcipher *tfm, err = -ENOMEM; goto free_all; } - sg_init_one(&src, vecs->c, vecs->c_size); + + if (WARN_ON(vecs->c_size > PAGE_SIZE)) + goto free_all; + + memcpy(xbuf[0], vecs->c, vecs->c_size); + + sg_init_one(&src, xbuf[0], vecs->c_size); sg_init_one(&dst, outbuf_dec, out_len_max); init_completion(&result.completion); akcipher_request_set_crypt(req, &src, &dst, vecs->c_size, out_len_max); @@ -1867,6 +1882,8 @@ static int do_test_rsa(struct crypto_akcipher *tfm, kfree(outbuf_enc); free_req: akcipher_request_free(req); +free_xbuf: + testmgr_free_buf(xbuf); return err; } diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h index 5c79526245c2e3309a28766b781a9f2bccc850d5..a0380338946a1dbd6cb7e54ab70b001edc77eef7 100644 --- a/drivers/base/regmap/internal.h +++ b/drivers/base/regmap/internal.h @@ -13,6 +13,7 @@ #ifndef _REGMAP_INTERNAL_H #define _REGMAP_INTERNAL_H +#include #include #include #include diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c index 7526906ca080f81dcff1499b7e57c2ff0b79569e..5189fd6182f6c6126b21b1d6b3e25f760d4df290 100644 --- a/drivers/base/regmap/regmap-mmio.c +++ b/drivers/base/regmap/regmap-mmio.c @@ -23,6 +23,8 @@ #include #include +#include "internal.h" + struct regmap_mmio_context { void __iomem *regs; unsigned val_bytes; @@ -212,6 +214,7 @@ static const struct regmap_bus regmap_mmio = { .reg_write = regmap_mmio_write, .reg_read = regmap_mmio_read, .free_context = regmap_mmio_free_context, + .val_format_endian_default = REGMAP_ENDIAN_LITTLE, }; static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev, @@ -245,7 +248,7 @@ static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev, ctx->val_bytes = config->val_bits / 8; ctx->clk = ERR_PTR(-ENODEV); - switch (config->reg_format_endian) { + switch (regmap_get_val_endian(dev, ®map_mmio, config)) { case REGMAP_ENDIAN_DEFAULT: case REGMAP_ENDIAN_LITTLE: #ifdef __LITTLE_ENDIAN diff --git a/drivers/base/regmap/regmap-spmi.c b/drivers/base/regmap/regmap-spmi.c index 7e58f656039900e633729828f5eae5fb1700b93a..4a36e415e938560ce2e3ba927b81888444767a7d 100644 --- a/drivers/base/regmap/regmap-spmi.c +++ b/drivers/base/regmap/regmap-spmi.c @@ -142,7 +142,7 @@ static int regmap_spmi_ext_read(void *context, while (val_size) { len = min_t(size_t, val_size, 8); - err = spmi_ext_register_readl(context, addr, val, val_size); + err = spmi_ext_register_readl(context, addr, val, len); if (err) goto err_out; diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c index 0f6b229afcb9e621eb116f6ce805a619ddfe96b3..247bfa8eaddbf3659daec2949561129e4a1a4db1 100644 --- a/drivers/cpufreq/longhaul.c +++ b/drivers/cpufreq/longhaul.c @@ -945,7 +945,7 @@ static int __init longhaul_init(void) } #endif #ifdef CONFIG_X86_IO_APIC - if (cpu_has_apic) { + if (boot_cpu_has(X86_FEATURE_APIC)) { printk(KERN_ERR PFX "APIC detected. Longhaul is currently " "broken in this configuration.\n"); return -ENODEV; diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 49768c08ac0734345007a1faaae7eef70cd34f60..9b6800a79c7f3662bcf80f0a8c972837c4d4cbf5 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1052,7 +1052,6 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) struct mce *m = (struct mce *)data; struct cpuinfo_x86 *c = &cpu_data(m->extcpu); int ecc; - u32 ebx = cpuid_ebx(0x80000007); if (amd_filter_mce(m)) return NOTIFY_STOP; @@ -1075,7 +1074,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); - if (!!(ebx & BIT(3))) { + if (boot_cpu_has(X86_FEATURE_SMCA)) { u32 low, high; u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); @@ -1094,7 +1093,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (m->status & MCI_STATUS_ADDRV) pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr); - if (!!(ebx & BIT(3))) { + if (boot_cpu_has(X86_FEATURE_SMCA)) { decode_smca_errors(m); goto err_code; } @@ -1149,7 +1148,6 @@ static struct notifier_block amd_mce_dec_nb = { static int __init mce_amd_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; - u32 ebx; if (c->x86_vendor != X86_VENDOR_AMD) return -ENODEV; @@ -1205,9 +1203,8 @@ static int __init mce_amd_init(void) break; case 0x17: - ebx = cpuid_ebx(0x80000007); xec_mask = 0x3f; - if (!(ebx & BIT(3))) { + if (!boot_cpu_has(X86_FEATURE_SMCA)) { printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n"); goto err_out; } diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index e1670d533f9742e94b505ba0684d9373956c8bb0..6394152f648f4f0262a4c5be42a7ae0cb7aaa7a8 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -87,6 +87,31 @@ config EFI_RUNTIME_WRAPPERS config EFI_ARMSTUB bool +config EFI_BOOTLOADER_CONTROL + tristate "EFI Bootloader Control" + depends on EFI_VARS + default n + ---help--- + This module installs a reboot hook, such that if reboot() is + invoked with a string argument NNN, "NNN" is copied to the + "LoaderEntryOneShot" EFI variable, to be read by the + bootloader. If the string matches one of the boot labels + defined in its configuration, the bootloader will boot once + to that label. The "LoaderEntryRebootReason" EFI variable is + set with the reboot reason: "reboot" or "shutdown". The + bootloader reads this reboot reason and takes particular + action according to its policy. + +config EFI_CAPSULE_LOADER + tristate "EFI capsule loader" + depends on EFI + help + This option exposes a loader interface "/dev/efi_capsule_loader" for + users to load EFI capsules. This driver requires working runtime + capsule support in the firmware, which many OEMs do not provide. + + Most users should say N. + endmenu config UEFI_CPER diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index 62e654f255f4d1e4cb1ee1b955e8fd03ca689221..a219640f881f0306eb1888d924c4980277044e68 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -9,7 +9,8 @@ # KASAN_SANITIZE_runtime-wrappers.o := n -obj-$(CONFIG_EFI) += efi.o vars.o reboot.o +obj-$(CONFIG_EFI) += efi.o vars.o reboot.o memattr.o +obj-$(CONFIG_EFI) += capsule.o obj-$(CONFIG_EFI_VARS) += efivars.o obj-$(CONFIG_EFI_ESRT) += esrt.o obj-$(CONFIG_EFI_VARS_PSTORE) += efi-pstore.o @@ -18,7 +19,9 @@ obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o obj-$(CONFIG_EFI_RUNTIME_WRAPPERS) += runtime-wrappers.o obj-$(CONFIG_EFI_STUB) += libstub/ obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_mem.o +obj-$(CONFIG_EFI_BOOTLOADER_CONTROL) += efibc.o arm-obj-$(CONFIG_EFI) := arm-init.o arm-runtime.o obj-$(CONFIG_ARM) += $(arm-obj-y) obj-$(CONFIG_ARM64) += $(arm-obj-y) +obj-$(CONFIG_EFI_CAPSULE_LOADER) += capsule-loader.o diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index 8714f8c271babfff5566765605b5429f33bfa5d8..ef90f0c4b70a735def7eb9c56aec980f7813a8fe 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -11,17 +11,19 @@ * */ +#define pr_fmt(fmt) "efi: " fmt + #include #include #include #include #include #include +#include +#include #include -struct efi_memory_map memmap; - u64 efi_system_table; static int __init is_normal_ram(efi_memory_desc_t *md) @@ -40,7 +42,7 @@ static phys_addr_t efi_to_phys(unsigned long addr) { efi_memory_desc_t *md; - for_each_efi_memory_desc(&memmap, md) { + for_each_efi_memory_desc(md) { if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; if (md->virt_addr == 0) @@ -53,6 +55,36 @@ static phys_addr_t efi_to_phys(unsigned long addr) return addr; } +static __initdata unsigned long screen_info_table = EFI_INVALID_TABLE_ADDR; + +static __initdata efi_config_table_type_t arch_tables[] = { + {LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID, NULL, &screen_info_table}, + {NULL_GUID, NULL, NULL} +}; + +static void __init init_screen_info(void) +{ + struct screen_info *si; + + if (screen_info_table != EFI_INVALID_TABLE_ADDR) { + si = early_memremap_ro(screen_info_table, sizeof(*si)); + if (!si) { + pr_err("Could not map screen_info config table\n"); + return; + } + screen_info = *si; + early_memunmap(si, sizeof(*si)); + + /* dummycon on ARM needs non-zero values for columns/lines */ + screen_info.orig_video_cols = 80; + screen_info.orig_video_lines = 25; + } + + if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI && + memblock_is_map_memory(screen_info.lfb_base)) + memblock_mark_nomap(screen_info.lfb_base, screen_info.lfb_size); +} + static int __init uefi_init(void) { efi_char16_t *c16; @@ -85,6 +117,8 @@ static int __init uefi_init(void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff); + efi.runtime_version = efi.systab->hdr.revision; + /* Show what we know for posterity */ c16 = early_memremap_ro(efi_to_phys(efi.systab->fw_vendor), sizeof(vendor) * sizeof(efi_char16_t)); @@ -108,7 +142,8 @@ static int __init uefi_init(void) goto out; } retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables, - sizeof(efi_config_table_t), NULL); + sizeof(efi_config_table_t), + arch_tables); early_memunmap(config_tables, table_size); out: @@ -143,7 +178,7 @@ static __init void reserve_regions(void) if (efi_enabled(EFI_DBG)) pr_info("Processing EFI memory map:\n"); - for_each_efi_memory_desc(&memmap, md) { + for_each_efi_memory_desc(md) { paddr = md->phys_addr; npages = md->num_pages; @@ -184,9 +219,9 @@ void __init efi_init(void) efi_system_table = params.system_table; - memmap.phys_map = params.mmap; - memmap.map = early_memremap_ro(params.mmap, params.mmap_size); - if (memmap.map == NULL) { + efi.memmap.phys_map = params.mmap; + efi.memmap.map = early_memremap_ro(params.mmap, params.mmap_size); + if (efi.memmap.map == NULL) { /* * If we are booting via UEFI, the UEFI memory map is the only * description of memory we have, so there is little point in @@ -194,28 +229,37 @@ void __init efi_init(void) */ panic("Unable to map EFI memory map.\n"); } - memmap.map_end = memmap.map + params.mmap_size; - memmap.desc_size = params.desc_size; - memmap.desc_version = params.desc_ver; + efi.memmap.map_end = efi.memmap.map + params.mmap_size; + efi.memmap.desc_size = params.desc_size; + efi.memmap.desc_version = params.desc_ver; + + WARN(efi.memmap.desc_version != 1, + "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", + efi.memmap.desc_version); if (uefi_init() < 0) return; reserve_regions(); - early_memunmap(memmap.map, params.mmap_size); + efi_memattr_init(); + early_memunmap(efi.memmap.map, params.mmap_size); - if (IS_ENABLED(CONFIG_ARM)) { - /* - * ARM currently does not allow ioremap_cache() to be called on - * memory regions that are covered by struct page. So remove the - * UEFI memory map from the linear mapping. - */ - memblock_mark_nomap(params.mmap & PAGE_MASK, - PAGE_ALIGN(params.mmap_size + - (params.mmap & ~PAGE_MASK))); - } else { - memblock_reserve(params.mmap & PAGE_MASK, - PAGE_ALIGN(params.mmap_size + - (params.mmap & ~PAGE_MASK))); - } + memblock_reserve(params.mmap & PAGE_MASK, + PAGE_ALIGN(params.mmap_size + + (params.mmap & ~PAGE_MASK))); + + init_screen_info(); +} + +static int __init register_gop_device(void) +{ + void *pd; + + if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI) + return 0; + + pd = platform_device_register_data(NULL, "efi-framebuffer", 0, + &screen_info, sizeof(screen_info)); + return PTR_ERR_OR_ZERO(pd); } +subsys_initcall(register_gop_device); diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 6ae21e41a429402d132d8d5be215d5adde64d08c..17ccf0a8787a2cf3f7ea1869e2e06801da2c46ef 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -42,11 +42,13 @@ static struct mm_struct efi_mm = { static bool __init efi_virtmap_init(void) { efi_memory_desc_t *md; + bool systab_found; efi_mm.pgd = pgd_alloc(&efi_mm); init_new_context(NULL, &efi_mm); - for_each_efi_memory_desc(&memmap, md) { + systab_found = false; + for_each_efi_memory_desc(md) { phys_addr_t phys = md->phys_addr; int ret; @@ -64,7 +66,25 @@ static bool __init efi_virtmap_init(void) &phys, ret); return false; } + /* + * If this entry covers the address of the UEFI system table, + * calculate and record its virtual address. + */ + if (efi_system_table >= phys && + efi_system_table < phys + (md->num_pages * EFI_PAGE_SIZE)) { + efi.systab = (void *)(unsigned long)(efi_system_table - + phys + md->virt_addr); + systab_found = true; + } + } + if (!systab_found) { + pr_err("No virtual mapping found for the UEFI System Table\n"); + return false; } + + if (efi_memattr_apply_permissions(&efi_mm, efi_set_mapping_permissions)) + return false; + return true; } @@ -89,26 +109,17 @@ static int __init arm_enable_runtime_services(void) pr_info("Remapping and enabling EFI services.\n"); - mapsize = memmap.map_end - memmap.map; - memmap.map = (__force void *)ioremap_cache(memmap.phys_map, - mapsize); - if (!memmap.map) { - pr_err("Failed to remap EFI memory map\n"); - return -ENOMEM; - } - memmap.map_end = memmap.map + mapsize; - efi.memmap = &memmap; + mapsize = efi.memmap.map_end - efi.memmap.map; - efi.systab = (__force void *)ioremap_cache(efi_system_table, - sizeof(efi_system_table_t)); - if (!efi.systab) { - pr_err("Failed to remap EFI System Table\n"); + efi.memmap.map = memremap(efi.memmap.phys_map, mapsize, MEMREMAP_WB); + if (!efi.memmap.map) { + pr_err("Failed to remap EFI memory map\n"); return -ENOMEM; } - set_bit(EFI_SYSTEM_TABLES, &efi.flags); + efi.memmap.map_end = efi.memmap.map + mapsize; if (!efi_virtmap_init()) { - pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n"); + pr_err("UEFI virtual mapping missing or invalid -- runtime services will not be available\n"); return -ENOMEM; } @@ -116,8 +127,6 @@ static int __init arm_enable_runtime_services(void) efi_native_runtime_setup(); set_bit(EFI_RUNTIME_SERVICES, &efi.flags); - efi.runtime_version = efi.systab->hdr.revision; - return 0; } early_initcall(arm_enable_runtime_services); diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c new file mode 100644 index 0000000000000000000000000000000000000000..c99c24bc79b02262298ea64b5ed3e6e625429f4d --- /dev/null +++ b/drivers/firmware/efi/capsule-loader.c @@ -0,0 +1,343 @@ +/* + * EFI capsule loader driver. + * + * Copyright 2015 Intel Corporation + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) "efi: " fmt + +#include +#include +#include +#include +#include +#include +#include + +#define NO_FURTHER_WRITE_ACTION -1 + +struct capsule_info { + bool header_obtained; + int reset_type; + long index; + size_t count; + size_t total_size; + struct page **pages; + size_t page_bytes_remain; +}; + +/** + * efi_free_all_buff_pages - free all previous allocated buffer pages + * @cap_info: pointer to current instance of capsule_info structure + * + * In addition to freeing buffer pages, it flags NO_FURTHER_WRITE_ACTION + * to cease processing data in subsequent write(2) calls until close(2) + * is called. + **/ +static void efi_free_all_buff_pages(struct capsule_info *cap_info) +{ + while (cap_info->index > 0) + __free_page(cap_info->pages[--cap_info->index]); + + cap_info->index = NO_FURTHER_WRITE_ACTION; +} + +/** + * efi_capsule_setup_info - obtain the efi capsule header in the binary and + * setup capsule_info structure + * @cap_info: pointer to current instance of capsule_info structure + * @kbuff: a mapped first page buffer pointer + * @hdr_bytes: the total received number of bytes for efi header + **/ +static ssize_t efi_capsule_setup_info(struct capsule_info *cap_info, + void *kbuff, size_t hdr_bytes) +{ + efi_capsule_header_t *cap_hdr; + size_t pages_needed; + int ret; + void *temp_page; + + /* Only process data block that is larger than efi header size */ + if (hdr_bytes < sizeof(efi_capsule_header_t)) + return 0; + + /* Reset back to the correct offset of header */ + cap_hdr = kbuff - cap_info->count; + pages_needed = ALIGN(cap_hdr->imagesize, PAGE_SIZE) >> PAGE_SHIFT; + + if (pages_needed == 0) { + pr_err("%s: pages count invalid\n", __func__); + return -EINVAL; + } + + /* Check if the capsule binary supported */ + ret = efi_capsule_supported(cap_hdr->guid, cap_hdr->flags, + cap_hdr->imagesize, + &cap_info->reset_type); + if (ret) { + pr_err("%s: efi_capsule_supported() failed\n", + __func__); + return ret; + } + + cap_info->total_size = cap_hdr->imagesize; + temp_page = krealloc(cap_info->pages, + pages_needed * sizeof(void *), + GFP_KERNEL | __GFP_ZERO); + if (!temp_page) { + pr_debug("%s: krealloc() failed\n", __func__); + return -ENOMEM; + } + + cap_info->pages = temp_page; + cap_info->header_obtained = true; + + return 0; +} + +/** + * efi_capsule_submit_update - invoke the efi_capsule_update API once binary + * upload done + * @cap_info: pointer to current instance of capsule_info structure + **/ +static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info) +{ + int ret; + void *cap_hdr_temp; + + cap_hdr_temp = kmap(cap_info->pages[0]); + if (!cap_hdr_temp) { + pr_debug("%s: kmap() failed\n", __func__); + return -EFAULT; + } + + ret = efi_capsule_update(cap_hdr_temp, cap_info->pages); + kunmap(cap_info->pages[0]); + if (ret) { + pr_err("%s: efi_capsule_update() failed\n", __func__); + return ret; + } + + /* Indicate capsule binary uploading is done */ + cap_info->index = NO_FURTHER_WRITE_ACTION; + pr_info("%s: Successfully upload capsule file with reboot type '%s'\n", + __func__, !cap_info->reset_type ? "RESET_COLD" : + cap_info->reset_type == 1 ? "RESET_WARM" : + "RESET_SHUTDOWN"); + return 0; +} + +/** + * efi_capsule_write - store the capsule binary and pass it to + * efi_capsule_update() API + * @file: file pointer + * @buff: buffer pointer + * @count: number of bytes in @buff + * @offp: not used + * + * Expectation: + * - A user space tool should start at the beginning of capsule binary and + * pass data in sequentially. + * - Users should close and re-open this file note in order to upload more + * capsules. + * - After an error returned, user should close the file and restart the + * operation for the next try otherwise -EIO will be returned until the + * file is closed. + * - An EFI capsule header must be located at the beginning of capsule + * binary file and passed in as first block data of write operation. + **/ +static ssize_t efi_capsule_write(struct file *file, const char __user *buff, + size_t count, loff_t *offp) +{ + int ret = 0; + struct capsule_info *cap_info = file->private_data; + struct page *page; + void *kbuff = NULL; + size_t write_byte; + + if (count == 0) + return 0; + + /* Return error while NO_FURTHER_WRITE_ACTION is flagged */ + if (cap_info->index < 0) + return -EIO; + + /* Only alloc a new page when previous page is full */ + if (!cap_info->page_bytes_remain) { + page = alloc_page(GFP_KERNEL); + if (!page) { + pr_debug("%s: alloc_page() failed\n", __func__); + ret = -ENOMEM; + goto failed; + } + + cap_info->pages[cap_info->index++] = page; + cap_info->page_bytes_remain = PAGE_SIZE; + } + + page = cap_info->pages[cap_info->index - 1]; + + kbuff = kmap(page); + if (!kbuff) { + pr_debug("%s: kmap() failed\n", __func__); + ret = -EFAULT; + goto failed; + } + kbuff += PAGE_SIZE - cap_info->page_bytes_remain; + + /* Copy capsule binary data from user space to kernel space buffer */ + write_byte = min_t(size_t, count, cap_info->page_bytes_remain); + if (copy_from_user(kbuff, buff, write_byte)) { + pr_debug("%s: copy_from_user() failed\n", __func__); + ret = -EFAULT; + goto fail_unmap; + } + cap_info->page_bytes_remain -= write_byte; + + /* Setup capsule binary info structure */ + if (!cap_info->header_obtained) { + ret = efi_capsule_setup_info(cap_info, kbuff, + cap_info->count + write_byte); + if (ret) + goto fail_unmap; + } + + cap_info->count += write_byte; + kunmap(page); + + /* Submit the full binary to efi_capsule_update() API */ + if (cap_info->header_obtained && + cap_info->count >= cap_info->total_size) { + if (cap_info->count > cap_info->total_size) { + pr_err("%s: upload size exceeded header defined size\n", + __func__); + ret = -EINVAL; + goto failed; + } + + ret = efi_capsule_submit_update(cap_info); + if (ret) + goto failed; + } + + return write_byte; + +fail_unmap: + kunmap(page); +failed: + efi_free_all_buff_pages(cap_info); + return ret; +} + +/** + * efi_capsule_flush - called by file close or file flush + * @file: file pointer + * @id: not used + * + * If a capsule is being partially uploaded then calling this function + * will be treated as upload termination and will free those completed + * buffer pages and -ECANCELED will be returned. + **/ +static int efi_capsule_flush(struct file *file, fl_owner_t id) +{ + int ret = 0; + struct capsule_info *cap_info = file->private_data; + + if (cap_info->index > 0) { + pr_err("%s: capsule upload not complete\n", __func__); + efi_free_all_buff_pages(cap_info); + ret = -ECANCELED; + } + + return ret; +} + +/** + * efi_capsule_release - called by file close + * @inode: not used + * @file: file pointer + * + * We will not free successfully submitted pages since efi update + * requires data to be maintained across system reboot. + **/ +static int efi_capsule_release(struct inode *inode, struct file *file) +{ + struct capsule_info *cap_info = file->private_data; + + kfree(cap_info->pages); + kfree(file->private_data); + file->private_data = NULL; + return 0; +} + +/** + * efi_capsule_open - called by file open + * @inode: not used + * @file: file pointer + * + * Will allocate each capsule_info memory for each file open call. + * This provided the capability to support multiple file open feature + * where user is not needed to wait for others to finish in order to + * upload their capsule binary. + **/ +static int efi_capsule_open(struct inode *inode, struct file *file) +{ + struct capsule_info *cap_info; + + cap_info = kzalloc(sizeof(*cap_info), GFP_KERNEL); + if (!cap_info) + return -ENOMEM; + + cap_info->pages = kzalloc(sizeof(void *), GFP_KERNEL); + if (!cap_info->pages) { + kfree(cap_info); + return -ENOMEM; + } + + file->private_data = cap_info; + + return 0; +} + +static const struct file_operations efi_capsule_fops = { + .owner = THIS_MODULE, + .open = efi_capsule_open, + .write = efi_capsule_write, + .flush = efi_capsule_flush, + .release = efi_capsule_release, + .llseek = no_llseek, +}; + +static struct miscdevice efi_capsule_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "efi_capsule_loader", + .fops = &efi_capsule_fops, +}; + +static int __init efi_capsule_loader_init(void) +{ + int ret; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return -ENODEV; + + ret = misc_register(&efi_capsule_misc); + if (ret) + pr_err("%s: Failed to register misc char file note\n", + __func__); + + return ret; +} +module_init(efi_capsule_loader_init); + +static void __exit efi_capsule_loader_exit(void) +{ + misc_deregister(&efi_capsule_misc); +} +module_exit(efi_capsule_loader_exit); + +MODULE_DESCRIPTION("EFI capsule firmware binary loader"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/firmware/efi/capsule.c b/drivers/firmware/efi/capsule.c new file mode 100644 index 0000000000000000000000000000000000000000..53b9fd2293ee8f5af6f7f6a38de61730c8c99ec9 --- /dev/null +++ b/drivers/firmware/efi/capsule.c @@ -0,0 +1,308 @@ +/* + * EFI capsule support. + * + * Copyright 2013 Intel Corporation; author Matt Fleming + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) "efi: " fmt + +#include +#include +#include +#include +#include +#include + +typedef struct { + u64 length; + u64 data; +} efi_capsule_block_desc_t; + +static bool capsule_pending; +static bool stop_capsules; +static int efi_reset_type = -1; + +/* + * capsule_mutex serialises access to both capsule_pending and + * efi_reset_type and stop_capsules. + */ +static DEFINE_MUTEX(capsule_mutex); + +/** + * efi_capsule_pending - has a capsule been passed to the firmware? + * @reset_type: store the type of EFI reset if capsule is pending + * + * To ensure that the registered capsule is processed correctly by the + * firmware we need to perform a specific type of reset. If a capsule is + * pending return the reset type in @reset_type. + * + * This function will race with callers of efi_capsule_update(), for + * example, calling this function while somebody else is in + * efi_capsule_update() but hasn't reached efi_capsue_update_locked() + * will miss the updates to capsule_pending and efi_reset_type after + * efi_capsule_update_locked() completes. + * + * A non-racy use is from platform reboot code because we use + * system_state to ensure no capsules can be sent to the firmware once + * we're at SYSTEM_RESTART. See efi_capsule_update_locked(). + */ +bool efi_capsule_pending(int *reset_type) +{ + if (!capsule_pending) + return false; + + if (reset_type) + *reset_type = efi_reset_type; + + return true; +} + +/* + * Whitelist of EFI capsule flags that we support. + * + * We do not handle EFI_CAPSULE_INITIATE_RESET because that would + * require us to prepare the kernel for reboot. Refuse to load any + * capsules with that flag and any other flags that we do not know how + * to handle. + */ +#define EFI_CAPSULE_SUPPORTED_FLAG_MASK \ + (EFI_CAPSULE_PERSIST_ACROSS_RESET | EFI_CAPSULE_POPULATE_SYSTEM_TABLE) + +/** + * efi_capsule_supported - does the firmware support the capsule? + * @guid: vendor guid of capsule + * @flags: capsule flags + * @size: size of capsule data + * @reset: the reset type required for this capsule + * + * Check whether a capsule with @flags is supported by the firmware + * and that @size doesn't exceed the maximum size for a capsule. + * + * No attempt is made to check @reset against the reset type required + * by any pending capsules because of the races involved. + */ +int efi_capsule_supported(efi_guid_t guid, u32 flags, size_t size, int *reset) +{ + efi_capsule_header_t capsule; + efi_capsule_header_t *cap_list[] = { &capsule }; + efi_status_t status; + u64 max_size; + + if (flags & ~EFI_CAPSULE_SUPPORTED_FLAG_MASK) + return -EINVAL; + + capsule.headersize = capsule.imagesize = sizeof(capsule); + memcpy(&capsule.guid, &guid, sizeof(efi_guid_t)); + capsule.flags = flags; + + status = efi.query_capsule_caps(cap_list, 1, &max_size, reset); + if (status != EFI_SUCCESS) + return efi_status_to_err(status); + + if (size > max_size) + return -ENOSPC; + + return 0; +} +EXPORT_SYMBOL_GPL(efi_capsule_supported); + +/* + * Every scatter gather list (block descriptor) page must end with a + * continuation pointer. The last continuation pointer of the last + * page must be zero to mark the end of the chain. + */ +#define SGLIST_PER_PAGE ((PAGE_SIZE / sizeof(efi_capsule_block_desc_t)) - 1) + +/* + * How many scatter gather list (block descriptor) pages do we need + * to map @count pages? + */ +static inline unsigned int sg_pages_num(unsigned int count) +{ + return DIV_ROUND_UP(count, SGLIST_PER_PAGE); +} + +/** + * efi_capsule_update_locked - pass a single capsule to the firmware + * @capsule: capsule to send to the firmware + * @sg_pages: array of scatter gather (block descriptor) pages + * @reset: the reset type required for @capsule + * + * Since this function must be called under capsule_mutex check + * whether efi_reset_type will conflict with @reset, and atomically + * set it and capsule_pending if a capsule was successfully sent to + * the firmware. + * + * We also check to see if the system is about to restart, and if so, + * abort. This avoids races between efi_capsule_update() and + * efi_capsule_pending(). + */ +static int +efi_capsule_update_locked(efi_capsule_header_t *capsule, + struct page **sg_pages, int reset) +{ + efi_physical_addr_t sglist_phys; + efi_status_t status; + + lockdep_assert_held(&capsule_mutex); + + /* + * If someone has already registered a capsule that requires a + * different reset type, we're out of luck and must abort. + */ + if (efi_reset_type >= 0 && efi_reset_type != reset) { + pr_err("Conflicting capsule reset type %d (%d).\n", + reset, efi_reset_type); + return -EINVAL; + } + + /* + * If the system is getting ready to restart it may have + * called efi_capsule_pending() to make decisions (such as + * whether to force an EFI reboot), and we're racing against + * that call. Abort in that case. + */ + if (unlikely(stop_capsules)) { + pr_warn("Capsule update raced with reboot, aborting.\n"); + return -EINVAL; + } + + sglist_phys = page_to_phys(sg_pages[0]); + + status = efi.update_capsule(&capsule, 1, sglist_phys); + if (status == EFI_SUCCESS) { + capsule_pending = true; + efi_reset_type = reset; + } + + return efi_status_to_err(status); +} + +/** + * efi_capsule_update - send a capsule to the firmware + * @capsule: capsule to send to firmware + * @pages: an array of capsule data pages + * + * Build a scatter gather list with EFI capsule block descriptors to + * map the capsule described by @capsule with its data in @pages and + * send it to the firmware via the UpdateCapsule() runtime service. + * + * @capsule must be a virtual mapping of the first page in @pages + * (@pages[0]) in the kernel address space. That is, a + * capsule_header_t that describes the entire contents of the capsule + * must be at the start of the first data page. + * + * Even though this function will validate that the firmware supports + * the capsule guid, users will likely want to check that + * efi_capsule_supported() returns true before calling this function + * because it makes it easier to print helpful error messages. + * + * If the capsule is successfully submitted to the firmware, any + * subsequent calls to efi_capsule_pending() will return true. @pages + * must not be released or modified if this function returns + * successfully. + * + * Callers must be prepared for this function to fail, which can + * happen if we raced with system reboot or if there is already a + * pending capsule that has a reset type that conflicts with the one + * required by @capsule. Do NOT use efi_capsule_pending() to detect + * this conflict since that would be racy. Instead, submit the capsule + * to efi_capsule_update() and check the return value. + * + * Return 0 on success, a converted EFI status code on failure. + */ +int efi_capsule_update(efi_capsule_header_t *capsule, struct page **pages) +{ + u32 imagesize = capsule->imagesize; + efi_guid_t guid = capsule->guid; + unsigned int count, sg_count; + u32 flags = capsule->flags; + struct page **sg_pages; + int rv, reset_type; + int i, j; + + rv = efi_capsule_supported(guid, flags, imagesize, &reset_type); + if (rv) + return rv; + + count = DIV_ROUND_UP(imagesize, PAGE_SIZE); + sg_count = sg_pages_num(count); + + sg_pages = kzalloc(sg_count * sizeof(*sg_pages), GFP_KERNEL); + if (!sg_pages) + return -ENOMEM; + + for (i = 0; i < sg_count; i++) { + sg_pages[i] = alloc_page(GFP_KERNEL); + if (!sg_pages[i]) { + rv = -ENOMEM; + goto out; + } + } + + for (i = 0; i < sg_count; i++) { + efi_capsule_block_desc_t *sglist; + + sglist = kmap(sg_pages[i]); + if (!sglist) { + rv = -ENOMEM; + goto out; + } + + for (j = 0; j < SGLIST_PER_PAGE && count > 0; j++) { + u64 sz = min_t(u64, imagesize, PAGE_SIZE); + + sglist[j].length = sz; + sglist[j].data = page_to_phys(*pages++); + + imagesize -= sz; + count--; + } + + /* Continuation pointer */ + sglist[j].length = 0; + + if (i + 1 == sg_count) + sglist[j].data = 0; + else + sglist[j].data = page_to_phys(sg_pages[i + 1]); + + kunmap(sg_pages[i]); + } + + mutex_lock(&capsule_mutex); + rv = efi_capsule_update_locked(capsule, sg_pages, reset_type); + mutex_unlock(&capsule_mutex); + +out: + for (i = 0; rv && i < sg_count; i++) { + if (sg_pages[i]) + __free_page(sg_pages[i]); + } + + kfree(sg_pages); + return rv; +} +EXPORT_SYMBOL_GPL(efi_capsule_update); + +static int capsule_reboot_notify(struct notifier_block *nb, unsigned long event, void *cmd) +{ + mutex_lock(&capsule_mutex); + stop_capsules = true; + mutex_unlock(&capsule_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block capsule_reboot_nb = { + .notifier_call = capsule_reboot_notify, +}; + +static int __init capsule_reboot_register(void) +{ + return register_reboot_notifier(&capsule_reboot_nb); +} +core_initcall(capsule_reboot_register); diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 3a69ed5ecfcb5f1254eaf1b0c3fb53cbdbcc61fb..05509f3aaee8feab84d8cdfdf24d1bd4af4f3b77 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -43,6 +43,7 @@ struct efi __read_mostly efi = { .config_table = EFI_INVALID_TABLE_ADDR, .esrt = EFI_INVALID_TABLE_ADDR, .properties_table = EFI_INVALID_TABLE_ADDR, + .mem_attr_table = EFI_INVALID_TABLE_ADDR, }; EXPORT_SYMBOL(efi); @@ -256,7 +257,7 @@ subsys_initcall(efisubsys_init); */ int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md) { - struct efi_memory_map *map = efi.memmap; + struct efi_memory_map *map = &efi.memmap; phys_addr_t p, e; if (!efi_enabled(EFI_MEMMAP)) { @@ -338,6 +339,7 @@ static __initdata efi_config_table_type_t common_tables[] = { {UGA_IO_PROTOCOL_GUID, "UGA", &efi.uga}, {EFI_SYSTEM_RESOURCE_TABLE_GUID, "ESRT", &efi.esrt}, {EFI_PROPERTIES_TABLE_GUID, "PROP", &efi.properties_table}, + {EFI_MEMORY_ATTRIBUTES_TABLE_GUID, "MEMATTR", &efi.mem_attr_table}, {NULL_GUID, NULL, NULL}, }; @@ -351,8 +353,9 @@ static __init int match_config_table(efi_guid_t *guid, for (i = 0; efi_guidcmp(table_types[i].guid, NULL_GUID); i++) { if (!efi_guidcmp(*guid, table_types[i].guid)) { *(table_types[i].ptr) = table; - pr_cont(" %s=0x%lx ", - table_types[i].name, table); + if (table_types[i].name) + pr_cont(" %s=0x%lx ", + table_types[i].name, table); return 1; } } @@ -620,16 +623,12 @@ char * __init efi_md_typeattr_format(char *buf, size_t size, */ u64 __weak efi_mem_attributes(unsigned long phys_addr) { - struct efi_memory_map *map; efi_memory_desc_t *md; - void *p; if (!efi_enabled(EFI_MEMMAP)) return 0; - map = efi.memmap; - for (p = map->map; p < map->map_end; p += map->desc_size) { - md = p; + for_each_efi_memory_desc(md) { if ((md->phys_addr <= phys_addr) && (phys_addr < (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)))) @@ -637,3 +636,36 @@ u64 __weak efi_mem_attributes(unsigned long phys_addr) } return 0; } + +int efi_status_to_err(efi_status_t status) +{ + int err; + + switch (status) { + case EFI_SUCCESS: + err = 0; + break; + case EFI_INVALID_PARAMETER: + err = -EINVAL; + break; + case EFI_OUT_OF_RESOURCES: + err = -ENOSPC; + break; + case EFI_DEVICE_ERROR: + err = -EIO; + break; + case EFI_WRITE_PROTECTED: + err = -EROFS; + break; + case EFI_SECURITY_VIOLATION: + err = -EACCES; + break; + case EFI_NOT_FOUND: + err = -ENOENT; + break; + default: + err = -EINVAL; + } + + return err; +} diff --git a/drivers/firmware/efi/efibc.c b/drivers/firmware/efi/efibc.c new file mode 100644 index 0000000000000000000000000000000000000000..8dd0c7085e59799dce9064361e6e1ec7df4f0ccd --- /dev/null +++ b/drivers/firmware/efi/efibc.c @@ -0,0 +1,113 @@ +/* + * efibc: control EFI bootloaders which obey LoaderEntryOneShot var + * Copyright (c) 2013-2016, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#define pr_fmt(fmt) "efibc: " fmt + +#include +#include +#include +#include + +static void efibc_str_to_str16(const char *str, efi_char16_t *str16) +{ + size_t i; + + for (i = 0; i < strlen(str); i++) + str16[i] = str[i]; + + str16[i] = '\0'; +} + +static int efibc_set_variable(const char *name, const char *value) +{ + int ret; + efi_guid_t guid = LINUX_EFI_LOADER_ENTRY_GUID; + struct efivar_entry *entry; + size_t size = (strlen(value) + 1) * sizeof(efi_char16_t); + + if (size > sizeof(entry->var.Data)) { + pr_err("value is too large"); + return -EINVAL; + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + pr_err("failed to allocate efivar entry"); + return -ENOMEM; + } + + efibc_str_to_str16(name, entry->var.VariableName); + efibc_str_to_str16(value, (efi_char16_t *)entry->var.Data); + memcpy(&entry->var.VendorGuid, &guid, sizeof(guid)); + + ret = efivar_entry_set(entry, + EFI_VARIABLE_NON_VOLATILE + | EFI_VARIABLE_BOOTSERVICE_ACCESS + | EFI_VARIABLE_RUNTIME_ACCESS, + size, entry->var.Data, NULL); + if (ret) + pr_err("failed to set %s EFI variable: 0x%x\n", + name, ret); + + kfree(entry); + return ret; +} + +static int efibc_reboot_notifier_call(struct notifier_block *notifier, + unsigned long event, void *data) +{ + const char *reason = "shutdown"; + int ret; + + if (event == SYS_RESTART) + reason = "reboot"; + + ret = efibc_set_variable("LoaderEntryRebootReason", reason); + if (ret || !data) + return NOTIFY_DONE; + + efibc_set_variable("LoaderEntryOneShot", (char *)data); + + return NOTIFY_DONE; +} + +static struct notifier_block efibc_reboot_notifier = { + .notifier_call = efibc_reboot_notifier_call, +}; + +static int __init efibc_init(void) +{ + int ret; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return -ENODEV; + + ret = register_reboot_notifier(&efibc_reboot_notifier); + if (ret) + pr_err("unable to register reboot notifier\n"); + + return ret; +} +module_init(efibc_init); + +static void __exit efibc_exit(void) +{ + unregister_reboot_notifier(&efibc_reboot_notifier); +} +module_exit(efibc_exit); + +MODULE_AUTHOR("Jeremy Compostella "); +MODULE_AUTHOR("Matt Gumbel phys_addr; end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1; @@ -95,25 +94,25 @@ void __init efi_fake_memmap(void) } /* allocate memory for new EFI memmap */ - new_memmap_phy = memblock_alloc(memmap.desc_size * new_nr_map, + new_memmap_phy = memblock_alloc(efi.memmap.desc_size * new_nr_map, PAGE_SIZE); if (!new_memmap_phy) return; /* create new EFI memmap */ new_memmap = early_memremap(new_memmap_phy, - memmap.desc_size * new_nr_map); + efi.memmap.desc_size * new_nr_map); if (!new_memmap) { - memblock_free(new_memmap_phy, memmap.desc_size * new_nr_map); + memblock_free(new_memmap_phy, efi.memmap.desc_size * new_nr_map); return; } - for (old = memmap.map, new = new_memmap; - old < memmap.map_end; - old += memmap.desc_size, new += memmap.desc_size) { + for (old = efi.memmap.map, new = new_memmap; + old < efi.memmap.map_end; + old += efi.memmap.desc_size, new += efi.memmap.desc_size) { /* copy original EFI memory descriptor */ - memcpy(new, old, memmap.desc_size); + memcpy(new, old, efi.memmap.desc_size); md = new; start = md->phys_addr; end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1; @@ -134,8 +133,8 @@ void __init efi_fake_memmap(void) md->num_pages = (m_end - md->phys_addr + 1) >> EFI_PAGE_SHIFT; /* latter part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->phys_addr = m_end + 1; md->num_pages = (end - md->phys_addr + 1) >> @@ -147,16 +146,16 @@ void __init efi_fake_memmap(void) md->num_pages = (m_start - md->phys_addr) >> EFI_PAGE_SHIFT; /* middle part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->attribute |= m_attr; md->phys_addr = m_start; md->num_pages = (m_end - m_start + 1) >> EFI_PAGE_SHIFT; /* last part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->phys_addr = m_end + 1; md->num_pages = (end - m_end) >> @@ -169,8 +168,8 @@ void __init efi_fake_memmap(void) md->num_pages = (m_start - md->phys_addr) >> EFI_PAGE_SHIFT; /* latter part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->phys_addr = m_start; md->num_pages = (end - md->phys_addr + 1) >> @@ -182,10 +181,10 @@ void __init efi_fake_memmap(void) /* swap into new EFI memmap */ efi_unmap_memmap(); - memmap.map = new_memmap; - memmap.phys_map = new_memmap_phy; - memmap.nr_map = new_nr_map; - memmap.map_end = memmap.map + memmap.nr_map * memmap.desc_size; + efi.memmap.map = new_memmap; + efi.memmap.phys_map = new_memmap_phy; + efi.memmap.nr_map = new_nr_map; + efi.memmap.map_end = efi.memmap.map + efi.memmap.nr_map * efi.memmap.desc_size; set_bit(EFI_MEMMAP, &efi.flags); /* print new EFI memmap */ diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index da99bbb74aebde62fc62128bed5d4ba2ee801212..c06945160a4154a5f8d518192b7adb16c4ab325d 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -28,7 +28,7 @@ OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. KCOV_INSTRUMENT := n -lib-y := efi-stub-helper.o +lib-y := efi-stub-helper.o gop.o # include the stub's generic dependencies from lib/ when building for ARM/arm64 arm-deps := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c fdt_empty_tree.c fdt_sw.c sort.c diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c index 414deb85c2e5214d4dc3a4cb85f4f8a20eb2df05..993aa56755f60afb929f90521485fd5fab335583 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/arm-stub.c @@ -20,27 +20,49 @@ bool __nokaslr; -static int efi_secureboot_enabled(efi_system_table_t *sys_table_arg) +static int efi_get_secureboot(efi_system_table_t *sys_table_arg) { - static efi_guid_t const var_guid = EFI_GLOBAL_VARIABLE_GUID; - static efi_char16_t const var_name[] = { + static efi_char16_t const sb_var_name[] = { 'S', 'e', 'c', 'u', 'r', 'e', 'B', 'o', 'o', 't', 0 }; + static efi_char16_t const sm_var_name[] = { + 'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0 }; + efi_guid_t var_guid = EFI_GLOBAL_VARIABLE_GUID; efi_get_variable_t *f_getvar = sys_table_arg->runtime->get_variable; - unsigned long size = sizeof(u8); - efi_status_t status; u8 val; + unsigned long size = sizeof(val); + efi_status_t status; - status = f_getvar((efi_char16_t *)var_name, (efi_guid_t *)&var_guid, + status = f_getvar((efi_char16_t *)sb_var_name, (efi_guid_t *)&var_guid, NULL, &size, &val); + if (status != EFI_SUCCESS) + goto out_efi_err; + + if (val == 0) + return 0; + + status = f_getvar((efi_char16_t *)sm_var_name, (efi_guid_t *)&var_guid, + NULL, &size, &val); + + if (status != EFI_SUCCESS) + goto out_efi_err; + + if (val == 1) + return 0; + + return 1; + +out_efi_err: switch (status) { - case EFI_SUCCESS: - return val; case EFI_NOT_FOUND: return 0; + case EFI_DEVICE_ERROR: + return -EIO; + case EFI_SECURITY_VIOLATION: + return -EACCES; default: - return 1; + return -EINVAL; } } @@ -147,6 +169,25 @@ void efi_char16_printk(efi_system_table_t *sys_table_arg, out->output_string(out, str); } +static struct screen_info *setup_graphics(efi_system_table_t *sys_table_arg) +{ + efi_guid_t gop_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID; + efi_status_t status; + unsigned long size; + void **gop_handle = NULL; + struct screen_info *si = NULL; + + size = 0; + status = efi_call_early(locate_handle, EFI_LOCATE_BY_PROTOCOL, + &gop_proto, NULL, &size, gop_handle); + if (status == EFI_BUFFER_TOO_SMALL) { + si = alloc_screen_info(sys_table_arg); + if (!si) + return NULL; + efi_setup_gop(sys_table_arg, si, &gop_proto, size); + } + return si; +} /* * This function handles the architcture specific differences between arm and @@ -185,6 +226,8 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, efi_guid_t loaded_image_proto = LOADED_IMAGE_PROTOCOL_GUID; unsigned long reserve_addr = 0; unsigned long reserve_size = 0; + int secure_boot = 0; + struct screen_info *si; /* Check if we were booted by the EFI firmware */ if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) @@ -237,6 +280,8 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, __nokaslr = true; } + si = setup_graphics(sys_table); + status = handle_kernel_image(sys_table, image_addr, &image_size, &reserve_addr, &reserve_size, @@ -250,12 +295,21 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, if (status != EFI_SUCCESS) pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n"); + secure_boot = efi_get_secureboot(sys_table); + if (secure_boot > 0) + pr_efi(sys_table, "UEFI Secure Boot is enabled.\n"); + + if (secure_boot < 0) { + pr_efi_err(sys_table, + "could not determine UEFI Secure Boot status.\n"); + } + /* * Unauthenticated device tree data is a security hazard, so * ignore 'dtb=' unless UEFI Secure Boot is disabled. */ - if (efi_secureboot_enabled(sys_table)) { - pr_efi(sys_table, "UEFI Secure Boot is enabled.\n"); + if (secure_boot != 0 && strstr(cmdline_ptr, "dtb=")) { + pr_efi(sys_table, "Ignoring DTB from command line.\n"); } else { status = handle_cmdline_files(sys_table, image, cmdline_ptr, "dtb=", @@ -309,6 +363,7 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, efi_free(sys_table, image_size, *image_addr); efi_free(sys_table, reserve_size, reserve_addr); fail_free_cmdline: + free_screen_info(sys_table, si); efi_free(sys_table, cmdline_size, (unsigned long)cmdline_ptr); fail: return EFI_ERROR; diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c index 6f42be4d0084f1befcceb329758513b5f2c50781..e1f0b28e1dcbbf9cde9a1e882ba6212fb165c55e 100644 --- a/drivers/firmware/efi/libstub/arm32-stub.c +++ b/drivers/firmware/efi/libstub/arm32-stub.c @@ -26,6 +26,43 @@ efi_status_t check_platform_features(efi_system_table_t *sys_table_arg) return EFI_SUCCESS; } +static efi_guid_t screen_info_guid = LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID; + +struct screen_info *alloc_screen_info(efi_system_table_t *sys_table_arg) +{ + struct screen_info *si; + efi_status_t status; + + /* + * Unlike on arm64, where we can directly fill out the screen_info + * structure from the stub, we need to allocate a buffer to hold + * its contents while we hand over to the kernel proper from the + * decompressor. + */ + status = efi_call_early(allocate_pool, EFI_RUNTIME_SERVICES_DATA, + sizeof(*si), (void **)&si); + + if (status != EFI_SUCCESS) + return NULL; + + status = efi_call_early(install_configuration_table, + &screen_info_guid, si); + if (status == EFI_SUCCESS) + return si; + + efi_call_early(free_pool, si); + return NULL; +} + +void free_screen_info(efi_system_table_t *sys_table_arg, struct screen_info *si) +{ + if (!si) + return; + + efi_call_early(install_configuration_table, &screen_info_guid, NULL); + efi_call_early(free_pool, si); +} + efi_status_t handle_kernel_image(efi_system_table_t *sys_table, unsigned long *image_addr, unsigned long *image_size, diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 29ed2f9b218ca9892bfcc72da2d91ba4750f4c97..3bd127f953151dff89d0563db13deb12046e4315 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -125,10 +125,12 @@ unsigned long get_dram_base(efi_system_table_t *sys_table_arg) map.map_end = map.map + map_size; - for_each_efi_memory_desc(&map, md) - if (md->attribute & EFI_MEMORY_WB) + for_each_efi_memory_desc_in_map(&map, md) { + if (md->attribute & EFI_MEMORY_WB) { if (membase > md->phys_addr) membase = md->phys_addr; + } + } efi_call_early(free_pool, map.map); diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c new file mode 100644 index 0000000000000000000000000000000000000000..932742e4cf23147e304d6df2a1c8f37c8f89d296 --- /dev/null +++ b/drivers/firmware/efi/libstub/gop.c @@ -0,0 +1,354 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2011 Intel Corporation; author Matt Fleming + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +#include +#include +#include +#include + +static void find_bits(unsigned long mask, u8 *pos, u8 *size) +{ + u8 first, len; + + first = 0; + len = 0; + + if (mask) { + while (!(mask & 0x1)) { + mask = mask >> 1; + first++; + } + + while (mask & 0x1) { + mask = mask >> 1; + len++; + } + } + + *pos = first; + *size = len; +} + +static void +setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, + struct efi_pixel_bitmask pixel_info, int pixel_format) +{ + if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 0; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 16; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 16; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 0; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BIT_MASK) { + find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size); + find_bits(pixel_info.green_mask, &si->green_pos, + &si->green_size); + find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size); + find_bits(pixel_info.reserved_mask, &si->rsvd_pos, + &si->rsvd_size); + si->lfb_depth = si->red_size + si->green_size + + si->blue_size + si->rsvd_size; + si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; + } else { + si->lfb_depth = 4; + si->lfb_linelength = si->lfb_width / 2; + si->red_size = 0; + si->red_pos = 0; + si->green_size = 0; + si->green_pos = 0; + si->blue_size = 0; + si->blue_pos = 0; + si->rsvd_size = 0; + si->rsvd_pos = 0; + } +} + +static efi_status_t +__gop_query32(efi_system_table_t *sys_table_arg, + struct efi_graphics_output_protocol_32 *gop32, + struct efi_graphics_output_mode_info **info, + unsigned long *size, u64 *fb_base) +{ + struct efi_graphics_output_protocol_mode_32 *mode; + efi_graphics_output_protocol_query_mode query_mode; + efi_status_t status; + unsigned long m; + + m = gop32->mode; + mode = (struct efi_graphics_output_protocol_mode_32 *)m; + query_mode = (void *)(unsigned long)gop32->query_mode; + + status = __efi_call_early(query_mode, (void *)gop32, mode->mode, size, + info); + if (status != EFI_SUCCESS) + return status; + + *fb_base = mode->frame_buffer_base; + return status; +} + +static efi_status_t +setup_gop32(efi_system_table_t *sys_table_arg, struct screen_info *si, + efi_guid_t *proto, unsigned long size, void **gop_handle) +{ + struct efi_graphics_output_protocol_32 *gop32, *first_gop; + unsigned long nr_gops; + u16 width, height; + u32 pixels_per_scan_line; + u32 ext_lfb_base; + u64 fb_base; + struct efi_pixel_bitmask pixel_info; + int pixel_format; + efi_status_t status = EFI_NOT_FOUND; + u32 *handles = (u32 *)(unsigned long)gop_handle; + int i; + + first_gop = NULL; + gop32 = NULL; + + nr_gops = size / sizeof(u32); + for (i = 0; i < nr_gops; i++) { + struct efi_graphics_output_mode_info *info = NULL; + efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; + bool conout_found = false; + void *dummy = NULL; + efi_handle_t h = (efi_handle_t)(unsigned long)handles[i]; + u64 current_fb_base; + + status = efi_call_early(handle_protocol, h, + proto, (void **)&gop32); + if (status != EFI_SUCCESS) + continue; + + status = efi_call_early(handle_protocol, h, + &conout_proto, &dummy); + if (status == EFI_SUCCESS) + conout_found = true; + + status = __gop_query32(sys_table_arg, gop32, &info, &size, + ¤t_fb_base); + if (status == EFI_SUCCESS && (!first_gop || conout_found)) { + /* + * Systems that use the UEFI Console Splitter may + * provide multiple GOP devices, not all of which are + * backed by real hardware. The workaround is to search + * for a GOP implementing the ConOut protocol, and if + * one isn't found, to just fall back to the first GOP. + */ + width = info->horizontal_resolution; + height = info->vertical_resolution; + pixel_format = info->pixel_format; + pixel_info = info->pixel_information; + pixels_per_scan_line = info->pixels_per_scan_line; + fb_base = current_fb_base; + + /* + * Once we've found a GOP supporting ConOut, + * don't bother looking any further. + */ + first_gop = gop32; + if (conout_found) + break; + } + } + + /* Did we find any GOPs? */ + if (!first_gop) + goto out; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_width = width; + si->lfb_height = height; + si->lfb_base = fb_base; + + ext_lfb_base = (u64)(unsigned long)fb_base >> 32; + if (ext_lfb_base) { + si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + si->ext_lfb_base = ext_lfb_base; + } + + si->pages = 1; + + setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); + + si->lfb_size = si->lfb_linelength * si->lfb_height; + + si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; +out: + return status; +} + +static efi_status_t +__gop_query64(efi_system_table_t *sys_table_arg, + struct efi_graphics_output_protocol_64 *gop64, + struct efi_graphics_output_mode_info **info, + unsigned long *size, u64 *fb_base) +{ + struct efi_graphics_output_protocol_mode_64 *mode; + efi_graphics_output_protocol_query_mode query_mode; + efi_status_t status; + unsigned long m; + + m = gop64->mode; + mode = (struct efi_graphics_output_protocol_mode_64 *)m; + query_mode = (void *)(unsigned long)gop64->query_mode; + + status = __efi_call_early(query_mode, (void *)gop64, mode->mode, size, + info); + if (status != EFI_SUCCESS) + return status; + + *fb_base = mode->frame_buffer_base; + return status; +} + +static efi_status_t +setup_gop64(efi_system_table_t *sys_table_arg, struct screen_info *si, + efi_guid_t *proto, unsigned long size, void **gop_handle) +{ + struct efi_graphics_output_protocol_64 *gop64, *first_gop; + unsigned long nr_gops; + u16 width, height; + u32 pixels_per_scan_line; + u32 ext_lfb_base; + u64 fb_base; + struct efi_pixel_bitmask pixel_info; + int pixel_format; + efi_status_t status = EFI_NOT_FOUND; + u64 *handles = (u64 *)(unsigned long)gop_handle; + int i; + + first_gop = NULL; + gop64 = NULL; + + nr_gops = size / sizeof(u64); + for (i = 0; i < nr_gops; i++) { + struct efi_graphics_output_mode_info *info = NULL; + efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; + bool conout_found = false; + void *dummy = NULL; + efi_handle_t h = (efi_handle_t)(unsigned long)handles[i]; + u64 current_fb_base; + + status = efi_call_early(handle_protocol, h, + proto, (void **)&gop64); + if (status != EFI_SUCCESS) + continue; + + status = efi_call_early(handle_protocol, h, + &conout_proto, &dummy); + if (status == EFI_SUCCESS) + conout_found = true; + + status = __gop_query64(sys_table_arg, gop64, &info, &size, + ¤t_fb_base); + if (status == EFI_SUCCESS && (!first_gop || conout_found)) { + /* + * Systems that use the UEFI Console Splitter may + * provide multiple GOP devices, not all of which are + * backed by real hardware. The workaround is to search + * for a GOP implementing the ConOut protocol, and if + * one isn't found, to just fall back to the first GOP. + */ + width = info->horizontal_resolution; + height = info->vertical_resolution; + pixel_format = info->pixel_format; + pixel_info = info->pixel_information; + pixels_per_scan_line = info->pixels_per_scan_line; + fb_base = current_fb_base; + + /* + * Once we've found a GOP supporting ConOut, + * don't bother looking any further. + */ + first_gop = gop64; + if (conout_found) + break; + } + } + + /* Did we find any GOPs? */ + if (!first_gop) + goto out; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_width = width; + si->lfb_height = height; + si->lfb_base = fb_base; + + ext_lfb_base = (u64)(unsigned long)fb_base >> 32; + if (ext_lfb_base) { + si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + si->ext_lfb_base = ext_lfb_base; + } + + si->pages = 1; + + setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); + + si->lfb_size = si->lfb_linelength * si->lfb_height; + + si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; +out: + return status; +} + +/* + * See if we have Graphics Output Protocol + */ +efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, + struct screen_info *si, efi_guid_t *proto, + unsigned long size) +{ + efi_status_t status; + void **gop_handle = NULL; + + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, + size, (void **)&gop_handle); + if (status != EFI_SUCCESS) + return status; + + status = efi_call_early(locate_handle, + EFI_LOCATE_BY_PROTOCOL, + proto, NULL, &size, gop_handle); + if (status != EFI_SUCCESS) + goto free_handle; + + if (efi_is_64bit()) { + status = setup_gop64(sys_table_arg, si, proto, size, + gop_handle); + } else { + status = setup_gop32(sys_table_arg, si, proto, size, + gop_handle); + } + +free_handle: + efi_call_early(free_pool, gop_handle); + return status; +} diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c new file mode 100644 index 0000000000000000000000000000000000000000..236004b9a50d3224024d0024b8774b6347995065 --- /dev/null +++ b/drivers/firmware/efi/memattr.c @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2016 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) "efi: memattr: " fmt + +#include +#include +#include +#include + +#include + +static int __initdata tbl_size; + +/* + * Reserve the memory associated with the Memory Attributes configuration + * table, if it exists. + */ +int __init efi_memattr_init(void) +{ + efi_memory_attributes_table_t *tbl; + + if (efi.mem_attr_table == EFI_INVALID_TABLE_ADDR) + return 0; + + tbl = early_memremap(efi.mem_attr_table, sizeof(*tbl)); + if (!tbl) { + pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n", + efi.mem_attr_table); + return -ENOMEM; + } + + if (tbl->version > 1) { + pr_warn("Unexpected EFI Memory Attributes table version %d\n", + tbl->version); + goto unmap; + } + + tbl_size = sizeof(*tbl) + tbl->num_entries * tbl->desc_size; + memblock_reserve(efi.mem_attr_table, tbl_size); + +unmap: + early_memunmap(tbl, sizeof(*tbl)); + return 0; +} + +/* + * Returns a copy @out of the UEFI memory descriptor @in if it is covered + * entirely by a UEFI memory map entry with matching attributes. The virtual + * address of @out is set according to the matching entry that was found. + */ +static bool entry_is_valid(const efi_memory_desc_t *in, efi_memory_desc_t *out) +{ + u64 in_paddr = in->phys_addr; + u64 in_size = in->num_pages << EFI_PAGE_SHIFT; + efi_memory_desc_t *md; + + *out = *in; + + if (in->type != EFI_RUNTIME_SERVICES_CODE && + in->type != EFI_RUNTIME_SERVICES_DATA) { + pr_warn("Entry type should be RuntimeServiceCode/Data\n"); + return false; + } + + if (!(in->attribute & (EFI_MEMORY_RO | EFI_MEMORY_XP))) { + pr_warn("Entry attributes invalid: RO and XP bits both cleared\n"); + return false; + } + + if (PAGE_SIZE > EFI_PAGE_SIZE && + (!PAGE_ALIGNED(in->phys_addr) || + !PAGE_ALIGNED(in->num_pages << EFI_PAGE_SHIFT))) { + /* + * Since arm64 may execute with page sizes of up to 64 KB, the + * UEFI spec mandates that RuntimeServices memory regions must + * be 64 KB aligned. We need to validate this here since we will + * not be able to tighten permissions on such regions without + * affecting adjacent regions. + */ + pr_warn("Entry address region misaligned\n"); + return false; + } + + for_each_efi_memory_desc(md) { + u64 md_paddr = md->phys_addr; + u64 md_size = md->num_pages << EFI_PAGE_SHIFT; + + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + if (md->virt_addr == 0) { + /* no virtual mapping has been installed by the stub */ + break; + } + + if (md_paddr > in_paddr || (in_paddr - md_paddr) >= md_size) + continue; + + /* + * This entry covers the start of @in, check whether + * it covers the end as well. + */ + if (md_paddr + md_size < in_paddr + in_size) { + pr_warn("Entry covers multiple EFI memory map regions\n"); + return false; + } + + if (md->type != in->type) { + pr_warn("Entry type deviates from EFI memory map region type\n"); + return false; + } + + out->virt_addr = in_paddr + (md->virt_addr - md_paddr); + + return true; + } + + pr_warn("No matching entry found in the EFI memory map\n"); + return false; +} + +/* + * To be called after the EFI page tables have been populated. If a memory + * attributes table is available, its contents will be used to update the + * mappings with tightened permissions as described by the table. + * This requires the UEFI memory map to have already been populated with + * virtual addresses. + */ +int __init efi_memattr_apply_permissions(struct mm_struct *mm, + efi_memattr_perm_setter fn) +{ + efi_memory_attributes_table_t *tbl; + int i, ret; + + if (tbl_size <= sizeof(*tbl)) + return 0; + + /* + * We need the EFI memory map to be setup so we can use it to + * lookup the virtual addresses of all entries in the of EFI + * Memory Attributes table. If it isn't available, this + * function should not be called. + */ + if (WARN_ON(!efi_enabled(EFI_MEMMAP))) + return 0; + + tbl = memremap(efi.mem_attr_table, tbl_size, MEMREMAP_WB); + if (!tbl) { + pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n", + efi.mem_attr_table); + return -ENOMEM; + } + + if (efi_enabled(EFI_DBG)) + pr_info("Processing EFI Memory Attributes table:\n"); + + for (i = ret = 0; ret == 0 && i < tbl->num_entries; i++) { + efi_memory_desc_t md; + unsigned long size; + bool valid; + char buf[64]; + + valid = entry_is_valid((void *)tbl->entry + i * tbl->desc_size, + &md); + size = md.num_pages << EFI_PAGE_SHIFT; + if (efi_enabled(EFI_DBG) || !valid) + pr_info("%s 0x%012llx-0x%012llx %s\n", + valid ? "" : "!", md.phys_addr, + md.phys_addr + size - 1, + efi_md_typeattr_format(buf, sizeof(buf), &md)); + + if (valid) + ret = fn(mm, &md); + } + memunmap(tbl); + return ret; +} diff --git a/drivers/firmware/efi/reboot.c b/drivers/firmware/efi/reboot.c index 9c59d1c795d1f2d6787b9a3a248f384e4017e8f1..62ead9b9d871a606af7c0b575f382377f7f82aaa 100644 --- a/drivers/firmware/efi/reboot.c +++ b/drivers/firmware/efi/reboot.c @@ -9,7 +9,8 @@ int efi_reboot_quirk_mode = -1; void efi_reboot(enum reboot_mode reboot_mode, const char *__unused) { - int efi_mode; + const char *str[] = { "cold", "warm", "shutdown", "platform" }; + int efi_mode, cap_reset_mode; if (!efi_enabled(EFI_RUNTIME_SERVICES)) return; @@ -30,6 +31,15 @@ void efi_reboot(enum reboot_mode reboot_mode, const char *__unused) if (efi_reboot_quirk_mode != -1) efi_mode = efi_reboot_quirk_mode; + if (efi_capsule_pending(&cap_reset_mode)) { + if (efi_mode != cap_reset_mode) + printk(KERN_CRIT "efi: %s reset requested but pending " + "capsule update requires %s reset... Performing " + "%s reset.\n", str[efi_mode], str[cap_reset_mode], + str[cap_reset_mode]); + efi_mode = cap_reset_mode; + } + efi.reset_system(efi_mode, EFI_SUCCESS, 0, NULL); } diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index de6953039af65255b1afe36d059fb2893fdcd6f9..23bef6bb73ee58db932e7fd34100e4d850e73a4a 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -16,10 +16,70 @@ #include #include +#include #include #include +#include #include +static void efi_call_virt_check_flags(unsigned long flags, const char *call) +{ + unsigned long cur_flags, mismatch; + + local_save_flags(cur_flags); + + mismatch = flags ^ cur_flags; + if (!WARN_ON_ONCE(mismatch & ARCH_EFI_IRQ_FLAGS_MASK)) + return; + + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_NOW_UNRELIABLE); + pr_err_ratelimited(FW_BUG "IRQ flags corrupted (0x%08lx=>0x%08lx) by EFI %s\n", + flags, cur_flags, call); + local_irq_restore(flags); +} + +/* + * Arch code can implement the following three template macros, avoiding + * reptition for the void/non-void return cases of {__,}efi_call_virt: + * + * * arch_efi_call_virt_setup + * + * Sets up the environment for the call (e.g. switching page tables, + * allowing kernel-mode use of floating point, if required). + * + * * arch_efi_call_virt + * + * Performs the call. The last expression in the macro must be the call + * itself, allowing the logic to be shared by the void and non-void + * cases. + * + * * arch_efi_call_virt_teardown + * + * Restores the usual kernel environment once the call has returned. + */ + +#define efi_call_virt(f, args...) \ +({ \ + efi_status_t __s; \ + unsigned long flags; \ + arch_efi_call_virt_setup(); \ + local_save_flags(flags); \ + __s = arch_efi_call_virt(f, args); \ + efi_call_virt_check_flags(flags, __stringify(f)); \ + arch_efi_call_virt_teardown(); \ + __s; \ +}) + +#define __efi_call_virt(f, args...) \ +({ \ + unsigned long flags; \ + arch_efi_call_virt_setup(); \ + local_save_flags(flags); \ + arch_efi_call_virt(f, args); \ + efi_call_virt_check_flags(flags, __stringify(f)); \ + arch_efi_call_virt_teardown(); \ +}) + /* * According to section 7.1 of the UEFI spec, Runtime Services are not fully * reentrant, and there are particular combinations of calls that need to be diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index 34b741940494a24682e6cbf0e96d3246468a4fd6..d3b75138328665f0be09e2192199831473499502 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -329,39 +329,6 @@ check_var_size_nonblocking(u32 attributes, unsigned long size) return fops->query_variable_store(attributes, size, true); } -static int efi_status_to_err(efi_status_t status) -{ - int err; - - switch (status) { - case EFI_SUCCESS: - err = 0; - break; - case EFI_INVALID_PARAMETER: - err = -EINVAL; - break; - case EFI_OUT_OF_RESOURCES: - err = -ENOSPC; - break; - case EFI_DEVICE_ERROR: - err = -EIO; - break; - case EFI_WRITE_PROTECTED: - err = -EROFS; - break; - case EFI_SECURITY_VIOLATION: - err = -EACCES; - break; - case EFI_NOT_FOUND: - err = -ENOENT; - break; - default: - err = -EINVAL; - } - - return err; -} - static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor, struct list_head *head) { @@ -452,8 +419,7 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, * Returns 0 on success, or a kernel error code on failure. */ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), - void *data, bool atomic, bool duplicates, - struct list_head *head) + void *data, bool duplicates, struct list_head *head) { const struct efivar_operations *ops = __efivars->ops; unsigned long variable_name_size = 1024; @@ -483,7 +449,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), &vendor_guid); switch (status) { case EFI_SUCCESS: - if (!atomic) + if (duplicates) spin_unlock_irq(&__efivars->lock); variable_name_size = var_name_strnsize(variable_name, @@ -498,21 +464,19 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), * and may end up looping here forever. */ if (duplicates && - variable_is_present(variable_name, &vendor_guid, head)) { + variable_is_present(variable_name, &vendor_guid, + head)) { dup_variable_bug(variable_name, &vendor_guid, variable_name_size); - if (!atomic) - spin_lock_irq(&__efivars->lock); - status = EFI_NOT_FOUND; - break; + } else { + err = func(variable_name, vendor_guid, + variable_name_size, data); + if (err) + status = EFI_NOT_FOUND; } - err = func(variable_name, vendor_guid, variable_name_size, data); - if (err) - status = EFI_NOT_FOUND; - - if (!atomic) + if (duplicates) spin_lock_irq(&__efivars->lock); break; diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c index bf731e9f643e9ecddd367034179fae6a40d294ce..7f85c2c1d68156a4d91bd4b18332df6f1886fb7c 100644 --- a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c +++ b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c @@ -276,8 +276,8 @@ static int amdgpu_atombios_dp_get_dp_link_config(struct drm_connector *connector } } } else { - for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { - for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { max_pix_clock = (lane_num * link_rates[i] * 8) / bpp; if (max_pix_clock >= pix_clock) { *dp_lanes = lane_num; diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c index 6743ff7dccfa30b2997d2529d97747d5261d5ad9..059f7c39c582827c1ad923c3f2fb6a203a235b52 100644 --- a/drivers/gpu/drm/drm_cache.c +++ b/drivers/gpu/drm/drm_cache.c @@ -72,7 +72,7 @@ drm_clflush_pages(struct page *pages[], unsigned long num_pages) { #if defined(CONFIG_X86) - if (cpu_has_clflush) { + if (static_cpu_has(X86_FEATURE_CLFLUSH)) { drm_cache_flush_clflush(pages, num_pages); return; } @@ -105,7 +105,7 @@ void drm_clflush_sg(struct sg_table *st) { #if defined(CONFIG_X86) - if (cpu_has_clflush) { + if (static_cpu_has(X86_FEATURE_CLFLUSH)) { struct sg_page_iter sg_iter; mb(); @@ -129,7 +129,7 @@ void drm_clflush_virt_range(void *addr, unsigned long length) { #if defined(CONFIG_X86) - if (cpu_has_clflush) { + if (static_cpu_has(X86_FEATURE_CLFLUSH)) { const int size = boot_cpu_data.x86_clflush_size; void *end = addr + length; addr = (void *)(((unsigned long)addr) & -size); diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index a0f1bd711b533910ce00a2bbe642591419c4d198..e3f4c725a1c6910f3431431d9a2dbc538153f025 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -2872,20 +2872,6 @@ static void intel_dp_info(struct seq_file *m, intel_panel_info(m, &intel_connector->panel); } -static void intel_dp_mst_info(struct seq_file *m, - struct intel_connector *intel_connector) -{ - struct intel_encoder *intel_encoder = intel_connector->encoder; - struct intel_dp_mst_encoder *intel_mst = - enc_to_mst(&intel_encoder->base); - struct intel_digital_port *intel_dig_port = intel_mst->primary; - struct intel_dp *intel_dp = &intel_dig_port->dp; - bool has_audio = drm_dp_mst_port_has_audio(&intel_dp->mst_mgr, - intel_connector->port); - - seq_printf(m, "\taudio support: %s\n", yesno(has_audio)); -} - static void intel_hdmi_info(struct seq_file *m, struct intel_connector *intel_connector) { @@ -2929,8 +2915,6 @@ static void intel_connector_info(struct seq_file *m, intel_hdmi_info(m, intel_connector); else if (intel_encoder->type == INTEL_OUTPUT_LVDS) intel_lvds_info(m, intel_connector); - else if (intel_encoder->type == INTEL_OUTPUT_DP_MST) - intel_dp_mst_info(m, intel_connector); } seq_printf(m, "\tmodes:\n"); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index dabc08987b5e20389fa6b57872902bd4706d71cc..f2cb9a9539ee066ef1ee2a6a5ecb666830cec0d9 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1732,7 +1732,7 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data, if (args->flags & ~(I915_MMAP_WC)) return -EINVAL; - if (args->flags & I915_MMAP_WC && !cpu_has_pat) + if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT)) return -ENODEV; obj = drm_gem_object_lookup(dev, file, args->handle); diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 1328bc5021b4cf7287021c81a4252f2cf60cdc2a..b845f468dd74f3b2500d3f1fb059662b6432664d 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -488,7 +488,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, ret = relocate_entry_cpu(obj, reloc, target_offset); else if (obj->map_and_fenceable) ret = relocate_entry_gtt(obj, reloc, target_offset); - else if (cpu_has_clflush) + else if (static_cpu_has(X86_FEATURE_CLFLUSH)) ret = relocate_entry_clflush(obj, reloc, target_offset); else { WARN_ONCE(1, "Impossible case in relocation handling\n"); diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index fffdac801d3b0da03abdd65b2c90e50fb1913e54..363bd79dea2ef476bdf0f14886248175310720f6 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -7444,6 +7444,8 @@ enum skl_disp_power_wells { #define TRANS_CLK_SEL_DISABLED (0x0<<29) #define TRANS_CLK_SEL_PORT(x) (((x)+1)<<29) +#define CDCLK_FREQ _MMIO(0x46200) + #define _TRANSA_MSA_MISC 0x60410 #define _TRANSB_MSA_MISC 0x61410 #define _TRANSC_MSA_MISC 0x62410 diff --git a/drivers/gpu/drm/i915/intel_audio.c b/drivers/gpu/drm/i915/intel_audio.c index 30f921421b0c944217832ba86856a6904f8fef11..7d281b40064a47f7e46071ef897c4720da38f9d8 100644 --- a/drivers/gpu/drm/i915/intel_audio.c +++ b/drivers/gpu/drm/i915/intel_audio.c @@ -262,8 +262,7 @@ static void hsw_audio_codec_disable(struct intel_encoder *encoder) tmp |= AUD_CONFIG_N_PROG_ENABLE; tmp &= ~AUD_CONFIG_UPPER_N_MASK; tmp &= ~AUD_CONFIG_LOWER_N_MASK; - if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT) || - intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DP_MST)) + if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT)) tmp |= AUD_CONFIG_N_VALUE_INDEX; I915_WRITE(HSW_AUD_CFG(pipe), tmp); @@ -476,8 +475,7 @@ static void ilk_audio_codec_enable(struct drm_connector *connector, tmp &= ~AUD_CONFIG_N_VALUE_INDEX; tmp &= ~AUD_CONFIG_N_PROG_ENABLE; tmp &= ~AUD_CONFIG_PIXEL_CLOCK_HDMI_MASK; - if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT) || - intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DP_MST)) + if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT)) tmp |= AUD_CONFIG_N_VALUE_INDEX; else tmp |= audio_config_hdmi_pixel_clock(adjusted_mode); @@ -515,8 +513,7 @@ void intel_audio_codec_enable(struct intel_encoder *intel_encoder) /* ELD Conn_Type */ connector->eld[5] &= ~(3 << 2); - if (intel_pipe_has_type(crtc, INTEL_OUTPUT_DISPLAYPORT) || - intel_pipe_has_type(crtc, INTEL_OUTPUT_DP_MST)) + if (intel_pipe_has_type(crtc, INTEL_OUTPUT_DISPLAYPORT)) connector->eld[5] |= (1 << 2); connector->eld[6] = drm_av_sync_delay(connector, adjusted_mode) / 2; diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c index 505fc5cf26f845217b5bd58c8fe0a865bde5170d..0364292367b1425a297cd35e6ae81da2cf18dbc3 100644 --- a/drivers/gpu/drm/i915/intel_crt.c +++ b/drivers/gpu/drm/i915/intel_crt.c @@ -257,8 +257,14 @@ static bool intel_crt_compute_config(struct intel_encoder *encoder, pipe_config->has_pch_encoder = true; /* LPT FDI RX only supports 8bpc. */ - if (HAS_PCH_LPT(dev)) + if (HAS_PCH_LPT(dev)) { + if (pipe_config->bw_constrained && pipe_config->pipe_bpp < 24) { + DRM_DEBUG_KMS("LPT only supports 24bpp\n"); + return false; + } + pipe_config->pipe_bpp = 24; + } /* FDI must always be 2.7 GHz */ if (HAS_DDI(dev)) { diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c index 3b57bf06abe8598c1c3b6fba0dc8e19b7f192619..96ffcc541e17697b69bad69fb24fe5bec3866aab 100644 --- a/drivers/gpu/drm/i915/intel_ddi.c +++ b/drivers/gpu/drm/i915/intel_ddi.c @@ -3106,23 +3106,6 @@ void intel_ddi_fdi_disable(struct drm_crtc *crtc) I915_WRITE(FDI_RX_CTL(PIPE_A), val); } -bool intel_ddi_is_audio_enabled(struct drm_i915_private *dev_priv, - struct intel_crtc *intel_crtc) -{ - u32 temp; - - if (intel_display_power_get_if_enabled(dev_priv, POWER_DOMAIN_AUDIO)) { - temp = I915_READ(HSW_AUD_PIN_ELD_CP_VLD); - - intel_display_power_put(dev_priv, POWER_DOMAIN_AUDIO); - - if (temp & AUDIO_OUTPUT_ENABLE(intel_crtc->pipe)) - return true; - } - - return false; -} - void intel_ddi_get_config(struct intel_encoder *encoder, struct intel_crtc_state *pipe_config) { @@ -3183,8 +3166,11 @@ void intel_ddi_get_config(struct intel_encoder *encoder, break; } - pipe_config->has_audio = - intel_ddi_is_audio_enabled(dev_priv, intel_crtc); + if (intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_AUDIO)) { + temp = I915_READ(HSW_AUD_PIN_ELD_CP_VLD); + if (temp & AUDIO_OUTPUT_ENABLE(intel_crtc->pipe)) + pipe_config->has_audio = true; + } if (encoder->type == INTEL_OUTPUT_EDP && dev_priv->vbt.edp_bpp && pipe_config->pipe_bpp > dev_priv->vbt.edp_bpp) { diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 182f84937345d5b3d778d9521ca17f4b04608037..0104a06d01fd6617f54bedf68353935668eee45b 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -7988,9 +7988,6 @@ static void i9xx_get_pfit_config(struct intel_crtc *crtc, pipe_config->gmch_pfit.control = tmp; pipe_config->gmch_pfit.pgm_ratios = I915_READ(PFIT_PGM_RATIOS); - if (INTEL_INFO(dev)->gen < 5) - pipe_config->gmch_pfit.lvds_border_bits = - I915_READ(LVDS) & LVDS_BORDER_ENABLE; } static void vlv_crtc_clock_get(struct intel_crtc *crtc, @@ -9752,6 +9749,8 @@ static void broadwell_set_cdclk(struct drm_device *dev, int cdclk) sandybridge_pcode_write(dev_priv, HSW_PCODE_DE_WRITE_FREQ_REQ, data); mutex_unlock(&dev_priv->rps.hw_lock); + I915_WRITE(CDCLK_FREQ, DIV_ROUND_CLOSEST(cdclk, 1000) - 1); + intel_update_cdclk(dev); WARN(cdclk != dev_priv->cdclk_freq, diff --git a/drivers/gpu/drm/i915/intel_dp_mst.c b/drivers/gpu/drm/i915/intel_dp_mst.c index 937e77228466eb22e66ac765d001078a418badfd..2c999725b3d4b3ac10d9599227e00a1b392ae4f2 100644 --- a/drivers/gpu/drm/i915/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/intel_dp_mst.c @@ -78,8 +78,6 @@ static bool intel_dp_mst_compute_config(struct intel_encoder *encoder, return false; } - if (drm_dp_mst_port_has_audio(&intel_dp->mst_mgr, found->port)) - pipe_config->has_audio = true; mst_pbn = drm_dp_calc_pbn_mode(adjusted_mode->crtc_clock, bpp); pipe_config->pbn = mst_pbn; @@ -104,11 +102,6 @@ static void intel_mst_disable_dp(struct intel_encoder *encoder) struct intel_dp_mst_encoder *intel_mst = enc_to_mst(&encoder->base); struct intel_digital_port *intel_dig_port = intel_mst->primary; struct intel_dp *intel_dp = &intel_dig_port->dp; - struct drm_device *dev = encoder->base.dev; - struct drm_i915_private *dev_priv = dev->dev_private; - struct drm_crtc *crtc = encoder->base.crtc; - struct intel_crtc *intel_crtc = to_intel_crtc(crtc); - int ret; DRM_DEBUG_KMS("%d\n", intel_dp->active_mst_links); @@ -119,10 +112,6 @@ static void intel_mst_disable_dp(struct intel_encoder *encoder) if (ret) { DRM_ERROR("failed to update payload %d\n", ret); } - if (intel_crtc->config->has_audio) { - intel_audio_codec_disable(encoder); - intel_display_power_put(dev_priv, POWER_DOMAIN_AUDIO); - } } static void intel_mst_post_disable_dp(struct intel_encoder *encoder) @@ -221,7 +210,6 @@ static void intel_mst_enable_dp(struct intel_encoder *encoder) struct intel_dp *intel_dp = &intel_dig_port->dp; struct drm_device *dev = intel_dig_port->base.base.dev; struct drm_i915_private *dev_priv = dev->dev_private; - struct intel_crtc *crtc = to_intel_crtc(encoder->base.crtc); enum port port = intel_dig_port->port; int ret; @@ -234,13 +222,6 @@ static void intel_mst_enable_dp(struct intel_encoder *encoder) ret = drm_dp_check_act_status(&intel_dp->mst_mgr); ret = drm_dp_update_payload_part2(&intel_dp->mst_mgr); - - if (crtc->config->has_audio) { - DRM_DEBUG_DRIVER("Enabling DP audio on pipe %c\n", - pipe_name(crtc->pipe)); - intel_display_power_get(dev_priv, POWER_DOMAIN_AUDIO); - intel_audio_codec_enable(encoder); - } } static bool intel_dp_mst_enc_get_hw_state(struct intel_encoder *encoder, @@ -266,9 +247,6 @@ static void intel_dp_mst_enc_get_config(struct intel_encoder *encoder, pipe_config->has_dp_encoder = true; - pipe_config->has_audio = - intel_ddi_is_audio_enabled(dev_priv, crtc); - temp = I915_READ(TRANS_DDI_FUNC_CTL(cpu_transcoder)); if (temp & TRANS_DDI_PHSYNC) flags |= DRM_MODE_FLAG_PHSYNC; diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h index 7d3af3a72abea7ac557f5f107aaf597bc38d5165..9d0770c23fdece738575cc70a372353b7ea6284e 100644 --- a/drivers/gpu/drm/i915/intel_drv.h +++ b/drivers/gpu/drm/i915/intel_drv.h @@ -1019,8 +1019,6 @@ void intel_ddi_set_pipe_settings(struct drm_crtc *crtc); void intel_ddi_prepare_link_retrain(struct intel_dp *intel_dp); bool intel_ddi_connector_get_hw_state(struct intel_connector *intel_connector); void intel_ddi_fdi_disable(struct drm_crtc *crtc); -bool intel_ddi_is_audio_enabled(struct drm_i915_private *dev_priv, - struct intel_crtc *intel_crtc); void intel_ddi_get_config(struct intel_encoder *encoder, struct intel_crtc_state *pipe_config); struct intel_encoder * diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c index cd9fe609aefbc2487ce94ab01e0d495dd12d8167..10dc3517b63b32921437bcff3d2352829061c4ed 100644 --- a/drivers/gpu/drm/i915/intel_lvds.c +++ b/drivers/gpu/drm/i915/intel_lvds.c @@ -123,6 +123,10 @@ static void intel_lvds_get_config(struct intel_encoder *encoder, pipe_config->base.adjusted_mode.flags |= flags; + if (INTEL_INFO(dev)->gen < 5) + pipe_config->gmch_pfit.lvds_border_bits = + tmp & LVDS_BORDER_ENABLE; + /* gen2/3 store dither state in pfit control, needs to match */ if (INTEL_INFO(dev)->gen < 4) { tmp = I915_READ(PFIT_CONTROL); diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 8ed3cf34f82d31bbefa98847d1413f625e2b2f74..3425d8e737b344ec8d43bdeab38957e2667d00bf 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -6646,6 +6646,12 @@ static void broadwell_init_clock_gating(struct drm_device *dev) misccpctl = I915_READ(GEN7_MISCCPCTL); I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE); I915_WRITE(GEN8_L3SQCREG1, BDW_WA_L3SQCREG1_DEFAULT); + /* + * Wait at least 100 clocks before re-enabling clock gating. See + * the definition of L3SQCREG1 in BSpec. + */ + POSTING_READ(GEN8_L3SQCREG1); + udelay(1); I915_WRITE(GEN7_MISCCPCTL, misccpctl); /* diff --git a/drivers/gpu/drm/radeon/atombios_crtc.c b/drivers/gpu/drm/radeon/atombios_crtc.c index b80b08f71cb46e8d69d7bd94f6d951008267500e..532127c55de64197698336c92d602b0ff3043071 100644 --- a/drivers/gpu/drm/radeon/atombios_crtc.c +++ b/drivers/gpu/drm/radeon/atombios_crtc.c @@ -1742,6 +1742,7 @@ static u32 radeon_get_pll_use_mask(struct drm_crtc *crtc) static int radeon_get_shared_dp_ppll(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; + struct radeon_device *rdev = dev->dev_private; struct drm_crtc *test_crtc; struct radeon_crtc *test_radeon_crtc; @@ -1751,6 +1752,10 @@ static int radeon_get_shared_dp_ppll(struct drm_crtc *crtc) test_radeon_crtc = to_radeon_crtc(test_crtc); if (test_radeon_crtc->encoder && ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_radeon_crtc->encoder))) { + /* PPLL2 is exclusive to UNIPHYA on DCE61 */ + if (ASIC_IS_DCE61(rdev) && !ASIC_IS_DCE8(rdev) && + test_radeon_crtc->pll_id == ATOM_PPLL2) + continue; /* for DP use the same PLL for all */ if (test_radeon_crtc->pll_id != ATOM_PPLL_INVALID) return test_radeon_crtc->pll_id; @@ -1772,6 +1777,7 @@ static int radeon_get_shared_nondp_ppll(struct drm_crtc *crtc) { struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc); struct drm_device *dev = crtc->dev; + struct radeon_device *rdev = dev->dev_private; struct drm_crtc *test_crtc; struct radeon_crtc *test_radeon_crtc; u32 adjusted_clock, test_adjusted_clock; @@ -1787,6 +1793,10 @@ static int radeon_get_shared_nondp_ppll(struct drm_crtc *crtc) test_radeon_crtc = to_radeon_crtc(test_crtc); if (test_radeon_crtc->encoder && !ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_radeon_crtc->encoder))) { + /* PPLL2 is exclusive to UNIPHYA on DCE61 */ + if (ASIC_IS_DCE61(rdev) && !ASIC_IS_DCE8(rdev) && + test_radeon_crtc->pll_id == ATOM_PPLL2) + continue; /* check if we are already driving this connector with another crtc */ if (test_radeon_crtc->connector == radeon_crtc->connector) { /* if we are, return that pll */ diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c index afa9db1dc0e3dfcc14f51bf3e74b2f2d3c5702c5..cead089a9e7d2ea1bc78046aeed4d6cb7e8a02d6 100644 --- a/drivers/gpu/drm/radeon/atombios_dp.c +++ b/drivers/gpu/drm/radeon/atombios_dp.c @@ -326,8 +326,8 @@ int radeon_dp_get_dp_link_config(struct drm_connector *connector, } } } else { - for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { - for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { max_pix_clock = (lane_num * link_rates[i] * 8) / bpp; if (max_pix_clock >= pix_clock) { *dp_lanes = lane_num; diff --git a/drivers/gpu/drm/radeon/radeon_dp_auxch.c b/drivers/gpu/drm/radeon/radeon_dp_auxch.c index 3b0c229d7dcd23ffb7184ad79e2ebaa8f001cca9..db64e0062689b076842b9710c33e4660c96e9985 100644 --- a/drivers/gpu/drm/radeon/radeon_dp_auxch.c +++ b/drivers/gpu/drm/radeon/radeon_dp_auxch.c @@ -105,7 +105,7 @@ radeon_dp_aux_transfer_native(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg tmp &= AUX_HPD_SEL(0x7); tmp |= AUX_HPD_SEL(chan->rec.hpd); - tmp |= AUX_EN | AUX_LS_READ_EN; + tmp |= AUX_EN | AUX_LS_READ_EN | AUX_HPD_DISCON(0x1); WREG32(AUX_CONTROL + aux_offset[instance], tmp); diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c index 6f8b084e13d0724e77c68c249a227fdb6377a02d..3d8ff09eba57696677b242d29e33fb1bac3f592d 100644 --- a/drivers/input/joystick/analog.c +++ b/drivers/input/joystick/analog.c @@ -143,9 +143,9 @@ struct analog_port { #include -#define GET_TIME(x) do { if (cpu_has_tsc) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0) -#define DELTA(x,y) (cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0))) -#define TIME_NAME (cpu_has_tsc?"TSC":"PIT") +#define GET_TIME(x) do { if (boot_cpu_has(X86_FEATURE_TSC)) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0) +#define DELTA(x,y) (boot_cpu_has(X86_FEATURE_TSC) ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0))) +#define TIME_NAME (boot_cpu_has(X86_FEATURE_TSC)?"TSC":"PIT") static unsigned int get_time_pit(void) { unsigned long flags; diff --git a/drivers/input/misc/max8997_haptic.c b/drivers/input/misc/max8997_haptic.c index a806ba3818f7267dd2036dd870c4f60ca28dc0d5..8d6326d7e7beaf1875bb95af385dff1285f10b0e 100644 --- a/drivers/input/misc/max8997_haptic.c +++ b/drivers/input/misc/max8997_haptic.c @@ -255,12 +255,14 @@ static int max8997_haptic_probe(struct platform_device *pdev) struct max8997_dev *iodev = dev_get_drvdata(pdev->dev.parent); const struct max8997_platform_data *pdata = dev_get_platdata(iodev->dev); - const struct max8997_haptic_platform_data *haptic_pdata = - pdata->haptic_pdata; + const struct max8997_haptic_platform_data *haptic_pdata = NULL; struct max8997_haptic *chip; struct input_dev *input_dev; int error; + if (pdata) + haptic_pdata = pdata->haptic_pdata; + if (!haptic_pdata) { dev_err(&pdev->dev, "no haptic platform data\n"); return -EINVAL; diff --git a/drivers/input/misc/twl6040-vibra.c b/drivers/input/misc/twl6040-vibra.c index df3581f606282a2890b8ce78cf13d9e4882c3c0a..42de34b9299633f5104f8127e3c8bb50ec83f294 100644 --- a/drivers/input/misc/twl6040-vibra.c +++ b/drivers/input/misc/twl6040-vibra.c @@ -257,6 +257,7 @@ static int twl6040_vibra_probe(struct platform_device *pdev) int vddvibr_uV = 0; int error; + of_node_get(twl6040_core_dev->of_node); twl6040_core_node = of_find_node_by_name(twl6040_core_dev->of_node, "vibra"); if (!twl6040_core_node) { diff --git a/drivers/input/mouse/byd.c b/drivers/input/mouse/byd.c index fdc243ca93ed7c50c89c18de6477e6b3c6d44a6f..e583f8b504549c6415f697f5d746a38e440dc556 100644 --- a/drivers/input/mouse/byd.c +++ b/drivers/input/mouse/byd.c @@ -2,6 +2,10 @@ * BYD TouchPad PS/2 mouse driver * * Copyright (C) 2015 Chris Diamand + * Copyright (C) 2015 Richard Pospesel + * Copyright (C) 2015 Tai Chi Minh Ralph Eastwood + * Copyright (C) 2015 Martin Wimpress + * Copyright (C) 2015 Jay Kuri * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published by diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 8adaaeae32681d863c568eddbfd9af5e12bdc205..49721b4e1975c3c1b038665aa9f18749d95e06e6 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -36,7 +36,7 @@ static void irq_remapping_disable_io_apic(void) * As this gets called during crash dump, keep this simple for * now. */ - if (cpu_has_apic || apic_from_smp_config()) + if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config()) disconnect_bsp_APIC(0); } diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index adc162c7040d7ef0a2f8f738e7a21bab1d57ba09..6e9042e3d2a944db17c37bf9f3fda675c1f0bdd6 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -603,7 +603,7 @@ void __init lguest_arch_host_init(void) * doing this. */ get_online_cpus(); - if (cpu_has_pge) { /* We have a broader idea of "global". */ + if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */ /* Remember that this was originally set (for cleanup). */ cpu_had_pge = 1; /* diff --git a/drivers/media/v4l2-core/videobuf2-v4l2.c b/drivers/media/v4l2-core/videobuf2-v4l2.c index 7f366f1b0377a3557201a1bf64470472a29d5658..0b1b8c7b6ce51e69cd2c7f9e8c3eec7f6b6744eb 100644 --- a/drivers/media/v4l2-core/videobuf2-v4l2.c +++ b/drivers/media/v4l2-core/videobuf2-v4l2.c @@ -74,11 +74,6 @@ static int __verify_planes_array(struct vb2_buffer *vb, const struct v4l2_buffer return 0; } -static int __verify_planes_array_core(struct vb2_buffer *vb, const void *pb) -{ - return __verify_planes_array(vb, pb); -} - /** * __verify_length() - Verify that the bytesused value for each plane fits in * the plane length and that the data offset doesn't exceed the bytesused value. @@ -442,7 +437,6 @@ static int __fill_vb2_buffer(struct vb2_buffer *vb, } static const struct vb2_buf_ops v4l2_buf_ops = { - .verify_planes_array = __verify_planes_array_core, .fill_user_buffer = __fill_v4l2_buffer, .fill_vb2_buffer = __fill_vb2_buffer, .copy_timestamp = __copy_timestamp, diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c index 967b9dd24fe93d3f898b23e63c411bedf3d2c907..030769018461b5ddca59e5dc62ab6fd3ab35d97f 100644 --- a/drivers/misc/sgi-gru/grukservices.c +++ b/drivers/misc/sgi-gru/grukservices.c @@ -718,8 +718,8 @@ static int send_message_queue_full(void *cb, struct gru_message_queue_desc *mqd, static int send_message_put_nacked(void *cb, struct gru_message_queue_desc *mqd, void *mesg, int lines) { - unsigned long m, *val = mesg, gpa, save; - int ret; + unsigned long m; + int ret, loops = 200; /* experimentally determined */ m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6); if (lines == 2) { @@ -735,22 +735,28 @@ static int send_message_put_nacked(void *cb, struct gru_message_queue_desc *mqd, return MQE_OK; /* - * Send a cross-partition interrupt to the SSI that contains the target - * message queue. Normally, the interrupt is automatically delivered by - * hardware but some error conditions require explicit delivery. - * Use the GRU to deliver the interrupt. Otherwise partition failures + * Send a noop message in order to deliver a cross-partition interrupt + * to the SSI that contains the target message queue. Normally, the + * interrupt is automatically delivered by hardware following mesq + * operations, but some error conditions require explicit delivery. + * The noop message will trigger delivery. Otherwise partition failures * could cause unrecovered errors. */ - gpa = uv_global_gru_mmr_address(mqd->interrupt_pnode, UVH_IPI_INT); - save = *val; - *val = uv_hub_ipi_value(mqd->interrupt_apicid, mqd->interrupt_vector, - dest_Fixed); - gru_vstore_phys(cb, gpa, gru_get_tri(mesg), IAA_REGISTER, IMA); - ret = gru_wait(cb); - *val = save; - if (ret != CBS_IDLE) - return MQE_UNEXPECTED_CB_ERR; - return MQE_OK; + do { + ret = send_noop_message(cb, mqd, mesg); + } while ((ret == MQIE_AGAIN || ret == MQE_CONGESTION) && (loops-- > 0)); + + if (ret == MQIE_AGAIN || ret == MQE_CONGESTION) { + /* + * Don't indicate to the app to resend the message, as it's + * already been successfully sent. We simply send an OK + * (rather than fail the send with MQE_UNEXPECTED_CB_ERR), + * assuming that the other side is receiving enough + * interrupts to get this message processed anyway. + */ + ret = MQE_OK; + } + return ret; } /* diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c index b212488606da4877117e7e9f617a169f2d62f282..11be8044e0d7b9e62f0ac2494925e83d21c3aeab 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c @@ -43,6 +43,7 @@ static void xgene_cle_idt_to_hw(u32 dstqid, u32 fpsel, static void xgene_cle_dbptr_to_hw(struct xgene_enet_pdata *pdata, struct xgene_cle_dbptr *dbptr, u32 *buf) { + buf[0] = SET_VAL(CLE_DROP, dbptr->drop); buf[4] = SET_VAL(CLE_FPSEL, dbptr->fpsel) | SET_VAL(CLE_DSTQIDL, dbptr->dstqid); @@ -412,7 +413,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) .branch = { { /* IPV4 */ - .valid = 0, + .valid = 1, .next_packet_pointer = 22, .jump_bw = JMP_FW, .jump_rel = JMP_ABS, @@ -420,7 +421,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) .next_node = PKT_PROT_NODE, .next_branch = 0, .data = 0x8, - .mask = 0xffff + .mask = 0x0 }, { .valid = 0, @@ -456,7 +457,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) .next_node = RSS_IPV4_TCP_NODE, .next_branch = 0, .data = 0x0600, - .mask = 0xffff + .mask = 0x00ff }, { /* UDP */ @@ -468,7 +469,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) .next_node = RSS_IPV4_UDP_NODE, .next_branch = 0, .data = 0x1100, - .mask = 0xffff + .mask = 0x00ff }, { .valid = 0, @@ -642,7 +643,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) { /* TCP DST Port */ .valid = 0, - .next_packet_pointer = 256, + .next_packet_pointer = 258, .jump_bw = JMP_FW, .jump_rel = JMP_ABS, .operation = EQT, diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h index 29a17abdd828174cb3f5e4fad10458c0df26efca..3bf90683240e1b3d759e67d13d7071aa4e08c020 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h @@ -83,6 +83,8 @@ #define CLE_TYPE_POS 0 #define CLE_TYPE_LEN 2 +#define CLE_DROP_POS 28 +#define CLE_DROP_LEN 1 #define CLE_DSTQIDL_POS 25 #define CLE_DSTQIDL_LEN 7 #define CLE_DSTQIDH_POS 0 diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c index 39e081a70f5b4f5bb9b674d7fe9c470f6d807b80..513d2a62ee6dae1da339c030f4e59453334d39d1 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c @@ -219,27 +219,30 @@ void xgene_enet_parse_error(struct xgene_enet_desc_ring *ring, struct xgene_enet_pdata *pdata, enum xgene_enet_err_code status) { - struct rtnl_link_stats64 *stats = &pdata->stats; - switch (status) { case INGRESS_CRC: - stats->rx_crc_errors++; + ring->rx_crc_errors++; + ring->rx_dropped++; break; case INGRESS_CHECKSUM: case INGRESS_CHECKSUM_COMPUTE: - stats->rx_errors++; + ring->rx_errors++; + ring->rx_dropped++; break; case INGRESS_TRUNC_FRAME: - stats->rx_frame_errors++; + ring->rx_frame_errors++; + ring->rx_dropped++; break; case INGRESS_PKT_LEN: - stats->rx_length_errors++; + ring->rx_length_errors++; + ring->rx_dropped++; break; case INGRESS_PKT_UNDER: - stats->rx_frame_errors++; + ring->rx_frame_errors++; + ring->rx_dropped++; break; case INGRESS_FIFO_OVERRUN: - stats->rx_fifo_errors++; + ring->rx_fifo_errors++; break; default: break; diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h index ba7da98af2efb48c2a5811244ea4776185ce2534..45220be3122f99ecf50c96e5f83fb6c8399108ac 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h @@ -86,7 +86,7 @@ enum xgene_enet_rm { #define RINGADDRL_POS 5 #define RINGADDRL_LEN 27 #define RINGADDRH_POS 0 -#define RINGADDRH_LEN 6 +#define RINGADDRH_LEN 7 #define RINGSIZE_POS 23 #define RINGSIZE_LEN 3 #define RINGTYPE_POS 19 @@ -94,9 +94,9 @@ enum xgene_enet_rm { #define RINGMODE_POS 20 #define RINGMODE_LEN 3 #define RECOMTIMEOUTL_POS 28 -#define RECOMTIMEOUTL_LEN 3 +#define RECOMTIMEOUTL_LEN 4 #define RECOMTIMEOUTH_POS 0 -#define RECOMTIMEOUTH_LEN 2 +#define RECOMTIMEOUTH_LEN 3 #define NUMMSGSINQ_POS 1 #define NUMMSGSINQ_LEN 16 #define ACCEPTLERR BIT(19) @@ -201,6 +201,8 @@ enum xgene_enet_rm { #define USERINFO_LEN 32 #define FPQNUM_POS 32 #define FPQNUM_LEN 12 +#define ELERR_POS 46 +#define ELERR_LEN 2 #define NV_POS 50 #define NV_LEN 1 #define LL_POS 51 diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index 99d7e580e16625bb0232d507ac4090489fdfab8f..fd200883d228eebb5063dc085858906ee23a372a 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -443,8 +443,8 @@ static netdev_tx_t xgene_enet_start_xmit(struct sk_buff *skb, skb_tx_timestamp(skb); - pdata->stats.tx_packets++; - pdata->stats.tx_bytes += skb->len; + tx_ring->tx_packets++; + tx_ring->tx_bytes += skb->len; pdata->ring_ops->wr_cmd(tx_ring, count); return NETDEV_TX_OK; @@ -483,12 +483,12 @@ static int xgene_enet_rx_frame(struct xgene_enet_desc_ring *rx_ring, skb = buf_pool->rx_skb[skb_index]; /* checking for error */ - status = GET_VAL(LERR, le64_to_cpu(raw_desc->m0)); + status = (GET_VAL(ELERR, le64_to_cpu(raw_desc->m0)) << LERR_LEN) || + GET_VAL(LERR, le64_to_cpu(raw_desc->m0)); if (unlikely(status > 2)) { dev_kfree_skb_any(skb); xgene_enet_parse_error(rx_ring, netdev_priv(rx_ring->ndev), status); - pdata->stats.rx_dropped++; ret = -EIO; goto out; } @@ -506,8 +506,8 @@ static int xgene_enet_rx_frame(struct xgene_enet_desc_ring *rx_ring, xgene_enet_skip_csum(skb); } - pdata->stats.rx_packets++; - pdata->stats.rx_bytes += datalen; + rx_ring->rx_packets++; + rx_ring->rx_bytes += datalen; napi_gro_receive(&rx_ring->napi, skb); out: if (--rx_ring->nbufpool == 0) { @@ -630,7 +630,7 @@ static int xgene_enet_register_irq(struct net_device *ndev) ring = pdata->rx_ring[i]; irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY); ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq, - IRQF_SHARED, ring->irq_name, ring); + 0, ring->irq_name, ring); if (ret) { netdev_err(ndev, "Failed to request irq %s\n", ring->irq_name); @@ -641,7 +641,7 @@ static int xgene_enet_register_irq(struct net_device *ndev) ring = pdata->tx_ring[i]->cp_ring; irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY); ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq, - IRQF_SHARED, ring->irq_name, ring); + 0, ring->irq_name, ring); if (ret) { netdev_err(ndev, "Failed to request irq %s\n", ring->irq_name); @@ -1114,12 +1114,31 @@ static struct rtnl_link_stats64 *xgene_enet_get_stats64( { struct xgene_enet_pdata *pdata = netdev_priv(ndev); struct rtnl_link_stats64 *stats = &pdata->stats; + struct xgene_enet_desc_ring *ring; + int i; - stats->rx_errors += stats->rx_length_errors + - stats->rx_crc_errors + - stats->rx_frame_errors + - stats->rx_fifo_errors; - memcpy(storage, &pdata->stats, sizeof(struct rtnl_link_stats64)); + memset(stats, 0, sizeof(struct rtnl_link_stats64)); + for (i = 0; i < pdata->txq_cnt; i++) { + ring = pdata->tx_ring[i]; + if (ring) { + stats->tx_packets += ring->tx_packets; + stats->tx_bytes += ring->tx_bytes; + } + } + + for (i = 0; i < pdata->rxq_cnt; i++) { + ring = pdata->rx_ring[i]; + if (ring) { + stats->rx_packets += ring->rx_packets; + stats->rx_bytes += ring->rx_bytes; + stats->rx_errors += ring->rx_length_errors + + ring->rx_crc_errors + + ring->rx_frame_errors + + ring->rx_fifo_errors; + stats->rx_dropped += ring->rx_dropped; + } + } + memcpy(storage, stats, sizeof(struct rtnl_link_stats64)); return storage; } @@ -1234,6 +1253,13 @@ static int xgene_enet_get_irqs(struct xgene_enet_pdata *pdata) for (i = 0; i < max_irqs; i++) { ret = platform_get_irq(pdev, i); if (ret <= 0) { + if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) { + max_irqs = i; + pdata->rxq_cnt = max_irqs / 2; + pdata->txq_cnt = max_irqs / 2; + pdata->cq_cnt = max_irqs / 2; + break; + } dev_err(dev, "Unable to get ENET IRQ\n"); ret = ret ? : -ENXIO; return ret; @@ -1437,19 +1463,28 @@ static void xgene_enet_setup_ops(struct xgene_enet_pdata *pdata) pdata->port_ops = &xgene_xgport_ops; pdata->cle_ops = &xgene_cle3in_ops; pdata->rm = RM0; - pdata->rxq_cnt = XGENE_NUM_RX_RING; - pdata->txq_cnt = XGENE_NUM_TX_RING; - pdata->cq_cnt = XGENE_NUM_TXC_RING; + if (!pdata->rxq_cnt) { + pdata->rxq_cnt = XGENE_NUM_RX_RING; + pdata->txq_cnt = XGENE_NUM_TX_RING; + pdata->cq_cnt = XGENE_NUM_TXC_RING; + } break; } if (pdata->enet_id == XGENE_ENET1) { switch (pdata->port_id) { case 0: - pdata->cpu_bufnum = START_CPU_BUFNUM_0; - pdata->eth_bufnum = START_ETH_BUFNUM_0; - pdata->bp_bufnum = START_BP_BUFNUM_0; - pdata->ring_num = START_RING_NUM_0; + if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) { + pdata->cpu_bufnum = X2_START_CPU_BUFNUM_0; + pdata->eth_bufnum = X2_START_ETH_BUFNUM_0; + pdata->bp_bufnum = X2_START_BP_BUFNUM_0; + pdata->ring_num = START_RING_NUM_0; + } else { + pdata->cpu_bufnum = START_CPU_BUFNUM_0; + pdata->eth_bufnum = START_ETH_BUFNUM_0; + pdata->bp_bufnum = START_BP_BUFNUM_0; + pdata->ring_num = START_RING_NUM_0; + } break; case 1: if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) { diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h index 175d18890c7a7da01895d7970ef653ce0170c01a..9d9cf445148c165d22376958d3552881d474bc24 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h @@ -49,10 +49,10 @@ #define XGENE_ENET_MSS 1448 #define XGENE_MIN_ENET_FRAME_SIZE 60 -#define XGENE_MAX_ENET_IRQ 8 -#define XGENE_NUM_RX_RING 4 -#define XGENE_NUM_TX_RING 4 -#define XGENE_NUM_TXC_RING 4 +#define XGENE_MAX_ENET_IRQ 16 +#define XGENE_NUM_RX_RING 8 +#define XGENE_NUM_TX_RING 8 +#define XGENE_NUM_TXC_RING 8 #define START_CPU_BUFNUM_0 0 #define START_ETH_BUFNUM_0 2 @@ -121,6 +121,16 @@ struct xgene_enet_desc_ring { struct xgene_enet_raw_desc16 *raw_desc16; }; __le64 *exp_bufs; + u64 tx_packets; + u64 tx_bytes; + u64 rx_packets; + u64 rx_bytes; + u64 rx_dropped; + u64 rx_errors; + u64 rx_length_errors; + u64 rx_crc_errors; + u64 rx_frame_errors; + u64 rx_fifo_errors; }; struct xgene_mac_ops { diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h index 29a71b4dcc44f361ef61c90bee1575984a5f47ea..002df5a6756e06e2e9f6100dba0a13d9584227d6 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h @@ -33,7 +33,7 @@ #define LINK_STATUS BIT(2) #define LINK_UP BIT(15) #define MPA_IDLE_WITH_QMI_EMPTY BIT(12) -#define SG_RX_DV_GATE_REG_0_ADDR 0x0dfc +#define SG_RX_DV_GATE_REG_0_ADDR 0x05fc extern const struct xgene_mac_ops xgene_sgmac_ops; extern const struct xgene_port_ops xgene_sgport_ops; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 9d4e8e113fe18f564060d48a6e686f5a4c79dadd..c39a7f5c6a0170abbddb0f962a5d7b3cb4cf4fb1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -813,6 +813,46 @@ static inline struct sk_buff *bnxt_copy_skb(struct bnxt_napi *bnapi, u8 *data, return skb; } +static int bnxt_discard_rx(struct bnxt *bp, struct bnxt_napi *bnapi, + u32 *raw_cons, void *cmp) +{ + struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring; + struct rx_cmp *rxcmp = cmp; + u32 tmp_raw_cons = *raw_cons; + u8 cmp_type, agg_bufs = 0; + + cmp_type = RX_CMP_TYPE(rxcmp); + + if (cmp_type == CMP_TYPE_RX_L2_CMP) { + agg_bufs = (le32_to_cpu(rxcmp->rx_cmp_misc_v1) & + RX_CMP_AGG_BUFS) >> + RX_CMP_AGG_BUFS_SHIFT; + } else if (cmp_type == CMP_TYPE_RX_L2_TPA_END_CMP) { + struct rx_tpa_end_cmp *tpa_end = cmp; + + agg_bufs = (le32_to_cpu(tpa_end->rx_tpa_end_cmp_misc_v1) & + RX_TPA_END_CMP_AGG_BUFS) >> + RX_TPA_END_CMP_AGG_BUFS_SHIFT; + } + + if (agg_bufs) { + if (!bnxt_agg_bufs_valid(bp, cpr, agg_bufs, &tmp_raw_cons)) + return -EBUSY; + } + *raw_cons = tmp_raw_cons; + return 0; +} + +static void bnxt_sched_reset(struct bnxt *bp, struct bnxt_rx_ring_info *rxr) +{ + if (!rxr->bnapi->in_reset) { + rxr->bnapi->in_reset = true; + set_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event); + schedule_work(&bp->sp_task); + } + rxr->rx_next_cons = 0xffff; +} + static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, struct rx_tpa_start_cmp *tpa_start, struct rx_tpa_start_cmp_ext *tpa_start1) @@ -830,6 +870,11 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, prod_rx_buf = &rxr->rx_buf_ring[prod]; tpa_info = &rxr->rx_tpa[agg_id]; + if (unlikely(cons != rxr->rx_next_cons)) { + bnxt_sched_reset(bp, rxr); + return; + } + prod_rx_buf->data = tpa_info->data; mapping = tpa_info->mapping; @@ -867,6 +912,7 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, rxr->rx_prod = NEXT_RX(prod); cons = NEXT_RX(cons); + rxr->rx_next_cons = NEXT_RX(cons); cons_rx_buf = &rxr->rx_buf_ring[cons]; bnxt_reuse_rx_data(rxr, cons, cons_rx_buf->data); @@ -980,6 +1026,14 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp, dma_addr_t mapping; struct sk_buff *skb; + if (unlikely(bnapi->in_reset)) { + int rc = bnxt_discard_rx(bp, bnapi, raw_cons, tpa_end); + + if (rc < 0) + return ERR_PTR(-EBUSY); + return NULL; + } + tpa_info = &rxr->rx_tpa[agg_id]; data = tpa_info->data; prefetch(data); @@ -1146,6 +1200,12 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons, cons = rxcmp->rx_cmp_opaque; rx_buf = &rxr->rx_buf_ring[cons]; data = rx_buf->data; + if (unlikely(cons != rxr->rx_next_cons)) { + int rc1 = bnxt_discard_rx(bp, bnapi, raw_cons, rxcmp); + + bnxt_sched_reset(bp, rxr); + return rc1; + } prefetch(data); agg_bufs = (le32_to_cpu(rxcmp->rx_cmp_misc_v1) & RX_CMP_AGG_BUFS) >> @@ -1245,6 +1305,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons, next_rx: rxr->rx_prod = NEXT_RX(prod); + rxr->rx_next_cons = NEXT_RX(cons); next_rx_no_prod: *raw_cons = tmp_raw_cons; @@ -2486,6 +2547,7 @@ static void bnxt_clear_ring_indices(struct bnxt *bp) rxr->rx_prod = 0; rxr->rx_agg_prod = 0; rxr->rx_sw_agg_prod = 0; + rxr->rx_next_cons = 0; } } } @@ -4462,6 +4524,7 @@ static void bnxt_enable_napi(struct bnxt *bp) int i; for (i = 0; i < bp->cp_nr_rings; i++) { + bp->bnapi[i]->in_reset = false; bnxt_enable_poll(bp->bnapi[i]); napi_enable(&bp->bnapi[i]->napi); } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 8b823ff558ffe2ab81422c0ce6bdfa61d3bc2b20..de9d53eee3ddad5742898383c34fafab89348fb7 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -584,6 +584,7 @@ struct bnxt_rx_ring_info { u16 rx_prod; u16 rx_agg_prod; u16 rx_sw_agg_prod; + u16 rx_next_cons; void __iomem *rx_doorbell; void __iomem *rx_agg_doorbell; @@ -636,6 +637,7 @@ struct bnxt_napi { #ifdef CONFIG_NET_RX_BUSY_POLL atomic_t poll_state; #endif + bool in_reset; }; #ifdef CONFIG_NET_RX_BUSY_POLL diff --git a/drivers/net/ethernet/marvell/Kconfig b/drivers/net/ethernet/marvell/Kconfig index b5c6d42daa1208dab6771d0b7578f7cdb24bde02..2664827ddecd969b3e61886c16e7059c74dbf46f 100644 --- a/drivers/net/ethernet/marvell/Kconfig +++ b/drivers/net/ethernet/marvell/Kconfig @@ -68,7 +68,7 @@ config MVNETA config MVNETA_BM tristate - default y if MVNETA=y && MVNETA_BM_ENABLE + default y if MVNETA=y && MVNETA_BM_ENABLE!=n default MVNETA_BM_ENABLE select HWBM help diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c index cda9e604a95f68d61227808779a93abf1c400342..0844b7c7576709c8a271142fd85f41b8007fc108 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c @@ -1417,6 +1417,7 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter) struct qlcnic_fw_dump *fw_dump = &ahw->fw_dump; struct pci_dev *pdev = adapter->pdev; bool extended = false; + int ret; prev_version = adapter->fw_version; current_version = qlcnic_83xx_get_fw_version(adapter); @@ -1427,8 +1428,11 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter) if (qlcnic_83xx_md_check_extended_dump_capability(adapter)) extended = !qlcnic_83xx_extend_md_capab(adapter); - if (!qlcnic_fw_cmd_get_minidump_temp(adapter)) - dev_info(&pdev->dev, "Supports FW dump capability\n"); + ret = qlcnic_fw_cmd_get_minidump_temp(adapter); + if (ret) + return; + + dev_info(&pdev->dev, "Supports FW dump capability\n"); /* Once we have minidump template with extended iSCSI dump * capability, update the minidump capture mask to 0x1f as diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 72c9f1f352b4ec686a073b48f8df52d6245f771f..7c7830722ea2cca599485754f60470be17920820 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -635,10 +635,10 @@ static int receive(struct net_device *dev, int cnt) #ifdef __i386__ #include -#define GETTICK(x) \ -({ \ - if (cpu_has_tsc) \ - x = (unsigned int)rdtsc(); \ +#define GETTICK(x) \ +({ \ + if (boot_cpu_has(X86_FEATURE_TSC)) \ + x = (unsigned int)rdtsc(); \ }) #else /* __i386__ */ #define GETTICK(x) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index b42f26029225fd69d9c57fe3088b31fdb5ab39bb..4412a57ec862287b1acad6ed486b7d7b2bfe1c1c 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -711,6 +711,7 @@ static void xenvif_tx_err(struct xenvif_queue *queue, if (cons == end) break; RING_COPY_REQUEST(&queue->tx, cons++, txp); + extra_count = 0; /* only the first frag can have extras */ } while (1); queue->tx.req_cons = cons; } diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index f70090897fdf19c9777c332401dd01de7b1efc52..f2d01d4d93645a0b029561ba2febf7fed56ec276 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -847,6 +847,14 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu) if (!platform_get_irq(cpu_pmu->plat_device, 0)) cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; + /* + * This is a CPU PMU potentially in a heterogeneous configuration (e.g. + * big.LITTLE). This is not an uncore PMU, and we have taken ctx + * sharing into account (e.g. with our pmu::filter_match callback and + * pmu::event_init group validation). + */ + cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS; + return 0; out_unregister: diff --git a/drivers/pinctrl/pinctrl-at91-pio4.c b/drivers/pinctrl/pinctrl-at91-pio4.c index 4429312e848dba2af05ddb3d9da6cb3b2b176248..2c447130b954fa15421858a03dfe7cdf887de80c 100644 --- a/drivers/pinctrl/pinctrl-at91-pio4.c +++ b/drivers/pinctrl/pinctrl-at91-pio4.c @@ -722,9 +722,11 @@ static int atmel_conf_pin_config_group_set(struct pinctrl_dev *pctldev, break; case PIN_CONFIG_BIAS_PULL_UP: conf |= ATMEL_PIO_PUEN_MASK; + conf &= (~ATMEL_PIO_PDEN_MASK); break; case PIN_CONFIG_BIAS_PULL_DOWN: conf |= ATMEL_PIO_PDEN_MASK; + conf &= (~ATMEL_PIO_PUEN_MASK); break; case PIN_CONFIG_DRIVE_OPEN_DRAIN: if (arg == 0) diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c index facd43b8516cf7af1178772f9e62fc9054fcd82a..81603d99082be8251b3cb85dfa37d6d3209effc2 100644 --- a/drivers/pnp/pnpbios/core.c +++ b/drivers/pnp/pnpbios/core.c @@ -521,10 +521,11 @@ static int __init pnpbios_init(void) int ret; if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table) || - paravirt_enabled()) { + arch_pnpbios_disabled()) { printk(KERN_INFO "PnPBIOS: Disabled\n"); return -ENODEV; } + #ifdef CONFIG_PNPACPI if (!acpi_disabled && !pnpacpi_disabled) { pnpbios_disabled = 1; diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 8fad0a7044d3d332b8733eaff65d2fb33dd46e61..f2201d42a9cdd8a5acde7000127aa29e2cc83f59 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -34,6 +34,9 @@ #include #include +/* Local defines */ +#define MSR_PLATFORM_POWER_LIMIT 0x0000065C + /* bitmasks for RAPL MSRs, used by primitive access functions */ #define ENERGY_STATUS_MASK 0xffffffff @@ -86,6 +89,7 @@ enum rapl_domain_type { RAPL_DOMAIN_PP0, /* core power plane */ RAPL_DOMAIN_PP1, /* graphics uncore */ RAPL_DOMAIN_DRAM,/* DRAM control_type */ + RAPL_DOMAIN_PLATFORM, /* PSys control_type */ RAPL_DOMAIN_MAX, }; @@ -251,9 +255,11 @@ static const char * const rapl_domain_names[] = { "core", "uncore", "dram", + "psys", }; static struct powercap_control_type *control_type; /* PowerCap Controller */ +static struct rapl_domain *platform_rapl_domain; /* Platform (PSys) domain */ /* caller to ensure CPU hotplug lock is held */ static struct rapl_package *find_package_by_id(int id) @@ -409,6 +415,14 @@ static const struct powercap_zone_ops zone_ops[] = { .set_enable = set_domain_enable, .get_enable = get_domain_enable, }, + /* RAPL_DOMAIN_PLATFORM */ + { + .get_energy_uj = get_energy_counter, + .get_max_energy_range_uj = get_max_energy_counter, + .release = release_zone, + .set_enable = set_domain_enable, + .get_enable = get_domain_enable, + }, }; static int set_power_limit(struct powercap_zone *power_zone, int id, @@ -1160,6 +1174,13 @@ static int rapl_unregister_powercap(void) powercap_unregister_zone(control_type, &rd_package->power_zone); } + + if (platform_rapl_domain) { + powercap_unregister_zone(control_type, + &platform_rapl_domain->power_zone); + kfree(platform_rapl_domain); + } + powercap_unregister_control_type(control_type); return 0; @@ -1239,6 +1260,47 @@ static int rapl_package_register_powercap(struct rapl_package *rp) return ret; } +static int rapl_register_psys(void) +{ + struct rapl_domain *rd; + struct powercap_zone *power_zone; + u64 val; + + if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_ENERGY_STATUS, &val) || !val) + return -ENODEV; + + if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_POWER_LIMIT, &val) || !val) + return -ENODEV; + + rd = kzalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return -ENOMEM; + + rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM]; + rd->id = RAPL_DOMAIN_PLATFORM; + rd->msrs[0] = MSR_PLATFORM_POWER_LIMIT; + rd->msrs[1] = MSR_PLATFORM_ENERGY_STATUS; + rd->rpl[0].prim_id = PL1_ENABLE; + rd->rpl[0].name = pl1_name; + rd->rpl[1].prim_id = PL2_ENABLE; + rd->rpl[1].name = pl2_name; + rd->rp = find_package_by_id(0); + + power_zone = powercap_register_zone(&rd->power_zone, control_type, + "psys", NULL, + &zone_ops[RAPL_DOMAIN_PLATFORM], + 2, &constraint_ops); + + if (IS_ERR(power_zone)) { + kfree(rd); + return PTR_ERR(power_zone); + } + + platform_rapl_domain = rd; + + return 0; +} + static int rapl_register_powercap(void) { struct rapl_domain *rd; @@ -1255,6 +1317,10 @@ static int rapl_register_powercap(void) list_for_each_entry(rp, &rapl_packages, plist) if (rapl_package_register_powercap(rp)) goto err_cleanup_package; + + /* Don't bail out if PSys is not supported */ + rapl_register_psys(); + return ret; err_cleanup_package: @@ -1289,6 +1355,9 @@ static int rapl_check_domain(int cpu, int domain) case RAPL_DOMAIN_DRAM: msr = MSR_DRAM_ENERGY_STATUS; break; + case RAPL_DOMAIN_PLATFORM: + /* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */ + return -EINVAL; default: pr_err("invalid domain id %d\n", domain); return -EINVAL; diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c index 40cd894e4df5e3d11f26442692dd9c0076d932af..514a5e8fdbab3e757b470daa6e579ebe489fa0f7 100644 --- a/drivers/regulator/axp20x-regulator.c +++ b/drivers/regulator/axp20x-regulator.c @@ -157,7 +157,9 @@ static struct regulator_ops axp20x_ops_sw = { static const struct regulator_linear_range axp20x_ldo4_ranges[] = { REGULATOR_LINEAR_RANGE(1250000, 0x0, 0x0, 0), REGULATOR_LINEAR_RANGE(1300000, 0x1, 0x8, 100000), - REGULATOR_LINEAR_RANGE(2500000, 0x9, 0xf, 100000), + REGULATOR_LINEAR_RANGE(2500000, 0x9, 0x9, 0), + REGULATOR_LINEAR_RANGE(2700000, 0xa, 0xb, 100000), + REGULATOR_LINEAR_RANGE(3000000, 0xc, 0xf, 100000), }; static const struct regulator_desc axp20x_regulators[] = { @@ -215,10 +217,14 @@ static const struct regulator_desc axp22x_regulators[] = { AXP22X_ELDO2_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(1)), AXP_DESC(AXP22X, ELDO3, "eldo3", "eldoin", 700, 3300, 100, AXP22X_ELDO3_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(2)), - AXP_DESC_IO(AXP22X, LDO_IO0, "ldo_io0", "ips", 1800, 3300, 100, + /* Note the datasheet only guarantees reliable operation up to + * 3.3V, this needs to be enforced via dts provided constraints */ + AXP_DESC_IO(AXP22X, LDO_IO0, "ldo_io0", "ips", 700, 3800, 100, AXP22X_LDO_IO0_V_OUT, 0x1f, AXP20X_GPIO0_CTRL, 0x07, AXP22X_IO_ENABLED, AXP22X_IO_DISABLED), - AXP_DESC_IO(AXP22X, LDO_IO1, "ldo_io1", "ips", 1800, 3300, 100, + /* Note the datasheet only guarantees reliable operation up to + * 3.3V, this needs to be enforced via dts provided constraints */ + AXP_DESC_IO(AXP22X, LDO_IO1, "ldo_io1", "ips", 700, 3800, 100, AXP22X_LDO_IO1_V_OUT, 0x1f, AXP20X_GPIO1_CTRL, 0x07, AXP22X_IO_ENABLED, AXP22X_IO_DISABLED), AXP_DESC_FIXED(AXP22X, RTC_LDO, "rtc_ldo", "ips", 3000), diff --git a/drivers/regulator/da9063-regulator.c b/drivers/regulator/da9063-regulator.c index ed9e7e96f8777a291341e1dd45fccc413bee0f43..c6af343f54eac5c59b3e06923e3b4141413b833e 100644 --- a/drivers/regulator/da9063-regulator.c +++ b/drivers/regulator/da9063-regulator.c @@ -900,4 +900,4 @@ module_exit(da9063_regulator_cleanup); MODULE_AUTHOR("Krystian Garbaciak "); MODULE_DESCRIPTION("DA9063 regulators driver"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("paltform:" DA9063_DRVNAME_REGULATORS); +MODULE_ALIAS("platform:" DA9063_DRVNAME_REGULATORS); diff --git a/drivers/regulator/gpio-regulator.c b/drivers/regulator/gpio-regulator.c index a8718e98674a273939f20d06627be943aab6aee7..83e89e5d47526c17be6d728fcc7a651f45aec4b0 100644 --- a/drivers/regulator/gpio-regulator.c +++ b/drivers/regulator/gpio-regulator.c @@ -162,6 +162,8 @@ of_get_gpio_regulator_config(struct device *dev, struct device_node *np, of_property_read_u32(np, "startup-delay-us", &config->startup_delay); config->enable_gpio = of_get_named_gpio(np, "enable-gpio", 0); + if (config->enable_gpio == -EPROBE_DEFER) + return ERR_PTR(-EPROBE_DEFER); /* Fetch GPIOs. - optional property*/ ret = of_gpio_count(np); diff --git a/drivers/regulator/s2mps11.c b/drivers/regulator/s2mps11.c index d24e2c783dc5c9a093220c4e98980813ad9e309f..6dfa3502e1f1a5ed7532d15157856c30e8d3bb7c 100644 --- a/drivers/regulator/s2mps11.c +++ b/drivers/regulator/s2mps11.c @@ -308,7 +308,7 @@ static struct regulator_ops s2mps11_buck_ops = { .enable_mask = S2MPS11_ENABLE_MASK \ } -#define regulator_desc_s2mps11_buck6_10(num, min, step) { \ +#define regulator_desc_s2mps11_buck67810(num, min, step) { \ .name = "BUCK"#num, \ .id = S2MPS11_BUCK##num, \ .ops = &s2mps11_buck_ops, \ @@ -324,6 +324,22 @@ static struct regulator_ops s2mps11_buck_ops = { .enable_mask = S2MPS11_ENABLE_MASK \ } +#define regulator_desc_s2mps11_buck9 { \ + .name = "BUCK9", \ + .id = S2MPS11_BUCK9, \ + .ops = &s2mps11_buck_ops, \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + .min_uV = MIN_3000_MV, \ + .uV_step = STEP_25_MV, \ + .n_voltages = S2MPS11_BUCK9_N_VOLTAGES, \ + .ramp_delay = S2MPS11_RAMP_DELAY, \ + .vsel_reg = S2MPS11_REG_B9CTRL2, \ + .vsel_mask = S2MPS11_BUCK9_VSEL_MASK, \ + .enable_reg = S2MPS11_REG_B9CTRL1, \ + .enable_mask = S2MPS11_ENABLE_MASK \ +} + static const struct regulator_desc s2mps11_regulators[] = { regulator_desc_s2mps11_ldo(1, STEP_25_MV), regulator_desc_s2mps11_ldo(2, STEP_50_MV), @@ -368,11 +384,11 @@ static const struct regulator_desc s2mps11_regulators[] = { regulator_desc_s2mps11_buck1_4(3), regulator_desc_s2mps11_buck1_4(4), regulator_desc_s2mps11_buck5, - regulator_desc_s2mps11_buck6_10(6, MIN_600_MV, STEP_6_25_MV), - regulator_desc_s2mps11_buck6_10(7, MIN_600_MV, STEP_6_25_MV), - regulator_desc_s2mps11_buck6_10(8, MIN_600_MV, STEP_6_25_MV), - regulator_desc_s2mps11_buck6_10(9, MIN_3000_MV, STEP_25_MV), - regulator_desc_s2mps11_buck6_10(10, MIN_750_MV, STEP_12_5_MV), + regulator_desc_s2mps11_buck67810(6, MIN_600_MV, STEP_6_25_MV), + regulator_desc_s2mps11_buck67810(7, MIN_600_MV, STEP_6_25_MV), + regulator_desc_s2mps11_buck67810(8, MIN_600_MV, STEP_6_25_MV), + regulator_desc_s2mps11_buck9, + regulator_desc_s2mps11_buck67810(10, MIN_750_MV, STEP_12_5_MV), }; static struct regulator_ops s2mps14_reg_ops; diff --git a/drivers/staging/unisys/visorbus/visorchipset.c b/drivers/staging/unisys/visorbus/visorchipset.c index 5fbda7b218c7a9ed06b2050ca276443a17b319d9..9cf4f8463c4e6b141e69632cd67f2b24965c9757 100644 --- a/drivers/staging/unisys/visorbus/visorchipset.c +++ b/drivers/staging/unisys/visorbus/visorchipset.c @@ -2425,7 +2425,7 @@ static __init uint32_t visorutil_spar_detect(void) { unsigned int eax, ebx, ecx, edx; - if (cpu_has_hypervisor) { + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { /* check the ID */ cpuid(UNISYS_SPAR_LEAF_ID, &eax, &ebx, &ecx, &edx); return (ebx == UNISYS_SPAR_ID_EBX) && diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index 983280e8d93f3df4030c737978402ada4deaed10..e5a391aecde1b4c94a71181dacf665f99579bab1 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -761,7 +761,7 @@ config FB_VESA config FB_EFI bool "EFI-based Framebuffer Support" - depends on (FB = y) && X86 && EFI + depends on (FB = y) && !IA64 && EFI select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c index 95d293b7445a83473bc2131cf657f5de14ef52b3..f4c045c0051cc65fbcdb1df6207f4e3292b8c327 100644 --- a/drivers/video/fbdev/efifb.c +++ b/drivers/video/fbdev/efifb.c @@ -6,16 +6,14 @@ * */ -#include #include +#include #include #include #include #include -#include -#include #include