Commit c8c655c3 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull kvm updates from Paolo Bonzini:
 "s390:

   - More phys_to_virt conversions

   - Improvement of AP management for VSIE (nested virtualization)

  ARM64:

   - Numerous fixes for the pathological lock inversion issue that
     plagued KVM/arm64 since... forever.

   - New framework allowing SMCCC-compliant hypercalls to be forwarded
     to userspace, hopefully paving the way for some more features being
     moved to VMMs rather than be implemented in the kernel.

   - Large rework of the timer code to allow a VM-wide offset to be
     applied to both virtual and physical counters as well as a
     per-timer, per-vcpu offset that complements the global one. This
     last part allows the NV timer code to be implemented on top.

   - A small set of fixes to make sure that we don't change anything
     affecting the EL1&0 translation regime just after having having
     taken an exception to EL2 until we have executed a DSB. This
     ensures that speculative walks started in EL1&0 have completed.

   - The usual selftest fixes and improvements.

  x86:

   - Optimize CR0.WP toggling by avoiding an MMU reload when TDP is
     enabled, and by giving the guest control of CR0.WP when EPT is
     enabled on VMX (VMX-only because SVM doesn't support per-bit
     controls)

   - Add CR0/CR4 helpers to query single bits, and clean up related code
     where KVM was interpreting kvm_read_cr4_bits()'s "unsigned long"
     return as a bool

   - Move AMD_PSFD to cpufeatures.h and purge KVM's definition

   - Avoid unnecessary writes+flushes when the guest is only adding new
     PTEs

   - Overhaul .sync_page() and .invlpg() to utilize .sync_page()'s
     optimizations when emulating invalidations

   - Clean up the range-based flushing APIs

   - Revamp the TDP MMU's reaping of Accessed/Dirty bits to clear a
     single A/D bit using a LOCK AND instead of XCHG, and skip all of
     the "handle changed SPTE" overhead associated with writing the
     entire entry

   - Track the number of "tail" entries in a pte_list_desc to avoid
     having to walk (potentially) all descriptors during insertion and
     deletion, which gets quite expensive if the guest is spamming
     fork()

   - Disallow virtualizing legacy LBRs if architectural LBRs are
     available, the two are mutually exclusive in hardware

   - Disallow writes to immutable feature MSRs (notably
     PERF_CAPABILITIES) after KVM_RUN, similar to CPUID features

   - Overhaul the vmx_pmu_caps selftest to better validate
     PERF_CAPABILITIES

   - Apply PMU filters to emulated events and add test coverage to the
     pmu_event_filter selftest

   - AMD SVM:
       - Add support for virtual NMIs
       - Fixes for edge cases related to virtual interrupts

   - Intel AMX:
       - Don't advertise XTILE_CFG in KVM_GET_SUPPORTED_CPUID if
         XTILE_DATA is not being reported due to userspace not opting in
         via prctl()
       - Fix a bug in emulation of ENCLS in compatibility mode
       - Allow emulation of NOP and PAUSE for L2
       - AMX selftests improvements
       - Misc cleanups

  MIPS:

   - Constify MIPS's internal callbacks (a leftover from the hardware
     enabling rework that landed in 6.3)

  Generic:

   - Drop unnecessary casts from "void *" throughout kvm_main.c

   - Tweak the layout of "struct kvm_mmu_memory_cache" to shrink the
     struct size by 8 bytes on 64-bit kernels by utilizing a padding
     hole

  Documentation:

   - Fix goof introduced by the conversion to rST"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (211 commits)
  KVM: s390: pci: fix virtual-physical confusion on module unload/load
  KVM: s390: vsie: clarifications on setting the APCB
  KVM: s390: interrupt: fix virtual-physical confusion for next alert GISA
  KVM: arm64: Have kvm_psci_vcpu_on() use WRITE_ONCE() to update mp_state
  KVM: arm64: Acquire mp_state_lock in kvm_arch_vcpu_ioctl_vcpu_init()
  KVM: selftests: Test the PMU event "Instructions retired"
  KVM: selftests: Copy full counter values from guest in PMU event filter test
  KVM: selftests: Use error codes to signal errors in PMU event filter test
  KVM: selftests: Print detailed info in PMU event filter asserts
  KVM: selftests: Add helpers for PMC asserts in PMU event filter test
  KVM: selftests: Add a common helper for the PMU event filter guest code
  KVM: selftests: Fix spelling mistake "perrmited" -> "permitted"
  KVM: arm64: vhe: Drop extra isb() on guest exit
  KVM: arm64: vhe: Synchronise with page table walker on MMU update
  KVM: arm64: pkvm: Document the side effects of kvm_flush_dcache_to_poc()
  KVM: arm64: nvhe: Synchronise with page table walker on TLBI
  KVM: arm64: Handle 32bit CNTPCTSS traps
  KVM: arm64: nvhe: Synchronise with page table walker on vcpu run
  KVM: arm64: vgic: Don't acquire its_lock before config_lock
  KVM: selftests: Add test to verify KVM's supported XCR0
  ...
parents d75439d6 b3c98052
Loading
Loading
Loading
Loading
+70 −5
Original line number Original line Diff line number Diff line
@@ -5645,7 +5645,8 @@ with the KVM_XEN_VCPU_GET_ATTR ioctl.
  };
  };


Copies Memory Tagging Extension (MTE) tags to/from guest tag memory. The
Copies Memory Tagging Extension (MTE) tags to/from guest tag memory. The
``guest_ipa`` and ``length`` fields must be ``PAGE_SIZE`` aligned. The ``addr``
``guest_ipa`` and ``length`` fields must be ``PAGE_SIZE`` aligned.
``length`` must not be bigger than 2^31 - PAGE_SIZE bytes. The ``addr``
field must point to a buffer which the tags will be copied to or from.
field must point to a buffer which the tags will be copied to or from.


``flags`` specifies the direction of copy, either ``KVM_ARM_TAGS_TO_GUEST`` or
``flags`` specifies the direction of copy, either ``KVM_ARM_TAGS_TO_GUEST`` or
@@ -6029,6 +6030,44 @@ delivery must be provided via the "reg_aen" struct.
The "pad" and "reserved" fields may be used for future extensions and should be
The "pad" and "reserved" fields may be used for future extensions and should be
set to 0s by userspace.
set to 0s by userspace.


4.138 KVM_ARM_SET_COUNTER_OFFSET
--------------------------------

:Capability: KVM_CAP_COUNTER_OFFSET
:Architectures: arm64
:Type: vm ioctl
:Parameters: struct kvm_arm_counter_offset (in)
:Returns: 0 on success, < 0 on error

This capability indicates that userspace is able to apply a single VM-wide
offset to both the virtual and physical counters as viewed by the guest
using the KVM_ARM_SET_CNT_OFFSET ioctl and the following data structure:

::

	struct kvm_arm_counter_offset {
		__u64 counter_offset;
		__u64 reserved;
	};

The offset describes a number of counter cycles that are subtracted from
both virtual and physical counter views (similar to the effects of the
CNTVOFF_EL2 and CNTPOFF_EL2 system registers, but only global). The offset
always applies to all vcpus (already created or created after this ioctl)
for this VM.

It is userspace's responsibility to compute the offset based, for example,
on previous values of the guest counters.

Any value other than 0 for the "reserved" field may result in an error
(-EINVAL) being returned. This ioctl can also return -EBUSY if any vcpu
ioctl is issued concurrently.

Note that using this ioctl results in KVM ignoring subsequent userspace
writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using the SET_ONE_REG
interface. No error will be returned, but the resulting offset will not be
applied.

5. The kvm_run structure
5. The kvm_run structure
========================
========================


@@ -6218,15 +6257,40 @@ to the byte array.
			__u64 nr;
			__u64 nr;
			__u64 args[6];
			__u64 args[6];
			__u64 ret;
			__u64 ret;
			__u32 longmode;
			__u64 flags;
			__u32 pad;
		} hypercall;
		} hypercall;


Unused.  This was once used for 'hypercall to userspace'.  To implement

such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390).
It is strongly recommended that userspace use ``KVM_EXIT_IO`` (x86) or
``KVM_EXIT_MMIO`` (all except s390) to implement functionality that
requires a guest to interact with host userpace.


.. note:: KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO.
.. note:: KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO.


For arm64:
----------

SMCCC exits can be enabled depending on the configuration of the SMCCC
filter. See the Documentation/virt/kvm/devices/vm.rst
``KVM_ARM_SMCCC_FILTER`` for more details.

``nr`` contains the function ID of the guest's SMCCC call. Userspace is
expected to use the ``KVM_GET_ONE_REG`` ioctl to retrieve the call
parameters from the vCPU's GPRs.

Definition of ``flags``:
 - ``KVM_HYPERCALL_EXIT_SMC``: Indicates that the guest used the SMC
   conduit to initiate the SMCCC call. If this bit is 0 then the guest
   used the HVC conduit for the SMCCC call.

 - ``KVM_HYPERCALL_EXIT_16BIT``: Indicates that the guest used a 16bit
   instruction to initiate the SMCCC call. If this bit is 0 then the
   guest used a 32bit instruction. An AArch64 guest always has this
   bit set to 0.

At the point of exit, PC points to the instruction immediately following
the trapping instruction.

::
::


		/* KVM_EXIT_TPR_ACCESS */
		/* KVM_EXIT_TPR_ACCESS */
@@ -7266,6 +7330,7 @@ and injected exceptions.
       will clear DR6.RTM.
       will clear DR6.RTM.


7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2
7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2
--------------------------------------


:Architectures: x86, arm64, mips
:Architectures: x86, arm64, mips
:Parameters: args[0] whether feature should be enabled or not
:Parameters: args[0] whether feature should be enabled or not
+79 −0
Original line number Original line Diff line number Diff line
@@ -321,3 +321,82 @@ Allows userspace to query the status of migration mode.
	     if it is enabled
	     if it is enabled
:Returns:   -EFAULT if the given address is not accessible from kernel space;
:Returns:   -EFAULT if the given address is not accessible from kernel space;
	    0 in case of success.
	    0 in case of success.

6. GROUP: KVM_ARM_VM_SMCCC_CTRL
===============================

:Architectures: arm64

6.1. ATTRIBUTE: KVM_ARM_VM_SMCCC_FILTER (w/o)
---------------------------------------------

:Parameters: Pointer to a ``struct kvm_smccc_filter``

:Returns:

        ======  ===========================================
        EEXIST  Range intersects with a previously inserted
                or reserved range
        EBUSY   A vCPU in the VM has already run
        EINVAL  Invalid filter configuration
        ENOMEM  Failed to allocate memory for the in-kernel
                representation of the SMCCC filter
        ======  ===========================================

Requests the installation of an SMCCC call filter described as follows::

    enum kvm_smccc_filter_action {
            KVM_SMCCC_FILTER_HANDLE = 0,
            KVM_SMCCC_FILTER_DENY,
            KVM_SMCCC_FILTER_FWD_TO_USER,
    };

    struct kvm_smccc_filter {
            __u32 base;
            __u32 nr_functions;
            __u8 action;
            __u8 pad[15];
    };

The filter is defined as a set of non-overlapping ranges. Each
range defines an action to be applied to SMCCC calls within the range.
Userspace can insert multiple ranges into the filter by using
successive calls to this attribute.

The default configuration of KVM is such that all implemented SMCCC
calls are allowed. Thus, the SMCCC filter can be defined sparsely
by userspace, only describing ranges that modify the default behavior.

The range expressed by ``struct kvm_smccc_filter`` is
[``base``, ``base + nr_functions``). The range is not allowed to wrap,
i.e. userspace cannot rely on ``base + nr_functions`` overflowing.

The SMCCC filter applies to both SMC and HVC calls initiated by the
guest. The SMCCC filter gates the in-kernel emulation of SMCCC calls
and as such takes effect before other interfaces that interact with
SMCCC calls (e.g. hypercall bitmap registers).

Actions:

 - ``KVM_SMCCC_FILTER_HANDLE``: Allows the guest SMCCC call to be
   handled in-kernel. It is strongly recommended that userspace *not*
   explicitly describe the allowed SMCCC call ranges.

 - ``KVM_SMCCC_FILTER_DENY``: Rejects the guest SMCCC call in-kernel
   and returns to the guest.

 - ``KVM_SMCCC_FILTER_FWD_TO_USER``: The guest SMCCC call is forwarded
   to userspace with an exit reason of ``KVM_EXIT_HYPERCALL``.

The ``pad`` field is reserved for future use and must be zero. KVM may
return ``-EINVAL`` if the field is nonzero.

KVM reserves the 'Arm Architecture Calls' range of function IDs and
will reject attempts to define a filter for any portion of these ranges:

        =========== ===============
        Start       End (inclusive)
        =========== ===============
        0x8000_0000 0x8000_FFFF
        0xC000_0000 0xC000_FFFF
        =========== ===============
+1 −1
Original line number Original line Diff line number Diff line
@@ -21,7 +21,7 @@ The acquisition orders for mutexes are as follows:
- kvm->mn_active_invalidate_count ensures that pairs of
- kvm->mn_active_invalidate_count ensures that pairs of
  invalidate_range_start() and invalidate_range_end() callbacks
  invalidate_range_start() and invalidate_range_end() callbacks
  use the same memslots array.  kvm->slots_lock and kvm->slots_arch_lock
  use the same memslots array.  kvm->slots_lock and kvm->slots_arch_lock
  are taken on the waiting side in install_new_memslots, so MMU notifiers
  are taken on the waiting side when modifying memslots, so MMU notifiers
  must not take either kvm->slots_lock or kvm->slots_arch_lock.
  must not take either kvm->slots_lock or kvm->slots_arch_lock.


For SRCU:
For SRCU:
+26 −3
Original line number Original line Diff line number Diff line
@@ -16,6 +16,7 @@
#include <linux/types.h>
#include <linux/types.h>
#include <linux/jump_label.h>
#include <linux/jump_label.h>
#include <linux/kvm_types.h>
#include <linux/kvm_types.h>
#include <linux/maple_tree.h>
#include <linux/percpu.h>
#include <linux/percpu.h>
#include <linux/psci.h>
#include <linux/psci.h>
#include <asm/arch_gicv3.h>
#include <asm/arch_gicv3.h>
@@ -199,6 +200,9 @@ struct kvm_arch {
	/* Mandated version of PSCI */
	/* Mandated version of PSCI */
	u32 psci_version;
	u32 psci_version;


	/* Protects VM-scoped configuration data */
	struct mutex config_lock;

	/*
	/*
	 * If we encounter a data abort without valid instruction syndrome
	 * If we encounter a data abort without valid instruction syndrome
	 * information, report this to user space.  User space can (and
	 * information, report this to user space.  User space can (and
@@ -221,7 +225,12 @@ struct kvm_arch {
#define KVM_ARCH_FLAG_EL1_32BIT				4
#define KVM_ARCH_FLAG_EL1_32BIT				4
	/* PSCI SYSTEM_SUSPEND enabled for the guest */
	/* PSCI SYSTEM_SUSPEND enabled for the guest */
#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		5
#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		5

	/* VM counter offset */
#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET			6
	/* Timer PPIs made immutable */
#define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE		7
	/* SMCCC filter initialized for the VM */
#define KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED		8
	unsigned long flags;
	unsigned long flags;


	/*
	/*
@@ -242,6 +251,7 @@ struct kvm_arch {


	/* Hypercall features firmware registers' descriptor */
	/* Hypercall features firmware registers' descriptor */
	struct kvm_smccc_features smccc_feat;
	struct kvm_smccc_features smccc_feat;
	struct maple_tree smccc_filter;


	/*
	/*
	 * For an untrusted host VM, 'pkvm.handle' is used to lookup
	 * For an untrusted host VM, 'pkvm.handle' is used to lookup
@@ -365,6 +375,10 @@ enum vcpu_sysreg {
	TPIDR_EL2,	/* EL2 Software Thread ID Register */
	TPIDR_EL2,	/* EL2 Software Thread ID Register */
	CNTHCTL_EL2,	/* Counter-timer Hypervisor Control register */
	CNTHCTL_EL2,	/* Counter-timer Hypervisor Control register */
	SP_EL2,		/* EL2 Stack Pointer */
	SP_EL2,		/* EL2 Stack Pointer */
	CNTHP_CTL_EL2,
	CNTHP_CVAL_EL2,
	CNTHV_CTL_EL2,
	CNTHV_CVAL_EL2,


	NR_SYS_REGS	/* Nothing after this line! */
	NR_SYS_REGS	/* Nothing after this line! */
};
};
@@ -522,6 +536,7 @@ struct kvm_vcpu_arch {


	/* vcpu power state */
	/* vcpu power state */
	struct kvm_mp_state mp_state;
	struct kvm_mp_state mp_state;
	spinlock_t mp_state_lock;


	/* Cache some mmu pages needed inside spinlock regions */
	/* Cache some mmu pages needed inside spinlock regions */
	struct kvm_mmu_memory_cache mmu_page_cache;
	struct kvm_mmu_memory_cache mmu_page_cache;
@@ -939,6 +954,9 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu);


int __init kvm_sys_reg_table_init(void);
int __init kvm_sys_reg_table_init(void);


bool lock_all_vcpus(struct kvm *kvm);
void unlock_all_vcpus(struct kvm *kvm);

/* MMIO helpers */
/* MMIO helpers */
void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len);
unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len);
@@ -1022,8 +1040,10 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
			       struct kvm_device_attr *attr);
			       struct kvm_device_attr *attr);


long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
			       struct kvm_arm_copy_mte_tags *copy_tags);
			       struct kvm_arm_copy_mte_tags *copy_tags);
int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
				    struct kvm_arm_counter_offset *offset);


/* Guest/host FPSIMD coordination helpers */
/* Guest/host FPSIMD coordination helpers */
int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
@@ -1078,6 +1098,9 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
	(system_supports_32bit_el0() &&				\
	(system_supports_32bit_el0() &&				\
	 !static_branch_unlikely(&arm64_mismatched_32bit_el0))
	 !static_branch_unlikely(&arm64_mismatched_32bit_el0))


#define kvm_vm_has_ran_once(kvm)					\
	(test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &(kvm)->arch.flags))

int kvm_trng_call(struct kvm_vcpu *vcpu);
int kvm_trng_call(struct kvm_vcpu *vcpu);
#ifdef CONFIG_KVM
#ifdef CONFIG_KVM
extern phys_addr_t hyp_mem_base;
extern phys_addr_t hyp_mem_base;
+4 −0
Original line number Original line Diff line number Diff line
@@ -63,6 +63,7 @@
 * specific registers encoded in the instructions).
 * specific registers encoded in the instructions).
 */
 */
.macro kern_hyp_va	reg
.macro kern_hyp_va	reg
#ifndef __KVM_VHE_HYPERVISOR__
alternative_cb ARM64_ALWAYS_SYSTEM, kvm_update_va_mask
alternative_cb ARM64_ALWAYS_SYSTEM, kvm_update_va_mask
	and     \reg, \reg, #1		/* mask with va_mask */
	and     \reg, \reg, #1		/* mask with va_mask */
	ror	\reg, \reg, #1		/* rotate to the first tag bit */
	ror	\reg, \reg, #1		/* rotate to the first tag bit */
@@ -70,6 +71,7 @@ alternative_cb ARM64_ALWAYS_SYSTEM, kvm_update_va_mask
	add	\reg, \reg, #0, lsl 12	/* insert the top 12 bits of the tag */
	add	\reg, \reg, #0, lsl 12	/* insert the top 12 bits of the tag */
	ror	\reg, \reg, #63		/* rotate back */
	ror	\reg, \reg, #63		/* rotate back */
alternative_cb_end
alternative_cb_end
#endif
.endm
.endm


/*
/*
@@ -127,6 +129,7 @@ void kvm_apply_hyp_relocations(void);


static __always_inline unsigned long __kern_hyp_va(unsigned long v)
static __always_inline unsigned long __kern_hyp_va(unsigned long v)
{
{
#ifndef __KVM_VHE_HYPERVISOR__
	asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n"
	asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n"
				    "ror %0, %0, #1\n"
				    "ror %0, %0, #1\n"
				    "add %0, %0, #0\n"
				    "add %0, %0, #0\n"
@@ -135,6 +138,7 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
				    ARM64_ALWAYS_SYSTEM,
				    ARM64_ALWAYS_SYSTEM,
				    kvm_update_va_mask)
				    kvm_update_va_mask)
		     : "+r" (v));
		     : "+r" (v));
#endif
	return v;
	return v;
}
}


Loading