Commit 192ad3c2 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull KVM updates from Paolo Bonzini:
 "ARM:
   - Page ownership tracking between host EL1 and EL2
   - Rely on userspace page tables to create large stage-2 mappings
   - Fix incompatibility between pKVM and kmemleak
   - Fix the PMU reset state, and improve the performance of the virtual
     PMU
   - Move over to the generic KVM entry code
   - Address PSCI reset issues w.r.t. save/restore
   - Preliminary rework for the upcoming pKVM fixed feature
   - A bunch of MM cleanups
   - a vGIC fix for timer spurious interrupts
   - Various cleanups

  s390:
   - enable interpretation of specification exceptions
   - fix a vcpu_idx vs vcpu_id mixup

  x86:
   - fast (lockless) page fault support for the new MMU
   - new MMU now the default
   - increased maximum allowed VCPU count
   - allow inhibit IRQs on KVM_RUN while debugging guests
   - let Hyper-V-enabled guests run with virtualized LAPIC as long as
     they do not enable the Hyper-V "AutoEOI" feature
   - fixes and optimizations for the toggling of AMD AVIC (virtualized
     LAPIC)
   - tuning for the case when two-dimensional paging (EPT/NPT) is
     disabled
   - bugfixes and cleanups, especially with respect to vCPU reset and
     choosing a paging mode based on CR0/CR4/EFER
   - support for 5-level page table on AMD processors

  Generic:
   - MMU notifier invalidation callbacks do not take mmu_lock unless
     necessary
   - improved caching of LRU kvm_memory_slot
   - support for histogram statistics
   - add statistics for halt polling and remote TLB flush requests"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (210 commits)
  KVM: Drop unused kvm_dirty_gfn_invalid()
  KVM: x86: Update vCPU's hv_clock before back to guest when tsc_offset is adjusted
  KVM: MMU: mark role_regs and role accessors as maybe unused
  KVM: MIPS: Remove a "set but not used" variable
  x86/kvm: Don't enable IRQ when IRQ enabled in kvm_wait
  KVM: stats: Add VM stat for remote tlb flush requests
  KVM: Remove unnecessary export of kvm_{inc,dec}_notifier_count()
  KVM: x86/mmu: Move lpage_disallowed_link further "down" in kvm_mmu_page
  KVM: x86/mmu: Relocate kvm_mmu_page.tdp_mmu_page for better cache locality
  Revert "KVM: x86: mmu: Add guest physical address check in translate_gpa()"
  KVM: x86/mmu: Remove unused field mmio_cached in struct kvm_mmu_page
  kvm: x86: Increase KVM_SOFT_MAX_VCPUS to 710
  kvm: x86: Increase MAX_VCPUS to 1024
  kvm: x86: Set KVM_MAX_VCPU_ID to 4*KVM_MAX_VCPUS
  KVM: VMX: avoid running vmx_handle_exit_irqoff in case of emulation
  KVM: x86/mmu: Don't freak out if pml5_root is NULL on 4-level host
  KVM: s390: index kvm->arch.idle_mask by vcpu_idx
  KVM: s390: Enable specification exception interpretation
  KVM: arm64: Trim guest debug exception handling
  KVM: SVM: Add 5-level page table support for SVM
  ...
parents a2b28235 109bbba5
Loading
Loading
Loading
Loading
+28 −8
Original line number Diff line number Diff line
@@ -3357,6 +3357,7 @@ flags which can include the following:
  - KVM_GUESTDBG_INJECT_DB:     inject DB type exception [x86]
  - KVM_GUESTDBG_INJECT_BP:     inject BP type exception [x86]
  - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
  - KVM_GUESTDBG_BLOCKIRQ:      avoid injecting interrupts/NMI/SMI [x86]

For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints
are enabled in memory so we need to ensure breakpoint exceptions are
@@ -5208,6 +5209,9 @@ by a string of size ``name_size``.
	#define KVM_STATS_TYPE_CUMULATIVE	(0x0 << KVM_STATS_TYPE_SHIFT)
	#define KVM_STATS_TYPE_INSTANT		(0x1 << KVM_STATS_TYPE_SHIFT)
	#define KVM_STATS_TYPE_PEAK		(0x2 << KVM_STATS_TYPE_SHIFT)
	#define KVM_STATS_TYPE_LINEAR_HIST	(0x3 << KVM_STATS_TYPE_SHIFT)
	#define KVM_STATS_TYPE_LOG_HIST		(0x4 << KVM_STATS_TYPE_SHIFT)
	#define KVM_STATS_TYPE_MAX		KVM_STATS_TYPE_LOG_HIST

	#define KVM_STATS_UNIT_SHIFT		4
	#define KVM_STATS_UNIT_MASK		(0xF << KVM_STATS_UNIT_SHIFT)
@@ -5215,18 +5219,20 @@ by a string of size ``name_size``.
	#define KVM_STATS_UNIT_BYTES		(0x1 << KVM_STATS_UNIT_SHIFT)
	#define KVM_STATS_UNIT_SECONDS		(0x2 << KVM_STATS_UNIT_SHIFT)
	#define KVM_STATS_UNIT_CYCLES		(0x3 << KVM_STATS_UNIT_SHIFT)
	#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_CYCLES

	#define KVM_STATS_BASE_SHIFT		8
	#define KVM_STATS_BASE_MASK		(0xF << KVM_STATS_BASE_SHIFT)
	#define KVM_STATS_BASE_POW10		(0x0 << KVM_STATS_BASE_SHIFT)
	#define KVM_STATS_BASE_POW2		(0x1 << KVM_STATS_BASE_SHIFT)
	#define KVM_STATS_BASE_MAX		KVM_STATS_BASE_POW2

	struct kvm_stats_desc {
		__u32 flags;
		__s16 exponent;
		__u16 size;
		__u32 offset;
		__u32 unused;
		__u32 bucket_size;
		char name[];
	};

@@ -5237,21 +5243,35 @@ The following flags are supported:
Bits 0-3 of ``flags`` encode the type:

  * ``KVM_STATS_TYPE_CUMULATIVE``
    The statistics data is cumulative. The value of data can only be increased.
    The statistics reports a cumulative count. The value of data can only be increased.
    Most of the counters used in KVM are of this type.
    The corresponding ``size`` field for this type is always 1.
    All cumulative statistics data are read/write.
  * ``KVM_STATS_TYPE_INSTANT``
    The statistics data is instantaneous. Its value can be increased or
    The statistics reports an instantaneous value. Its value can be increased or
    decreased. This type is usually used as a measurement of some resources,
    like the number of dirty pages, the number of large pages, etc.
    All instant statistics are read only.
    The corresponding ``size`` field for this type is always 1.
  * ``KVM_STATS_TYPE_PEAK``
    The statistics data is peak. The value of data can only be increased, and
    represents a peak value for a measurement, for example the maximum number
    The statistics data reports a peak value, for example the maximum number
    of items in a hash table bucket, the longest time waited and so on.
    The value of data can only be increased.
    The corresponding ``size`` field for this type is always 1.
  * ``KVM_STATS_TYPE_LINEAR_HIST``
    The statistic is reported as a linear histogram. The number of
    buckets is specified by the ``size`` field. The size of buckets is specified
    by the ``hist_param`` field. The range of the Nth bucket (1 <= N < ``size``)
    is [``hist_param``*(N-1), ``hist_param``*N), while the range of the last
    bucket is [``hist_param``*(``size``-1), +INF). (+INF means positive infinity
    value.) The bucket value indicates how many samples fell in the bucket's range.
  * ``KVM_STATS_TYPE_LOG_HIST``
    The statistic is reported as a logarithmic histogram. The number of
    buckets is specified by the ``size`` field. The range of the first bucket is
    [0, 1), while the range of the last bucket is [pow(2, ``size``-2), +INF).
    Otherwise, The Nth bucket (1 < N < ``size``) covers
    [pow(2, N-2), pow(2, N-1)). The bucket value indicates how many samples fell
    in the bucket's range.

Bits 4-7 of ``flags`` encode the unit:

@@ -5286,9 +5306,9 @@ unsigned 64bit data.
The ``offset`` field is the offset from the start of Data Block to the start of
the corresponding statistics data.

The ``unused`` field is reserved for future support for other types of
statistics data, like log/linear histogram. Its value is always 0 for the types
defined above.
The ``bucket_size`` field is used as a parameter for histogram statistics data.
It is only used by linear histogram statistics data, specifying the size of a
bucket.

The ``name`` field is the name string of the statistics data. The name string
starts at the end of ``struct kvm_stats_desc``.  The maximum length including
+6 −0
Original line number Diff line number Diff line
@@ -21,6 +21,12 @@ The acquisition orders for mutexes are as follows:
  can be taken inside a kvm->srcu read-side critical section,
  while kvm->slots_lock cannot.

- kvm->mn_active_invalidate_count ensures that pairs of
  invalidate_range_start() and invalidate_range_end() callbacks
  use the same memslots array.  kvm->slots_lock and kvm->slots_arch_lock
  are taken on the waiting side in install_new_memslots, so MMU notifiers
  must not take either kvm->slots_lock or kvm->slots_arch_lock.

On x86:

- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
+9 −9
Original line number Diff line number Diff line
@@ -602,14 +602,14 @@ static inline bool id_aa64pfr0_32bit_el1(u64 pfr0)
{
	u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT);

	return val == ID_AA64PFR0_EL1_32BIT_64BIT;
	return val == ID_AA64PFR0_ELx_32BIT_64BIT;
}

static inline bool id_aa64pfr0_32bit_el0(u64 pfr0)
{
	u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT);

	return val == ID_AA64PFR0_EL0_32BIT_64BIT;
	return val == ID_AA64PFR0_ELx_32BIT_64BIT;
}

static inline bool id_aa64pfr0_sve(u64 pfr0)
@@ -784,13 +784,13 @@ extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
{
	switch (parange) {
	case 0: return 32;
	case 1: return 36;
	case 2: return 40;
	case 3: return 42;
	case 4: return 44;
	case 5: return 48;
	case 6: return 52;
	case ID_AA64MMFR0_PARANGE_32: return 32;
	case ID_AA64MMFR0_PARANGE_36: return 36;
	case ID_AA64MMFR0_PARANGE_40: return 40;
	case ID_AA64MMFR0_PARANGE_42: return 42;
	case ID_AA64MMFR0_PARANGE_44: return 44;
	case ID_AA64MMFR0_PARANGE_48: return 48;
	case ID_AA64MMFR0_PARANGE_52: return 52;
	/*
	 * A future PE could use a value unknown to the kernel.
	 * However, by the "D10.1.4 Principles of the ID scheme
+38 −16
Original line number Diff line number Diff line
@@ -12,8 +12,13 @@
#include <asm/types.h>

/* Hyp Configuration Register (HCR) bits */

#define HCR_TID5	(UL(1) << 58)
#define HCR_DCT		(UL(1) << 57)
#define HCR_ATA_SHIFT	56
#define HCR_ATA		(UL(1) << HCR_ATA_SHIFT)
#define HCR_AMVOFFEN	(UL(1) << 51)
#define HCR_FIEN	(UL(1) << 47)
#define HCR_FWB		(UL(1) << 46)
#define HCR_API		(UL(1) << 41)
#define HCR_APK		(UL(1) << 40)
@@ -32,9 +37,9 @@
#define HCR_TVM		(UL(1) << 26)
#define HCR_TTLB	(UL(1) << 25)
#define HCR_TPU		(UL(1) << 24)
#define HCR_TPC		(UL(1) << 23)
#define HCR_TPC		(UL(1) << 23) /* HCR_TPCP if FEAT_DPB */
#define HCR_TSW		(UL(1) << 22)
#define HCR_TAC		(UL(1) << 21)
#define HCR_TACR	(UL(1) << 21)
#define HCR_TIDCP	(UL(1) << 20)
#define HCR_TSC		(UL(1) << 19)
#define HCR_TID3	(UL(1) << 18)
@@ -56,12 +61,13 @@
#define HCR_PTW		(UL(1) << 2)
#define HCR_SWIO	(UL(1) << 1)
#define HCR_VM		(UL(1) << 0)
#define HCR_RES0	((UL(1) << 48) | (UL(1) << 39))

/*
 * The bits we set in HCR:
 * TLOR:	Trap LORegion register accesses
 * RW:		64bit by default, can be overridden for 32bit VMs
 * TAC:		Trap ACTLR
 * TACR:	Trap ACTLR
 * TSC:		Trap SMC
 * TSW:		Trap cache operations by set/way
 * TWE:		Trap WFE
@@ -76,7 +82,7 @@
 * PTW:		Take a stage2 fault if a stage1 walk steps in device memory
 */
#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
			 HCR_BSU_IS | HCR_FB | HCR_TAC | \
			 HCR_BSU_IS | HCR_FB | HCR_TACR | \
			 HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
			 HCR_FMO | HCR_IMO | HCR_PTW )
#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
@@ -275,24 +281,40 @@
#define CPTR_EL2_TTA	(1 << 20)
#define CPTR_EL2_TFP	(1 << CPTR_EL2_TFP_SHIFT)
#define CPTR_EL2_TZ	(1 << 8)
#define CPTR_EL2_RES1	0x000032ff /* known RES1 bits in CPTR_EL2 */
#define CPTR_EL2_DEFAULT	CPTR_EL2_RES1
#define CPTR_NVHE_EL2_RES1	0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */
#define CPTR_EL2_DEFAULT	CPTR_NVHE_EL2_RES1
#define CPTR_NVHE_EL2_RES0	(GENMASK(63, 32) |	\
				 GENMASK(29, 21) |	\
				 GENMASK(19, 14) |	\
				 BIT(11))

/* Hyp Debug Configuration Register bits */
#define MDCR_EL2_E2TB_MASK	(UL(0x3))
#define MDCR_EL2_E2TB_SHIFT	(UL(24))
#define MDCR_EL2_TTRF		(1 << 19)
#define MDCR_EL2_TPMS		(1 << 14)
#define MDCR_EL2_HPMFZS		(UL(1) << 36)
#define MDCR_EL2_HPMFZO		(UL(1) << 29)
#define MDCR_EL2_MTPME		(UL(1) << 28)
#define MDCR_EL2_TDCC		(UL(1) << 27)
#define MDCR_EL2_HCCD		(UL(1) << 23)
#define MDCR_EL2_TTRF		(UL(1) << 19)
#define MDCR_EL2_HPMD		(UL(1) << 17)
#define MDCR_EL2_TPMS		(UL(1) << 14)
#define MDCR_EL2_E2PB_MASK	(UL(0x3))
#define MDCR_EL2_E2PB_SHIFT	(UL(12))
#define MDCR_EL2_TDRA		(1 << 11)
#define MDCR_EL2_TDOSA		(1 << 10)
#define MDCR_EL2_TDA		(1 << 9)
#define MDCR_EL2_TDE		(1 << 8)
#define MDCR_EL2_HPME		(1 << 7)
#define MDCR_EL2_TPM		(1 << 6)
#define MDCR_EL2_TPMCR		(1 << 5)
#define MDCR_EL2_HPMN_MASK	(0x1F)
#define MDCR_EL2_TDRA		(UL(1) << 11)
#define MDCR_EL2_TDOSA		(UL(1) << 10)
#define MDCR_EL2_TDA		(UL(1) << 9)
#define MDCR_EL2_TDE		(UL(1) << 8)
#define MDCR_EL2_HPME		(UL(1) << 7)
#define MDCR_EL2_TPM		(UL(1) << 6)
#define MDCR_EL2_TPMCR		(UL(1) << 5)
#define MDCR_EL2_HPMN_MASK	(UL(0x1F))
#define MDCR_EL2_RES0		(GENMASK(63, 37) |	\
				 GENMASK(35, 30) |	\
				 GENMASK(25, 24) |	\
				 GENMASK(22, 20) |	\
				 BIT(18) |		\
				 GENMASK(16, 15))

/* For compatibility with fault code shared with 32-bit */
#define FSC_FAULT	ESR_ELx_FSC_FAULT
+3 −4
Original line number Diff line number Diff line
@@ -59,12 +59,11 @@
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs		13
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs		14
#define __KVM_HOST_SMCCC_FUNC___pkvm_init			15
#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings		16
#define __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp		16
#define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping	17
#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector		18
#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize		19
#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp			20
#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc			21
#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc			20

#ifndef __ASSEMBLY__

@@ -210,7 +209,7 @@ extern u64 __vgic_v3_read_vmcr(void);
extern void __vgic_v3_write_vmcr(u32 vmcr);
extern void __vgic_v3_init_lrs(void);

extern u32 __kvm_get_mdcr_el2(void);
extern u64 __kvm_get_mdcr_el2(void);

#define __KVM_EXTABLE(from, to)						\
	"	.pushsection	__kvm_ex_table, \"a\"\n"		\
Loading