Merge branch 'kvm-fixes-for-5.18-rc5' into HEAD (71d7c575) · Commits · EulixOS / Software / Kernel

Documentation/virt/kvm/api.rst

+15 −12

Original line number	Diff line number	Diff line
		@@ -6089,21 +6089,18 @@ should put the acknowledged interrupt vector into the 'epr' field.
		#define KVM_SYSTEM_EVENT_RESET 2
		#define KVM_SYSTEM_EVENT_CRASH 3
		#define KVM_SYSTEM_EVENT_SEV_TERM 4
		#define KVM_SYSTEM_EVENT_NDATA_VALID (1u << 31)
		__u32 type;
		__u32 ndata;
		__u64 flags;
		__u64 data[16];
		} system_event;

		If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered
		a system-level event using some architecture specific mechanism (hypercall
		or some special instruction). In case of ARM64, this is triggered using
		HVC instruction based PSCI call from the vcpu. The 'type' field describes
		the system-level event type. The 'flags' field describes architecture
		specific flags for the system-level event.
		HVC instruction based PSCI call from the vcpu.

		Valid values for bits 30:0 of 'type' are:
		The 'type' field describes the system-level event type.
		Valid values for 'type' are:

		- KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the
		VM. Userspace is not obliged to honour this, and if it does honour
		@@ -6119,14 +6116,20 @@ Valid values for bits 30:0 of 'type' are:
		- KVM_SYSTEM_EVENT_SEV_TERM -- an AMD SEV guest requested termination.
		The guest physical address of the guest's GHCB is stored in `data[0]`.

		Valid flags are:
		If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain
		architecture specific information for the system-level event. Only
		the first `ndata` items (possibly zero) of the data array are valid.

		- for arm64, data[0] is set to KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 if
		the guest issued a SYSTEM_RESET2 call according to v1.1 of the PSCI
		specification.

		- KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (arm64 only) -- the guest issued
		a SYSTEM_RESET2 call according to v1.1 of the PSCI specification.
		- for RISC-V, data[0] is set to the value of the second argument of the
		``sbi_system_reset`` call.

		Extra data for this event is stored in the `data[]` array, up to index
		`ndata-1` included, if bit 31 is set in `type`. The data depends on the
		`type` field. There is no extra data if bit 31 is clear or `ndata` is zero.
		Previous versions of Linux defined a `flags` member in this struct. The
		field is now aliased to `data[0]`. Userspace can assume that it is only
		written if ndata is greater than 0.

		::

arch/arm64/kvm/psci.c

+2 −1

Original line number	Diff line number	Diff line
		@@ -181,7 +181,8 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags)

		memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
		vcpu->run->system_event.type = type;
		vcpu->run->system_event.flags = flags;
		vcpu->run->system_event.ndata = 1;
		vcpu->run->system_event.data[0] = flags;
		vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
		}

arch/riscv/kvm/vcpu_sbi.c

+3 −2

Original line number	Diff line number	Diff line
		@@ -83,7 +83,7 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu vcpu, struct kvm_run run)

		void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
		struct kvm_run *run,
		u32 type, u64 flags)
		u32 type, u64 reason)
		{
		unsigned long i;
		struct kvm_vcpu *tmp;
		@@ -94,7 +94,8 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,

		memset(&run->system_event, 0, sizeof(run->system_event));
		run->system_event.type = type;
		run->system_event.flags = flags;
		run->system_event.ndata = 1;
		run->system_event.data[0] = reason;
		run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
		}

arch/x86/kvm/mmu.h

+24 −0

Original line number	Diff line number	Diff line
		@@ -65,6 +65,30 @@ static __always_inline u64 rsvd_bits(int s, int e)
		return ((2ULL << (e - s)) - 1) << s;
		}

		/*
		* The number of non-reserved physical address bits irrespective of features
		* that repurpose legal bits, e.g. MKTME.
		*/
		extern u8 __read_mostly shadow_phys_bits;

		static inline gfn_t kvm_mmu_max_gfn(void)
		{
		/*
		* Note that this uses the host MAXPHYADDR, not the guest's.
		* EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR;
		* assuming KVM is running on bare metal, guest accesses beyond
		* host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit
		* (either EPT Violation/Misconfig or #NPF), and so KVM will never
		* install a SPTE for such addresses. If KVM is running as a VM
		* itself, on the other hand, it might see a MAXPHYADDR that is less
		* than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
		* disallows such SPTEs entirely and simplifies the TDP MMU.
		*/
		int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;

		return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
		}

		void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
		void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);

arch/x86/kvm/mmu/mmu.c

+50 −7

Original line number	Diff line number	Diff line
		@@ -2804,8 +2804,12 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
		const struct kvm_memory_slot *slot)
		{
		unsigned long hva;
		pte_t *pte;
		int level;
		unsigned long flags;
		int level = PG_LEVEL_4K;
		pgd_t pgd;
		p4d_t p4d;
		pud_t pud;
		pmd_t pmd;

		if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
		return PG_LEVEL_4K;
		@@ -2820,10 +2824,43 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
		*/
		hva = __gfn_to_hva_memslot(slot, gfn);

		pte = lookup_address_in_mm(kvm->mm, hva, &level);
		if (unlikely(!pte))
		return PG_LEVEL_4K;
		/*
		* Lookup the mapping level in the current mm. The information
		* may become stale soon, but it is safe to use as long as
		* 1) mmu_notifier_retry was checked after taking mmu_lock, and
		* 2) mmu_lock is taken now.
		*
		* We still need to disable IRQs to prevent concurrent tear down
		* of page tables.
		*/
		local_irq_save(flags);

		pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
		if (pgd_none(pgd))
		goto out;

		p4d = READ_ONCE(*p4d_offset(&pgd, hva));
		if (p4d_none(p4d) \|\| !p4d_present(p4d))
		goto out;

		pud = READ_ONCE(*pud_offset(&p4d, hva));
		if (pud_none(pud) \|\| !pud_present(pud))
		goto out;

		if (pud_large(pud)) {
		level = PG_LEVEL_1G;
		goto out;
		}

		pmd = READ_ONCE(*pmd_offset(&pud, hva));
		if (pmd_none(pmd) \|\| !pmd_present(pmd))
		goto out;

		if (pmd_large(pmd))
		level = PG_LEVEL_2M;

		out:
		local_irq_restore(flags);
		return level;
		}

		@@ -2992,9 +3029,15 @@ static bool handle_abnormal_pfn(struct kvm_vcpu vcpu, struct kvm_page_fault fa
		/*
		* If MMIO caching is disabled, emulate immediately without
		* touching the shadow page tables as attempting to install an
		* MMIO SPTE will just be an expensive nop.
		*/
		if (unlikely(!shadow_mmio_value)) {
		* MMIO SPTE will just be an expensive nop. Do not cache MMIO
		* whose gfn is greater than host.MAXPHYADDR, any guest that
		* generates such gfns is running nested and is being tricked
		* by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
		* and only if L1's MAXPHYADDR is inaccurate with respect to
		* the hardware's).
		*/
		if (unlikely(!shadow_mmio_value) \|\|
		unlikely(fault->gfn > kvm_mmu_max_gfn())) {
		*ret_val = RET_PF_EMULATE;
		return true;
		}