KVM: mmu: extract spte.h and spte.c (5a9624af) · Commits · EulixOS / Software / Kernel

arch/x86/kvm/Makefile

+2 −1

Original line number	Diff line number	Diff line
		@@ -15,7 +15,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o

		kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
		i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
		hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
		hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
		mmu/spte.o

		kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
		vmx/evmcs.o vmx/nested.o vmx/posted_intr.o

arch/x86/kvm/mmu/mmu.c

+5 −546

Original line number	Diff line number	Diff line
		@@ -23,6 +23,7 @@
		#include "kvm_cache_regs.h"
		#include "kvm_emulate.h"
		#include "cpuid.h"
		#include "spte.h"

		#include <linux/kvm_host.h>
		#include <linux/types.h>
		@@ -45,7 +46,6 @@
		#include <asm/page.h>
		#include <asm/memtype.h>
		#include <asm/cmpxchg.h>
		#include <asm/e820/api.h>
		#include <asm/io.h>
		#include <asm/vmx.h>
		#include <asm/kvm_page_track.h>
		@@ -104,45 +104,13 @@ enum {
		AUDIT_POST_SYNC
		};

		#undef MMU_DEBUG

		#ifdef MMU_DEBUG
		static bool dbg = 0;
		bool dbg = 0;
		module_param(dbg, bool, 0644);

		#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
		#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
		#define MMU_WARN_ON(x) WARN_ON(x)
		#else
		#define pgprintk(x...) do { } while (0)
		#define rmap_printk(x...) do { } while (0)
		#define MMU_WARN_ON(x) do { } while (0)
		#endif

		#define PTE_PREFETCH_NUM 8

		#define PT_FIRST_AVAIL_BITS_SHIFT 10
		#define PT64_SECOND_AVAIL_BITS_SHIFT 54

		/*
		* The mask used to denote special SPTEs, which can be either MMIO SPTEs or
		* Access Tracking SPTEs.
		*/
		#define SPTE_SPECIAL_MASK (3ULL << 52)
		#define SPTE_AD_ENABLED_MASK (0ULL << 52)
		#define SPTE_AD_DISABLED_MASK (1ULL << 52)
		#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
		#define SPTE_MMIO_MASK (3ULL << 52)

		#define PT64_LEVEL_BITS 9

		#define PT64_LEVEL_SHIFT(level) \
		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)

		#define PT64_INDEX(address, level)\
		(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))


		#define PT32_LEVEL_BITS 10

		#define PT32_LEVEL_SHIFT(level) \
		@@ -156,18 +124,6 @@ module_param(dbg, bool, 0644);
		(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))


		#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
		#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
		#else
		#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
		#endif
		#define PT64_LVL_ADDR_MASK(level) \
		(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
		* PT64_LEVEL_BITS))) - 1))
		#define PT64_LVL_OFFSET_MASK(level) \
		(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
		* PT64_LEVEL_BITS))) - 1))

		#define PT32_BASE_ADDR_MASK PAGE_MASK
		#define PT32_DIR_BASE_ADDR_MASK \
		(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
		@@ -175,25 +131,8 @@ module_param(dbg, bool, 0644);
		(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
		* PT32_LEVEL_BITS))) - 1))

		#define PT64_PERM_MASK (PT_PRESENT_MASK \| PT_WRITABLE_MASK \| shadow_user_mask \
		\| shadow_x_mask \| shadow_nx_mask \| shadow_me_mask)

		#define ACC_EXEC_MASK 1
		#define ACC_WRITE_MASK PT_WRITABLE_MASK
		#define ACC_USER_MASK PT_USER_MASK
		#define ACC_ALL (ACC_EXEC_MASK \| ACC_WRITE_MASK \| ACC_USER_MASK)

		/* The mask for the R/X bits in EPT PTEs */
		#define PT64_EPT_READABLE_MASK 0x1ull
		#define PT64_EPT_EXECUTABLE_MASK 0x4ull

		#include <trace/events/kvm.h>

		#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
		#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))

		#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

		/* make pte_list_desc fit well in cache line */
		#define PTE_LIST_EXT 3

		@@ -248,62 +187,7 @@ static struct kmem_cache *pte_list_desc_cache;
		static struct kmem_cache *mmu_page_header_cache;
		static struct percpu_counter kvm_total_used_mmu_pages;

		static u64 __read_mostly shadow_nx_mask;
		static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
		static u64 __read_mostly shadow_user_mask;
		static u64 __read_mostly shadow_accessed_mask;
		static u64 __read_mostly shadow_dirty_mask;
		static u64 __read_mostly shadow_mmio_value;
		static u64 __read_mostly shadow_mmio_access_mask;
		static u64 __read_mostly shadow_present_mask;
		static u64 __read_mostly shadow_me_mask;

		/*
		* SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
		* shadow_acc_track_mask is the set of bits to be cleared in non-accessed
		* pages.
		*/
		static u64 __read_mostly shadow_acc_track_mask;

		/*
		* The mask/shift to use for saving the original R/X bits when marking the PTE
		* as not-present for access tracking purposes. We do not save the W bit as the
		* PTEs being access tracked also need to be dirty tracked, so the W bit will be
		* restored only when a write is attempted to the page.
		*/
		static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK \|
		PT64_EPT_EXECUTABLE_MASK;
		static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;

		/*
		* This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
		* to guard against L1TF attacks.
		*/
		static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;

		/*
		* The number of high-order 1 bits to use in the mask above.
		*/
		static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;

		/*
		* In some cases, we need to preserve the GFN of a non-present or reserved
		* SPTE when we usurp the upper five bits of the physical address space to
		* defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
		* shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
		* left into the reserved bits, i.e. the GFN in the SPTE will be split into
		* high and low parts. This mask covers the lower bits of the GFN.
		*/
		static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;

		/*
		* The number of non-reserved physical address bits irrespective of features
		* that repurpose legal bits, e.g. MKTME.
		*/
		static u8 __read_mostly shadow_phys_bits;

		static void mmu_spte_set(u64 *sptep, u64 spte);
		static bool is_executable_pte(u64 spte);
		static union kvm_mmu_page_role
		kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);

		@@ -339,134 +223,11 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
		kvm_flush_remote_tlbs_with_range(kvm, &range);
		}

		void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
		{
		BUG_ON((u64)(unsigned)access_mask != access_mask);
		WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
		WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
		shadow_mmio_value = mmio_value \| SPTE_MMIO_MASK;
		shadow_mmio_access_mask = access_mask;
		}
		EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);

		static bool is_mmio_spte(u64 spte)
		{
		return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
		}

		static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
		{
		return sp->role.ad_disabled;
		}

		static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
		{
		/*
		* When using the EPT page-modification log, the GPAs in the log
		* would come from L2 rather than L1. Therefore, we need to rely
		* on write protection to record dirty pages. This also bypasses
		* PML, since writes now result in a vmexit.
		*/
		return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
		}

		static inline bool spte_ad_enabled(u64 spte)
		{
		MMU_WARN_ON(is_mmio_spte(spte));
		return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
		}

		static inline bool spte_ad_need_write_protect(u64 spte)
		{
		MMU_WARN_ON(is_mmio_spte(spte));
		return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
		}

		static bool is_nx_huge_page_enabled(void)
		bool is_nx_huge_page_enabled(void)
		{
		return READ_ONCE(nx_huge_pages);
		}

		static inline u64 spte_shadow_accessed_mask(u64 spte)
		{
		MMU_WARN_ON(is_mmio_spte(spte));
		return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
		}

		static inline u64 spte_shadow_dirty_mask(u64 spte)
		{
		MMU_WARN_ON(is_mmio_spte(spte));
		return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
		}

		static inline bool is_access_track_spte(u64 spte)
		{
		return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
		}

		/*
		* Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
		* the memslots generation and is derived as follows:
		*
		* Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
		* Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
		*
		* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
		* the MMIO generation number, as doing so would require stealing a bit from
		* the "real" generation number and thus effectively halve the maximum number
		* of MMIO generations that can be handled before encountering a wrap (which
		* requires a full MMU zap). The flag is instead explicitly queried when
		* checking for MMIO spte cache hits.
		*/
		#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0)

		#define MMIO_SPTE_GEN_LOW_START 3
		#define MMIO_SPTE_GEN_LOW_END 11
		#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
		MMIO_SPTE_GEN_LOW_START)

		#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
		#define MMIO_SPTE_GEN_HIGH_END 62
		#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
		MMIO_SPTE_GEN_HIGH_START)

		static u64 generation_mmio_spte_mask(u64 gen)
		{
		u64 mask;

		WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
		BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK \| MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);

		mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
		mask \|= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
		return mask;
		}

		static u64 get_mmio_spte_generation(u64 spte)
		{
		u64 gen;

		gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
		gen \|= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
		return gen;
		}

		static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
		{

		u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
		u64 mask = generation_mmio_spte_mask(gen);
		u64 gpa = gfn << PAGE_SHIFT;

		access &= shadow_mmio_access_mask;
		mask \|= shadow_mmio_value \| access;
		mask \|= gpa \| shadow_nonpresent_or_rsvd_mask;
		mask \|= (gpa & shadow_nonpresent_or_rsvd_mask)
		<< shadow_nonpresent_or_rsvd_mask_len;

		return mask;
		}

		static void mark_mmio_spte(struct kvm_vcpu vcpu, u64 sptep, u64 gfn,
		unsigned int access)
		{
		@@ -532,90 +293,6 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
		return gpa;
		}

		/*
		* Sets the shadow PTE masks used by the MMU.
		*
		* Assumptions:
		* - Setting either @accessed_mask or @dirty_mask requires setting both
		* - At least one of @accessed_mask or @acc_track_mask must be set
		*/
		void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
		u64 acc_track_mask, u64 me_mask)
		{
		BUG_ON(!dirty_mask != !accessed_mask);
		BUG_ON(!accessed_mask && !acc_track_mask);
		BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);

		shadow_user_mask = user_mask;
		shadow_accessed_mask = accessed_mask;
		shadow_dirty_mask = dirty_mask;
		shadow_nx_mask = nx_mask;
		shadow_x_mask = x_mask;
		shadow_present_mask = p_mask;
		shadow_acc_track_mask = acc_track_mask;
		shadow_me_mask = me_mask;
		}
		EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);

		static u8 kvm_get_shadow_phys_bits(void)
		{
		/*
		* boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
		* in CPU detection code, but the processor treats those reduced bits as
		* 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
		* the physical address bits reported by CPUID.
		*/
		if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
		return cpuid_eax(0x80000008) & 0xff;

		/*
		* Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
		* custom CPUID. Proceed with whatever the kernel found since these features
		* aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
		*/
		return boot_cpu_data.x86_phys_bits;
		}

		static void kvm_mmu_reset_all_pte_masks(void)
		{
		u8 low_phys_bits;

		shadow_user_mask = 0;
		shadow_accessed_mask = 0;
		shadow_dirty_mask = 0;
		shadow_nx_mask = 0;
		shadow_x_mask = 0;
		shadow_present_mask = 0;
		shadow_acc_track_mask = 0;

		shadow_phys_bits = kvm_get_shadow_phys_bits();

		/*
		* If the CPU has 46 or less physical address bits, then set an
		* appropriate mask to guard against L1TF attacks. Otherwise, it is
		* assumed that the CPU is not vulnerable to L1TF.
		*
		* Some Intel CPUs address the L1 cache using more PA bits than are
		* reported by CPUID. Use the PA width of the L1 cache when possible
		* to achieve more effective mitigation, e.g. if system RAM overlaps
		* the most significant bits of legal physical address space.
		*/
		shadow_nonpresent_or_rsvd_mask = 0;
		low_phys_bits = boot_cpu_data.x86_phys_bits;
		if (boot_cpu_has_bug(X86_BUG_L1TF) &&
		!WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
		52 - shadow_nonpresent_or_rsvd_mask_len)) {
		low_phys_bits = boot_cpu_data.x86_cache_bits
		- shadow_nonpresent_or_rsvd_mask_len;
		shadow_nonpresent_or_rsvd_mask =
		rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
		}

		shadow_nonpresent_or_rsvd_lower_gfn_mask =
		GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
		}

		static int is_cpuid_PSE36(void)
		{
		return 1;
		@@ -626,35 +303,6 @@ static int is_nx(struct kvm_vcpu *vcpu)
		return vcpu->arch.efer & EFER_NX;
		}

		static int is_shadow_present_pte(u64 pte)
		{
		return (pte != 0) && !is_mmio_spte(pte);
		}

		static int is_large_pte(u64 pte)
		{
		return pte & PT_PAGE_SIZE_MASK;
		}

		static int is_last_spte(u64 pte, int level)
		{
		if (level == PG_LEVEL_4K)
		return 1;
		if (is_large_pte(pte))
		return 1;
		return 0;
		}

		static bool is_executable_pte(u64 spte)
		{
		return (spte & (shadow_x_mask \| shadow_nx_mask)) == shadow_x_mask;
		}

		static kvm_pfn_t spte_to_pfn(u64 pte)
		{
		return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
		}

		static gfn_t pse36_gfn_delta(u32 gpte)
		{
		int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
		@@ -799,12 +447,6 @@ static u64 __get_spte_lockless(u64 *sptep)
		}
		#endif

		static bool spte_can_locklessly_be_made_writable(u64 spte)
		{
		return (spte & (SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE)) ==
		(SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE);
		}

		static bool spte_has_volatile_bits(u64 spte)
		{
		if (!is_shadow_present_pte(spte))
		@@ -829,21 +471,6 @@ static bool spte_has_volatile_bits(u64 spte)
		return false;
		}

		static bool is_accessed_spte(u64 spte)
		{
		u64 accessed_mask = spte_shadow_accessed_mask(spte);

		return accessed_mask ? spte & accessed_mask
		: !is_access_track_spte(spte);
		}

		static bool is_dirty_spte(u64 spte)
		{
		u64 dirty_mask = spte_shadow_dirty_mask(spte);

		return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
		}

		/* Rules for using mmu_spte_set:
		* Set the sptep from nonpresent to present.
		* Note: the sptep being assigned must be either not present
		@@ -979,34 +606,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
		return __get_spte_lockless(sptep);
		}

		static u64 mark_spte_for_access_track(u64 spte)
		{
		if (spte_ad_enabled(spte))
		return spte & ~shadow_accessed_mask;

		if (is_access_track_spte(spte))
		return spte;

		/*
		* Making an Access Tracking PTE will result in removal of write access
		* from the PTE. So, verify that we will be able to restore the write
		* access in the fast page fault path later on.
		*/
		WARN_ONCE((spte & PT_WRITABLE_MASK) &&
		!spte_can_locklessly_be_made_writable(spte),
		"kvm: Writable SPTE is not locklessly dirty-trackable\n");

		WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
		shadow_acc_track_saved_bits_shift),
		"kvm: Access Tracking saved bit locations are not zero\n");

		spte \|= (spte & shadow_acc_track_saved_bits_mask) <<
		shadow_acc_track_saved_bits_shift;
		spte &= ~shadow_acc_track_mask;

		return spte;
		}

		/* Restore an acc-track PTE back to a regular PTE */
		static u64 restore_acc_track_spte(u64 spte)
		{
		@@ -1747,21 +1346,6 @@ static int kvm_unmap_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
		return kvm_zap_rmapp(kvm, rmap_head);
		}

		static u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
		{
		u64 new_spte;

		new_spte = old_spte & ~PT64_BASE_ADDR_MASK;
		new_spte \|= (u64)new_pfn << PAGE_SHIFT;

		new_spte &= ~PT_WRITABLE_MASK;
		new_spte &= ~SPTE_HOST_WRITEABLE;

		new_spte = mark_spte_for_access_track(new_spte);

		return new_spte;
		}

		static int kvm_set_pte_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
		struct kvm_memory_slot *slot, gfn_t gfn, int level,
		unsigned long data)
		@@ -2583,21 +2167,6 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
		__shadow_walk_next(iterator, *iterator->sptep);
		}

		static u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
		{
		u64 spte;

		spte = __pa(child_pt) \| shadow_present_mask \| PT_WRITABLE_MASK \|
		shadow_user_mask \| shadow_x_mask \| shadow_me_mask;

		if (ad_disabled)
		spte \|= SPTE_AD_DISABLED_MASK;
		else
		spte \|= shadow_accessed_mask;

		return spte;
		}

		static void link_shadow_page(struct kvm_vcpu vcpu, u64 sptep,
		struct kvm_mmu_page *sp)
		{
		@@ -2919,7 +2488,7 @@ static void kvm_unsync_page(struct kvm_vcpu vcpu, struct kvm_mmu_page sp)
		kvm_mmu_mark_parents_unsync(sp);
		}

		static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
		bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
		bool can_unsync)
		{
		struct kvm_mmu_page *sp;
		@@ -2980,116 +2549,6 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
		return false;
		}

		static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
		{
		if (pfn_valid(pfn))
		return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
		/*
		* Some reserved pages, such as those from NVDIMM
		* DAX devices, are not for MMIO, and can be mapped
		* with cached memory type for better performance.
		* However, the above check misconceives those pages
		* as MMIO, and results in KVM mapping them with UC
		* memory type, which would hurt the performance.
		* Therefore, we check the host memory type in addition
		* and only treat UC/UC-/WC pages as MMIO.
		*/
		(!pat_enabled() \|\| pat_pfn_immune_to_uc_mtrr(pfn));

		return !e820__mapped_raw_any(pfn_to_hpa(pfn),
		pfn_to_hpa(pfn + 1) - 1,
		E820_TYPE_RAM);
		}

		/* Bits which may be returned by set_spte() */
		#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
		#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
		#define SET_SPTE_SPURIOUS BIT(2)

		static int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
		gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative,
		bool can_unsync, bool host_writable, bool ad_disabled,
		u64 *new_spte)
		{
		u64 spte = 0;
		int ret = 0;

		if (ad_disabled)
		spte \|= SPTE_AD_DISABLED_MASK;
		else if (kvm_vcpu_ad_need_write_protect(vcpu))
		spte \|= SPTE_AD_WRPROT_ONLY_MASK;

		/*
		* For the EPT case, shadow_present_mask is 0 if hardware
		* supports exec-only page table entries. In that case,
		* ACC_USER_MASK and shadow_user_mask are used to represent
		* read access. See FNAME(gpte_access) in paging_tmpl.h.
		*/
		spte \|= shadow_present_mask;
		if (!speculative)
		spte \|= spte_shadow_accessed_mask(spte);

		if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
		is_nx_huge_page_enabled()) {
		pte_access &= ~ACC_EXEC_MASK;
		}

		if (pte_access & ACC_EXEC_MASK)
		spte \|= shadow_x_mask;
		else
		spte \|= shadow_nx_mask;

		if (pte_access & ACC_USER_MASK)
		spte \|= shadow_user_mask;

		if (level > PG_LEVEL_4K)
		spte \|= PT_PAGE_SIZE_MASK;
		if (tdp_enabled)
		spte \|= kvm_x86_ops.get_mt_mask(vcpu, gfn,
		kvm_is_mmio_pfn(pfn));

		if (host_writable)
		spte \|= SPTE_HOST_WRITEABLE;
		else
		pte_access &= ~ACC_WRITE_MASK;

		if (!kvm_is_mmio_pfn(pfn))
		spte \|= shadow_me_mask;

		spte \|= (u64)pfn << PAGE_SHIFT;

		if (pte_access & ACC_WRITE_MASK) {
		spte \|= PT_WRITABLE_MASK \| SPTE_MMU_WRITEABLE;

		/*
		* Optimization: for pte sync, if spte was writable the hash
		* lookup is unnecessary (and expensive). Write protection
		* is responsibility of mmu_get_page / kvm_sync_page.
		* Same reasoning can be applied to dirty page accounting.
		*/
		if (!can_unsync && is_writable_pte(old_spte))
		goto out;

		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
		pgprintk("%s: found shadow page for %llx, marking ro\n",
		__func__, gfn);
		ret \|= SET_SPTE_WRITE_PROTECTED_PT;
		pte_access &= ~ACC_WRITE_MASK;
		spte &= ~(PT_WRITABLE_MASK \| SPTE_MMU_WRITEABLE);
		}
		}

		if (pte_access & ACC_WRITE_MASK)
		spte \|= spte_shadow_dirty_mask(spte);

		if (speculative)
		spte = mark_spte_for_access_track(spte);

		out:
		*new_spte = spte;
		return ret;
		}

		static int set_spte(struct kvm_vcpu vcpu, u64 sptep,
		unsigned int pte_access, int level,
		gfn_t gfn, kvm_pfn_t pfn, bool speculative,

arch/x86/kvm/mmu/mmu_internal.h

+30 −1

Original line number	Diff line number	Diff line
		@@ -3,9 +3,23 @@
		#define __KVM_X86_MMU_INTERNAL_H

		#include <linux/types.h>

		#include <linux/kvm_host.h>
		#include <asm/kvm_host.h>

		#undef MMU_DEBUG

		#ifdef MMU_DEBUG
		extern bool dbg;

		#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
		#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
		#define MMU_WARN_ON(x) WARN_ON(x)
		#else
		#define pgprintk(x...) do { } while (0)
		#define rmap_printk(x...) do { } while (0)
		#define MMU_WARN_ON(x) do { } while (0)
		#endif

		struct kvm_mmu_page {
		struct list_head link;
		struct hlist_node hash_link;
		@@ -55,6 +69,21 @@ static inline struct kvm_mmu_page sptep_to_sp(u64 sptep)
		return to_shadow_page(__pa(sptep));
		}

		static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
		{
		/*
		* When using the EPT page-modification log, the GPAs in the log
		* would come from L2 rather than L1. Therefore, we need to rely
		* on write protection to record dirty pages. This also bypasses
		* PML, since writes now result in a vmexit.
		*/
		return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
		}

		bool is_nx_huge_page_enabled(void);
		bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
		bool can_unsync);

		void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
		void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
		bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,

arch/x86/kvm/mmu/spte.c

0 → 100644

+318 −0

File added.

Preview size limit exceeded, changes collapsed.

arch/x86/kvm/mmu/spte.h

0 → 100644

+252 −0

File added.

Preview size limit exceeded, changes collapsed.