Commit a3fe5dbd authored by David Matlack's avatar David Matlack Committed by Paolo Bonzini
Browse files

KVM: x86/mmu: Split huge pages mapped by the TDP MMU when dirty logging is enabled



When dirty logging is enabled without initially-all-set, try to split
all huge pages in the memslot down to 4KB pages so that vCPUs do not
have to take expensive write-protection faults to split huge pages.

Eager page splitting is best-effort only. This commit only adds the
support for the TDP MMU, and even there splitting may fail due to out
of memory conditions. Failures to split a huge page is fine from a
correctness standpoint because KVM will always follow up splitting by
write-protecting any remaining huge pages.

Eager page splitting moves the cost of splitting huge pages off of the
vCPU threads and onto the thread enabling dirty logging on the memslot.
This is useful because:

 1. Splitting on the vCPU thread interrupts vCPUs execution and is
    disruptive to customers whereas splitting on VM ioctl threads can
    run in parallel with vCPU execution.

 2. Splitting all huge pages at once is more efficient because it does
    not require performing VM-exit handling or walking the page table for
    every 4KiB page in the memslot, and greatly reduces the amount of
    contention on the mmu_lock.

For example, when running dirty_log_perf_test with 96 virtual CPUs, 1GiB
per vCPU, and 1GiB HugeTLB memory, the time it takes vCPUs to write to
all of their memory after dirty logging is enabled decreased by 95% from
2.94s to 0.14s.

Eager Page Splitting is over 100x more efficient than the current
implementation of splitting on fault under the read lock. For example,
taking the same workload as above, Eager Page Splitting reduced the CPU
required to split all huge pages from ~270 CPU-seconds ((2.94s - 0.14s)
* 96 vCPU threads) to only 1.55 CPU-seconds.

Eager page splitting does increase the amount of time it takes to enable
dirty logging since it has split all huge pages. For example, the time
it took to enable dirty logging in the 96GiB region of the
aforementioned test increased from 0.001s to 1.55s.

Reviewed-by: default avatarPeter Xu <peterx@redhat.com>
Signed-off-by: default avatarDavid Matlack <dmatlack@google.com>
Message-Id: <20220119230739.2234394-16-dmatlack@google.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent a82070b6
Loading
Loading
Loading
Loading
+24 −0
Original line number Diff line number Diff line
@@ -2339,6 +2339,30 @@
	kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
			Default is 0 (don't ignore, but inject #GP)

	kvm.eager_page_split=
			[KVM,X86] Controls whether or not KVM will try to
			proactively split all huge pages during dirty logging.
			Eager page splitting reduces interruptions to vCPU
			execution by eliminating the write-protection faults
			and MMU lock contention that would otherwise be
			required to split huge pages lazily.

			VM workloads that rarely perform writes or that write
			only to a small region of VM memory may benefit from
			disabling eager page splitting to allow huge pages to
			still be used for reads.

			The behavior of eager page splitting depends on whether
			KVM_DIRTY_LOG_INITIALLY_SET is enabled or disabled. If
			disabled, all huge pages in a memslot will be eagerly
			split when dirty logging is enabled on that memslot. If
			enabled, huge pages will not be eagerly split.

			Eager page splitting currently only supports splitting
			huge pages mapped by the TDP MMU.

			Default is Y (on).

	kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
				   Default is false (don't support).

+3 −0
Original line number Diff line number Diff line
@@ -1587,6 +1587,9 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
				      const struct kvm_memory_slot *memslot,
				      int start_level);
void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
				       const struct kvm_memory_slot *memslot,
				       int target_level);
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
				   const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+24 −0
Original line number Diff line number Diff line
@@ -5830,6 +5830,30 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
}

void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
				       const struct kvm_memory_slot *memslot,
				       int target_level)
{
	u64 start = memslot->base_gfn;
	u64 end = start + memslot->npages;

	if (is_tdp_mmu_enabled(kvm)) {
		read_lock(&kvm->mmu_lock);
		kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
		read_unlock(&kvm->mmu_lock);
	}

	/*
	 * No TLB flush is necessary here. KVM will flush TLBs after
	 * write-protecting and/or clearing dirty on the newly split SPTEs to
	 * ensure that guest writes are reflected in the dirty log before the
	 * ioctl to enable dirty logging on this memslot completes. Since the
	 * split SPTEs retain the write and dirty bits of the huge SPTE, it is
	 * safe for KVM to decide if a TLB flush is necessary based on the split
	 * SPTEs.
	 */
}

static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
					 struct kvm_rmap_head *rmap_head,
					 const struct kvm_memory_slot *slot)
+59 −0
Original line number Diff line number Diff line
@@ -192,6 +192,65 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
	return wrprot;
}

static u64 make_spte_executable(u64 spte)
{
	bool is_access_track = is_access_track_spte(spte);

	if (is_access_track)
		spte = restore_acc_track_spte(spte);

	spte &= ~shadow_nx_mask;
	spte |= shadow_x_mask;

	if (is_access_track)
		spte = mark_spte_for_access_track(spte);

	return spte;
}

/*
 * Construct an SPTE that maps a sub-page of the given huge page SPTE where
 * `index` identifies which sub-page.
 *
 * This is used during huge page splitting to build the SPTEs that make up the
 * new page table.
 */
u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
{
	u64 child_spte;
	int child_level;

	if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
		return 0;

	if (WARN_ON_ONCE(!is_large_pte(huge_spte)))
		return 0;

	child_spte = huge_spte;
	child_level = huge_level - 1;

	/*
	 * The child_spte already has the base address of the huge page being
	 * split. So we just have to OR in the offset to the page at the next
	 * lower level for the given index.
	 */
	child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT;

	if (child_level == PG_LEVEL_4K) {
		child_spte &= ~PT_PAGE_SIZE_MASK;

		/*
		 * When splitting to a 4K page, mark the page executable as the
		 * NX hugepage mitigation no longer applies.
		 */
		if (is_nx_huge_page_enabled())
			child_spte = make_spte_executable(child_spte);
	}

	return child_spte;
}


u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
{
	u64 spte = SPTE_MMU_PRESENT_MASK;
+1 −0
Original line number Diff line number Diff line
@@ -415,6 +415,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
	       unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
	       u64 old_spte, bool prefetch, bool can_unsync,
	       bool host_writable, u64 *new_spte);
u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index);
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
u64 mark_spte_for_access_track(u64 spte);
Loading