Merge branch 'kvm-tdpmmu-fixes' into kvm-master (6e949ddb) · Commits · EulixOS / Software / Kernel

Documentation/virt/kvm/locking.rst

+4 −4

Original line number	Diff line number	Diff line
		@@ -25,10 +25,10 @@ On x86:

		- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock

		- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is
		taken inside kvm->arch.mmu_lock, and cannot be taken without already
		holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise
		there's no need to take kvm->arch.tdp_mmu_pages_lock at all).
		- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and
		kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and
		cannot be taken without already holding kvm->arch.mmu_lock (typically with
		``read_lock`` for the TDP MMU, thus the need for additional spinlocks).

		Everything else is a leaf: no other lock is taken inside the critical
		sections.

arch/x86/include/asm/kvm_host.h

+7 −0

Original line number	Diff line number	Diff line
		@@ -1038,6 +1038,13 @@ struct kvm_arch {
		struct list_head lpage_disallowed_mmu_pages;
		struct kvm_page_track_notifier_node mmu_sp_tracker;
		struct kvm_page_track_notifier_head track_notifier_head;
		/*
		* Protects marking pages unsync during page faults, as TDP MMU page
		* faults only take mmu_lock for read. For simplicity, the unsync
		* pages lock is always taken when marking pages unsync regardless of
		* whether mmu_lock is held for read or write.
		*/
		spinlock_t mmu_unsync_pages_lock;

		struct list_head assigned_dev_head;
		struct iommu_domain *iommu_domain;

arch/x86/kvm/mmu/mmu.c

+28 −0

Original line number	Diff line number	Diff line
		@@ -2535,6 +2535,7 @@ static void kvm_unsync_page(struct kvm_vcpu vcpu, struct kvm_mmu_page sp)
		int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
		{
		struct kvm_mmu_page *sp;
		bool locked = false;

		/*
		* Force write-protection if the page is being tracked. Note, the page
		@@ -2557,9 +2558,34 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
		if (sp->unsync)
		continue;

		/*
		* TDP MMU page faults require an additional spinlock as they
		* run with mmu_lock held for read, not write, and the unsync
		* logic is not thread safe. Take the spinklock regardless of
		* the MMU type to avoid extra conditionals/parameters, there's
		* no meaningful penalty if mmu_lock is held for write.
		*/
		if (!locked) {
		locked = true;
		spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock);

		/*
		* Recheck after taking the spinlock, a different vCPU
		* may have since marked the page unsync. A false
		* positive on the unprotected check above is not
		* possible as clearing sp->unsync _must_ hold mmu_lock
		* for write, i.e. unsync cannot transition from 0->1
		* while this CPU holds mmu_lock for read (or write).
		*/
		if (READ_ONCE(sp->unsync))
		continue;
		}

		WARN_ON(sp->role.level != PG_LEVEL_4K);
		kvm_unsync_page(vcpu, sp);
		}
		if (locked)
		spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock);

		/*
		* We need to ensure that the marking of unsync pages is visible
		@@ -5537,6 +5563,8 @@ void kvm_mmu_init_vm(struct kvm *kvm)
		{
		struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;

		spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);

		if (!kvm_mmu_init_tdp_mmu(kvm))
		/*
		* No smp_load/store wrappers needed here as we are in

arch/x86/kvm/mmu/tdp_mmu.c

+24 −11

Original line number	Diff line number	Diff line
		@@ -43,6 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
		if (!kvm->arch.tdp_mmu_enabled)
		return;

		WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
		WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));

		/*
		@@ -81,8 +82,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
		void kvm_tdp_mmu_put_root(struct kvm kvm, struct kvm_mmu_page root,
		bool shared)
		{
		gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);

		kvm_lockdep_assert_mmu_lock_held(kvm, shared);

		if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
		@@ -94,7 +93,7 @@ void kvm_tdp_mmu_put_root(struct kvm kvm, struct kvm_mmu_page root,
		list_del_rcu(&root->link);
		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);

		zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
		zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);

		call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
		}
		@@ -724,13 +723,29 @@ static bool zap_gfn_range(struct kvm kvm, struct kvm_mmu_page root,
		gfn_t start, gfn_t end, bool can_yield, bool flush,
		bool shared)
		{
		gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
		bool zap_all = (start == 0 && end >= max_gfn_host);
		struct tdp_iter iter;

		/*
		* No need to try to step down in the iterator when zapping all SPTEs,
		* zapping the top-level non-leaf SPTEs will recurse on their children.
		*/
		int min_level = zap_all ? root->role.level : PG_LEVEL_4K;

		/*
		* Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
		* hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
		* and so KVM will never install a SPTE for such addresses.
		*/
		end = min(end, max_gfn_host);

		kvm_lockdep_assert_mmu_lock_held(kvm, shared);

		rcu_read_lock();

		tdp_root_for_each_pte(iter, root, start, end) {
		for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
		min_level, start, end) {
		retry:
		if (can_yield &&
		tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
		@@ -744,9 +759,10 @@ static bool zap_gfn_range(struct kvm kvm, struct kvm_mmu_page root,
		/*
		* If this is a non-last-level SPTE that covers a larger range
		* than should be zapped, continue, and zap the mappings at a
		* lower level.
		* lower level, except when zapping all SPTEs.
		*/
		if ((iter.gfn < start \|\|
		if (!zap_all &&
		(iter.gfn < start \|\|
		iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
		!is_last_spte(iter.old_spte, iter.level))
		continue;
		@@ -794,12 +810,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,

		void kvm_tdp_mmu_zap_all(struct kvm *kvm)
		{
		gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
		bool flush = false;
		int i;

		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
		flush, false);

		if (flush)
		@@ -838,7 +853,6 @@ static struct kvm_mmu_page next_invalidated_root(struct kvm kvm,
		*/
		void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
		{
		gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
		struct kvm_mmu_page *next_root;
		struct kvm_mmu_page *root;
		bool flush = false;
		@@ -854,8 +868,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)

		rcu_read_unlock();

		flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
		true);
		flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);

		/*
		* Put the reference acquired in