Commit 531810ca authored by Ben Gardon's avatar Ben Gardon Committed by Paolo Bonzini
Browse files

KVM: x86/mmu: Use an rwlock for the x86 MMU



Add a read / write lock to be used in place of the MMU spinlock on x86.
The rwlock will enable the TDP MMU to handle page faults, and other
operations in parallel in future commits.

Reviewed-by: default avatarPeter Feiner <pfeiner@google.com>
Signed-off-by: default avatarBen Gardon <bgardon@google.com>

Message-Id: <20210202185734.1680553-19-bgardon@google.com>
[Introduce virt/kvm/mmu_lock.h - Paolo]
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent f3d4b4b1
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -348,6 +348,8 @@ struct kvm_mmu_root_info {

#define KVM_MMU_NUM_PREV_ROOTS 3

#define KVM_HAVE_MMU_RWLOCK

struct kvm_mmu_page;

/*
+45 −45
Original line number Diff line number Diff line
@@ -2010,9 +2010,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
			flush |= kvm_sync_page(vcpu, sp, &invalid_list);
			mmu_pages_clear_parents(&parents);
		}
		if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
		if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
			kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
			cond_resched_lock(&vcpu->kvm->mmu_lock);
			cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
			flush = false;
		}
	}
@@ -2464,7 +2464,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
 */
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
{
	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);

	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
		kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
@@ -2475,7 +2475,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)

	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;

	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);
}

int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2486,7 +2486,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)

	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
	r = 0;
	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);
	for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
			 sp->role.word);
@@ -2494,7 +2494,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
	}
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);

	return r;
}
@@ -3186,7 +3186,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
			return;
	}

	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);

	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
		if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
@@ -3209,7 +3209,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
	}

	kvm_mmu_commit_zap_page(kvm, &invalid_list);
	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);

@@ -3230,16 +3230,16 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
{
	struct kvm_mmu_page *sp;

	spin_lock(&vcpu->kvm->mmu_lock);
	write_lock(&vcpu->kvm->mmu_lock);

	if (make_mmu_pages_available(vcpu)) {
		spin_unlock(&vcpu->kvm->mmu_lock);
		write_unlock(&vcpu->kvm->mmu_lock);
		return INVALID_PAGE;
	}
	sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
	++sp->root_count;

	spin_unlock(&vcpu->kvm->mmu_lock);
	write_unlock(&vcpu->kvm->mmu_lock);
	return __pa(sp->spt);
}

@@ -3410,17 +3410,17 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
		    !smp_load_acquire(&sp->unsync_children))
			return;

		spin_lock(&vcpu->kvm->mmu_lock);
		write_lock(&vcpu->kvm->mmu_lock);
		kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);

		mmu_sync_children(vcpu, sp);

		kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
		spin_unlock(&vcpu->kvm->mmu_lock);
		write_unlock(&vcpu->kvm->mmu_lock);
		return;
	}

	spin_lock(&vcpu->kvm->mmu_lock);
	write_lock(&vcpu->kvm->mmu_lock);
	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);

	for (i = 0; i < 4; ++i) {
@@ -3434,7 +3434,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
	}

	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
	spin_unlock(&vcpu->kvm->mmu_lock);
	write_unlock(&vcpu->kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);

@@ -3718,7 +3718,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
		return r;

	r = RET_PF_RETRY;
	spin_lock(&vcpu->kvm->mmu_lock);
	write_lock(&vcpu->kvm->mmu_lock);
	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
		goto out_unlock;
	r = make_mmu_pages_available(vcpu);
@@ -3733,7 +3733,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
				 prefault, is_tdp);

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	write_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
	return r;
}
@@ -4959,7 +4959,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
	 */
	mmu_topup_memory_caches(vcpu, true);

	spin_lock(&vcpu->kvm->mmu_lock);
	write_lock(&vcpu->kvm->mmu_lock);

	gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);

@@ -4991,7 +4991,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
	}
	kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
	kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
	spin_unlock(&vcpu->kvm->mmu_lock);
	write_unlock(&vcpu->kvm->mmu_lock);
}

int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -5189,14 +5189,14 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
		if (iterator.rmap)
			flush |= fn(kvm, iterator.rmap);

		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
			if (flush && lock_flush_tlb) {
				kvm_flush_remote_tlbs_with_address(kvm,
						start_gfn,
						iterator.gfn - start_gfn + 1);
				flush = false;
			}
			cond_resched_lock(&kvm->mmu_lock);
			cond_resched_rwlock_write(&kvm->mmu_lock);
		}
	}

@@ -5346,7 +5346,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
		 * be in active use by the guest.
		 */
		if (batch >= BATCH_ZAP_PAGES &&
		    cond_resched_lock(&kvm->mmu_lock)) {
		    cond_resched_rwlock_write(&kvm->mmu_lock)) {
			batch = 0;
			goto restart;
		}
@@ -5379,7 +5379,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
{
	lockdep_assert_held(&kvm->slots_lock);

	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);
	trace_kvm_mmu_zap_all_fast(kvm);

	/*
@@ -5406,7 +5406,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
	if (kvm->arch.tdp_mmu_enabled)
		kvm_tdp_mmu_zap_all(kvm);

	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);
}

static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5448,7 +5448,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
	int i;
	bool flush;

	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);
	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
		slots = __kvm_memslots(kvm, i);
		kvm_for_each_memslot(memslot, slots) {
@@ -5472,7 +5472,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
			kvm_flush_remote_tlbs(kvm);
	}

	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);
}

static bool slot_rmap_write_protect(struct kvm *kvm,
@@ -5487,12 +5487,12 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
{
	bool flush;

	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);
	flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
				start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
	if (kvm->arch.tdp_mmu_enabled)
		flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);

	/*
	 * We can flush all the TLBs out of the mmu lock without TLB
@@ -5552,13 +5552,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
				   const struct kvm_memory_slot *memslot)
{
	/* FIXME: const-ify all uses of struct kvm_memory_slot.  */
	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);
	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
			 kvm_mmu_zap_collapsible_spte, true);

	if (kvm->arch.tdp_mmu_enabled)
		kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);
}

void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
@@ -5581,11 +5581,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
{
	bool flush;

	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);
	flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
	if (kvm->arch.tdp_mmu_enabled)
		flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);

	/*
	 * It's also safe to flush TLBs out of mmu lock here as currently this
@@ -5603,12 +5603,12 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
{
	bool flush;

	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);
	flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
					false);
	if (kvm->arch.tdp_mmu_enabled)
		flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M);
	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);

	if (flush)
		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5620,11 +5620,11 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
{
	bool flush;

	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);
	flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
	if (kvm->arch.tdp_mmu_enabled)
		flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot);
	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);

	if (flush)
		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5637,14 +5637,14 @@ void kvm_mmu_zap_all(struct kvm *kvm)
	LIST_HEAD(invalid_list);
	int ign;

	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);
restart:
	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
		if (WARN_ON(sp->role.invalid))
			continue;
		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
			goto restart;
		if (cond_resched_lock(&kvm->mmu_lock))
		if (cond_resched_rwlock_write(&kvm->mmu_lock))
			goto restart;
	}

@@ -5653,7 +5653,7 @@ void kvm_mmu_zap_all(struct kvm *kvm)
	if (kvm->arch.tdp_mmu_enabled)
		kvm_tdp_mmu_zap_all(kvm);

	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);
}

void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
@@ -5713,7 +5713,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
			continue;

		idx = srcu_read_lock(&kvm->srcu);
		spin_lock(&kvm->mmu_lock);
		write_lock(&kvm->mmu_lock);

		if (kvm_has_zapped_obsolete_pages(kvm)) {
			kvm_mmu_commit_zap_page(kvm,
@@ -5724,7 +5724,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
		freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);

unlock:
		spin_unlock(&kvm->mmu_lock);
		write_unlock(&kvm->mmu_lock);
		srcu_read_unlock(&kvm->srcu, idx);

		/*
@@ -5944,7 +5944,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
	ulong to_zap;

	rcu_idx = srcu_read_lock(&kvm->srcu);
	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);

	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
	to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
@@ -5969,14 +5969,14 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
			WARN_ON_ONCE(sp->lpage_disallowed);
		}

		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
			kvm_mmu_commit_zap_page(kvm, &invalid_list);
			cond_resched_lock(&kvm->mmu_lock);
			cond_resched_rwlock_write(&kvm->mmu_lock);
		}
	}
	kvm_mmu_commit_zap_page(kvm, &invalid_list);

	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);
	srcu_read_unlock(&kvm->srcu, rcu_idx);
}

+4 −4
Original line number Diff line number Diff line
@@ -184,9 +184,9 @@ kvm_page_track_register_notifier(struct kvm *kvm,

	head = &kvm->arch.track_notifier_head;

	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);
	hlist_add_head_rcu(&n->node, &head->track_notifier_list);
	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);

@@ -202,9 +202,9 @@ kvm_page_track_unregister_notifier(struct kvm *kvm,

	head = &kvm->arch.track_notifier_head;

	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);
	hlist_del_rcu(&n->node);
	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);
	synchronize_srcu(&head->track_srcu);
}
EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
+4 −4
Original line number Diff line number Diff line
@@ -868,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
	}

	r = RET_PF_RETRY;
	spin_lock(&vcpu->kvm->mmu_lock);
	write_lock(&vcpu->kvm->mmu_lock);
	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
		goto out_unlock;

@@ -881,7 +881,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);

out_unlock:
	spin_unlock(&vcpu->kvm->mmu_lock);
	write_unlock(&vcpu->kvm->mmu_lock);
	kvm_release_pfn_clean(pfn);
	return r;
}
@@ -919,7 +919,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
		return;
	}

	spin_lock(&vcpu->kvm->mmu_lock);
	write_lock(&vcpu->kvm->mmu_lock);
	for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
		level = iterator.level;
		sptep = iterator.sptep;
@@ -954,7 +954,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
			break;
	}
	spin_unlock(&vcpu->kvm->mmu_lock);
	write_unlock(&vcpu->kvm->mmu_lock);
}

/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
+10 −10
Original line number Diff line number Diff line
@@ -59,7 +59,7 @@ static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
					   struct kvm_mmu_page *root)
{
	lockdep_assert_held(&kvm->mmu_lock);
	lockdep_assert_held_write(&kvm->mmu_lock);

	if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
		return false;
@@ -117,7 +117,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);

	lockdep_assert_held(&kvm->mmu_lock);
	lockdep_assert_held_write(&kvm->mmu_lock);

	WARN_ON(root->root_count);
	WARN_ON(!root->tdp_mmu_page);
@@ -170,13 +170,13 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)

	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);

	spin_lock(&kvm->mmu_lock);
	write_lock(&kvm->mmu_lock);

	/* Check for an existing root before allocating a new one. */
	for_each_tdp_mmu_root(kvm, root) {
		if (root->role.word == role.word) {
			kvm_mmu_get_root(kvm, root);
			spin_unlock(&kvm->mmu_lock);
			write_unlock(&kvm->mmu_lock);
			return root;
		}
	}
@@ -186,7 +186,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)

	list_add(&root->link, &kvm->arch.tdp_mmu_roots);

	spin_unlock(&kvm->mmu_lock);
	write_unlock(&kvm->mmu_lock);

	return root;
}
@@ -421,7 +421,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
	struct kvm_mmu_page *root = sptep_to_sp(root_pt);
	int as_id = kvm_mmu_page_as_id(root);

	lockdep_assert_held(&kvm->mmu_lock);
	lockdep_assert_held_write(&kvm->mmu_lock);

	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);

@@ -492,13 +492,13 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
	if (iter->next_last_level_gfn == iter->yielded_gfn)
		return false;

	if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
		rcu_read_unlock();

		if (flush)
			kvm_flush_remote_tlbs(kvm);

		cond_resched_lock(&kvm->mmu_lock);
		cond_resched_rwlock_write(&kvm->mmu_lock);
		rcu_read_lock();

		WARN_ON(iter->gfn > iter->next_last_level_gfn);
@@ -1103,7 +1103,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
	struct kvm_mmu_page *root;
	int root_as_id;

	lockdep_assert_held(&kvm->mmu_lock);
	lockdep_assert_held_write(&kvm->mmu_lock);
	for_each_tdp_mmu_root(kvm, root) {
		root_as_id = kvm_mmu_page_as_id(root);
		if (root_as_id != slot->as_id)
@@ -1268,7 +1268,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
	int root_as_id;
	bool spte_set = false;

	lockdep_assert_held(&kvm->mmu_lock);
	lockdep_assert_held_write(&kvm->mmu_lock);
	for_each_tdp_mmu_root(kvm, root) {
		root_as_id = kvm_mmu_page_as_id(root);
		if (root_as_id != slot->as_id)
Loading