Commit 6e949ddb authored by Paolo Bonzini's avatar Paolo Bonzini
Browse files

Merge branch 'kvm-tdpmmu-fixes' into kvm-master

Merge topic branch with fixes for both 5.14-rc6 and 5.15.
parents c5e2bf0b ce25681d
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -25,10 +25,10 @@ On x86:

- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock

- kvm->arch.mmu_lock is an rwlock.  kvm->arch.tdp_mmu_pages_lock is
  taken inside kvm->arch.mmu_lock, and cannot be taken without already
  holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise
  there's no need to take kvm->arch.tdp_mmu_pages_lock at all).
- kvm->arch.mmu_lock is an rwlock.  kvm->arch.tdp_mmu_pages_lock and
  kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and
  cannot be taken without already holding kvm->arch.mmu_lock (typically with
  ``read_lock`` for the TDP MMU, thus the need for additional spinlocks).

Everything else is a leaf: no other lock is taken inside the critical
sections.
+7 −0
Original line number Diff line number Diff line
@@ -1038,6 +1038,13 @@ struct kvm_arch {
	struct list_head lpage_disallowed_mmu_pages;
	struct kvm_page_track_notifier_node mmu_sp_tracker;
	struct kvm_page_track_notifier_head track_notifier_head;
	/*
	 * Protects marking pages unsync during page faults, as TDP MMU page
	 * faults only take mmu_lock for read.  For simplicity, the unsync
	 * pages lock is always taken when marking pages unsync regardless of
	 * whether mmu_lock is held for read or write.
	 */
	spinlock_t mmu_unsync_pages_lock;

	struct list_head assigned_dev_head;
	struct iommu_domain *iommu_domain;
+28 −0
Original line number Diff line number Diff line
@@ -2535,6 +2535,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
{
	struct kvm_mmu_page *sp;
	bool locked = false;

	/*
	 * Force write-protection if the page is being tracked.  Note, the page
@@ -2557,9 +2558,34 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
		if (sp->unsync)
			continue;

		/*
		 * TDP MMU page faults require an additional spinlock as they
		 * run with mmu_lock held for read, not write, and the unsync
		 * logic is not thread safe.  Take the spinklock regardless of
		 * the MMU type to avoid extra conditionals/parameters, there's
		 * no meaningful penalty if mmu_lock is held for write.
		 */
		if (!locked) {
			locked = true;
			spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock);

			/*
			 * Recheck after taking the spinlock, a different vCPU
			 * may have since marked the page unsync.  A false
			 * positive on the unprotected check above is not
			 * possible as clearing sp->unsync _must_ hold mmu_lock
			 * for write, i.e. unsync cannot transition from 0->1
			 * while this CPU holds mmu_lock for read (or write).
			 */
			if (READ_ONCE(sp->unsync))
				continue;
		}

		WARN_ON(sp->role.level != PG_LEVEL_4K);
		kvm_unsync_page(vcpu, sp);
	}
	if (locked)
		spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock);

	/*
	 * We need to ensure that the marking of unsync pages is visible
@@ -5537,6 +5563,8 @@ void kvm_mmu_init_vm(struct kvm *kvm)
{
	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;

	spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);

	if (!kvm_mmu_init_tdp_mmu(kvm))
		/*
		 * No smp_load/store wrappers needed here as we are in
+24 −11
Original line number Diff line number Diff line
@@ -43,6 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
	if (!kvm->arch.tdp_mmu_enabled)
		return;

	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));

	/*
@@ -81,8 +82,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
			  bool shared)
{
	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);

	kvm_lockdep_assert_mmu_lock_held(kvm, shared);

	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
@@ -94,7 +93,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
	list_del_rcu(&root->link);
	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);

	zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
	zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);

	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
@@ -724,13 +723,29 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
			  gfn_t start, gfn_t end, bool can_yield, bool flush,
			  bool shared)
{
	gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
	bool zap_all = (start == 0 && end >= max_gfn_host);
	struct tdp_iter iter;

	/*
	 * No need to try to step down in the iterator when zapping all SPTEs,
	 * zapping the top-level non-leaf SPTEs will recurse on their children.
	 */
	int min_level = zap_all ? root->role.level : PG_LEVEL_4K;

	/*
	 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
	 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
	 * and so KVM will never install a SPTE for such addresses.
	 */
	end = min(end, max_gfn_host);

	kvm_lockdep_assert_mmu_lock_held(kvm, shared);

	rcu_read_lock();

	tdp_root_for_each_pte(iter, root, start, end) {
	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
				   min_level, start, end) {
retry:
		if (can_yield &&
		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
@@ -744,9 +759,10 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
		/*
		 * If this is a non-last-level SPTE that covers a larger range
		 * than should be zapped, continue, and zap the mappings at a
		 * lower level.
		 * lower level, except when zapping all SPTEs.
		 */
		if ((iter.gfn < start ||
		if (!zap_all &&
		    (iter.gfn < start ||
		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
		    !is_last_spte(iter.old_spte, iter.level))
			continue;
@@ -794,12 +810,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,

void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
	bool flush = false;
	int i;

	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
						  flush, false);

	if (flush)
@@ -838,7 +853,6 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
 */
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
{
	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
	struct kvm_mmu_page *next_root;
	struct kvm_mmu_page *root;
	bool flush = false;
@@ -854,8 +868,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)

		rcu_read_unlock();

		flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
				      true);
		flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);

		/*
		 * Put the reference acquired in