Commit 65855ed8 authored by Lai Jiangshan's avatar Lai Jiangshan Committed by Paolo Bonzini
Browse files

KVM: X86: Synchronize the shadow pagetable before link it



If gpte is changed from non-present to present, the guest doesn't need
to flush tlb per SDM.  So the host must synchronze sp before
link it.  Otherwise the guest might use a wrong mapping.

For example: the guest first changes a level-1 pagetable, and then
links its parent to a new place where the original gpte is non-present.
Finally the guest can access the remapped area without flushing
the tlb.  The guest's behavior should be allowed per SDM, but the host
kvm mmu makes it wrong.

Fixes: 4731d4c7 ("KVM: MMU: out of sync shadow core")
Signed-off-by: default avatarLai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210918005636.3675-3-jiangshanlai@gmail.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent f8160295
Loading
Loading
Loading
Loading
+10 −7
Original line number Diff line number Diff line
@@ -2027,8 +2027,8 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
	} while (!sp->unsync_children);
}

static void mmu_sync_children(struct kvm_vcpu *vcpu,
			      struct kvm_mmu_page *parent)
static int mmu_sync_children(struct kvm_vcpu *vcpu,
			     struct kvm_mmu_page *parent, bool can_yield)
{
	int i;
	struct kvm_mmu_page *sp;
@@ -2055,12 +2055,18 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
		}
		if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
			kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
			if (!can_yield) {
				kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
				return -EINTR;
			}

			cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
			flush = false;
		}
	}

	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
	return 0;
}

static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
@@ -2146,9 +2152,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
		}

		if (sp->unsync_children)
			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);

		__clear_sp_write_flooding_count(sp);

trace_get_page:
@@ -3684,7 +3687,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
		write_lock(&vcpu->kvm->mmu_lock);
		kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);

		mmu_sync_children(vcpu, sp);
		mmu_sync_children(vcpu, sp, true);

		kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
		write_unlock(&vcpu->kvm->mmu_lock);
@@ -3700,7 +3703,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
		if (IS_VALID_PAE_ROOT(root)) {
			root &= PT64_BASE_ADDR_MASK;
			sp = to_shadow_page(root);
			mmu_sync_children(vcpu, sp);
			mmu_sync_children(vcpu, sp, true);
		}
	}

+21 −2
Original line number Diff line number Diff line
@@ -707,8 +707,27 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
		if (!is_shadow_present_pte(*it.sptep)) {
			table_gfn = gw->table_gfn[it.level - 2];
			access = gw->pt_access[it.level - 2];
			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
					      false, access);
			sp = kvm_mmu_get_page(vcpu, table_gfn, addr,
					      it.level-1, false, access);
			/*
			 * We must synchronize the pagetable before linking it
			 * because the guest doesn't need to flush tlb when
			 * the gpte is changed from non-present to present.
			 * Otherwise, the guest may use the wrong mapping.
			 *
			 * For PG_LEVEL_4K, kvm_mmu_get_page() has already
			 * synchronized it transiently via kvm_sync_page().
			 *
			 * For higher level pagetable, we synchronize it via
			 * the slower mmu_sync_children().  If it needs to
			 * break, some progress has been made; return
			 * RET_PF_RETRY and retry on the next #PF.
			 * KVM_REQ_MMU_SYNC is not necessary but it
			 * expedites the process.
			 */
			if (sp->unsync_children &&
			    mmu_sync_children(vcpu, sp, false))
				return RET_PF_RETRY;
		}

		/*