Commit 6103bc07 authored by Ben Gardon's avatar Ben Gardon Committed by Paolo Bonzini
Browse files

KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock



To reduce lock contention and interference with page fault handlers,
allow the TDP MMU function to zap a GFN range to operate under the MMU
read lock.

Signed-off-by: default avatarBen Gardon <bgardon@google.com>
Message-Id: <20210401233736.638171-10-bgardon@google.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent c0e64238
Loading
Loading
Loading
Loading
+15 −7
Original line number Diff line number Diff line
@@ -3121,7 +3121,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
	sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);

	if (is_tdp_mmu_page(sp))
		kvm_tdp_mmu_put_root(kvm, sp);
		kvm_tdp_mmu_put_root(kvm, sp, false);
	else if (!--sp->root_count && sp->role.invalid)
		kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);

@@ -5496,16 +5496,24 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
		}
	}

	if (flush)
		kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);

	write_unlock(&kvm->mmu_lock);

	if (is_tdp_mmu_enabled(kvm)) {
		flush = false;

		read_lock(&kvm->mmu_lock);
		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
			flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
							  gfn_end, flush);
	}

							  gfn_end, flush, true);
		if (flush)
		kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
			kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
							   gfn_end);

	write_unlock(&kvm->mmu_lock);
		read_unlock(&kvm->mmu_lock);
	}
}

static bool slot_rmap_write_protect(struct kvm *kvm,
+77 −33
Original line number Diff line number Diff line
@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
}

static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
							     bool shared)
{
	if (shared)
		lockdep_assert_held_read(&kvm->mmu_lock);
	else
		lockdep_assert_held_write(&kvm->mmu_lock);
}

void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
{
	if (!kvm->arch.tdp_mmu_enabled)
@@ -42,7 +51,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
}

static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
			  gfn_t start, gfn_t end, bool can_yield, bool flush);
			  gfn_t start, gfn_t end, bool can_yield, bool flush,
			  bool shared);

static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
@@ -66,11 +76,12 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
	tdp_mmu_free_sp(sp);
}

void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
			  bool shared)
{
	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);

	lockdep_assert_held_write(&kvm->mmu_lock);
	kvm_lockdep_assert_mmu_lock_held(kvm, shared);

	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
		return;
@@ -81,7 +92,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
	list_del_rcu(&root->link);
	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);

	zap_gfn_range(kvm, root, 0, max_gfn, false, false);
	zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);

	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
@@ -94,12 +105,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
 * function will return NULL.
 */
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
					      struct kvm_mmu_page *prev_root)
					      struct kvm_mmu_page *prev_root,
					      bool shared)
{
	struct kvm_mmu_page *next_root;

	lockdep_assert_held_write(&kvm->mmu_lock);

	rcu_read_lock();

	if (prev_root)
@@ -117,7 +127,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
	rcu_read_unlock();

	if (prev_root)
		kvm_tdp_mmu_put_root(kvm, prev_root);
		kvm_tdp_mmu_put_root(kvm, prev_root, shared);

	return next_root;
}
@@ -127,11 +137,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 * This makes it safe to release the MMU lock and yield within the loop, but
 * if exiting the loop early, the caller must drop the reference to the most
 * recent root. (Unless keeping a live reference is desirable.)
 *
 * If shared is set, this function is operating under the MMU lock in read
 * mode. In the unlikely event that this thread must free a root, the lock
 * will be temporarily dropped and reacquired in write mode.
 */
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)	\
	for (_root = tdp_mmu_next_root(_kvm, NULL);		\
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);		\
	     _root;							\
	     _root = tdp_mmu_next_root(_kvm, _root))		\
	     _root = tdp_mmu_next_root(_kvm, _root, _shared))		\
		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
		} else

@@ -636,7 +650,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 * Return false if a yield was not needed.
 */
static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
					     struct tdp_iter *iter, bool flush)
					     struct tdp_iter *iter, bool flush,
					     bool shared)
{
	/* Ensure forward progress has been made before yielding. */
	if (iter->next_last_level_gfn == iter->yielded_gfn)
@@ -648,7 +663,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
		if (flush)
			kvm_flush_remote_tlbs(kvm);

		if (shared)
			cond_resched_rwlock_read(&kvm->mmu_lock);
		else
			cond_resched_rwlock_write(&kvm->mmu_lock);

		rcu_read_lock();

		WARN_ON(iter->gfn > iter->next_last_level_gfn);
@@ -666,24 +685,32 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 * non-root pages mapping GFNs strictly within that range. Returns true if
 * SPTEs have been cleared and a TLB flush is needed before releasing the
 * MMU lock.
 *
 * If can_yield is true, will release the MMU lock and reschedule if the
 * scheduler needs the CPU or there is contention on the MMU lock. If this
 * function cannot yield, it will not release the MMU lock or reschedule and
 * the caller must ensure it does not supply too large a GFN range, or the
 * operation can cause a soft lockup.  Note, in some use cases a flush may be
 * required by prior actions.  Ensure the pending flush is performed prior to
 * yielding.
 * operation can cause a soft lockup.
 *
 * If shared is true, this thread holds the MMU lock in read mode and must
 * account for the possibility that other threads are modifying the paging
 * structures concurrently. If shared is false, this thread should hold the
 * MMU lock in write mode.
 */
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
			  gfn_t start, gfn_t end, bool can_yield, bool flush)
			  gfn_t start, gfn_t end, bool can_yield, bool flush,
			  bool shared)
{
	struct tdp_iter iter;

	kvm_lockdep_assert_mmu_lock_held(kvm, shared);

	rcu_read_lock();

	tdp_root_for_each_pte(iter, root, start, end) {
retry:
		if (can_yield &&
		    tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
			flush = false;
			continue;
		}
@@ -701,8 +728,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
		    !is_last_spte(iter.old_spte, iter.level))
			continue;

		if (!shared) {
			tdp_mmu_set_spte(kvm, &iter, 0);
			flush = true;
		} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
			/*
			 * The iter must explicitly re-read the SPTE because
			 * the atomic cmpxchg failed.
			 */
			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
			goto retry;
		}
	}

	rcu_read_unlock();
@@ -714,14 +750,21 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 * non-root pages mapping GFNs strictly within that range. Returns true if
 * SPTEs have been cleared and a TLB flush is needed before releasing the
 * MMU lock.
 *
 * If shared is true, this thread holds the MMU lock in read mode and must
 * account for the possibility that other threads are modifying the paging
 * structures concurrently. If shared is false, this thread should hold the
 * MMU in write mode.
 */
bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
				 gfn_t end, bool can_yield, bool flush)
				 gfn_t end, bool can_yield, bool flush,
				 bool shared)
{
	struct kvm_mmu_page *root;

	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
				      shared);

	return flush;
}
@@ -733,7 +776,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
	int i;

	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, flush);
		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
						  flush, false);

	if (flush)
		kvm_flush_remote_tlbs(kvm);
@@ -892,7 +936,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,

	for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
		flush |= zap_gfn_range(kvm, root, range->start, range->end,
				       range->may_block, flush);
				       range->may_block, flush, false);

	return flush;
}
@@ -1038,7 +1082,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,

	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
				   min_level, start, end) {
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
			continue;

		if (!is_shadow_present_pte(iter.old_spte) ||
@@ -1067,7 +1111,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
	struct kvm_mmu_page *root;
	bool spte_set = false;

	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false)
		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
			     slot->base_gfn + slot->npages, min_level);

@@ -1091,7 +1135,7 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
	rcu_read_lock();

	tdp_root_for_each_leaf_pte(iter, root, start, end) {
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
			continue;

		if (spte_ad_need_write_protect(iter.old_spte)) {
@@ -1126,7 +1170,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
	struct kvm_mmu_page *root;
	bool spte_set = false;

	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false)
		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
				slot->base_gfn + slot->npages);

@@ -1213,7 +1257,7 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
	rcu_read_lock();

	tdp_root_for_each_pte(iter, root, start, end) {
		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
			flush = false;
			continue;
		}
@@ -1248,7 +1292,7 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
{
	struct kvm_mmu_page *root;

	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false)
		flush = zap_collapsible_spte_range(kvm, root, slot, flush);

	return flush;
+9 −5
Original line number Diff line number Diff line
@@ -13,14 +13,18 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
	return refcount_inc_not_zero(&root->tdp_mmu_root_count);
}

void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
			  bool shared);

bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
				 gfn_t end, bool can_yield, bool flush);
				 gfn_t end, bool can_yield, bool flush,
				 bool shared);
static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
					     gfn_t start, gfn_t end, bool flush)
					     gfn_t start, gfn_t end, bool flush,
					     bool shared)
{
	return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush);
	return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush,
					   shared);
}
static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
@@ -37,7 +41,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
	 */
	lockdep_assert_held_write(&kvm->mmu_lock);
	return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
					   sp->gfn, end, false, false);
					   sp->gfn, end, false, false, false);
}
void kvm_tdp_mmu_zap_all(struct kvm *kvm);