KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock (6103bc07) · Commits · EulixOS / Software / Kernel

arch/x86/kvm/mmu/mmu.c

+15 −7

Original line number	Diff line number	Diff line
		@@ -3121,7 +3121,7 @@ static void mmu_free_root_page(struct kvm kvm, hpa_t root_hpa,
		sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);

		if (is_tdp_mmu_page(sp))
		kvm_tdp_mmu_put_root(kvm, sp);
		kvm_tdp_mmu_put_root(kvm, sp, false);
		else if (!--sp->root_count && sp->role.invalid)
		kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);

		@@ -5496,16 +5496,24 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
		}
		}

		if (flush)
		kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);

		write_unlock(&kvm->mmu_lock);

		if (is_tdp_mmu_enabled(kvm)) {
		flush = false;

		read_lock(&kvm->mmu_lock);
		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
		gfn_end, flush);
		}

		gfn_end, flush, true);
		if (flush)
		kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
		kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
		gfn_end);

		write_unlock(&kvm->mmu_lock);
		read_unlock(&kvm->mmu_lock);
		}
		}

		static bool slot_rmap_write_protect(struct kvm *kvm,

arch/x86/kvm/mmu/tdp_mmu.c

+77 −33

Original line number	Diff line number	Diff line
		@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
		INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
		}

		static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
		bool shared)
		{
		if (shared)
		lockdep_assert_held_read(&kvm->mmu_lock);
		else
		lockdep_assert_held_write(&kvm->mmu_lock);
		}

		void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
		{
		if (!kvm->arch.tdp_mmu_enabled)
		@@ -42,7 +51,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
		}

		static bool zap_gfn_range(struct kvm kvm, struct kvm_mmu_page root,
		gfn_t start, gfn_t end, bool can_yield, bool flush);
		gfn_t start, gfn_t end, bool can_yield, bool flush,
		bool shared);

		static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
		{
		@@ -66,11 +76,12 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
		tdp_mmu_free_sp(sp);
		}

		void kvm_tdp_mmu_put_root(struct kvm kvm, struct kvm_mmu_page root)
		void kvm_tdp_mmu_put_root(struct kvm kvm, struct kvm_mmu_page root,
		bool shared)
		{
		gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);

		lockdep_assert_held_write(&kvm->mmu_lock);
		kvm_lockdep_assert_mmu_lock_held(kvm, shared);

		if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
		return;
		@@ -81,7 +92,7 @@ void kvm_tdp_mmu_put_root(struct kvm kvm, struct kvm_mmu_page root)
		list_del_rcu(&root->link);
		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);

		zap_gfn_range(kvm, root, 0, max_gfn, false, false);
		zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);

		call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
		}
		@@ -94,12 +105,11 @@ void kvm_tdp_mmu_put_root(struct kvm kvm, struct kvm_mmu_page root)
		* function will return NULL.
		*/
		static struct kvm_mmu_page tdp_mmu_next_root(struct kvm kvm,
		struct kvm_mmu_page *prev_root)
		struct kvm_mmu_page *prev_root,
		bool shared)
		{
		struct kvm_mmu_page *next_root;

		lockdep_assert_held_write(&kvm->mmu_lock);

		rcu_read_lock();

		if (prev_root)
		@@ -117,7 +127,7 @@ static struct kvm_mmu_page tdp_mmu_next_root(struct kvm kvm,
		rcu_read_unlock();

		if (prev_root)
		kvm_tdp_mmu_put_root(kvm, prev_root);
		kvm_tdp_mmu_put_root(kvm, prev_root, shared);

		return next_root;
		}
		@@ -127,11 +137,15 @@ static struct kvm_mmu_page tdp_mmu_next_root(struct kvm kvm,
		* This makes it safe to release the MMU lock and yield within the loop, but
		* if exiting the loop early, the caller must drop the reference to the most
		* recent root. (Unless keeping a live reference is desirable.)
		*
		* If shared is set, this function is operating under the MMU lock in read
		* mode. In the unlikely event that this thread must free a root, the lock
		* will be temporarily dropped and reacquired in write mode.
		*/
		#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
		for (_root = tdp_mmu_next_root(_kvm, NULL); \
		#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
		for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \
		_root; \
		_root = tdp_mmu_next_root(_kvm, _root)) \
		_root = tdp_mmu_next_root(_kvm, _root, _shared)) \
		if (kvm_mmu_page_as_id(_root) != _as_id) { \
		} else

		@@ -636,7 +650,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
		* Return false if a yield was not needed.
		*/
		static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
		struct tdp_iter *iter, bool flush)
		struct tdp_iter *iter, bool flush,
		bool shared)
		{
		/* Ensure forward progress has been made before yielding. */
		if (iter->next_last_level_gfn == iter->yielded_gfn)
		@@ -648,7 +663,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
		if (flush)
		kvm_flush_remote_tlbs(kvm);

		if (shared)
		cond_resched_rwlock_read(&kvm->mmu_lock);
		else
		cond_resched_rwlock_write(&kvm->mmu_lock);

		rcu_read_lock();

		WARN_ON(iter->gfn > iter->next_last_level_gfn);
		@@ -666,24 +685,32 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
		* non-root pages mapping GFNs strictly within that range. Returns true if
		* SPTEs have been cleared and a TLB flush is needed before releasing the
		* MMU lock.
		*
		* If can_yield is true, will release the MMU lock and reschedule if the
		* scheduler needs the CPU or there is contention on the MMU lock. If this
		* function cannot yield, it will not release the MMU lock or reschedule and
		* the caller must ensure it does not supply too large a GFN range, or the
		* operation can cause a soft lockup. Note, in some use cases a flush may be
		* required by prior actions. Ensure the pending flush is performed prior to
		* yielding.
		* operation can cause a soft lockup.
		*
		* If shared is true, this thread holds the MMU lock in read mode and must
		* account for the possibility that other threads are modifying the paging
		* structures concurrently. If shared is false, this thread should hold the
		* MMU lock in write mode.
		*/
		static bool zap_gfn_range(struct kvm kvm, struct kvm_mmu_page root,
		gfn_t start, gfn_t end, bool can_yield, bool flush)
		gfn_t start, gfn_t end, bool can_yield, bool flush,
		bool shared)
		{
		struct tdp_iter iter;

		kvm_lockdep_assert_mmu_lock_held(kvm, shared);

		rcu_read_lock();

		tdp_root_for_each_pte(iter, root, start, end) {
		retry:
		if (can_yield &&
		tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
		tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
		flush = false;
		continue;
		}
		@@ -701,8 +728,17 @@ static bool zap_gfn_range(struct kvm kvm, struct kvm_mmu_page root,
		!is_last_spte(iter.old_spte, iter.level))
		continue;

		if (!shared) {
		tdp_mmu_set_spte(kvm, &iter, 0);
		flush = true;
		} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
		/*
		* The iter must explicitly re-read the SPTE because
		* the atomic cmpxchg failed.
		*/
		iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
		goto retry;
		}
		}

		rcu_read_unlock();
		@@ -714,14 +750,21 @@ static bool zap_gfn_range(struct kvm kvm, struct kvm_mmu_page root,
		* non-root pages mapping GFNs strictly within that range. Returns true if
		* SPTEs have been cleared and a TLB flush is needed before releasing the
		* MMU lock.
		*
		* If shared is true, this thread holds the MMU lock in read mode and must
		* account for the possibility that other threads are modifying the paging
		* structures concurrently. If shared is false, this thread should hold the
		* MMU in write mode.
		*/
		bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
		gfn_t end, bool can_yield, bool flush)
		gfn_t end, bool can_yield, bool flush,
		bool shared)
		{
		struct kvm_mmu_page *root;

		for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
		for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
		shared);

		return flush;
		}
		@@ -733,7 +776,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
		int i;

		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, flush);
		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
		flush, false);

		if (flush)
		kvm_flush_remote_tlbs(kvm);
		@@ -892,7 +936,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm kvm, struct kvm_gfn_range range,

		for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
		flush \|= zap_gfn_range(kvm, root, range->start, range->end,
		range->may_block, flush);
		range->may_block, flush, false);

		return flush;
		}
		@@ -1038,7 +1082,7 @@ static bool wrprot_gfn_range(struct kvm kvm, struct kvm_mmu_page root,

		for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
		min_level, start, end) {
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
		continue;

		if (!is_shadow_present_pte(iter.old_spte) \|\|
		@@ -1067,7 +1111,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm kvm, struct kvm_memory_slot slot,
		struct kvm_mmu_page *root;
		bool spte_set = false;

		for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
		for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false)
		spte_set \|= wrprot_gfn_range(kvm, root, slot->base_gfn,
		slot->base_gfn + slot->npages, min_level);

		@@ -1091,7 +1135,7 @@ static bool clear_dirty_gfn_range(struct kvm kvm, struct kvm_mmu_page root,
		rcu_read_lock();

		tdp_root_for_each_leaf_pte(iter, root, start, end) {
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
		continue;

		if (spte_ad_need_write_protect(iter.old_spte)) {
		@@ -1126,7 +1170,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm kvm, struct kvm_memory_slot slot)
		struct kvm_mmu_page *root;
		bool spte_set = false;

		for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
		for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false)
		spte_set \|= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
		slot->base_gfn + slot->npages);

		@@ -1213,7 +1257,7 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
		rcu_read_lock();

		tdp_root_for_each_pte(iter, root, start, end) {
		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
		flush = false;
		continue;
		}
		@@ -1248,7 +1292,7 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
		{
		struct kvm_mmu_page *root;

		for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
		for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false)
		flush = zap_collapsible_spte_range(kvm, root, slot, flush);

		return flush;

arch/x86/kvm/mmu/tdp_mmu.h

+9 −5

Original line number	Diff line number	Diff line
		@@ -13,14 +13,18 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
		return refcount_inc_not_zero(&root->tdp_mmu_root_count);
		}

		void kvm_tdp_mmu_put_root(struct kvm kvm, struct kvm_mmu_page root);
		void kvm_tdp_mmu_put_root(struct kvm kvm, struct kvm_mmu_page root,
		bool shared);

		bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
		gfn_t end, bool can_yield, bool flush);
		gfn_t end, bool can_yield, bool flush,
		bool shared);
		static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
		gfn_t start, gfn_t end, bool flush)
		gfn_t start, gfn_t end, bool flush,
		bool shared)
		{
		return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush);
		return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush,
		shared);
		}
		static inline bool kvm_tdp_mmu_zap_sp(struct kvm kvm, struct kvm_mmu_page sp)
		{
		@@ -37,7 +41,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm kvm, struct kvm_mmu_page sp)
		*/
		lockdep_assert_held_write(&kvm->mmu_lock);
		return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
		sp->gfn, end, false, false);
		sp->gfn, end, false, false, false);
		}
		void kvm_tdp_mmu_zap_all(struct kvm *kvm);