KVM: Block memslot updates across range_start() and range_end() (52ac8b35) · Commits · EulixOS / Software / Kernel

Documentation/virt/kvm/locking.rst

+6 −0

Original line number	Diff line number	Diff line
		@@ -21,6 +21,12 @@ The acquisition orders for mutexes are as follows:
		can be taken inside a kvm->srcu read-side critical section,
		while kvm->slots_lock cannot.

		- kvm->mn_active_invalidate_count ensures that pairs of
		invalidate_range_start() and invalidate_range_end() callbacks
		use the same memslots array. kvm->slots_lock and kvm->slots_arch_lock
		are taken on the waiting side in install_new_memslots, so MMU notifiers
		must not take either kvm->slots_lock or kvm->slots_arch_lock.

		On x86:

		- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock

include/linux/kvm_host.h

+5 −0

Original line number	Diff line number	Diff line
		@@ -548,6 +548,11 @@ struct kvm {
		struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
		struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];

		/* Used to wait for completion of MMU notifiers. */
		spinlock_t mn_invalidate_lock;
		unsigned long mn_active_invalidate_count;
		struct rcuwait mn_memslots_update_rcuwait;

		/*
		* created_vcpus is protected by kvm->lock, and is incremented
		* at the beginning of KVM_CREATE_VCPU. online_vcpus is only

virt/kvm/kvm_main.c

+54 −4

Original line number	Diff line number	Diff line
		@@ -604,11 +604,9 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
		trace_kvm_set_spte_hva(address);

		/*
		* .change_pte() must be surrounded by .invalidate_range_{start,end}(),
		* and so always runs with an elevated notifier count. This obviates
		* the need to bump the sequence count.
		* .change_pte() must be surrounded by .invalidate_range_{start,end}().
		*/
		WARN_ON_ONCE(!kvm->mmu_notifier_count);
		WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));

		kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
		}
		@@ -658,6 +656,18 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,

		trace_kvm_unmap_hva_range(range->start, range->end);

		/*
		* Prevent memslot modification between range_start() and range_end()
		* so that conditionally locking provides the same result in both
		* functions. Without that guarantee, the mmu_notifier_count
		* adjustments will be imbalanced.
		*
		* Pairs with the decrement in range_end().
		*/
		spin_lock(&kvm->mn_invalidate_lock);
		kvm->mn_active_invalidate_count++;
		spin_unlock(&kvm->mn_invalidate_lock);

		__kvm_handle_hva_range(kvm, &hva_range);

		return 0;
		@@ -694,9 +704,22 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
		.flush_on_ret = false,
		.may_block = mmu_notifier_range_blockable(range),
		};
		bool wake;

		__kvm_handle_hva_range(kvm, &hva_range);

		/* Pairs with the increment in range_start(). */
		spin_lock(&kvm->mn_invalidate_lock);
		wake = (--kvm->mn_active_invalidate_count == 0);
		spin_unlock(&kvm->mn_invalidate_lock);

		/*
		* There can only be one waiter, since the wait happens under
		* slots_lock.
		*/
		if (wake)
		rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);

		BUG_ON(kvm->mmu_notifier_count < 0);
		}

		@@ -977,6 +1000,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
		mutex_init(&kvm->irq_lock);
		mutex_init(&kvm->slots_lock);
		mutex_init(&kvm->slots_arch_lock);
		spin_lock_init(&kvm->mn_invalidate_lock);
		rcuwait_init(&kvm->mn_memslots_update_rcuwait);

		INIT_LIST_HEAD(&kvm->devices);

		BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
		@@ -1099,6 +1125,16 @@ static void kvm_destroy_vm(struct kvm *kvm)
		kvm_coalesced_mmio_free(kvm);
		#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
		mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
		/*
		* At this point, pending calls to invalidate_range_start()
		* have completed but no more MMU notifiers will run, so
		* mn_active_invalidate_count may remain unbalanced.
		* No threads can be waiting in install_new_memslots as the
		* last reference on KVM has been dropped, but freeing
		* memslots would deadlock without this manual intervention.
		*/
		WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
		kvm->mn_active_invalidate_count = 0;
		#else
		kvm_arch_flush_shadow_all(kvm);
		#endif
		@@ -1360,7 +1396,21 @@ static struct kvm_memslots install_new_memslots(struct kvm kvm,
		WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
		slots->generation = gen \| KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;

		/*
		* Do not store the new memslots while there are invalidations in
		* progress (preparatory change for the next commit).
		*/
		spin_lock(&kvm->mn_invalidate_lock);
		prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
		while (kvm->mn_active_invalidate_count) {
		set_current_state(TASK_UNINTERRUPTIBLE);
		spin_unlock(&kvm->mn_invalidate_lock);
		schedule();
		spin_lock(&kvm->mn_invalidate_lock);
		}
		finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
		rcu_assign_pointer(kvm->memslots[as_id], slots);
		spin_unlock(&kvm->mn_invalidate_lock);

		/*
		* Acquired in kvm_set_memslot. Must be released before synchronize