Commit fe8e3f44 authored by Marc Zyngier's avatar Marc Zyngier
Browse files

Merge branch kvm-arm64/parallel-faults into kvmarm-master/next



* kvm-arm64/parallel-faults:
  : .
  : Parallel stage-2 fault handling, courtesy of Oliver Upton.
  : From the cover letter:
  :
  : "Presently KVM only takes a read lock for stage 2 faults if it believes
  : the fault can be fixed by relaxing permissions on a PTE (write unprotect
  : for dirty logging). Otherwise, stage 2 faults grab the write lock, which
  : predictably can pile up all the vCPUs in a sufficiently large VM.
  :
  : Like the TDP MMU for x86, this series loosens the locking around
  : manipulations of the stage 2 page tables to allow parallel faults. RCU
  : and atomics are exploited to safely build/destroy the stage 2 page
  : tables in light of multiple software observers."
  : .
  KVM: arm64: Reject shared table walks in the hyp code
  KVM: arm64: Don't acquire RCU read lock for exclusive table walks
  KVM: arm64: Take a pointer to walker data in kvm_dereference_pteref()
  KVM: arm64: Handle stage-2 faults in parallel
  KVM: arm64: Make table->block changes parallel-aware
  KVM: arm64: Make leaf->leaf PTE changes parallel-aware
  KVM: arm64: Make block->table PTE changes parallel-aware
  KVM: arm64: Split init and set for table PTE
  KVM: arm64: Atomically update stage 2 leaf attributes in parallel walks
  KVM: arm64: Protect stage-2 traversal with RCU
  KVM: arm64: Tear down unlinked stage-2 subtree after break-before-make
  KVM: arm64: Use an opaque type for pteps
  KVM: arm64: Add a helper to tear down unlinked stage-2 subtrees
  KVM: arm64: Don't pass kvm_pgtable through kvm_pgtable_walk_data
  KVM: arm64: Pass mm_ops through the visitor context
  KVM: arm64: Stash observed pte value in visitor context
  KVM: arm64: Combine visitor arguments into a context structure

Signed-off-by: default avatarMarc Zyngier <maz@kernel.org>
parents a937f37d 5e806c58
Loading
Loading
Loading
Loading
+125 −28
Original line number Diff line number Diff line
@@ -85,6 +85,8 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
 *				allocation is physically contiguous.
 * @free_pages_exact:		Free an exact number of memory pages previously
 *				allocated by zalloc_pages_exact.
 * @free_removed_table:		Free a removed paging structure by unlinking and
 *				dropping references.
 * @get_page:			Increment the refcount on a page.
 * @put_page:			Decrement the refcount on a page. When the
 *				refcount reaches 0 the page is automatically
@@ -103,6 +105,7 @@ struct kvm_pgtable_mm_ops {
	void*		(*zalloc_page)(void *arg);
	void*		(*zalloc_pages_exact)(size_t size);
	void		(*free_pages_exact)(void *addr, size_t size);
	void		(*free_removed_table)(void *addr, u32 level);
	void		(*get_page)(void *addr);
	void		(*put_page)(void *addr);
	int		(*page_count)(void *addr);
@@ -161,29 +164,6 @@ enum kvm_pgtable_prot {
typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
					   enum kvm_pgtable_prot prot);

/**
 * struct kvm_pgtable - KVM page-table.
 * @ia_bits:		Maximum input address size, in bits.
 * @start_level:	Level at which the page-table walk starts.
 * @pgd:		Pointer to the first top-level entry of the page-table.
 * @mm_ops:		Memory management callbacks.
 * @mmu:		Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
 * @flags:		Stage-2 page-table flags.
 * @force_pte_cb:	Function that returns true if page level mappings must
 *			be used instead of block mappings.
 */
struct kvm_pgtable {
	u32					ia_bits;
	u32					start_level;
	kvm_pte_t				*pgd;
	struct kvm_pgtable_mm_ops		*mm_ops;

	/* Stage-2 only */
	struct kvm_s2_mmu			*mmu;
	enum kvm_pgtable_stage2_flags		flags;
	kvm_pgtable_force_pte_cb_t		force_pte_cb;
};

/**
 * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
 * @KVM_PGTABLE_WALK_LEAF:		Visit leaf entries, including invalid
@@ -192,17 +172,34 @@ struct kvm_pgtable {
 *					children.
 * @KVM_PGTABLE_WALK_TABLE_POST:	Visit table entries after their
 *					children.
 * @KVM_PGTABLE_WALK_SHARED:		Indicates the page-tables may be shared
 *					with other software walkers.
 */
enum kvm_pgtable_walk_flags {
	KVM_PGTABLE_WALK_LEAF			= BIT(0),
	KVM_PGTABLE_WALK_TABLE_PRE		= BIT(1),
	KVM_PGTABLE_WALK_TABLE_POST		= BIT(2),
	KVM_PGTABLE_WALK_SHARED			= BIT(3),
};

typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level,
					kvm_pte_t *ptep,
					enum kvm_pgtable_walk_flags flag,
					void * const arg);
struct kvm_pgtable_visit_ctx {
	kvm_pte_t				*ptep;
	kvm_pte_t				old;
	void					*arg;
	struct kvm_pgtable_mm_ops		*mm_ops;
	u64					addr;
	u64					end;
	u32					level;
	enum kvm_pgtable_walk_flags		flags;
};

typedef int (*kvm_pgtable_visitor_fn_t)(const struct kvm_pgtable_visit_ctx *ctx,
					enum kvm_pgtable_walk_flags visit);

static inline bool kvm_pgtable_walk_shared(const struct kvm_pgtable_visit_ctx *ctx)
{
	return ctx->flags & KVM_PGTABLE_WALK_SHARED;
}

/**
 * struct kvm_pgtable_walker - Hook into a page-table walk.
@@ -217,6 +214,94 @@ struct kvm_pgtable_walker {
	const enum kvm_pgtable_walk_flags	flags;
};

/*
 * RCU cannot be used in a non-kernel context such as the hyp. As such, page
 * table walkers used in hyp do not call into RCU and instead use other
 * synchronization mechanisms (such as a spinlock).
 */
#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)

typedef kvm_pte_t *kvm_pteref_t;

static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker,
						kvm_pteref_t pteref)
{
	return pteref;
}

static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
{
	/*
	 * Due to the lack of RCU (or a similar protection scheme), only
	 * non-shared table walkers are allowed in the hypervisor.
	 */
	if (walker->flags & KVM_PGTABLE_WALK_SHARED)
		return -EPERM;

	return 0;
}

static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) {}

static inline bool kvm_pgtable_walk_lock_held(void)
{
	return true;
}

#else

typedef kvm_pte_t __rcu *kvm_pteref_t;

static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker,
						kvm_pteref_t pteref)
{
	return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED));
}

static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
{
	if (walker->flags & KVM_PGTABLE_WALK_SHARED)
		rcu_read_lock();

	return 0;
}

static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker)
{
	if (walker->flags & KVM_PGTABLE_WALK_SHARED)
		rcu_read_unlock();
}

static inline bool kvm_pgtable_walk_lock_held(void)
{
	return rcu_read_lock_held();
}

#endif

/**
 * struct kvm_pgtable - KVM page-table.
 * @ia_bits:		Maximum input address size, in bits.
 * @start_level:	Level at which the page-table walk starts.
 * @pgd:		Pointer to the first top-level entry of the page-table.
 * @mm_ops:		Memory management callbacks.
 * @mmu:		Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
 * @flags:		Stage-2 page-table flags.
 * @force_pte_cb:	Function that returns true if page level mappings must
 *			be used instead of block mappings.
 */
struct kvm_pgtable {
	u32					ia_bits;
	u32					start_level;
	kvm_pteref_t				pgd;
	struct kvm_pgtable_mm_ops		*mm_ops;

	/* Stage-2 only */
	struct kvm_s2_mmu			*mmu;
	enum kvm_pgtable_stage2_flags		flags;
	kvm_pgtable_force_pte_cb_t		force_pte_cb;
};

/**
 * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
 * @pgt:	Uninitialised page-table structure to initialise.
@@ -324,6 +409,17 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
 */
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);

/**
 * kvm_pgtable_stage2_free_removed() - Free a removed stage-2 paging structure.
 * @mm_ops:	Memory management callbacks.
 * @pgtable:	Unlinked stage-2 paging structure to be freed.
 * @level:	Level of the stage-2 paging structure to be freed.
 *
 * The page-table is assumed to be unreachable by any hardware walkers prior to
 * freeing and therefore no TLB invalidation is performed.
 */
void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level);

/**
 * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
 * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init*().
@@ -333,6 +429,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
 * @prot:	Permissions and attributes for the mapping.
 * @mc:		Cache of pre-allocated and zeroed memory from which to allocate
 *		page-table pages.
 * @flags:	Flags to control the page-table walk (ex. a shared walk)
 *
 * The offset of @addr within a page is ignored, @size is rounded-up to
 * the next page boundary and @phys is rounded-down to the previous page
@@ -354,7 +451,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
 */
int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
			   u64 phys, enum kvm_pgtable_prot prot,
			   void *mc);
			   void *mc, enum kvm_pgtable_walk_flags flags);

/**
 * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to
+12 −9
Original line number Diff line number Diff line
@@ -79,6 +79,11 @@ static void host_s2_put_page(void *addr)
	hyp_put_page(&host_s2_pool, addr);
}

static void host_s2_free_removed_table(void *addr, u32 level)
{
	kvm_pgtable_stage2_free_removed(&host_kvm.mm_ops, addr, level);
}

static int prepare_s2_pool(void *pgt_pool_base)
{
	unsigned long nr_pages, pfn;
@@ -93,6 +98,7 @@ static int prepare_s2_pool(void *pgt_pool_base)
	host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) {
		.zalloc_pages_exact = host_s2_zalloc_pages_exact,
		.zalloc_page = host_s2_zalloc_page,
		.free_removed_table = host_s2_free_removed_table,
		.phys_to_virt = hyp_phys_to_virt,
		.virt_to_phys = hyp_virt_to_phys,
		.page_count = hyp_page_count,
@@ -251,7 +257,7 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
				      enum kvm_pgtable_prot prot)
{
	return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start,
				      prot, &host_s2_pool);
				      prot, &host_s2_pool, 0);
}

/*
@@ -417,18 +423,15 @@ struct check_walk_data {
	enum pkvm_page_state	(*get_page_state)(kvm_pte_t pte);
};

static int __check_page_state_visitor(u64 addr, u64 end, u32 level,
				      kvm_pte_t *ptep,
				      enum kvm_pgtable_walk_flags flag,
				      void * const arg)
static int __check_page_state_visitor(const struct kvm_pgtable_visit_ctx *ctx,
				      enum kvm_pgtable_walk_flags visit)
{
	struct check_walk_data *d = arg;
	kvm_pte_t pte = *ptep;
	struct check_walk_data *d = ctx->arg;

	if (kvm_pte_valid(pte) && !addr_is_memory(kvm_pte_to_phys(pte)))
	if (kvm_pte_valid(ctx->old) && !addr_is_memory(kvm_pte_to_phys(ctx->old)))
		return -EINVAL;

	return d->get_page_state(pte) == d->desired ? 0 : -EPERM;
	return d->get_page_state(ctx->old) == d->desired ? 0 : -EPERM;
}

static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
+9 −13
Original line number Diff line number Diff line
@@ -186,18 +186,15 @@ static void hpool_put_page(void *addr)
	hyp_put_page(&hpool, addr);
}

static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
					 kvm_pte_t *ptep,
					 enum kvm_pgtable_walk_flags flag,
					 void * const arg)
static int finalize_host_mappings_walker(const struct kvm_pgtable_visit_ctx *ctx,
					 enum kvm_pgtable_walk_flags visit)
{
	struct kvm_pgtable_mm_ops *mm_ops = arg;
	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
	enum kvm_pgtable_prot prot;
	enum pkvm_page_state state;
	kvm_pte_t pte = *ptep;
	phys_addr_t phys;

	if (!kvm_pte_valid(pte))
	if (!kvm_pte_valid(ctx->old))
		return 0;

	/*
@@ -205,14 +202,14 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
	 * was unable to access the hyp_vmemmap and so the buddy allocator has
	 * initialised the refcount to '1'.
	 */
	mm_ops->get_page(ptep);
	if (flag != KVM_PGTABLE_WALK_LEAF)
	mm_ops->get_page(ctx->ptep);
	if (visit != KVM_PGTABLE_WALK_LEAF)
		return 0;

	if (level != (KVM_PGTABLE_MAX_LEVELS - 1))
	if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1))
		return -EINVAL;

	phys = kvm_pte_to_phys(pte);
	phys = kvm_pte_to_phys(ctx->old);
	if (!addr_is_memory(phys))
		return -EINVAL;

@@ -220,7 +217,7 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
	 * Adjust the host stage-2 mappings to match the ownership attributes
	 * configured in the hypervisor stage-1.
	 */
	state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
	state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old));
	switch (state) {
	case PKVM_PAGE_OWNED:
		return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id);
@@ -242,7 +239,6 @@ static int finalize_host_mappings(void)
	struct kvm_pgtable_walker walker = {
		.cb	= finalize_host_mappings_walker,
		.flags	= KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
		.arg	= pkvm_pgtable.mm_ops,
	};
	int i, ret;

+334 −297

File changed.

Preview size limit exceeded, changes collapsed.

+28 −25
Original line number Diff line number Diff line
@@ -128,6 +128,25 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size)
	free_pages_exact(virt, size);
}

static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;

static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
{
	struct page *page = container_of(head, struct page, rcu_head);
	void *pgtable = page_to_virt(page);
	u32 level = page_private(page);

	kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level);
}

static void stage2_free_removed_table(void *addr, u32 level)
{
	struct page *page = virt_to_page(addr);

	set_page_private(page, (unsigned long)level);
	call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
}

static void kvm_host_get_page(void *addr)
{
	get_page(virt_to_page(addr));
@@ -640,7 +659,7 @@ static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
static int get_user_mapping_size(struct kvm *kvm, u64 addr)
{
	struct kvm_pgtable pgt = {
		.pgd		= (kvm_pte_t *)kvm->mm->pgd,
		.pgd		= (kvm_pteref_t)kvm->mm->pgd,
		.ia_bits	= vabits_actual,
		.start_level	= (KVM_PGTABLE_MAX_LEVELS -
				   CONFIG_PGTABLE_LEVELS),
@@ -662,6 +681,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
	.zalloc_page		= stage2_memcache_zalloc_page,
	.zalloc_pages_exact	= kvm_s2_zalloc_pages_exact,
	.free_pages_exact	= kvm_s2_free_pages_exact,
	.free_removed_table	= stage2_free_removed_table,
	.get_page		= kvm_host_get_page,
	.put_page		= kvm_s2_put_page,
	.page_count		= kvm_host_page_count,
@@ -841,7 +861,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,

		write_lock(&kvm->mmu_lock);
		ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
					     &cache);
					     &cache, 0);
		write_unlock(&kvm->mmu_lock);
		if (ret)
			break;
@@ -1136,7 +1156,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
	gfn_t gfn;
	kvm_pfn_t pfn;
	bool logging_active = memslot_is_logging(memslot);
	bool use_read_lock = false;
	unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
	unsigned long vma_pagesize, fault_granule;
	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
@@ -1171,8 +1190,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
	if (logging_active) {
		force_pte = true;
		vma_shift = PAGE_SHIFT;
		use_read_lock = (fault_status == FSC_PERM && write_fault &&
				 fault_granule == PAGE_SIZE);
	} else {
		vma_shift = get_vma_page_shift(vma, hva);
	}
@@ -1271,15 +1288,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
	if (exec_fault && device)
		return -ENOEXEC;

	/*
	 * To reduce MMU contentions and enhance concurrency during dirty
	 * logging dirty logging, only acquire read lock for permission
	 * relaxation.
	 */
	if (use_read_lock)
	read_lock(&kvm->mmu_lock);
	else
		write_lock(&kvm->mmu_lock);
	pgt = vcpu->arch.hw_mmu->pgt;
	if (mmu_invalidate_retry(kvm, mmu_seq))
		goto out_unlock;
@@ -1323,15 +1332,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
	 * permissions only if vma_pagesize equals fault_granule. Otherwise,
	 * kvm_pgtable_stage2_map() should be called to change block size.
	 */
	if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
	if (fault_status == FSC_PERM && vma_pagesize == fault_granule)
		ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
	} else {
		WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n");

	else
		ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
					     __pfn_to_phys(pfn), prot,
					     memcache);
	}
					     memcache, KVM_PGTABLE_WALK_SHARED);

	/* Mark the page dirty only if the fault is handled successfully */
	if (writable && !ret) {
@@ -1340,10 +1346,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
	}

out_unlock:
	if (use_read_lock)
	read_unlock(&kvm->mmu_lock);
	else
		write_unlock(&kvm->mmu_lock);
	kvm_set_pfn_accessed(pfn);
	kvm_release_pfn_clean(pfn);
	return ret != -EAGAIN ? ret : 0;
@@ -1549,7 +1552,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
	 */
	kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
			       PAGE_SIZE, __pfn_to_phys(pfn),
			       KVM_PGTABLE_PROT_R, NULL);
			       KVM_PGTABLE_PROT_R, NULL, 0);

	return false;
}