Commit 8f5a3eb7 authored by Ricardo Koller's avatar Ricardo Koller Committed by Oliver Upton
Browse files

KVM: arm64: Add kvm_pgtable_stage2_split()



Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a
range of huge pages. This will be used for eager-splitting huge pages
into PAGE_SIZE pages. The goal is to avoid having to split huge pages
on write-protection faults, and instead use this function to do it
ahead of time for large ranges (e.g., all guest memory in 1G chunks at
a time).

Signed-off-by: default avatarRicardo Koller <ricarkol@google.com>
Reviewed-by: default avatarShaoqin Huang <shahuang@redhat.com>
Reviewed-by: default avatarGavin Shan <gshan@redhat.com>
Link: https://lore.kernel.org/r/20230426172330.1439644-7-ricarkol@google.com


Signed-off-by: default avatarOliver Upton <oliver.upton@linux.dev>
parent 2f440b72
Loading
Loading
Loading
Loading
+19 −0
Original line number Diff line number Diff line
@@ -671,6 +671,25 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
 */
int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);

/**
 * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
 *				to PAGE_SIZE guest pages.
 * @pgt:	 Page-table structure initialised by kvm_pgtable_stage2_init().
 * @addr:	 Intermediate physical address from which to split.
 * @size:	 Size of the range.
 * @mc:		 Cache of pre-allocated and zeroed memory from which to allocate
 *		 page-table pages.
 *
 * The function tries to split any level 1 or 2 entry that overlaps
 * with the input range (given by @addr and @size).
 *
 * Return: 0 on success, negative error code on failure. Note that
 * kvm_pgtable_stage2_split() is best effort: it tries to break as many
 * blocks in the input range as allowed by @mc_capacity.
 */
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
			     struct kvm_mmu_memory_cache *mc);

/**
 * kvm_pgtable_walk() - Walk a page-table.
 * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
+103 −0
Original line number Diff line number Diff line
@@ -1276,6 +1276,109 @@ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
	return pgtable;
}

/*
 * Get the number of page-tables needed to replace a block with a
 * fully populated tree up to the PTE entries. Note that @level is
 * interpreted as in "level @level entry".
 */
static int stage2_block_get_nr_page_tables(u32 level)
{
	switch (level) {
	case 1:
		return PTRS_PER_PTE + 1;
	case 2:
		return 1;
	case 3:
		return 0;
	default:
		WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
			     level >= KVM_PGTABLE_MAX_LEVELS);
		return -EINVAL;
	};
}

static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
			       enum kvm_pgtable_walk_flags visit)
{
	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
	struct kvm_mmu_memory_cache *mc = ctx->arg;
	struct kvm_s2_mmu *mmu;
	kvm_pte_t pte = ctx->old, new, *childp;
	enum kvm_pgtable_prot prot;
	u32 level = ctx->level;
	bool force_pte;
	int nr_pages;
	u64 phys;

	/* No huge-pages exist at the last level */
	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
		return 0;

	/* We only split valid block mappings */
	if (!kvm_pte_valid(pte))
		return 0;

	nr_pages = stage2_block_get_nr_page_tables(level);
	if (nr_pages < 0)
		return nr_pages;

	if (mc->nobjs >= nr_pages) {
		/* Build a tree mapped down to the PTE granularity. */
		force_pte = true;
	} else {
		/*
		 * Don't force PTEs, so create_unlinked() below does
		 * not populate the tree up to the PTE level. The
		 * consequence is that the call will require a single
		 * page of level 2 entries at level 1, or a single
		 * page of PTEs at level 2. If we are at level 1, the
		 * PTEs will be created recursively.
		 */
		force_pte = false;
		nr_pages = 1;
	}

	if (mc->nobjs < nr_pages)
		return -ENOMEM;

	mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache);
	phys = kvm_pte_to_phys(pte);
	prot = kvm_pgtable_stage2_pte_prot(pte);

	childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys,
						    level, prot, mc, force_pte);
	if (IS_ERR(childp))
		return PTR_ERR(childp);

	if (!stage2_try_break_pte(ctx, mmu)) {
		kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
		mm_ops->put_page(childp);
		return -EAGAIN;
	}

	/*
	 * Note, the contents of the page table are guaranteed to be made
	 * visible before the new PTE is assigned because stage2_make_pte()
	 * writes the PTE using smp_store_release().
	 */
	new = kvm_init_table_pte(childp, mm_ops);
	stage2_make_pte(ctx, new);
	dsb(ishst);
	return 0;
}

int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
			     struct kvm_mmu_memory_cache *mc)
{
	struct kvm_pgtable_walker walker = {
		.cb	= stage2_split_walker,
		.flags	= KVM_PGTABLE_WALK_LEAF,
		.arg	= mc,
	};

	return kvm_pgtable_walk(pgt, addr, size, &walker);
}

int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
			      struct kvm_pgtable_mm_ops *mm_ops,
			      enum kvm_pgtable_stage2_flags flags,