Commit 83510396 authored by Oliver Upton's avatar Oliver Upton
Browse files

Merge branch kvm-arm64/eager-page-splitting into kvmarm/next



* kvm-arm64/eager-page-splitting:
  : Eager Page Splitting, courtesy of Ricardo Koller.
  :
  : Dirty logging performance is dominated by the cost of splitting
  : hugepages to PTE granularity. On systems that mere mortals can get their
  : hands on, each fault incurs the cost of a full break-before-make
  : pattern, wherein the broadcast invalidation and ensuing serialization
  : significantly increases fault latency.
  :
  : The goal of eager page splitting is to move the cost of hugepage
  : splitting out of the stage-2 fault path and instead into the ioctls
  : responsible for managing the dirty log:
  :
  :  - If manual protection is enabled for the VM, hugepage splitting
  :    happens in the KVM_CLEAR_DIRTY_LOG ioctl. This is desirable as it
  :    provides userspace granular control over hugepage splitting.
  :
  :  - Otherwise, if userspace relies on the legacy dirty log behavior
  :    (clear on collection), hugepage splitting is done at the moment dirty
  :    logging is enabled for a particular memslot.
  :
  : Support for eager page splitting requires explicit opt-in from
  : userspace, which is realized through the
  : KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE capability.
  arm64: kvm: avoid overflow in integer division
  KVM: arm64: Use local TLBI on permission relaxation
  KVM: arm64: Split huge pages during KVM_CLEAR_DIRTY_LOG
  KVM: arm64: Open-code kvm_mmu_write_protect_pt_masked()
  KVM: arm64: Split huge pages when dirty logging is enabled
  KVM: arm64: Add kvm_uninit_stage2_mmu()
  KVM: arm64: Refactor kvm_arch_commit_memory_region()
  KVM: arm64: Add kvm_pgtable_stage2_split()
  KVM: arm64: Add KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
  KVM: arm64: Export kvm_are_all_memslots_empty()
  KVM: arm64: Add helper for creating unlinked stage2 subtrees
  KVM: arm64: Add KVM_PGTABLE_WALK flags for skipping CMOs and BBM TLBIs
  KVM: arm64: Rename free_removed to free_unlinked

Signed-off-by: default avatarOliver Upton <oliver.upton@linux.dev>
parents 44c026a7 14c3555f
Loading
Loading
Loading
Loading
+27 −0
Original line number Diff line number Diff line
@@ -8445,6 +8445,33 @@ structure.
When getting the Modified Change Topology Report value, the attr->addr
must point to a byte where the value will be stored or retrieved from.

8.40 KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
---------------------------------------

:Capability: KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
:Architectures: arm64
:Type: vm
:Parameters: arg[0] is the new split chunk size.
:Returns: 0 on success, -EINVAL if any memslot was already created.

This capability sets the chunk size used in Eager Page Splitting.

Eager Page Splitting improves the performance of dirty-logging (used
in live migrations) when guest memory is backed by huge-pages.  It
avoids splitting huge-pages (into PAGE_SIZE pages) on fault, by doing
it eagerly when enabling dirty logging (with the
KVM_MEM_LOG_DIRTY_PAGES flag for a memory region), or when using
KVM_CLEAR_DIRTY_LOG.

The chunk size specifies how many pages to break at a time, using a
single allocation for each chunk. Bigger the chunk size, more pages
need to be allocated ahead of time.

The chunk size needs to be a valid block size. The list of acceptable
block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a
64-bit bitmap (each bit describing a block size). The default value is
0, to disable the eager page splitting.

9. Known KVM API problems
=========================

+4 −0
Original line number Diff line number Diff line
@@ -68,6 +68,7 @@ enum __kvm_host_smccc_func {
	__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
	__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa,
	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa_nsh,
	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid,
	__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
	__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
@@ -225,6 +226,9 @@ extern void __kvm_flush_vm_context(void);
extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu);
extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
				     int level);
extern void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
					 phys_addr_t ipa,
					 int level);
extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);

extern void __kvm_timer_set_cntvoff(u64 cntvoff);
+15 −0
Original line number Diff line number Diff line
@@ -159,6 +159,21 @@ struct kvm_s2_mmu {
	/* The last vcpu id that ran on each physical CPU */
	int __percpu *last_vcpu_ran;

#define KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT 0
	/*
	 * Memory cache used to split
	 * KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE worth of huge pages. It
	 * is used to allocate stage2 page tables while splitting huge
	 * pages. The choice of KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
	 * influences both the capacity of the split page cache, and
	 * how often KVM reschedules. Be wary of raising CHUNK_SIZE
	 * too high.
	 *
	 * Protected by kvm->slots_lock.
	 */
	struct kvm_mmu_memory_cache split_page_cache;
	uint64_t split_page_chunk_size;

	struct kvm_arch *arch;
};

+1 −0
Original line number Diff line number Diff line
@@ -172,6 +172,7 @@ void __init free_hyp_pgds(void);

void stage2_unmap_vm(struct kvm *kvm);
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
void kvm_uninit_stage2_mmu(struct kvm *kvm);
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
			  phys_addr_t pa, unsigned long size, bool writable);
+75 −4
Original line number Diff line number Diff line
@@ -92,6 +92,24 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
	return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL;
}

static inline u32 kvm_supported_block_sizes(void)
{
	u32 level = KVM_PGTABLE_MIN_BLOCK_LEVEL;
	u32 r = 0;

	for (; level < KVM_PGTABLE_MAX_LEVELS; level++)
		r |= BIT(kvm_granule_shift(level));

	return r;
}

static inline bool kvm_is_block_size_supported(u64 size)
{
	bool is_power_of_two = IS_ALIGNED(size, size);

	return is_power_of_two && (size & kvm_supported_block_sizes());
}

/**
 * struct kvm_pgtable_mm_ops - Memory management callbacks.
 * @zalloc_page:		Allocate a single zeroed memory page.
@@ -104,7 +122,7 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
 *				allocation is physically contiguous.
 * @free_pages_exact:		Free an exact number of memory pages previously
 *				allocated by zalloc_pages_exact.
 * @free_removed_table:		Free a removed paging structure by unlinking and
 * @free_unlinked_table:	Free an unlinked paging structure by unlinking and
 *				dropping references.
 * @get_page:			Increment the refcount on a page.
 * @put_page:			Decrement the refcount on a page. When the
@@ -124,7 +142,7 @@ struct kvm_pgtable_mm_ops {
	void*		(*zalloc_page)(void *arg);
	void*		(*zalloc_pages_exact)(size_t size);
	void		(*free_pages_exact)(void *addr, size_t size);
	void		(*free_removed_table)(void *addr, u32 level);
	void		(*free_unlinked_table)(void *addr, u32 level);
	void		(*get_page)(void *addr);
	void		(*put_page)(void *addr);
	int		(*page_count)(void *addr);
@@ -195,6 +213,12 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
 *					with other software walkers.
 * @KVM_PGTABLE_WALK_HANDLE_FAULT:	Indicates the page-table walk was
 *					invoked from a fault handler.
 * @KVM_PGTABLE_WALK_SKIP_BBM_TLBI:	Visit and update table entries
 *					without Break-before-make's
 *					TLB invalidation.
 * @KVM_PGTABLE_WALK_SKIP_CMO:		Visit and update table entries
 *					without Cache maintenance
 *					operations required.
 */
enum kvm_pgtable_walk_flags {
	KVM_PGTABLE_WALK_LEAF			= BIT(0),
@@ -202,6 +226,8 @@ enum kvm_pgtable_walk_flags {
	KVM_PGTABLE_WALK_TABLE_POST		= BIT(2),
	KVM_PGTABLE_WALK_SHARED			= BIT(3),
	KVM_PGTABLE_WALK_HANDLE_FAULT		= BIT(4),
	KVM_PGTABLE_WALK_SKIP_BBM_TLBI		= BIT(5),
	KVM_PGTABLE_WALK_SKIP_CMO		= BIT(6),
};

struct kvm_pgtable_visit_ctx {
@@ -441,7 +467,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);

/**
 * kvm_pgtable_stage2_free_removed() - Free a removed stage-2 paging structure.
 * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure.
 * @mm_ops:	Memory management callbacks.
 * @pgtable:	Unlinked stage-2 paging structure to be freed.
 * @level:	Level of the stage-2 paging structure to be freed.
@@ -449,7 +475,33 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
 * The page-table is assumed to be unreachable by any hardware walkers prior to
 * freeing and therefore no TLB invalidation is performed.
 */
void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level);
void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level);

/**
 * kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure.
 * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init*().
 * @phys:	Physical address of the memory to map.
 * @level:	Starting level of the stage-2 paging structure to be created.
 * @prot:	Permissions and attributes for the mapping.
 * @mc:		Cache of pre-allocated and zeroed memory from which to allocate
 *		page-table pages.
 * @force_pte:  Force mappings to PAGE_SIZE granularity.
 *
 * Returns an unlinked page-table tree.  This new page-table tree is
 * not reachable (i.e., it is unlinked) from the root pgd and it's
 * therefore unreachableby the hardware page-table walker. No TLB
 * invalidation or CMOs are performed.
 *
 * If device attributes are not explicitly requested in @prot, then the
 * mapping will be normal, cacheable.
 *
 * Return: The fully populated (unlinked) stage-2 paging structure, or
 * an ERR_PTR(error) on failure.
 */
kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
					      u64 phys, u32 level,
					      enum kvm_pgtable_prot prot,
					      void *mc, bool force_pte);

/**
 * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
@@ -620,6 +672,25 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
 */
int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);

/**
 * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
 *				to PAGE_SIZE guest pages.
 * @pgt:	 Page-table structure initialised by kvm_pgtable_stage2_init().
 * @addr:	 Intermediate physical address from which to split.
 * @size:	 Size of the range.
 * @mc:		 Cache of pre-allocated and zeroed memory from which to allocate
 *		 page-table pages.
 *
 * The function tries to split any level 1 or 2 entry that overlaps
 * with the input range (given by @addr and @size).
 *
 * Return: 0 on success, negative error code on failure. Note that
 * kvm_pgtable_stage2_split() is best effort: it tries to break as many
 * blocks in the input range as allowed by @mc_capacity.
 */
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
			     struct kvm_mmu_memory_cache *mc);

/**
 * kvm_pgtable_walk() - Walk a page-table.
 * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
Loading