Merge branch kvm-arm64/eager-page-splitting into kvmarm/next (83510396) · Commits · EulixOS / Software / Kernel

Documentation/virt/kvm/api.rst

+27 −0

Original line number	Diff line number	Diff line
		@@ -8445,6 +8445,33 @@ structure.
		When getting the Modified Change Topology Report value, the attr->addr
		must point to a byte where the value will be stored or retrieved from.

		8.40 KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
		---------------------------------------

		:Capability: KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
		:Architectures: arm64
		:Type: vm
		:Parameters: arg[0] is the new split chunk size.
		:Returns: 0 on success, -EINVAL if any memslot was already created.

		This capability sets the chunk size used in Eager Page Splitting.

		Eager Page Splitting improves the performance of dirty-logging (used
		in live migrations) when guest memory is backed by huge-pages. It
		avoids splitting huge-pages (into PAGE_SIZE pages) on fault, by doing
		it eagerly when enabling dirty logging (with the
		KVM_MEM_LOG_DIRTY_PAGES flag for a memory region), or when using
		KVM_CLEAR_DIRTY_LOG.

		The chunk size specifies how many pages to break at a time, using a
		single allocation for each chunk. Bigger the chunk size, more pages
		need to be allocated ahead of time.

		The chunk size needs to be a valid block size. The list of acceptable
		block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a
		64-bit bitmap (each bit describing a block size). The default value is
		0, to disable the eager page splitting.

		9. Known KVM API problems
		=========================

arch/arm64/include/asm/kvm_asm.h

+4 −0

Original line number	Diff line number	Diff line
		@@ -68,6 +68,7 @@ enum __kvm_host_smccc_func {
		__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
		__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
		__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa,
		__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa_nsh,
		__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid,
		__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
		__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
		@@ -225,6 +226,9 @@ extern void __kvm_flush_vm_context(void);
		extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu);
		extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
		int level);
		extern void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
		phys_addr_t ipa,
		int level);
		extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);

		extern void __kvm_timer_set_cntvoff(u64 cntvoff);

arch/arm64/include/asm/kvm_host.h

+15 −0

Original line number	Diff line number	Diff line
		@@ -159,6 +159,21 @@ struct kvm_s2_mmu {
		/* The last vcpu id that ran on each physical CPU */
		int __percpu *last_vcpu_ran;

		#define KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT 0
		/*
		* Memory cache used to split
		* KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE worth of huge pages. It
		* is used to allocate stage2 page tables while splitting huge
		* pages. The choice of KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
		* influences both the capacity of the split page cache, and
		* how often KVM reschedules. Be wary of raising CHUNK_SIZE
		* too high.
		*
		* Protected by kvm->slots_lock.
		*/
		struct kvm_mmu_memory_cache split_page_cache;
		uint64_t split_page_chunk_size;

		struct kvm_arch *arch;
		};

arch/arm64/include/asm/kvm_mmu.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -172,6 +172,7 @@ void __init free_hyp_pgds(void);

		void stage2_unmap_vm(struct kvm *kvm);
		int kvm_init_stage2_mmu(struct kvm kvm, struct kvm_s2_mmu mmu, unsigned long type);
		void kvm_uninit_stage2_mmu(struct kvm *kvm);
		void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
		int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
		phys_addr_t pa, unsigned long size, bool writable);

arch/arm64/include/asm/kvm_pgtable.h

+75 −4

Original line number	Diff line number	Diff line
		@@ -92,6 +92,24 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
		return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL;
		}

		static inline u32 kvm_supported_block_sizes(void)
		{
		u32 level = KVM_PGTABLE_MIN_BLOCK_LEVEL;
		u32 r = 0;

		for (; level < KVM_PGTABLE_MAX_LEVELS; level++)
		r \|= BIT(kvm_granule_shift(level));

		return r;
		}

		static inline bool kvm_is_block_size_supported(u64 size)
		{
		bool is_power_of_two = IS_ALIGNED(size, size);

		return is_power_of_two && (size & kvm_supported_block_sizes());
		}

		/**
		* struct kvm_pgtable_mm_ops - Memory management callbacks.
		* @zalloc_page: Allocate a single zeroed memory page.
		@@ -104,7 +122,7 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
		* allocation is physically contiguous.
		* @free_pages_exact: Free an exact number of memory pages previously
		* allocated by zalloc_pages_exact.
		* @free_removed_table: Free a removed paging structure by unlinking and
		* @free_unlinked_table: Free an unlinked paging structure by unlinking and
		* dropping references.
		* @get_page: Increment the refcount on a page.
		* @put_page: Decrement the refcount on a page. When the
		@@ -124,7 +142,7 @@ struct kvm_pgtable_mm_ops {
		void* (zalloc_page)(void arg);
		void* (*zalloc_pages_exact)(size_t size);
		void (free_pages_exact)(void addr, size_t size);
		void (free_removed_table)(void addr, u32 level);
		void (free_unlinked_table)(void addr, u32 level);
		void (get_page)(void addr);
		void (put_page)(void addr);
		int (page_count)(void addr);
		@@ -195,6 +213,12 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
		* with other software walkers.
		* @KVM_PGTABLE_WALK_HANDLE_FAULT: Indicates the page-table walk was
		* invoked from a fault handler.
		* @KVM_PGTABLE_WALK_SKIP_BBM_TLBI: Visit and update table entries
		* without Break-before-make's
		* TLB invalidation.
		* @KVM_PGTABLE_WALK_SKIP_CMO: Visit and update table entries
		* without Cache maintenance
		* operations required.
		*/
		enum kvm_pgtable_walk_flags {
		KVM_PGTABLE_WALK_LEAF = BIT(0),
		@@ -202,6 +226,8 @@ enum kvm_pgtable_walk_flags {
		KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
		KVM_PGTABLE_WALK_SHARED = BIT(3),
		KVM_PGTABLE_WALK_HANDLE_FAULT = BIT(4),
		KVM_PGTABLE_WALK_SKIP_BBM_TLBI = BIT(5),
		KVM_PGTABLE_WALK_SKIP_CMO = BIT(6),
		};

		struct kvm_pgtable_visit_ctx {
		@@ -441,7 +467,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable pgt, struct kvm_s2_mmu mmu,
		void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);

		/**
		* kvm_pgtable_stage2_free_removed() - Free a removed stage-2 paging structure.
		* kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure.
		* @mm_ops: Memory management callbacks.
		* @pgtable: Unlinked stage-2 paging structure to be freed.
		* @level: Level of the stage-2 paging structure to be freed.
		@@ -449,7 +475,33 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
		* The page-table is assumed to be unreachable by any hardware walkers prior to
		* freeing and therefore no TLB invalidation is performed.
		*/
		void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops mm_ops, void pgtable, u32 level);
		void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops mm_ops, void pgtable, u32 level);

		/**
		* kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure.
		* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
		* @phys: Physical address of the memory to map.
		* @level: Starting level of the stage-2 paging structure to be created.
		* @prot: Permissions and attributes for the mapping.
		* @mc: Cache of pre-allocated and zeroed memory from which to allocate
		* page-table pages.
		* @force_pte: Force mappings to PAGE_SIZE granularity.
		*
		* Returns an unlinked page-table tree. This new page-table tree is
		* not reachable (i.e., it is unlinked) from the root pgd and it's
		* therefore unreachableby the hardware page-table walker. No TLB
		* invalidation or CMOs are performed.
		*
		* If device attributes are not explicitly requested in @prot, then the
		* mapping will be normal, cacheable.
		*
		* Return: The fully populated (unlinked) stage-2 paging structure, or
		* an ERR_PTR(error) on failure.
		*/
		kvm_pte_t kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable pgt,
		u64 phys, u32 level,
		enum kvm_pgtable_prot prot,
		void *mc, bool force_pte);

		/**
		* kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
		@@ -620,6 +672,25 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
		*/
		int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);

		/**
		* kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
		* to PAGE_SIZE guest pages.
		* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
		* @addr: Intermediate physical address from which to split.
		* @size: Size of the range.
		* @mc: Cache of pre-allocated and zeroed memory from which to allocate
		* page-table pages.
		*
		* The function tries to split any level 1 or 2 entry that overlaps
		* with the input range (given by @addr and @size).
		*
		* Return: 0 on success, negative error code on failure. Note that
		* kvm_pgtable_stage2_split() is best effort: it tries to break as many
		* blocks in the input range as allowed by @mc_capacity.
		*/
		int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
		struct kvm_mmu_memory_cache *mc);

		/**
		* kvm_pgtable_walk() - Walk a page-table.
		* @pgt: Page-table structure initialised by kvm_pgtable_*_init().