mm/vmalloc: hugepage vmalloc mappings (121e6f32) · Commits · EulixOS / Software / Kernel

arch/Kconfig

+11 −0

Original line number	Diff line number	Diff line
		@@ -829,6 +829,17 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
		config HAVE_ARCH_HUGE_VMAP
		bool

		#
		# Archs that select this would be capable of PMD-sized vmaps (i.e.,
		# arch_vmap_pmd_supported() returns true), and they must make no assumptions
		# that vmalloc memory is mapped with PAGE_SIZE ptes. The VM_NO_HUGE_VMAP flag
		# can be used to prohibit arch-specific allocations from using hugepages to
		# help with this (e.g., modules may require it).
		#
		config HAVE_ARCH_HUGE_VMALLOC
		depends on HAVE_ARCH_HUGE_VMAP
		bool

		config ARCH_WANT_HUGE_PMD_SHARE
		bool

include/linux/vmalloc.h

+21 −0

Original line number	Diff line number	Diff line
		@@ -26,6 +26,7 @@ struct notifier_block; /* in notifier.h */
		#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
		#define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */
		#define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */
		#define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */

		/*
		* VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
		@@ -54,6 +55,9 @@ struct vm_struct {
		unsigned long size;
		unsigned long flags;
		struct page **pages;
		#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
		unsigned int page_order;
		#endif
		unsigned int nr_pages;
		phys_addr_t phys_addr;
		const void *caller;
		@@ -188,6 +192,22 @@ void free_vm_area(struct vm_struct *area);
		extern struct vm_struct remove_vm_area(const void addr);
		extern struct vm_struct find_vm_area(const void addr);

		static inline bool is_vm_area_hugepages(const void *addr)
		{
		/*
		* This may not 100% tell if the area is mapped with > PAGE_SIZE
		* page table entries, if for some reason the architecture indicates
		* larger sizes are available but decides not to use them, nothing
		* prevents that. This only indicates the size of the physical page
		* allocated in the vmalloc layer.
		*/
		#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
		return find_vm_area(addr)->page_order > 0;
		#else
		return false;
		#endif
		}

		#ifdef CONFIG_MMU
		int vmap_range(unsigned long addr, unsigned long end,
		phys_addr_t phys_addr, pgprot_t prot,
		@@ -205,6 +225,7 @@ static inline void set_vm_flush_reset_perms(void *addr)
		if (vm)
		vm->flags \|= VM_FLUSH_RESET_PERMS;
		}

		#else
		static inline int
		map_kernel_range_noflush(unsigned long start, unsigned long size,

mm/page_alloc.c

+4 −1

Original line number	Diff line number	Diff line
		@@ -72,6 +72,7 @@
		#include <linux/padata.h>
		#include <linux/khugepaged.h>
		#include <linux/buffer_head.h>
		#include <linux/vmalloc.h>

		#include <asm/sections.h>
		#include <asm/tlbflush.h>
		@@ -8222,6 +8223,7 @@ void __init alloc_large_system_hash(const char tablename,
		void *table = NULL;
		gfp_t gfp_flags;
		bool virt;
		bool huge;

		/* allow the kernel cmdline to have a say */
		if (!numentries) {
		@@ -8289,6 +8291,7 @@ void __init alloc_large_system_hash(const char tablename,
		} else if (get_order(size) >= MAX_ORDER \|\| hashdist) {
		table = __vmalloc(size, gfp_flags);
		virt = true;
		huge = is_vm_area_hugepages(table);
		} else {
		/*
		* If bucketsize is not a power-of-two, we may free
		@@ -8305,7 +8308,7 @@ void __init alloc_large_system_hash(const char tablename,

		pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
		tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
		virt ? "vmalloc" : "linear");
		virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");

		if (_hash_shift)
		*_hash_shift = log2qty;

mm/vmalloc.c

+173 −47

Original line number	Diff line number	Diff line
		@@ -42,6 +42,19 @@
		#include "internal.h"
		#include "pgalloc-track.h"

		#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
		static bool __ro_after_init vmap_allow_huge = true;

		static int __init set_nohugevmalloc(char *str)
		{
		vmap_allow_huge = false;
		return 0;
		}
		early_param("nohugevmalloc", set_nohugevmalloc);
		#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
		static const bool vmap_allow_huge = false;
		#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */

		bool is_vmalloc_addr(const void *x)
		{
		unsigned long addr = (unsigned long)x;
		@@ -483,31 +496,12 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
		return 0;
		}

		/**
		* map_kernel_range_noflush - map kernel VM area with the specified pages
		* @addr: start of the VM area to map
		* @size: size of the VM area to map
		* @prot: page protection flags to use
		* @pages: pages to map
		*
		* Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
		* have been allocated using get_vm_area() and its friends.
		*
		* NOTE:
		* This function does NOT do any cache flushing. The caller is responsible for
		* calling flush_cache_vmap() on to-be-mapped areas before calling this
		* function.
		*
		* RETURNS:
		* 0 on success, -errno on failure.
		*/
		int map_kernel_range_noflush(unsigned long addr, unsigned long size,
		static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
		pgprot_t prot, struct page **pages)
		{
		unsigned long start = addr;
		unsigned long end = addr + size;
		unsigned long next;
		pgd_t *pgd;
		unsigned long next;
		int err = 0;
		int nr = 0;
		pgtbl_mod_mask mask = 0;
		@@ -529,6 +523,66 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
		return 0;
		}

		static int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
		pgprot_t prot, struct page **pages, unsigned int page_shift)
		{
		unsigned int i, nr = (end - addr) >> PAGE_SHIFT;

		WARN_ON(page_shift < PAGE_SHIFT);

		if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) \|\|
		page_shift == PAGE_SHIFT)
		return vmap_small_pages_range_noflush(addr, end, prot, pages);

		for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
		int err;

		err = vmap_range_noflush(addr, addr + (1UL << page_shift),
		__pa(page_address(pages[i])), prot,
		page_shift);
		if (err)
		return err;

		addr += 1UL << page_shift;
		}

		return 0;
		}

		static int vmap_pages_range(unsigned long addr, unsigned long end,
		pgprot_t prot, struct page **pages, unsigned int page_shift)
		{
		int err;

		err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
		flush_cache_vmap(addr, end);
		return err;
		}

		/**
		* map_kernel_range_noflush - map kernel VM area with the specified pages
		* @addr: start of the VM area to map
		* @size: size of the VM area to map
		* @prot: page protection flags to use
		* @pages: pages to map
		*
		* Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
		* have been allocated using get_vm_area() and its friends.
		*
		* NOTE:
		* This function does NOT do any cache flushing. The caller is responsible for
		* calling flush_cache_vmap() on to-be-mapped areas before calling this
		* function.
		*
		* RETURNS:
		* 0 on success, -errno on failure.
		*/
		int map_kernel_range_noflush(unsigned long addr, unsigned long size,
		pgprot_t prot, struct page **pages)
		{
		return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT);
		}

		int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
		struct page **pages)
		{
		@@ -2112,6 +2166,24 @@ EXPORT_SYMBOL(vm_map_ram);

		static struct vm_struct *vmlist __initdata;

		static inline unsigned int vm_area_page_order(struct vm_struct *vm)
		{
		#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
		return vm->page_order;
		#else
		return 0;
		#endif
		}

		static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
		{
		#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
		vm->page_order = order;
		#else
		BUG_ON(order != 0);
		#endif
		}

		/**
		* vm_area_add_early - add vmap area early during boot
		* @vm: vm_struct to add
		@@ -2422,6 +2494,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
		{
		int i;

		/* HUGE_VMALLOC passes small pages to set_direct_map */
		for (i = 0; i < area->nr_pages; i++)
		if (page_address(area->pages[i]))
		set_direct_map(area->pages[i]);
		@@ -2431,6 +2504,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
		static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
		{
		unsigned long start = ULONG_MAX, end = 0;
		unsigned int page_order = vm_area_page_order(area);
		int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
		int flush_dmap = 0;
		int i;
		@@ -2455,11 +2529,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
		* map. Find the start and end range of the direct mappings to make sure
		* the vm_unmap_aliases() flush includes the direct map.
		*/
		for (i = 0; i < area->nr_pages; i++) {
		for (i = 0; i < area->nr_pages; i += 1U << page_order) {
		unsigned long addr = (unsigned long)page_address(area->pages[i]);
		if (addr) {
		unsigned long page_size;

		page_size = PAGE_SIZE << page_order;
		start = min(addr, start);
		end = max(addr + PAGE_SIZE, end);
		end = max(addr + page_size, end);
		flush_dmap = 1;
		}
		}
		@@ -2500,13 +2577,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
		vm_remove_mappings(area, deallocate_pages);

		if (deallocate_pages) {
		unsigned int page_order = vm_area_page_order(area);
		int i;

		for (i = 0; i < area->nr_pages; i++) {
		for (i = 0; i < area->nr_pages; i += 1U << page_order) {
		struct page *page = area->pages[i];

		BUG_ON(!page);
		__free_pages(page, 0);
		__free_pages(page, page_order);
		}
		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);

		@@ -2697,15 +2775,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn);
		#endif /* CONFIG_VMAP_PFN */

		static void __vmalloc_area_node(struct vm_struct area, gfp_t gfp_mask,
		pgprot_t prot, int node)
		pgprot_t prot, unsigned int page_shift,
		int node)
		{
		const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) \| __GFP_ZERO;
		unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
		unsigned long addr = (unsigned long)area->addr;
		unsigned long size = get_vm_area_size(area);
		unsigned long array_size;
		unsigned int i;
		unsigned int nr_small_pages = size >> PAGE_SHIFT;
		unsigned int page_order;
		struct page **pages;
		unsigned int i;

		array_size = (unsigned long)nr_pages * sizeof(struct page *);
		array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
		gfp_mask \|= __GFP_NOWARN;
		if (!(gfp_mask & (GFP_DMA \| GFP_DMA32)))
		gfp_mask \|= __GFP_HIGHMEM;
		@@ -2724,30 +2806,38 @@ static void __vmalloc_area_node(struct vm_struct area, gfp_t gfp_mask,
		}

		area->pages = pages;
		area->nr_pages = nr_pages;
		area->nr_pages = nr_small_pages;
		set_vm_area_page_order(area, page_shift - PAGE_SHIFT);

		for (i = 0; i < area->nr_pages; i++) {
		struct page *page;
		page_order = vm_area_page_order(area);

		if (node == NUMA_NO_NODE)
		page = alloc_page(gfp_mask);
		else
		page = alloc_pages_node(node, gfp_mask, 0);
		/*
		* Careful, we allocate and map page_order pages, but tracking is done
		* per PAGE_SIZE page so as to keep the vm_struct APIs independent of
		* the physical/mapped size.
		*/
		for (i = 0; i < area->nr_pages; i += 1U << page_order) {
		struct page *page;
		int p;

		/* Compound pages required for remap_vmalloc_page */
		page = alloc_pages_node(node, gfp_mask \| __GFP_COMP, page_order);
		if (unlikely(!page)) {
		/* Successfully allocated i pages, free them in __vfree() */
		area->nr_pages = i;
		atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
		goto fail;
		}
		area->pages[i] = page;

		for (p = 0; p < (1U << page_order); p++)
		area->pages[i + p] = page + p;

		if (gfpflags_allow_blocking(gfp_mask))
		cond_resched();
		}
		atomic_long_add(area->nr_pages, &nr_vmalloc_pages);

		if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
		prot, pages) < 0)
		if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0)
		goto fail;

		return area->addr;
		@@ -2755,7 +2845,7 @@ static void __vmalloc_area_node(struct vm_struct area, gfp_t gfp_mask,
		fail:
		warn_alloc(gfp_mask, NULL,
		"vmalloc: allocation failure, allocated %ld of %ld bytes",
		(area->nr_pages*PAGE_SIZE), area->size);
		(area->nr_pages*PAGE_SIZE), size);
		__vfree(area->addr);
		return NULL;
		}
		@@ -2786,19 +2876,45 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
		struct vm_struct *area;
		void *addr;
		unsigned long real_size = size;
		unsigned long real_align = align;
		unsigned int shift = PAGE_SHIFT;

		size = PAGE_ALIGN(size);
		if (!size \|\| (size >> PAGE_SHIFT) > totalram_pages())
		if (!size \|\| (size >> PAGE_SHIFT) > totalram_pages()) {
		area = NULL;
		goto fail;
		}

		if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) &&
		arch_vmap_pmd_supported(prot)) {
		unsigned long size_per_node;

		area = __get_vm_area_node(real_size, align, VM_ALLOC \| VM_UNINITIALIZED \|
		/*
		* Try huge pages. Only try for PAGE_KERNEL allocations,
		* others like modules don't yet expect huge pages in
		* their allocations due to apply_to_page_range not
		* supporting them.
		*/

		size_per_node = size;
		if (node == NUMA_NO_NODE)
		size_per_node /= num_online_nodes();
		if (size_per_node >= PMD_SIZE) {
		shift = PMD_SHIFT;
		align = max(real_align, 1UL << shift);
		size = ALIGN(real_size, 1UL << shift);
		}
		}

		again:
		size = PAGE_ALIGN(size);
		area = __get_vm_area_node(size, align, VM_ALLOC \| VM_UNINITIALIZED \|
		vm_flags, start, end, node, gfp_mask, caller);
		if (!area)
		goto fail;

		addr = __vmalloc_area_node(area, gfp_mask, prot, node);
		addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
		if (!addr)
		return NULL;
		goto fail;

		/*
		* In this function, newly allocated vm_struct has VM_UNINITIALIZED
		@@ -2812,8 +2928,18 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
		return addr;

		fail:
		if (shift > PAGE_SHIFT) {
		shift = PAGE_SHIFT;
		align = real_align;
		size = real_size;
		goto again;
		}

		if (!area) {
		/* Warn for area allocation, page allocations already warn */
		warn_alloc(gfp_mask, NULL,
		"vmalloc: allocation failure: %lu bytes", real_size);
		}
		return NULL;
		}