!7997 mm: some misc bugfix (45c4db5b) · Commits · EulixOS / Software / Kernel

fs/proc/task_mmu.c

+2 −2

Original line number	Diff line number	Diff line
		@@ -869,8 +869,8 @@ static int show_smap(struct seq_file m, void v)
		__show_smap(m, &mss, false);

		seq_printf(m, "THPeligible: %8u\n",
		!!thp_vma_allowable_orders(vma, vma->vm_flags, true, false,
		true, THP_ORDERS_ALL));
		!!thp_vma_allowable_orders(vma, vma->vm_flags,
		TVA_SMAPS \| TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));

		if (arch_pkeys_enabled())
		seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));

include/linux/huge_mm.h

+15 −14

Original line number	Diff line number	Diff line
		@@ -89,8 +89,12 @@ extern struct kobj_attribute shmem_enabled_attr;
		*/
		#define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON \| THP_ORDERS_ALL_FILE)

		#define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \
		(!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order)))
		#define TVA_SMAPS (1 << 0) /* Will be used for procfs */
		#define TVA_IN_PF (1 << 1) /* Page fault handler */
		#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */

		#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
		(!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))

		#ifdef CONFIG_TRANSPARENT_HUGEPAGE
		#define HPAGE_PMD_SHIFT PMD_SHIFT
		@@ -216,17 +220,15 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
		}

		unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
		unsigned long vm_flags, bool smaps,
		bool in_pf, bool enforce_sysfs,
		unsigned long vm_flags,
		unsigned long tva_flags,
		unsigned long orders);

		/**
		* thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
		* @vma: the vm area to check
		* @vm_flags: use these vm_flags instead of vma->vm_flags
		* @smaps: whether answer will be used for smaps file
		* @in_pf: whether answer will be used by page fault handler
		* @enforce_sysfs: whether sysfs config should be taken into account
		* @tva_flags: Which TVA flags to honour
		* @orders: bitfield of all orders to consider
		*
		* Calculates the intersection of the requested hugepage orders and the allowed
		@@ -239,12 +241,12 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
		*/
		static inline
		unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
		unsigned long vm_flags, bool smaps,
		bool in_pf, bool enforce_sysfs,
		unsigned long vm_flags,
		unsigned long tva_flags,
		unsigned long orders)
		{
		/* Optimization to check if required orders are enabled early. */
		if (enforce_sysfs && vma_is_anonymous(vma)) {
		if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
		unsigned long mask = READ_ONCE(huge_anon_orders_always);

		if (vm_flags & VM_HUGEPAGE)
		@@ -258,8 +260,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
		return 0;
		}

		return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf,
		enforce_sysfs, orders);
		return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
		}

		enum mthp_stat_item {
		@@ -437,8 +438,8 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
		}

		static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
		unsigned long vm_flags, bool smaps,
		bool in_pf, bool enforce_sysfs,
		unsigned long vm_flags,
		unsigned long tva_flags,
		unsigned long orders)
		{
		return 0;

mm/huge_memory.c

+16 −4

Original line number	Diff line number	Diff line
		@@ -77,10 +77,13 @@ unsigned long huge_anon_orders_inherit __read_mostly;
		unsigned long huge_pcp_allow_orders __read_mostly;

		unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
		unsigned long vm_flags, bool smaps,
		bool in_pf, bool enforce_sysfs,
		unsigned long vm_flags,
		unsigned long tva_flags,
		unsigned long orders)
		{
		bool smaps = tva_flags & TVA_SMAPS;
		bool in_pf = tva_flags & TVA_IN_PF;
		bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
		/* Check the intersection of requested and supported orders. */
		orders &= vma_is_anonymous(vma) ?
		THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
		@@ -3155,6 +3158,7 @@ int split_huge_page_to_list_to_order(struct page page, struct list_head list,
		XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
		struct anon_vma *anon_vma = NULL;
		struct address_space *mapping = NULL;
		bool is_thp = folio_test_pmd_mappable(folio);
		int extra_pins, ret;
		pgoff_t end;
		bool is_hzp;
		@@ -3333,6 +3337,7 @@ int split_huge_page_to_list_to_order(struct page page, struct list_head list,
		i_mmap_unlock_read(mapping);
		out:
		xas_destroy(&xas);
		if (is_thp)
		count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
		return ret;
		}
		@@ -3395,6 +3400,7 @@ void deferred_split_folio(struct folio *folio)

		spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
		if (list_empty(&folio->_deferred_list)) {
		if (folio_test_pmd_mappable(folio))
		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
		ds_queue->split_queue_len++;
		@@ -3604,6 +3610,9 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
		if (!is_transparent_hugepage(folio))
		goto next;

		if (new_order >= folio_order(folio))
		goto next;

		total++;
		/*
		* For folios with private, split_huge_page_to_list_to_order()
		@@ -3671,6 +3680,9 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
		total++;
		nr_pages = folio_nr_pages(folio);

		if (new_order >= folio_order(folio))
		goto next;

		if (!folio_trylock(folio))
		goto next;

mm/khugepaged.c

+7 −9

Original line number	Diff line number	Diff line
		@@ -459,7 +459,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
		{
		if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
		hugepage_flags_enabled()) {
		if (thp_vma_allowable_order(vma, vm_flags, false, false, true,
		if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
		PMD_ORDER))
		__khugepaged_enter(vma->vm_mm);
		}
		@@ -925,6 +925,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
		struct collapse_control *cc)
		{
		struct vm_area_struct *vma;
		unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;

		if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
		return SCAN_ANY_PROCESS;
		@@ -935,8 +936,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,

		if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
		return SCAN_ADDRESS_RANGE;
		if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
		cc->is_khugepaged, PMD_ORDER))
		if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
		return SCAN_VMA_CHECK;
		/*
		* Anon VMA expected, the address may be unmapped then
		@@ -1527,8 +1527,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
		* and map it by a PMD, regardless of sysfs THP settings. As such, let's
		* analogously elide sysfs THP settings here.
		*/
		if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
		PMD_ORDER))
		if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
		return SCAN_VMA_CHECK;

		/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
		@@ -2403,8 +2402,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
		progress++;
		break;
		}
		if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
		true, PMD_ORDER)) {
		if (!thp_vma_allowable_order(vma, vma->vm_flags,
		TVA_ENFORCE_SYSFS, PMD_ORDER)) {
		skip:
		progress++;
		continue;
		@@ -2741,8 +2740,7 @@ int madvise_collapse(struct vm_area_struct vma, struct vm_area_struct *prev,

		*prev = vma;

		if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
		PMD_ORDER))
		if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
		return -EINVAL;

		if (task_in_dynamic_pool(current))

mm/memory.c

+36 −61

Original line number	Diff line number	Diff line
		@@ -3532,6 +3532,16 @@ static vm_fault_t wp_page_shared(struct vm_fault vmf, struct folio folio)
		static bool wp_can_reuse_anon_folio(struct folio *folio,
		struct vm_area_struct *vma)
		{
		/*
		* We could currently only reuse a subpage of a large folio if no
		* other subpages of the large folios are still mapped. However,
		* let's just consistently not reuse subpages even if we could
		* reuse in that scenario, and give back a large folio a bit
		* sooner.
		*/
		if (folio_test_large(folio))
		return false;

		/*
		* We have to verify under folio lock: these early checks are
		* just an optimization to avoid locking the folio and freeing
		@@ -4333,8 +4343,8 @@ static struct folio alloc_anon_folio(struct vm_fault vmf)
		* for this vma. Then filter out the orders that can't be allocated over
		* the faulting address and still be fully contained in the vma.
		*/
		orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
		BIT(PMD_ORDER) - 1);
		orders = thp_vma_allowable_orders(vma, vma->vm_flags,
		TVA_IN_PF \| TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
		orders = thp_vma_suitable_orders(vma, vmf->address, orders);

		if (!orders)
		@@ -4807,7 +4817,8 @@ static int fault_around_bytes_set(void *data, u64 val)
		* The minimum value is 1 page, however this results in no fault-around
		* at all. See should_fault_around().
		*/
		fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL);
		val = max(val, PAGE_SIZE);
		fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT;

		return 0;
		}
		@@ -5071,51 +5082,17 @@ int numa_migrate_prep(struct folio folio, struct vm_area_struct vma,
		}

		static void numa_rebuild_single_mapping(struct vm_fault vmf, struct vm_area_struct vma,
		unsigned long fault_addr, pte_t *fault_pte,
		bool writable)
		{
		pte_t pte, old_pte;

		old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte);
		old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
		pte = pte_modify(old_pte, vma->vm_page_prot);
		pte = pte_mkyoung(pte);
		if (writable)
		pte = pte_mkwrite(pte, vma);
		ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte);
		update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
		}

		static void numa_rebuild_large_mapping(struct vm_fault vmf, struct vm_area_struct vma,
		struct folio *folio, pte_t fault_pte,
		bool ignore_writable, bool pte_write_upgrade)
		{
		int nr = pte_pfn(fault_pte) - folio_pfn(folio);
		unsigned long start = max(vmf->address - nr * PAGE_SIZE, vma->vm_start);
		unsigned long end = min(vmf->address + (folio_nr_pages(folio) - nr) * PAGE_SIZE, vma->vm_end);
		pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE;
		unsigned long addr;

		/* Restore all PTEs' mapping of the large folio */
		for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
		pte_t ptent = ptep_get(start_ptep);
		bool writable = false;

		if (!pte_present(ptent) \|\| !pte_protnone(ptent))
		continue;

		if (pfn_folio(pte_pfn(ptent)) != folio)
		continue;

		if (!ignore_writable) {
		ptent = pte_modify(ptent, vma->vm_page_prot);
		writable = pte_write(ptent);
		if (!writable && pte_write_upgrade &&
		can_change_pte_writable(vma, addr, ptent))
		writable = true;
		}

		numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable);
		}
		ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
		update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
		}

		static vm_fault_t do_numa_page(struct vm_fault *vmf)
		@@ -5123,26 +5100,25 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
		struct vm_area_struct *vma = vmf->vma;
		struct folio *folio = NULL;
		int nid = NUMA_NO_NODE;
		bool writable = false, ignore_writable = false;
		bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
		bool writable = false;
		int last_cpupid;
		int target_nid;
		pte_t pte, old_pte;
		int flags = 0, nr_pages;
		int flags = 0;

		/*
		* The "pte" at this point cannot be used safely without
		* validation through pte_unmap_same(). It's of NUMA type but
		* the pfn may be screwed if the read is non atomic.
		* The pte cannot be used safely until we verify, while holding the page
		* table lock, that its contents have not changed during fault handling.
		*/
		spin_lock(vmf->ptl);
		if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
		/* Read the live PTE from the page tables: */
		old_pte = ptep_get(vmf->pte);

		if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
		pte_unmap_unlock(vmf->pte, vmf->ptl);
		goto out;
		}

		/* Get the normal PTE */
		old_pte = ptep_get(vmf->pte);
		pte = pte_modify(old_pte, vma->vm_page_prot);

		/*
		@@ -5150,7 +5126,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
		* is only valid while holding the PT lock.
		*/
		writable = pte_write(pte);
		if (!writable && pte_write_upgrade &&
		if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
		can_change_pte_writable(vma, vmf->address, pte))
		writable = true;

		@@ -5158,6 +5134,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
		if (!folio \|\| folio_is_zone_device(folio))
		goto out_map;

		/* TODO: handle PTE-mapped THP */
		if (folio_test_large(folio))
		goto out_map;

		/*
		* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
		* much anyway since they can be in shared cache state. This misses
		@@ -5177,7 +5157,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
		flags \|= TNF_SHARED;

		nid = folio_nid(folio);
		nr_pages = folio_nr_pages(folio);
		/*
		* For memory tiering mode, cpupid of slow memory page is used
		* to record page access time. So use default value.
		@@ -5194,7 +5173,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
		}
		pte_unmap_unlock(vmf->pte, vmf->ptl);
		writable = false;
		ignore_writable = true;

		/* Migrate to the requested node */
		if (migrate_misplaced_folio(folio, vma, target_nid)) {
		@@ -5215,19 +5193,14 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)

		out:
		if (nid != NUMA_NO_NODE)
		task_numa_fault(last_cpupid, nid, nr_pages, flags);
		task_numa_fault(last_cpupid, nid, 1, flags);
		return 0;
		out_map:
		/*
		* Make it present again, depending on how arch implements
		* non-accessible ptes, some can allow access by kernel mode.
		*/
		if (folio && folio_test_large(folio))
		numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable,
		pte_write_upgrade);
		else
		numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
		writable);
		numa_rebuild_single_mapping(vmf, vma, writable);
		pte_unmap_unlock(vmf->pte, vmf->ptl);
		goto out;
		}
		@@ -5434,7 +5407,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
		return VM_FAULT_OOM;
		retry_pud:
		if (pud_none(*vmf.pud) &&
		thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER) &&
		thp_vma_allowable_order(vma, vm_flags,
		TVA_IN_PF \| TVA_ENFORCE_SYSFS, PUD_ORDER) &&
		!task_in_dynamic_pool(current)) {
		ret = create_huge_pud(&vmf);
		if (!(ret & VM_FAULT_FALLBACK))
		@@ -5469,7 +5443,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
		goto retry_pud;

		if (pmd_none(*vmf.pmd) &&
		thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER) &&
		thp_vma_allowable_order(vma, vm_flags,
		TVA_IN_PF \| TVA_ENFORCE_SYSFS, PMD_ORDER) &&
		!task_in_dynamic_pool(current)) {
		ret = create_huge_pmd(&vmf);
		if (!(ret & VM_FAULT_FALLBACK))