!12631 some bugfixs for khuegpaged (836fd984) · Commits · EulixOS / Software / Kernel

include/trace/events/huge_memory.h

+8 −8

Original line number	Diff line number	Diff line
		@@ -174,10 +174,10 @@ TRACE_EVENT(mm_collapse_huge_page_swapin,

		TRACE_EVENT(mm_khugepaged_scan_file,

		TP_PROTO(struct mm_struct mm, struct page page, struct file *file,
		TP_PROTO(struct mm_struct mm, struct folio folio, struct file *file,
		int present, int swap, int result),

		TP_ARGS(mm, page, file, present, swap, result),
		TP_ARGS(mm, folio, file, present, swap, result),

		TP_STRUCT__entry(
		__field(struct mm_struct *, mm)
		@@ -190,7 +190,7 @@ TRACE_EVENT(mm_khugepaged_scan_file,

		TP_fast_assign(
		__entry->mm = mm;
		__entry->pfn = page ? page_to_pfn(page) : -1;
		__entry->pfn = folio ? folio_pfn(folio) : -1;
		__assign_str(filename, file->f_path.dentry->d_iname);
		__entry->present = present;
		__entry->swap = swap;
		@@ -207,10 +207,10 @@ TRACE_EVENT(mm_khugepaged_scan_file,
		);

		TRACE_EVENT(mm_khugepaged_collapse_file,
		TP_PROTO(struct mm_struct mm, struct page hpage, pgoff_t index,
		bool is_shmem, unsigned long addr, struct file *file,
		TP_PROTO(struct mm_struct mm, struct folio new_folio, pgoff_t index,
		unsigned long addr, bool is_shmem, struct file *file,
		int nr, int result),
		TP_ARGS(mm, hpage, index, addr, is_shmem, file, nr, result),
		TP_ARGS(mm, new_folio, index, addr, is_shmem, file, nr, result),
		TP_STRUCT__entry(
		__field(struct mm_struct *, mm)
		__field(unsigned long, hpfn)
		@@ -224,7 +224,7 @@ TRACE_EVENT(mm_khugepaged_collapse_file,

		TP_fast_assign(
		__entry->mm = mm;
		__entry->hpfn = hpage ? page_to_pfn(hpage) : -1;
		__entry->hpfn = new_folio ? folio_pfn(new_folio) : -1;
		__entry->index = index;
		__entry->addr = addr;
		__entry->is_shmem = is_shmem;
		@@ -233,7 +233,7 @@ TRACE_EVENT(mm_khugepaged_collapse_file,
		__entry->result = result;
		),

		TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%ld, is_shmem=%d, filename=%s, nr=%d, result=%s",
		TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%lx, is_shmem=%d, filename=%s, nr=%d, result=%s",
		__entry->mm,
		__entry->hpfn,
		__entry->index,

mm/khugepaged.c

+160 −181

Original line number	Diff line number	Diff line
		@@ -555,12 +555,14 @@ static void release_pte_pages(pte_t pte, pte_t _pte,

		static bool is_refcount_suitable(struct folio *folio)
		{
		int expected_refcount;
		int expected_refcount = folio_mapcount(folio);

		expected_refcount = folio_mapcount(folio);
		if (folio_test_swapcache(folio))
		if (!folio_test_anon(folio) \|\| folio_test_swapcache(folio))
		expected_refcount += folio_nr_pages(folio);

		if (folio_test_private(folio))
		expected_refcount++;

		return folio_ref_count(folio) == expected_refcount;
		}

		@@ -794,7 +796,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
		* Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
		*
		* @pte: starting of the PTEs to copy from
		* @page: the new hugepage to copy contents to
		* @folio: the new hugepage to copy contents to
		* @pmd: pointer to the new hugepage's PMD
		* @orig_pmd: the original raw pages' PMD
		* @vma: the original raw pages' virtual memory area
		@@ -802,33 +804,29 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
		* @ptl: lock on raw pages' PTEs
		* @compound_pagelist: list that stores compound pages
		*/
		static int __collapse_huge_page_copy(pte_t *pte,
		struct page *page,
		pmd_t *pmd,
		pmd_t orig_pmd,
		struct vm_area_struct *vma,
		unsigned long address,
		spinlock_t *ptl,
		static int __collapse_huge_page_copy(pte_t pte, struct folio folio,
		pmd_t pmd, pmd_t orig_pmd, struct vm_area_struct vma,
		unsigned long address, spinlock_t *ptl,
		struct list_head *compound_pagelist)
		{
		struct page *src_page;
		pte_t *_pte;
		pte_t pteval;
		unsigned long _address;
		unsigned int i;
		int result = SCAN_SUCCEED;

		/*
		* Copying pages' contents is subject to memory poison at any iteration.
		*/
		for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR;
		_pte++, page++, _address += PAGE_SIZE) {
		pteval = ptep_get(_pte);
		for (i = 0; i < HPAGE_PMD_NR; i++) {
		pte_t pteval = ptep_get(pte + i);
		struct page *page = folio_page(folio, i);
		unsigned long src_addr = address + i * PAGE_SIZE;
		struct page *src_page;

		if (pte_none(pteval) \|\| is_zero_pfn(pte_pfn(pteval))) {
		clear_user_highpage(page, _address);
		clear_user_highpage(page, src_addr);
		continue;
		}
		src_page = pte_page(pteval);
		if (copy_mc_user_highpage(page, src_page, _address, vma)) {
		if (copy_mc_user_highpage(page, src_page, src_addr, vma)) {
		result = SCAN_COPY_MC;
		break;
		}
		@@ -919,20 +917,6 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
		}
		#endif

		static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node,
		nodemask_t *nmask)
		{
		*folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask);

		if (unlikely(!*folio)) {
		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
		return false;
		}

		count_vm_event(THP_COLLAPSE_ALLOC);
		return true;
		}

		/*
		* If mmap_lock temporarily dropped, revalidate vma
		* before taking mmap_lock.
		@@ -1087,7 +1071,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
		return result;
		}

		static int alloc_charge_hpage(struct page *hpage, struct mm_struct mm,
		static int alloc_charge_folio(struct folio *foliop, struct mm_struct mm,
		struct collapse_control *cc)
		{
		gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
		@@ -1098,20 +1082,23 @@ static int alloc_charge_hpage(struct page *hpage, struct mm_struct mm,
		if (cc->reliable)
		gfp \|= GFP_RELIABLE;

		if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) {
		*hpage = NULL;
		folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
		if (!folio) {
		*foliop = NULL;
		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
		return SCAN_ALLOC_HUGE_PAGE_FAIL;
		}

		count_vm_event(THP_COLLAPSE_ALLOC);
		if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
		folio_put(folio);
		*hpage = NULL;
		*foliop = NULL;
		return SCAN_CGROUP_CHARGE_FAIL;
		}

		count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);

		*hpage = folio_page(folio, 0);
		*foliop = folio;
		return SCAN_SUCCEED;
		}

		@@ -1124,7 +1111,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
		pte_t *pte;
		pgtable_t pgtable;
		struct folio *folio;
		struct page *hpage;
		spinlock_t pmd_ptl, pte_ptl;
		int result = SCAN_FAIL;
		struct vm_area_struct *vma;
		@@ -1140,7 +1126,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
		*/
		mmap_read_unlock(mm);

		result = alloc_charge_hpage(&hpage, mm, cc);
		result = alloc_charge_folio(&folio, mm, cc);
		if (result != SCAN_SUCCEED)
		goto out_nolock;

		@@ -1236,14 +1222,13 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
		*/
		anon_vma_unlock_write(vma->anon_vma);

		result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
		result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
		vma, address, pte_ptl,
		&compound_pagelist);
		pte_unmap(pte);
		if (unlikely(result != SCAN_SUCCEED))
		goto out_up_write;

		folio = page_folio(hpage);
		/*
		* The smp_wmb() inside __folio_mark_uptodate() ensures the
		* copy_huge_page writes become visible before the set_pmd_at()
		@@ -1252,12 +1237,12 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
		__folio_mark_uptodate(folio);
		pgtable = pmd_pgtable(_pmd);

		_pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
		_pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
		_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);

		spin_lock(pmd_ptl);
		BUG_ON(!pmd_none(*pmd));
		add_reliable_page_counter(hpage, vma->vm_mm, HPAGE_PMD_NR);
		add_reliable_folio_counter(folio, vma->vm_mm, HPAGE_PMD_NR);
		folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
		folio_add_lru_vma(folio, vma);
		pgtable_trans_huge_deposit(mm, pmd, pgtable);
		@@ -1265,14 +1250,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
		update_mmu_cache_pmd(vma, address, pmd);
		spin_unlock(pmd_ptl);

		hpage = NULL;
		folio = NULL;

		result = SCAN_SUCCEED;
		out_up_write:
		mmap_write_unlock(mm);
		out_nolock:
		if (hpage)
		put_page(hpage);
		if (folio)
		folio_put(folio);
		trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
		return result;
		}
		@@ -1835,29 +1820,26 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		struct collapse_control *cc)
		{
		struct address_space *mapping = file->f_mapping;
		struct page *hpage;
		struct page *page;
		struct page *tmp;
		struct folio *folio;
		struct page *dst;
		struct folio folio, tmp, *new_folio;
		pgoff_t index = 0, end = start + HPAGE_PMD_NR;
		LIST_HEAD(pagelist);
		XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
		int nr_none = 0, result = SCAN_SUCCEED;
		bool is_shmem = shmem_file(file);
		int nr = 0;

		VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
		VM_BUG_ON(start & (HPAGE_PMD_NR - 1));

		result = alloc_charge_hpage(&hpage, mm, cc);
		result = alloc_charge_folio(&new_folio, mm, cc);
		if (result != SCAN_SUCCEED)
		goto out;

		__SetPageLocked(hpage);
		__folio_set_locked(new_folio);
		if (is_shmem)
		__SetPageSwapBacked(hpage);
		hpage->index = start;
		hpage->mapping = mapping;
		__folio_set_swapbacked(new_folio);
		new_folio->index = start;
		new_folio->mapping = mapping;

		/*
		* Ensure we have slots for all the pages in the range. This is
		@@ -1875,13 +1857,13 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		}
		} while (1);

		for (index = start; index < end; index++) {
		for (index = start; index < end;) {
		xas_set(&xas, index);
		page = xas_load(&xas);
		folio = xas_load(&xas);

		VM_BUG_ON(index != xas.xa_index);
		if (is_shmem) {
		if (!page) {
		if (!folio) {
		/*
		* Stop if extent has been truncated or
		* hole-punched, and is now completely
		@@ -1894,10 +1876,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		}
		}
		nr_none++;
		index++;
		continue;
		}

		if (xa_is_value(page) \|\| !PageUptodate(page)) {
		if (xa_is_value(folio) \|\| !folio_test_uptodate(folio)) {
		xas_unlock_irq(&xas);
		/* swap in or instantiate fallocated page */
		if (shmem_get_folio(mapping->host, index, 0,
		@@ -1907,28 +1890,27 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		}
		/* drain lru cache to help isolate_lru_page() */
		lru_add_drain();
		page = folio_file_page(folio, index);
		} else if (trylock_page(page)) {
		get_page(page);
		} else if (folio_trylock(folio)) {
		folio_get(folio);
		xas_unlock_irq(&xas);
		} else {
		result = SCAN_PAGE_LOCK;
		goto xa_locked;
		}
		} else { /* !is_shmem */
		if (!page \|\| xa_is_value(page)) {
		if (!folio \|\| xa_is_value(folio)) {
		xas_unlock_irq(&xas);
		page_cache_sync_readahead(mapping, &file->f_ra,
		file, index,
		end - index);
		/* drain lru cache to help isolate_lru_page() */
		lru_add_drain();
		page = find_lock_page(mapping, index);
		if (unlikely(page == NULL)) {
		folio = filemap_lock_folio(mapping, index);
		if (IS_ERR(folio)) {
		result = SCAN_FAIL;
		goto xa_unlocked;
		}
		} else if (PageDirty(page)) {
		} else if (folio_test_dirty(folio)) {
		/*
		* khugepaged only works on read-only fd,
		* so this page is dirty because it hasn't
		@@ -1946,12 +1928,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		filemap_flush(mapping);
		result = SCAN_FAIL;
		goto xa_unlocked;
		} else if (PageWriteback(page)) {
		} else if (folio_test_writeback(folio)) {
		xas_unlock_irq(&xas);
		result = SCAN_FAIL;
		goto xa_unlocked;
		} else if (trylock_page(page)) {
		get_page(page);
		} else if (folio_trylock(folio)) {
		folio_get(folio);
		xas_unlock_irq(&xas);
		} else {
		result = SCAN_PAGE_LOCK;
		@@ -1960,35 +1942,29 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		}

		/*
		* The page must be locked, so we can drop the i_pages lock
		* The folio must be locked, so we can drop the i_pages lock
		* without racing with truncate.
		*/
		VM_BUG_ON_PAGE(!PageLocked(page), page);
		VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

		/* make sure the page is up to date */
		if (unlikely(!PageUptodate(page))) {
		/* make sure the folio is up to date */
		if (unlikely(!folio_test_uptodate(folio))) {
		result = SCAN_FAIL;
		goto out_unlock;
		}

		/*
		* If file was truncated then extended, or hole-punched, before
		* we locked the first page, then a THP might be there already.
		* we locked the first folio, then a THP might be there already.
		* This will be discovered on the first iteration.
		*/
		if (PageTransCompound(page)) {
		struct page *head = compound_head(page);

		result = compound_order(head) == HPAGE_PMD_ORDER &&
		head->index == start
		if (folio_order(folio) == HPAGE_PMD_ORDER &&
		folio->index == start) {
		/* Maybe PMD-mapped */
		? SCAN_PTE_MAPPED_HUGEPAGE
		: SCAN_PAGE_COMPOUND;
		result = SCAN_PTE_MAPPED_HUGEPAGE;
		goto out_unlock;
		}

		folio = page_folio(page);

		if (folio_mapping(folio) != mapping) {
		result = SCAN_TRUNCATED;
		goto out_unlock;
		@@ -1998,7 +1974,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		folio_test_writeback(folio))) {
		/*
		* khugepaged only works on read-only fd, so this
		* page is dirty because it hasn't been flushed
		* folio is dirty because it hasn't been flushed
		* since first write.
		*/
		result = SCAN_FAIL;
		@@ -2022,33 +1998,35 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,

		xas_lock_irq(&xas);

		VM_BUG_ON_PAGE(page != xa_load(xas.xa, index), page);
		VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio);

		/*
		* We control three references to the page:
		* We control 2 + nr_pages references to the folio:
		* - we hold a pin on it;
		* - one reference from page cache;
		* - one from isolate_lru_page;
		* If those are the only references, then any new usage of the
		* page will have to fetch it from the page cache. That requires
		* locking the page to handle truncate, so any new usage will be
		* blocked until we unlock page after collapse/during rollback.
		*/
		if (page_count(page) != 3) {
		* - nr_pages reference from page cache;
		* - one from lru_isolate_folio;
		* If those are the only references, then any new usage
		* of the folio will have to fetch it from the page
		* cache. That requires locking the folio to handle
		* truncate, so any new usage will be blocked until we
		* unlock folio after collapse/during rollback.
		*/
		if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) {
		result = SCAN_PAGE_COUNT;
		xas_unlock_irq(&xas);
		putback_lru_page(page);
		folio_putback_lru(folio);
		goto out_unlock;
		}

		/*
		* Accumulate the pages that are being collapsed.
		* Accumulate the folios that are being collapsed.
		*/
		list_add_tail(&page->lru, &pagelist);
		list_add_tail(&folio->lru, &pagelist);
		index += folio_nr_pages(folio);
		continue;
		out_unlock:
		unlock_page(page);
		put_page(page);
		folio_unlock(folio);
		folio_put(folio);
		goto xa_unlocked;
		}

		@@ -2087,23 +2065,32 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		}

		/*
		* The old pages are locked, so they won't change anymore.
		* The old folios are locked, so they won't change anymore.
		*/
		index = start;
		list_for_each_entry(page, &pagelist, lru) {
		while (index < page->index) {
		clear_highpage(hpage + (index % HPAGE_PMD_NR));
		dst = folio_page(new_folio, 0);
		list_for_each_entry(folio, &pagelist, lru) {
		int i, nr_pages = folio_nr_pages(folio);

		while (index < folio->index) {
		clear_highpage(dst);
		index++;
		dst++;
		}
		if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page)) {

		for (i = 0; i < nr_pages; i++) {
		if (copy_mc_highpage(dst, folio_page(folio, i))) {
		result = SCAN_COPY_MC;
		goto rollback;
		}
		index++;
		dst++;
		}
		}
		while (index < end) {
		clear_highpage(hpage + (index % HPAGE_PMD_NR));
		clear_highpage(dst);
		index++;
		dst++;
		}

		if (nr_none) {
		@@ -2131,16 +2118,17 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		}

		/*
		* If userspace observed a missing page in a VMA with a MODE_MISSING
		* userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
		* page. If so, we need to roll back to avoid suppressing such an
		* event. Since wp/minor userfaultfds don't give userspace any
		* guarantees that the kernel doesn't fill a missing page with a zero
		* page, so they don't matter here.
		* If userspace observed a missing page in a VMA with
		* a MODE_MISSING userfaultfd, then it might expect a
		* UFFD_EVENT_PAGEFAULT for that page. If so, we need to
		* roll back to avoid suppressing such an event. Since
		* wp/minor userfaultfds don't give userspace any
		* guarantees that the kernel doesn't fill a missing
		* page with a zero page, so they don't matter here.
		*
		* Any userfaultfds registered after this point will not be able to
		* observe any missing pages due to the previously inserted retry
		* entries.
		* Any userfaultfds registered after this point will
		* not be able to observe any missing pages due to the
		* previously inserted retry entries.
		*/
		vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
		if (userfaultfd_missing(vma)) {
		@@ -2165,33 +2153,32 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		xas_lock_irq(&xas);
		}

		folio = page_folio(hpage);
		nr = folio_nr_pages(folio);
		if (is_shmem)
		__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
		__lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
		else
		__lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr);
		__lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);

		if (nr_none) {
		__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_none);
		__lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
		/* nr_none is always 0 for non-shmem. */
		__lruvec_stat_mod_folio(folio, NR_SHMEM, nr_none);
		__lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
		}

		/*
		* Mark hpage as uptodate before inserting it into the page cache so
		* that it isn't mistaken for an fallocated but unwritten page.
		* Mark new_folio as uptodate before inserting it into the
		* page cache so that it isn't mistaken for an fallocated but
		* unwritten page.
		*/
		folio_mark_uptodate(folio);
		folio_ref_add(folio, HPAGE_PMD_NR - 1);
		folio_mark_uptodate(new_folio);
		folio_ref_add(new_folio, HPAGE_PMD_NR - 1);

		if (is_shmem)
		folio_mark_dirty(folio);
		folio_add_lru(folio);
		folio_mark_dirty(new_folio);
		folio_add_lru(new_folio);

		/* Join all the small entries into a single multi-index entry. */
		xas_set_order(&xas, start, HPAGE_PMD_ORDER);
		xas_store(&xas, folio);
		xas_store(&xas, new_folio);
		WARN_ON_ONCE(xas_error(&xas));
		xas_unlock_irq(&xas);

		@@ -2202,18 +2189,18 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		retract_page_tables(mapping, start);
		if (cc && !cc->is_khugepaged)
		result = SCAN_PTE_MAPPED_HUGEPAGE;
		folio_unlock(folio);
		folio_unlock(new_folio);

		/*
		* The collapse has succeeded, so free the old pages.
		* The collapse has succeeded, so free the old folios.
		*/
		list_for_each_entry_safe(page, tmp, &pagelist, lru) {
		list_del(&page->lru);
		page->mapping = NULL;
		ClearPageActive(page);
		ClearPageUnevictable(page);
		unlock_page(page);
		folio_put_refs(page_folio(page), 3);
		list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
		list_del(&folio->lru);
		folio->mapping = NULL;
		folio_clear_active(folio);
		folio_clear_unevictable(folio);
		folio_unlock(folio);
		folio_put_refs(folio, 2 + folio_nr_pages(folio));
		}

		goto out;
		@@ -2227,11 +2214,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		shmem_uncharge(mapping->host, nr_none);
		}

		list_for_each_entry_safe(page, tmp, &pagelist, lru) {
		list_del(&page->lru);
		unlock_page(page);
		putback_lru_page(page);
		put_page(page);
		list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
		list_del(&folio->lru);
		folio_unlock(folio);
		folio_putback_lru(folio);
		folio_put(folio);
		}
		/*
		* Undo the updates of filemap_nr_thps_inc for non-SHMEM
		@@ -2247,13 +2234,13 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		smp_mb();
		}

		hpage->mapping = NULL;
		new_folio->mapping = NULL;

		unlock_page(hpage);
		put_page(hpage);
		folio_unlock(new_folio);
		folio_put(new_folio);
		out:
		VM_BUG_ON(!list_empty(&pagelist));
		trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
		trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result);
		return result;
		}

		@@ -2261,7 +2248,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
		struct file *file, pgoff_t start,
		struct collapse_control *cc)
		{
		struct page *page = NULL;
		struct folio *folio = NULL;
		struct address_space *mapping = file->f_mapping;
		XA_STATE(xas, &mapping->i_pages, start);
		int present, swap;
		@@ -2274,12 +2261,12 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
		nodes_clear(cc->alloc_nmask);
		cc->reliable = false;
		rcu_read_lock();
		xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
		if (xas_retry(&xas, page))
		xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) {
		if (xas_retry(&xas, folio))
		continue;

		if (xa_is_value(page)) {
		++swap;
		if (xa_is_value(folio)) {
		swap += 1 << xas_get_order(&xas);
		if (cc->is_khugepaged &&
		swap > khugepaged_max_ptes_swap) {
		result = SCAN_EXCEED_SWAP_PTE;
		@@ -2289,18 +2276,10 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
		continue;
		}

		/*
		* TODO: khugepaged should compact smaller compound pages
		* into a PMD sized page
		*/
		if (PageTransCompound(page)) {
		struct page *head = compound_head(page);

		result = compound_order(head) == HPAGE_PMD_ORDER &&
		head->index == start
		if (folio_order(folio) == HPAGE_PMD_ORDER &&
		folio->index == start) {
		/* Maybe PMD-mapped */
		? SCAN_PTE_MAPPED_HUGEPAGE
		: SCAN_PAGE_COMPOUND;
		result = SCAN_PTE_MAPPED_HUGEPAGE;
		/*
		* For SCAN_PTE_MAPPED_HUGEPAGE, further processing
		* by the caller won't touch the page cache, and so
		@@ -2310,43 +2289,43 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
		break;
		}

		node = page_to_nid(page);
		node = folio_nid(folio);
		if (hpage_collapse_scan_abort(node, cc)) {
		result = SCAN_SCAN_ABORT;
		break;
		}
		cc->node_load[node]++;

		if (!PageLRU(page)) {
		if (!folio_test_lru(folio)) {
		result = SCAN_PAGE_LRU;
		break;
		}

		if (page_count(page) !=
		1 + page_mapcount(page) + page_has_private(page)) {
		if (!is_refcount_suitable(folio)) {
		result = SCAN_PAGE_COUNT;
		break;
		}

		if (page_from_dynamic_pool(page)) {
		if (page_from_dynamic_pool(&folio->page)) {
		result = SCAN_FAIL;
		break;
		}

		/*
		* We probably should check if the page is referenced here, but
		* nobody would transfer pte_young() to PageReferenced() for us.
		* And rmap walk here is just too costly...
		* We probably should check if the folio is referenced
		* here, but nobody would transfer pte_young() to
		* folio_test_referenced() for us. And rmap walk here
		* is just too costly...
		*/

		present++;
		present += folio_nr_pages(folio);

		if (need_resched()) {
		xas_pause(&xas);
		cond_resched_rcu();
		}

		if (page_reliable(page))
		if (folio_reliable(folio))
		cc->reliable = true;
		}
		rcu_read_unlock();
		@@ -2361,7 +2340,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
		}
		}

		trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result);
		trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
		return result;
		}
		#else

tools/testing/selftests/mm/khugepaged.c

+3 −1

Original line number	Diff line number	Diff line
		@@ -1095,7 +1095,7 @@ static void usage(void)
		fprintf(stderr, "\n\tSupported Options:\n");
		fprintf(stderr, "\t\t-h: This help message.\n");
		fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n");
		fprintf(stderr, "\t\t Defaults to 0. Use this size for anon allocations.\n");
		fprintf(stderr, "\t\t Defaults to 0. Use this size for anon or shmem allocations.\n");
		exit(1);
		}

		@@ -1209,6 +1209,8 @@ int main(int argc, char **argv)
		default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
		default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT;
		default_settings.hugepages[anon_order].enabled = THP_ALWAYS;
		default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT;
		default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS;

		save_settings();
		thp_push_settings(&default_settings);

tools/testing/selftests/mm/thp_settings.c

+40 −6

Original line number	Diff line number	Diff line
		@@ -33,10 +33,11 @@ static const char * const thp_defrag_strings[] = {
		};

		static const char * const shmem_enabled_strings[] = {
		"never",
		"always",
		"within_size",
		"advise",
		"never",
		"inherit",
		"deny",
		"force",
		NULL
		@@ -200,6 +201,7 @@ void thp_write_num(const char *name, unsigned long num)
		void thp_read_settings(struct thp_settings *settings)
		{
		unsigned long orders = thp_supported_orders();
		unsigned long shmem_orders = thp_shmem_supported_orders();
		char path[PATH_MAX];
		int i;

		@@ -234,12 +236,24 @@ void thp_read_settings(struct thp_settings *settings)
		settings->hugepages[i].enabled =
		thp_read_string(path, thp_enabled_strings);
		}

		for (i = 0; i < NR_ORDERS; i++) {
		if (!((1 << i) & shmem_orders)) {
		settings->shmem_hugepages[i].enabled = SHMEM_NEVER;
		continue;
		}
		snprintf(path, PATH_MAX, "hugepages-%ukB/shmem_enabled",
		(getpagesize() >> 10) << i);
		settings->shmem_hugepages[i].enabled =
		thp_read_string(path, shmem_enabled_strings);
		}
		}

		void thp_write_settings(struct thp_settings *settings)
		{
		struct khugepaged_settings *khugepaged = &settings->khugepaged;
		unsigned long orders = thp_supported_orders();
		unsigned long shmem_orders = thp_shmem_supported_orders();
		char path[PATH_MAX];
		int enabled;
		int i;
		@@ -271,6 +285,15 @@ void thp_write_settings(struct thp_settings *settings)
		enabled = settings->hugepages[i].enabled;
		thp_write_string(path, thp_enabled_strings[enabled]);
		}

		for (i = 0; i < NR_ORDERS; i++) {
		if (!((1 << i) & shmem_orders))
		continue;
		snprintf(path, PATH_MAX, "hugepages-%ukB/shmem_enabled",
		(getpagesize() >> 10) << i);
		enabled = settings->shmem_hugepages[i].enabled;
		thp_write_string(path, shmem_enabled_strings[enabled]);
		}
		}

		struct thp_settings *thp_current_settings(void)
		@@ -324,17 +347,18 @@ void thp_set_read_ahead_path(char *path)
		dev_queue_read_ahead_path[sizeof(dev_queue_read_ahead_path) - 1] = '\0';
		}

		unsigned long thp_supported_orders(void)
		static unsigned long __thp_supported_orders(bool is_shmem)
		{
		unsigned long orders = 0;
		char path[PATH_MAX];
		char buf[256];
		int ret;
		int i;
		int ret, i;
		char anon_dir[] = "enabled";
		char shmem_dir[] = "shmem_enabled";

		for (i = 0; i < NR_ORDERS; i++) {
		ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/enabled",
		(getpagesize() >> 10) << i);
		ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/%s",
		(getpagesize() >> 10) << i, is_shmem ? shmem_dir : anon_dir);
		if (ret >= PATH_MAX) {
		printf("%s: Pathname is too long\n", __func__);
		exit(EXIT_FAILURE);
		@@ -347,3 +371,13 @@ unsigned long thp_supported_orders(void)

		return orders;
		}

		unsigned long thp_supported_orders(void)
		{
		return __thp_supported_orders(false);
		}

		unsigned long thp_shmem_supported_orders(void)
		{
		return __thp_supported_orders(true);
		}

tools/testing/selftests/mm/thp_settings.h

+8 −1

Original line number	Diff line number	Diff line
		@@ -22,10 +22,11 @@ enum thp_defrag {
		};

		enum shmem_enabled {
		SHMEM_NEVER,
		SHMEM_ALWAYS,
		SHMEM_WITHIN_SIZE,
		SHMEM_ADVISE,
		SHMEM_NEVER,
		SHMEM_INHERIT,
		SHMEM_DENY,
		SHMEM_FORCE,
		};
		@@ -46,6 +47,10 @@ struct khugepaged_settings {
		unsigned long pages_to_scan;
		};

		struct shmem_hugepages_settings {
		enum shmem_enabled enabled;
		};

		struct thp_settings {
		enum thp_enabled thp_enabled;
		enum thp_defrag thp_defrag;
		@@ -54,6 +59,7 @@ struct thp_settings {
		struct khugepaged_settings khugepaged;
		unsigned long read_ahead_kb;
		struct hugepages_settings hugepages[NR_ORDERS];
		struct shmem_hugepages_settings shmem_hugepages[NR_ORDERS];
		};

		int read_file(const char path, char buf, size_t buflen);
		@@ -76,5 +82,6 @@ void thp_save_settings(void);

		void thp_set_read_ahead_path(char *path);
		unsigned long thp_supported_orders(void);
		unsigned long thp_shmem_supported_orders(void);

		#endif /* __THP_SETTINGS_H__ */