Unverified Commit 836fd984 authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!12631 some bugfixs for khuegpaged

Merge Pull Request from: @ci-robot 
 
PR sync from: Nanyong Sun <sunnanyong@huawei.com>
https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/IGZDSM5FM6RVWTBUWQSVD2BVR7IBYSEO/ 
some bugfixs for khuegpaged, after this, khugepaged support collapse
large folios of pagecache.

Baolin Wang (6):
  mm: khugepaged: expand the is_refcount_suitable() to support file
    folios
  mm: khugepaged: use the number of pages in the folio to check the
    reference count
  mm: khugepaged: support shmem mTHP copy
  mm: khugepaged: support shmem mTHP collapse
  selftests: mm: support shmem mTHP collapse testing
  mm: khugepaged: fix the incorrect statistics when collapsing large
    file folios

Matthew Wilcox (Oracle) (7):
  khugepaged: inline hpage_collapse_alloc_folio()
  khugepaged: convert alloc_charge_hpage to alloc_charge_folio
  khugepaged: remove hpage from collapse_huge_page()
  khugepaged: pass a folio to __collapse_huge_page_copy()
  khugepaged: remove hpage from collapse_file()
  khugepaged: use a folio throughout collapse_file()
  khugepaged: use a folio throughout hpage_collapse_scan_file()

Yang Shi (1):
  mm: khugepaged: fix the arguments order in khugepaged_collapse_file
    trace point


-- 
2.33.0
 
https://gitee.com/openeuler/kernel/issues/IAXCD2 
 
Link:https://gitee.com/openeuler/kernel/pulls/12631

 

Reviewed-by: default avatarKefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: default avatarZhang Peng <zhangpeng362@huawei.com>
parents a8e395e1 eb3b796a
Loading
Loading
Loading
Loading
+8 −8
Original line number Diff line number Diff line
@@ -174,10 +174,10 @@ TRACE_EVENT(mm_collapse_huge_page_swapin,

TRACE_EVENT(mm_khugepaged_scan_file,

	TP_PROTO(struct mm_struct *mm, struct page *page, struct file *file,
	TP_PROTO(struct mm_struct *mm, struct folio *folio, struct file *file,
		 int present, int swap, int result),

	TP_ARGS(mm, page, file, present, swap, result),
	TP_ARGS(mm, folio, file, present, swap, result),

	TP_STRUCT__entry(
		__field(struct mm_struct *, mm)
@@ -190,7 +190,7 @@ TRACE_EVENT(mm_khugepaged_scan_file,

	TP_fast_assign(
		__entry->mm = mm;
		__entry->pfn = page ? page_to_pfn(page) : -1;
		__entry->pfn = folio ? folio_pfn(folio) : -1;
		__assign_str(filename, file->f_path.dentry->d_iname);
		__entry->present = present;
		__entry->swap = swap;
@@ -207,10 +207,10 @@ TRACE_EVENT(mm_khugepaged_scan_file,
);

TRACE_EVENT(mm_khugepaged_collapse_file,
	TP_PROTO(struct mm_struct *mm, struct page *hpage, pgoff_t index,
			bool is_shmem, unsigned long addr, struct file *file,
	TP_PROTO(struct mm_struct *mm, struct folio *new_folio, pgoff_t index,
			unsigned long addr, bool is_shmem, struct file *file,
			int nr, int result),
	TP_ARGS(mm, hpage, index, addr, is_shmem, file, nr, result),
	TP_ARGS(mm, new_folio, index, addr, is_shmem, file, nr, result),
	TP_STRUCT__entry(
		__field(struct mm_struct *, mm)
		__field(unsigned long, hpfn)
@@ -224,7 +224,7 @@ TRACE_EVENT(mm_khugepaged_collapse_file,

	TP_fast_assign(
		__entry->mm = mm;
		__entry->hpfn = hpage ? page_to_pfn(hpage) : -1;
		__entry->hpfn = new_folio ? folio_pfn(new_folio) : -1;
		__entry->index = index;
		__entry->addr = addr;
		__entry->is_shmem = is_shmem;
@@ -233,7 +233,7 @@ TRACE_EVENT(mm_khugepaged_collapse_file,
		__entry->result = result;
	),

	TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%ld, is_shmem=%d, filename=%s, nr=%d, result=%s",
	TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%lx, is_shmem=%d, filename=%s, nr=%d, result=%s",
		__entry->mm,
		__entry->hpfn,
		__entry->index,
+160 −181
Original line number Diff line number Diff line
@@ -555,12 +555,14 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,

static bool is_refcount_suitable(struct folio *folio)
{
	int expected_refcount;
	int expected_refcount = folio_mapcount(folio);

	expected_refcount = folio_mapcount(folio);
	if (folio_test_swapcache(folio))
	if (!folio_test_anon(folio) || folio_test_swapcache(folio))
		expected_refcount += folio_nr_pages(folio);

	if (folio_test_private(folio))
		expected_refcount++;

	return folio_ref_count(folio) == expected_refcount;
}

@@ -794,7 +796,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
 * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
 *
 * @pte: starting of the PTEs to copy from
 * @page: the new hugepage to copy contents to
 * @folio: the new hugepage to copy contents to
 * @pmd: pointer to the new hugepage's PMD
 * @orig_pmd: the original raw pages' PMD
 * @vma: the original raw pages' virtual memory area
@@ -802,33 +804,29 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
 * @ptl: lock on raw pages' PTEs
 * @compound_pagelist: list that stores compound pages
 */
static int __collapse_huge_page_copy(pte_t *pte,
				     struct page *page,
				     pmd_t *pmd,
				     pmd_t orig_pmd,
				     struct vm_area_struct *vma,
				     unsigned long address,
				     spinlock_t *ptl,
static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
		unsigned long address, spinlock_t *ptl,
		struct list_head *compound_pagelist)
{
	struct page *src_page;
	pte_t *_pte;
	pte_t pteval;
	unsigned long _address;
	unsigned int i;
	int result = SCAN_SUCCEED;

	/*
	 * Copying pages' contents is subject to memory poison at any iteration.
	 */
	for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR;
	     _pte++, page++, _address += PAGE_SIZE) {
		pteval = ptep_get(_pte);
	for (i = 0; i < HPAGE_PMD_NR; i++) {
		pte_t pteval = ptep_get(pte + i);
		struct page *page = folio_page(folio, i);
		unsigned long src_addr = address + i * PAGE_SIZE;
		struct page *src_page;

		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
			clear_user_highpage(page, _address);
			clear_user_highpage(page, src_addr);
			continue;
		}
		src_page = pte_page(pteval);
		if (copy_mc_user_highpage(page, src_page, _address, vma)) {
		if (copy_mc_user_highpage(page, src_page, src_addr, vma)) {
			result = SCAN_COPY_MC;
			break;
		}
@@ -919,20 +917,6 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
}
#endif

static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node,
				      nodemask_t *nmask)
{
	*folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask);

	if (unlikely(!*folio)) {
		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
		return false;
	}

	count_vm_event(THP_COLLAPSE_ALLOC);
	return true;
}

/*
 * If mmap_lock temporarily dropped, revalidate vma
 * before taking mmap_lock.
@@ -1087,7 +1071,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
	return result;
}

static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
			      struct collapse_control *cc)
{
	gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
@@ -1098,20 +1082,23 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
	if (cc->reliable)
		gfp |= GFP_RELIABLE;

	if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) {
		*hpage = NULL;
	folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
	if (!folio) {
		*foliop = NULL;
		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
		return SCAN_ALLOC_HUGE_PAGE_FAIL;
	}

	count_vm_event(THP_COLLAPSE_ALLOC);
	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
		folio_put(folio);
		*hpage = NULL;
		*foliop = NULL;
		return SCAN_CGROUP_CHARGE_FAIL;
	}

	count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);

	*hpage = folio_page(folio, 0);
	*foliop = folio;
	return SCAN_SUCCEED;
}

@@ -1124,7 +1111,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
	pte_t *pte;
	pgtable_t pgtable;
	struct folio *folio;
	struct page *hpage;
	spinlock_t *pmd_ptl, *pte_ptl;
	int result = SCAN_FAIL;
	struct vm_area_struct *vma;
@@ -1140,7 +1126,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
	 */
	mmap_read_unlock(mm);

	result = alloc_charge_hpage(&hpage, mm, cc);
	result = alloc_charge_folio(&folio, mm, cc);
	if (result != SCAN_SUCCEED)
		goto out_nolock;

@@ -1236,14 +1222,13 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
	 */
	anon_vma_unlock_write(vma->anon_vma);

	result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
	result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
					   vma, address, pte_ptl,
					   &compound_pagelist);
	pte_unmap(pte);
	if (unlikely(result != SCAN_SUCCEED))
		goto out_up_write;

	folio = page_folio(hpage);
	/*
	 * The smp_wmb() inside __folio_mark_uptodate() ensures the
	 * copy_huge_page writes become visible before the set_pmd_at()
@@ -1252,12 +1237,12 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
	__folio_mark_uptodate(folio);
	pgtable = pmd_pgtable(_pmd);

	_pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
	_pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);

	spin_lock(pmd_ptl);
	BUG_ON(!pmd_none(*pmd));
	add_reliable_page_counter(hpage, vma->vm_mm, HPAGE_PMD_NR);
	add_reliable_folio_counter(folio, vma->vm_mm, HPAGE_PMD_NR);
	folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
	folio_add_lru_vma(folio, vma);
	pgtable_trans_huge_deposit(mm, pmd, pgtable);
@@ -1265,14 +1250,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
	update_mmu_cache_pmd(vma, address, pmd);
	spin_unlock(pmd_ptl);

	hpage = NULL;
	folio = NULL;

	result = SCAN_SUCCEED;
out_up_write:
	mmap_write_unlock(mm);
out_nolock:
	if (hpage)
		put_page(hpage);
	if (folio)
		folio_put(folio);
	trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
	return result;
}
@@ -1835,29 +1820,26 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
			 struct collapse_control *cc)
{
	struct address_space *mapping = file->f_mapping;
	struct page *hpage;
	struct page *page;
	struct page *tmp;
	struct folio *folio;
	struct page *dst;
	struct folio *folio, *tmp, *new_folio;
	pgoff_t index = 0, end = start + HPAGE_PMD_NR;
	LIST_HEAD(pagelist);
	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
	int nr_none = 0, result = SCAN_SUCCEED;
	bool is_shmem = shmem_file(file);
	int nr = 0;

	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));

	result = alloc_charge_hpage(&hpage, mm, cc);
	result = alloc_charge_folio(&new_folio, mm, cc);
	if (result != SCAN_SUCCEED)
		goto out;

	__SetPageLocked(hpage);
	__folio_set_locked(new_folio);
	if (is_shmem)
		__SetPageSwapBacked(hpage);
	hpage->index = start;
	hpage->mapping = mapping;
		__folio_set_swapbacked(new_folio);
	new_folio->index = start;
	new_folio->mapping = mapping;

	/*
	 * Ensure we have slots for all the pages in the range.  This is
@@ -1875,13 +1857,13 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		}
	} while (1);

	for (index = start; index < end; index++) {
	for (index = start; index < end;) {
		xas_set(&xas, index);
		page = xas_load(&xas);
		folio = xas_load(&xas);

		VM_BUG_ON(index != xas.xa_index);
		if (is_shmem) {
			if (!page) {
			if (!folio) {
				/*
				 * Stop if extent has been truncated or
				 * hole-punched, and is now completely
@@ -1894,10 +1876,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
					}
				}
				nr_none++;
				index++;
				continue;
			}

			if (xa_is_value(page) || !PageUptodate(page)) {
			if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
				xas_unlock_irq(&xas);
				/* swap in or instantiate fallocated page */
				if (shmem_get_folio(mapping->host, index, 0,
@@ -1907,28 +1890,27 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
				}
				/* drain lru cache to help isolate_lru_page() */
				lru_add_drain();
				page = folio_file_page(folio, index);
			} else if (trylock_page(page)) {
				get_page(page);
			} else if (folio_trylock(folio)) {
				folio_get(folio);
				xas_unlock_irq(&xas);
			} else {
				result = SCAN_PAGE_LOCK;
				goto xa_locked;
			}
		} else {	/* !is_shmem */
			if (!page || xa_is_value(page)) {
			if (!folio || xa_is_value(folio)) {
				xas_unlock_irq(&xas);
				page_cache_sync_readahead(mapping, &file->f_ra,
							  file, index,
							  end - index);
				/* drain lru cache to help isolate_lru_page() */
				lru_add_drain();
				page = find_lock_page(mapping, index);
				if (unlikely(page == NULL)) {
				folio = filemap_lock_folio(mapping, index);
				if (IS_ERR(folio)) {
					result = SCAN_FAIL;
					goto xa_unlocked;
				}
			} else if (PageDirty(page)) {
			} else if (folio_test_dirty(folio)) {
				/*
				 * khugepaged only works on read-only fd,
				 * so this page is dirty because it hasn't
@@ -1946,12 +1928,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
				filemap_flush(mapping);
				result = SCAN_FAIL;
				goto xa_unlocked;
			} else if (PageWriteback(page)) {
			} else if (folio_test_writeback(folio)) {
				xas_unlock_irq(&xas);
				result = SCAN_FAIL;
				goto xa_unlocked;
			} else if (trylock_page(page)) {
				get_page(page);
			} else if (folio_trylock(folio)) {
				folio_get(folio);
				xas_unlock_irq(&xas);
			} else {
				result = SCAN_PAGE_LOCK;
@@ -1960,35 +1942,29 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		}

		/*
		 * The page must be locked, so we can drop the i_pages lock
		 * The folio must be locked, so we can drop the i_pages lock
		 * without racing with truncate.
		 */
		VM_BUG_ON_PAGE(!PageLocked(page), page);
		VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

		/* make sure the page is up to date */
		if (unlikely(!PageUptodate(page))) {
		/* make sure the folio is up to date */
		if (unlikely(!folio_test_uptodate(folio))) {
			result = SCAN_FAIL;
			goto out_unlock;
		}

		/*
		 * If file was truncated then extended, or hole-punched, before
		 * we locked the first page, then a THP might be there already.
		 * we locked the first folio, then a THP might be there already.
		 * This will be discovered on the first iteration.
		 */
		if (PageTransCompound(page)) {
			struct page *head = compound_head(page);

			result = compound_order(head) == HPAGE_PMD_ORDER &&
					head->index == start
		if (folio_order(folio) == HPAGE_PMD_ORDER &&
		    folio->index == start) {
			/* Maybe PMD-mapped */
					? SCAN_PTE_MAPPED_HUGEPAGE
					: SCAN_PAGE_COMPOUND;
			result = SCAN_PTE_MAPPED_HUGEPAGE;
			goto out_unlock;
		}

		folio = page_folio(page);

		if (folio_mapping(folio) != mapping) {
			result = SCAN_TRUNCATED;
			goto out_unlock;
@@ -1998,7 +1974,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
				  folio_test_writeback(folio))) {
			/*
			 * khugepaged only works on read-only fd, so this
			 * page is dirty because it hasn't been flushed
			 * folio is dirty because it hasn't been flushed
			 * since first write.
			 */
			result = SCAN_FAIL;
@@ -2022,33 +1998,35 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,

		xas_lock_irq(&xas);

		VM_BUG_ON_PAGE(page != xa_load(xas.xa, index), page);
		VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio);

		/*
		 * We control three references to the page:
		 * We control 2 + nr_pages references to the folio:
		 *  - we hold a pin on it;
		 *  - one reference from page cache;
		 *  - one from isolate_lru_page;
		 * If those are the only references, then any new usage of the
		 * page will have to fetch it from the page cache. That requires
		 * locking the page to handle truncate, so any new usage will be
		 * blocked until we unlock page after collapse/during rollback.
		 */
		if (page_count(page) != 3) {
		 *  - nr_pages reference from page cache;
		 *  - one from lru_isolate_folio;
		 * If those are the only references, then any new usage
		 * of the folio will have to fetch it from the page
		 * cache. That requires locking the folio to handle
		 * truncate, so any new usage will be blocked until we
		 * unlock folio after collapse/during rollback.
		 */
		if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) {
			result = SCAN_PAGE_COUNT;
			xas_unlock_irq(&xas);
			putback_lru_page(page);
			folio_putback_lru(folio);
			goto out_unlock;
		}

		/*
		 * Accumulate the pages that are being collapsed.
		 * Accumulate the folios that are being collapsed.
		 */
		list_add_tail(&page->lru, &pagelist);
		list_add_tail(&folio->lru, &pagelist);
		index += folio_nr_pages(folio);
		continue;
out_unlock:
		unlock_page(page);
		put_page(page);
		folio_unlock(folio);
		folio_put(folio);
		goto xa_unlocked;
	}

@@ -2087,23 +2065,32 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
	}

	/*
	 * The old pages are locked, so they won't change anymore.
	 * The old folios are locked, so they won't change anymore.
	 */
	index = start;
	list_for_each_entry(page, &pagelist, lru) {
		while (index < page->index) {
			clear_highpage(hpage + (index % HPAGE_PMD_NR));
	dst = folio_page(new_folio, 0);
	list_for_each_entry(folio, &pagelist, lru) {
		int i, nr_pages = folio_nr_pages(folio);

		while (index < folio->index) {
			clear_highpage(dst);
			index++;
			dst++;
		}
		if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page)) {

		for (i = 0; i < nr_pages; i++) {
			if (copy_mc_highpage(dst, folio_page(folio, i))) {
				result = SCAN_COPY_MC;
				goto rollback;
			}
			index++;
			dst++;
		}
	}
	while (index < end) {
		clear_highpage(hpage + (index % HPAGE_PMD_NR));
		clear_highpage(dst);
		index++;
		dst++;
	}

	if (nr_none) {
@@ -2131,16 +2118,17 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		}

		/*
		 * If userspace observed a missing page in a VMA with a MODE_MISSING
		 * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
		 * page. If so, we need to roll back to avoid suppressing such an
		 * event. Since wp/minor userfaultfds don't give userspace any
		 * guarantees that the kernel doesn't fill a missing page with a zero
		 * page, so they don't matter here.
		 * If userspace observed a missing page in a VMA with
		 * a MODE_MISSING userfaultfd, then it might expect a
		 * UFFD_EVENT_PAGEFAULT for that page. If so, we need to
		 * roll back to avoid suppressing such an event. Since
		 * wp/minor userfaultfds don't give userspace any
		 * guarantees that the kernel doesn't fill a missing
		 * page with a zero page, so they don't matter here.
		 *
		 * Any userfaultfds registered after this point will not be able to
		 * observe any missing pages due to the previously inserted retry
		 * entries.
		 * Any userfaultfds registered after this point will
		 * not be able to observe any missing pages due to the
		 * previously inserted retry entries.
		 */
		vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
			if (userfaultfd_missing(vma)) {
@@ -2165,33 +2153,32 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		xas_lock_irq(&xas);
	}

	folio = page_folio(hpage);
	nr = folio_nr_pages(folio);
	if (is_shmem)
		__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
		__lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
	else
		__lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr);
		__lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);

	if (nr_none) {
		__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_none);
		__lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
		/* nr_none is always 0 for non-shmem. */
		__lruvec_stat_mod_folio(folio, NR_SHMEM, nr_none);
		__lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
	}

	/*
	 * Mark hpage as uptodate before inserting it into the page cache so
	 * that it isn't mistaken for an fallocated but unwritten page.
	 * Mark new_folio as uptodate before inserting it into the
	 * page cache so that it isn't mistaken for an fallocated but
	 * unwritten page.
	 */
	folio_mark_uptodate(folio);
	folio_ref_add(folio, HPAGE_PMD_NR - 1);
	folio_mark_uptodate(new_folio);
	folio_ref_add(new_folio, HPAGE_PMD_NR - 1);

	if (is_shmem)
		folio_mark_dirty(folio);
	folio_add_lru(folio);
		folio_mark_dirty(new_folio);
	folio_add_lru(new_folio);

	/* Join all the small entries into a single multi-index entry. */
	xas_set_order(&xas, start, HPAGE_PMD_ORDER);
	xas_store(&xas, folio);
	xas_store(&xas, new_folio);
	WARN_ON_ONCE(xas_error(&xas));
	xas_unlock_irq(&xas);

@@ -2202,18 +2189,18 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
	retract_page_tables(mapping, start);
	if (cc && !cc->is_khugepaged)
		result = SCAN_PTE_MAPPED_HUGEPAGE;
	folio_unlock(folio);
	folio_unlock(new_folio);

	/*
	 * The collapse has succeeded, so free the old pages.
	 * The collapse has succeeded, so free the old folios.
	 */
	list_for_each_entry_safe(page, tmp, &pagelist, lru) {
		list_del(&page->lru);
		page->mapping = NULL;
		ClearPageActive(page);
		ClearPageUnevictable(page);
		unlock_page(page);
		folio_put_refs(page_folio(page), 3);
	list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
		list_del(&folio->lru);
		folio->mapping = NULL;
		folio_clear_active(folio);
		folio_clear_unevictable(folio);
		folio_unlock(folio);
		folio_put_refs(folio, 2 + folio_nr_pages(folio));
	}

	goto out;
@@ -2227,11 +2214,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		shmem_uncharge(mapping->host, nr_none);
	}

	list_for_each_entry_safe(page, tmp, &pagelist, lru) {
		list_del(&page->lru);
		unlock_page(page);
		putback_lru_page(page);
		put_page(page);
	list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
		list_del(&folio->lru);
		folio_unlock(folio);
		folio_putback_lru(folio);
		folio_put(folio);
	}
	/*
	 * Undo the updates of filemap_nr_thps_inc for non-SHMEM
@@ -2247,13 +2234,13 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
		smp_mb();
	}

	hpage->mapping = NULL;
	new_folio->mapping = NULL;

	unlock_page(hpage);
	put_page(hpage);
	folio_unlock(new_folio);
	folio_put(new_folio);
out:
	VM_BUG_ON(!list_empty(&pagelist));
	trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
	trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result);
	return result;
}

@@ -2261,7 +2248,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
				    struct file *file, pgoff_t start,
				    struct collapse_control *cc)
{
	struct page *page = NULL;
	struct folio *folio = NULL;
	struct address_space *mapping = file->f_mapping;
	XA_STATE(xas, &mapping->i_pages, start);
	int present, swap;
@@ -2274,12 +2261,12 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
	nodes_clear(cc->alloc_nmask);
	cc->reliable = false;
	rcu_read_lock();
	xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
		if (xas_retry(&xas, page))
	xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) {
		if (xas_retry(&xas, folio))
			continue;

		if (xa_is_value(page)) {
			++swap;
		if (xa_is_value(folio)) {
			swap += 1 << xas_get_order(&xas);
			if (cc->is_khugepaged &&
			    swap > khugepaged_max_ptes_swap) {
				result = SCAN_EXCEED_SWAP_PTE;
@@ -2289,18 +2276,10 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
			continue;
		}

		/*
		 * TODO: khugepaged should compact smaller compound pages
		 * into a PMD sized page
		 */
		if (PageTransCompound(page)) {
			struct page *head = compound_head(page);

			result = compound_order(head) == HPAGE_PMD_ORDER &&
					head->index == start
		if (folio_order(folio) == HPAGE_PMD_ORDER &&
		    folio->index == start) {
			/* Maybe PMD-mapped */
					? SCAN_PTE_MAPPED_HUGEPAGE
					: SCAN_PAGE_COMPOUND;
			result = SCAN_PTE_MAPPED_HUGEPAGE;
			/*
			 * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
			 * by the caller won't touch the page cache, and so
@@ -2310,43 +2289,43 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
			break;
		}

		node = page_to_nid(page);
		node = folio_nid(folio);
		if (hpage_collapse_scan_abort(node, cc)) {
			result = SCAN_SCAN_ABORT;
			break;
		}
		cc->node_load[node]++;

		if (!PageLRU(page)) {
		if (!folio_test_lru(folio)) {
			result = SCAN_PAGE_LRU;
			break;
		}

		if (page_count(page) !=
		    1 + page_mapcount(page) + page_has_private(page)) {
		if (!is_refcount_suitable(folio)) {
			result = SCAN_PAGE_COUNT;
			break;
		}

		if (page_from_dynamic_pool(page)) {
		if (page_from_dynamic_pool(&folio->page)) {
			result = SCAN_FAIL;
			break;
		}

		/*
		 * We probably should check if the page is referenced here, but
		 * nobody would transfer pte_young() to PageReferenced() for us.
		 * And rmap walk here is just too costly...
		 * We probably should check if the folio is referenced
		 * here, but nobody would transfer pte_young() to
		 * folio_test_referenced() for us.  And rmap walk here
		 * is just too costly...
		 */

		present++;
		present += folio_nr_pages(folio);

		if (need_resched()) {
			xas_pause(&xas);
			cond_resched_rcu();
		}

		if (page_reliable(page))
		if (folio_reliable(folio))
			cc->reliable = true;
	}
	rcu_read_unlock();
@@ -2361,7 +2340,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
		}
	}

	trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result);
	trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
	return result;
}
#else
+3 −1
Original line number Diff line number Diff line
@@ -1095,7 +1095,7 @@ static void usage(void)
	fprintf(stderr,	"\n\tSupported Options:\n");
	fprintf(stderr,	"\t\t-h: This help message.\n");
	fprintf(stderr,	"\t\t-s: mTHP size, expressed as page order.\n");
	fprintf(stderr,	"\t\t    Defaults to 0. Use this size for anon allocations.\n");
	fprintf(stderr,	"\t\t    Defaults to 0. Use this size for anon or shmem allocations.\n");
	exit(1);
}

@@ -1209,6 +1209,8 @@ int main(int argc, char **argv)
	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
	default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT;
	default_settings.hugepages[anon_order].enabled = THP_ALWAYS;
	default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT;
	default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS;

	save_settings();
	thp_push_settings(&default_settings);
+40 −6
Original line number Diff line number Diff line
@@ -33,10 +33,11 @@ static const char * const thp_defrag_strings[] = {
};

static const char * const shmem_enabled_strings[] = {
	"never",
	"always",
	"within_size",
	"advise",
	"never",
	"inherit",
	"deny",
	"force",
	NULL
@@ -200,6 +201,7 @@ void thp_write_num(const char *name, unsigned long num)
void thp_read_settings(struct thp_settings *settings)
{
	unsigned long orders = thp_supported_orders();
	unsigned long shmem_orders = thp_shmem_supported_orders();
	char path[PATH_MAX];
	int i;

@@ -234,12 +236,24 @@ void thp_read_settings(struct thp_settings *settings)
		settings->hugepages[i].enabled =
			thp_read_string(path, thp_enabled_strings);
	}

	for (i = 0; i < NR_ORDERS; i++) {
		if (!((1 << i) & shmem_orders)) {
			settings->shmem_hugepages[i].enabled = SHMEM_NEVER;
			continue;
		}
		snprintf(path, PATH_MAX, "hugepages-%ukB/shmem_enabled",
			(getpagesize() >> 10) << i);
		settings->shmem_hugepages[i].enabled =
			thp_read_string(path, shmem_enabled_strings);
	}
}

void thp_write_settings(struct thp_settings *settings)
{
	struct khugepaged_settings *khugepaged = &settings->khugepaged;
	unsigned long orders = thp_supported_orders();
	unsigned long shmem_orders = thp_shmem_supported_orders();
	char path[PATH_MAX];
	int enabled;
	int i;
@@ -271,6 +285,15 @@ void thp_write_settings(struct thp_settings *settings)
		enabled = settings->hugepages[i].enabled;
		thp_write_string(path, thp_enabled_strings[enabled]);
	}

	for (i = 0; i < NR_ORDERS; i++) {
		if (!((1 << i) & shmem_orders))
			continue;
		snprintf(path, PATH_MAX, "hugepages-%ukB/shmem_enabled",
			(getpagesize() >> 10) << i);
		enabled = settings->shmem_hugepages[i].enabled;
		thp_write_string(path, shmem_enabled_strings[enabled]);
	}
}

struct thp_settings *thp_current_settings(void)
@@ -324,17 +347,18 @@ void thp_set_read_ahead_path(char *path)
	dev_queue_read_ahead_path[sizeof(dev_queue_read_ahead_path) - 1] = '\0';
}

unsigned long thp_supported_orders(void)
static unsigned long __thp_supported_orders(bool is_shmem)
{
	unsigned long orders = 0;
	char path[PATH_MAX];
	char buf[256];
	int ret;
	int i;
	int ret, i;
	char anon_dir[] = "enabled";
	char shmem_dir[] = "shmem_enabled";

	for (i = 0; i < NR_ORDERS; i++) {
		ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/enabled",
			(getpagesize() >> 10) << i);
		ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/%s",
			       (getpagesize() >> 10) << i, is_shmem ? shmem_dir : anon_dir);
		if (ret >= PATH_MAX) {
			printf("%s: Pathname is too long\n", __func__);
			exit(EXIT_FAILURE);
@@ -347,3 +371,13 @@ unsigned long thp_supported_orders(void)

	return orders;
}

unsigned long thp_supported_orders(void)
{
	return __thp_supported_orders(false);
}

unsigned long thp_shmem_supported_orders(void)
{
	return __thp_supported_orders(true);
}
+8 −1
Original line number Diff line number Diff line
@@ -22,10 +22,11 @@ enum thp_defrag {
};

enum shmem_enabled {
	SHMEM_NEVER,
	SHMEM_ALWAYS,
	SHMEM_WITHIN_SIZE,
	SHMEM_ADVISE,
	SHMEM_NEVER,
	SHMEM_INHERIT,
	SHMEM_DENY,
	SHMEM_FORCE,
};
@@ -46,6 +47,10 @@ struct khugepaged_settings {
	unsigned long pages_to_scan;
};

struct shmem_hugepages_settings {
	enum shmem_enabled enabled;
};

struct thp_settings {
	enum thp_enabled thp_enabled;
	enum thp_defrag thp_defrag;
@@ -54,6 +59,7 @@ struct thp_settings {
	struct khugepaged_settings khugepaged;
	unsigned long read_ahead_kb;
	struct hugepages_settings hugepages[NR_ORDERS];
	struct shmem_hugepages_settings shmem_hugepages[NR_ORDERS];
};

int read_file(const char *path, char *buf, size_t buflen);
@@ -76,5 +82,6 @@ void thp_save_settings(void);

void thp_set_read_ahead_path(char *path);
unsigned long thp_supported_orders(void);
unsigned long thp_shmem_supported_orders(void);

#endif /* __THP_SETTINGS_H__ */