Unverified Commit 171c8e9d authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!4199 Support large folio for mlock

Merge Pull Request from: @ci-robot 
 
PR sync from: Peng Zhang <zhangpeng362@huawei.com>
https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/5NUJOHQRM7ZDTVRMOIVQH43TLLEXW46F/ 
From: ZhangPeng <zhangpeng362@huawei.com>

Support large folio for mlock. Backport from v6.7-rc1.

ChangeLog:

v1->v2:
- Add a bugfix patch of patch 2

Hugh Dickins (1):
  mm: mlock: avoid folio_within_range() on KSM pages

Yin Fengwei (3):
  mm: add functions folio_in_range() and folio_within_vma()
  mm: handle large folio when large folio in VM_LOCKED VMA range
  mm: mlock: update mlock_pte_range to handle large folio


-- 
2.25.1
 
https://gitee.com/openeuler/kernel/issues/I8YQMW 
 
Link:https://gitee.com/openeuler/kernel/pulls/4199

 

Reviewed-by: default avatarKefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: default avatarZheng Zengkai <zhengzengkai@huawei.com>
parents 6188d510 1e71dccd
Loading
Loading
Loading
Loading
+63 −10
Original line number Diff line number Diff line
@@ -592,6 +592,56 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma,
				   bool write, int *locked);
extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
			       unsigned long bytes);

/*
 * NOTE: This function can't tell whether the folio is "fully mapped" in the
 * range.
 * "fully mapped" means all the pages of folio is associated with the page
 * table of range while this function just check whether the folio range is
 * within the range [start, end). Funcation caller nees to do page table
 * check if it cares about the page table association.
 *
 * Typical usage (like mlock or madvise) is:
 * Caller knows at least 1 page of folio is associated with page table of VMA
 * and the range [start, end) is intersect with the VMA range. Caller wants
 * to know whether the folio is fully associated with the range. It calls
 * this function to check whether the folio is in the range first. Then checks
 * the page table to know whether the folio is fully mapped to the range.
 */
static inline bool
folio_within_range(struct folio *folio, struct vm_area_struct *vma,
		unsigned long start, unsigned long end)
{
	pgoff_t pgoff, addr;
	unsigned long vma_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;

	VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
	if (start > end)
		return false;

	if (start < vma->vm_start)
		start = vma->vm_start;

	if (end > vma->vm_end)
		end = vma->vm_end;

	pgoff = folio_pgoff(folio);

	/* if folio start address is not in vma range */
	if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
		return false;

	addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);

	return !(addr < start || end - addr < folio_size(folio));
}

static inline bool
folio_within_vma(struct folio *folio, struct vm_area_struct *vma)
{
	return folio_within_range(folio, vma, vma->vm_start, vma->vm_end);
}

/*
 * mlock_vma_folio() and munlock_vma_folio():
 * should be called with vma's mmap_lock held for read or write,
@@ -600,14 +650,10 @@ extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
 * mlock is usually called at the end of page_add_*_rmap(), munlock at
 * the end of page_remove_rmap(); but new anon folios are managed by
 * folio_add_lru_vma() calling mlock_new_folio().
 *
 * @compound is used to include pmd mappings of THPs, but filter out
 * pte mappings of THPs, which cannot be consistently counted: a pte
 * mapping of the THP head cannot be distinguished by the page alone.
 */
void mlock_folio(struct folio *folio);
static inline void mlock_vma_folio(struct folio *folio,
			struct vm_area_struct *vma, bool compound)
				struct vm_area_struct *vma)
{
	/*
	 * The VM_SPECIAL check here serves two purposes.
@@ -617,17 +663,24 @@ static inline void mlock_vma_folio(struct folio *folio,
	 *    file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
	 *    still be set while VM_SPECIAL bits are added: so ignore it then.
	 */
	if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
	    (compound || !folio_test_large(folio)))
	if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED))
		mlock_folio(folio);
}

void munlock_folio(struct folio *folio);
static inline void munlock_vma_folio(struct folio *folio,
			struct vm_area_struct *vma, bool compound)
					struct vm_area_struct *vma)
{
	if (unlikely(vma->vm_flags & VM_LOCKED) &&
	    (compound || !folio_test_large(folio)))
	/*
	 * munlock if the function is called. Ideally, we should only
	 * do munlock if any page of folio is unmapped from VMA and
	 * cause folio not fully mapped to VMA.
	 *
	 * But it's not easy to confirm that's the situation. So we
	 * always munlock the folio and page reclaim will correct it
	 * if it's wrong.
	 */
	if (unlikely(vma->vm_flags & VM_LOCKED))
		munlock_folio(folio);
}

+68 −2
Original line number Diff line number Diff line
@@ -305,6 +305,62 @@ void munlock_folio(struct folio *folio)
	local_unlock(&mlock_fbatch.lock);
}

static inline unsigned int folio_mlock_step(struct folio *folio,
		pte_t *pte, unsigned long addr, unsigned long end)
{
	unsigned int count, i, nr = folio_nr_pages(folio);
	unsigned long pfn = folio_pfn(folio);
	pte_t ptent = ptep_get(pte);

	if (!folio_test_large(folio))
		return 1;

	count = pfn + nr - pte_pfn(ptent);
	count = min_t(unsigned int, count, (end - addr) >> PAGE_SHIFT);

	for (i = 0; i < count; i++, pte++) {
		pte_t entry = ptep_get(pte);

		if (!pte_present(entry))
			break;
		if (pte_pfn(entry) - pfn >= nr)
			break;
	}

	return i;
}

static inline bool allow_mlock_munlock(struct folio *folio,
		struct vm_area_struct *vma, unsigned long start,
		unsigned long end, unsigned int step)
{
	/*
	 * For unlock, allow munlock large folio which is partially
	 * mapped to VMA. As it's possible that large folio is
	 * mlocked and VMA is split later.
	 *
	 * During memory pressure, such kind of large folio can
	 * be split. And the pages are not in VM_LOCKed VMA
	 * can be reclaimed.
	 */
	if (!(vma->vm_flags & VM_LOCKED))
		return true;

	/* folio_within_range() cannot take KSM, but any small folio is OK */
	if (!folio_test_large(folio))
		return true;

	/* folio not in range [start, end), skip mlock */
	if (!folio_within_range(folio, vma, start, end))
		return false;

	/* folio is not fully mapped, skip mlock */
	if (step != folio_nr_pages(folio))
		return false;

	return true;
}

static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
			   unsigned long end, struct mm_walk *walk)

@@ -314,6 +370,8 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
	pte_t *start_pte, *pte;
	pte_t ptent;
	struct folio *folio;
	unsigned int step = 1;
	unsigned long start = addr;

	ptl = pmd_trans_huge_lock(pmd, vma);
	if (ptl) {
@@ -334,6 +392,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
		walk->action = ACTION_AGAIN;
		return 0;
	}

	for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
		ptent = ptep_get(pte);
		if (!pte_present(ptent))
@@ -341,12 +400,19 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
		folio = vm_normal_folio(vma, addr, ptent);
		if (!folio || folio_is_zone_device(folio))
			continue;
		if (folio_test_large(folio))
			continue;

		step = folio_mlock_step(folio, pte, addr, end);
		if (!allow_mlock_munlock(folio, vma, start, end, step))
			goto next_entry;

		if (vma->vm_flags & VM_LOCKED)
			mlock_folio(folio);
		else
			munlock_folio(folio);

next_entry:
		pte += step - 1;
		addr += (step - 1) << PAGE_SHIFT;
	}
	pte_unmap(start_pte);
out:
+55 −11
Original line number Diff line number Diff line
@@ -798,6 +798,7 @@ struct folio_referenced_arg {
	unsigned long vm_flags;
	struct mem_cgroup *memcg;
};

/*
 * arg: folio_referenced_arg will be passed
 */
@@ -807,18 +808,34 @@ static bool folio_referenced_one(struct folio *folio,
	struct folio_referenced_arg *pra = arg;
	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
	int referenced = 0;
	unsigned long start = address, ptes = 0;

	while (page_vma_mapped_walk(&pvmw)) {
		address = pvmw.address;

		if ((vma->vm_flags & VM_LOCKED) &&
		    (!folio_test_large(folio) || !pvmw.pte)) {
		if (vma->vm_flags & VM_LOCKED) {
			if (!folio_test_large(folio) || !pvmw.pte) {
				/* Restore the mlock which got missed */
			mlock_vma_folio(folio, vma, !pvmw.pte);
				mlock_vma_folio(folio, vma);
				page_vma_mapped_walk_done(&pvmw);
				pra->vm_flags |= VM_LOCKED;
				return false; /* To break the loop */
			}
			/*
			 * For large folio fully mapped to VMA, will
			 * be handled after the pvmw loop.
			 *
			 * For large folio cross VMA boundaries, it's
			 * expected to be picked  by page reclaim. But
			 * should skip reference of pages which are in
			 * the range of VM_LOCKED vma. As page reclaim
			 * should just count the reference of pages out
			 * the range of VM_LOCKED vma.
			 */
			ptes++;
			pra->mapcount--;
			continue;
		}

		if (pvmw.pte) {
			if (lru_gen_enabled() &&
@@ -842,6 +859,23 @@ static bool folio_referenced_one(struct folio *folio,
		pra->mapcount--;
	}

	if ((vma->vm_flags & VM_LOCKED) &&
			folio_test_large(folio) &&
			folio_within_vma(folio, vma)) {
		unsigned long s_align, e_align;

		s_align = ALIGN_DOWN(start, PMD_SIZE);
		e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE);

		/* folio doesn't cross page table boundary and fully mapped */
		if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) {
			/* Restore the mlock which got missed */
			mlock_vma_folio(folio, vma);
			pra->vm_flags |= VM_LOCKED;
			return false; /* To break the loop */
		}
	}

	if (referenced)
		folio_clear_idle(folio);
	if (folio_test_clear_young(folio))
@@ -1253,7 +1287,14 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
			__page_check_anon_rmap(folio, page, vma, address);
	}

	mlock_vma_folio(folio, vma, compound);
	/*
	 * For large folio, only mlock it if it's fully mapped to VMA. It's
	 * not easy to check whether the large folio is fully mapped to VMA
	 * here. Only mlock normal 4K folio and leave page reclaim to handle
	 * large folio.
	 */
	if (!folio_test_large(folio))
		mlock_vma_folio(folio, vma);
}

/**
@@ -1352,7 +1393,9 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page,
	if (nr)
		__lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr);

	mlock_vma_folio(folio, vma, compound);
	/* See comments in page_add_anon_rmap() */
	if (!folio_test_large(folio))
		mlock_vma_folio(folio, vma);
}

/**
@@ -1463,7 +1506,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
	 * it's only reliable while mapped.
	 */

	munlock_vma_folio(folio, vma, compound);
	munlock_vma_folio(folio, vma);
}

/*
@@ -1528,7 +1571,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
		if (!(flags & TTU_IGNORE_MLOCK) &&
		    (vma->vm_flags & VM_LOCKED)) {
			/* Restore the mlock which got missed */
			mlock_vma_folio(folio, vma, false);
			if (!folio_test_large(folio))
				mlock_vma_folio(folio, vma);
			page_vma_mapped_walk_done(&pvmw);
			ret = false;
			break;