!4199 Support large folio for mlock (171c8e9d) · Commits · EulixOS / Software / Kernel

mm/internal.h

+63 −10

Original line number	Diff line number	Diff line
		@@ -592,6 +592,56 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma,
		bool write, int *locked);
		extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
		unsigned long bytes);

		/*
		* NOTE: This function can't tell whether the folio is "fully mapped" in the
		* range.
		* "fully mapped" means all the pages of folio is associated with the page
		* table of range while this function just check whether the folio range is
		* within the range [start, end). Funcation caller nees to do page table
		* check if it cares about the page table association.
		*
		* Typical usage (like mlock or madvise) is:
		* Caller knows at least 1 page of folio is associated with page table of VMA
		* and the range [start, end) is intersect with the VMA range. Caller wants
		* to know whether the folio is fully associated with the range. It calls
		* this function to check whether the folio is in the range first. Then checks
		* the page table to know whether the folio is fully mapped to the range.
		*/
		static inline bool
		folio_within_range(struct folio folio, struct vm_area_struct vma,
		unsigned long start, unsigned long end)
		{
		pgoff_t pgoff, addr;
		unsigned long vma_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;

		VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
		if (start > end)
		return false;

		if (start < vma->vm_start)
		start = vma->vm_start;

		if (end > vma->vm_end)
		end = vma->vm_end;

		pgoff = folio_pgoff(folio);

		/* if folio start address is not in vma range */
		if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
		return false;

		addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);

		return !(addr < start \|\| end - addr < folio_size(folio));
		}

		static inline bool
		folio_within_vma(struct folio folio, struct vm_area_struct vma)
		{
		return folio_within_range(folio, vma, vma->vm_start, vma->vm_end);
		}

		/*
		* mlock_vma_folio() and munlock_vma_folio():
		* should be called with vma's mmap_lock held for read or write,
		@@ -600,14 +650,10 @@ extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
		* mlock is usually called at the end of page_add_*_rmap(), munlock at
		* the end of page_remove_rmap(); but new anon folios are managed by
		* folio_add_lru_vma() calling mlock_new_folio().
		*
		* @compound is used to include pmd mappings of THPs, but filter out
		* pte mappings of THPs, which cannot be consistently counted: a pte
		* mapping of the THP head cannot be distinguished by the page alone.
		*/
		void mlock_folio(struct folio *folio);
		static inline void mlock_vma_folio(struct folio *folio,
		struct vm_area_struct *vma, bool compound)
		struct vm_area_struct *vma)
		{
		/*
		* The VM_SPECIAL check here serves two purposes.
		@@ -617,17 +663,24 @@ static inline void mlock_vma_folio(struct folio *folio,
		* file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
		* still be set while VM_SPECIAL bits are added: so ignore it then.
		*/
		if (unlikely((vma->vm_flags & (VM_LOCKED\|VM_SPECIAL)) == VM_LOCKED) &&
		(compound \|\| !folio_test_large(folio)))
		if (unlikely((vma->vm_flags & (VM_LOCKED\|VM_SPECIAL)) == VM_LOCKED))
		mlock_folio(folio);
		}

		void munlock_folio(struct folio *folio);
		static inline void munlock_vma_folio(struct folio *folio,
		struct vm_area_struct *vma, bool compound)
		struct vm_area_struct *vma)
		{
		if (unlikely(vma->vm_flags & VM_LOCKED) &&
		(compound \|\| !folio_test_large(folio)))
		/*
		* munlock if the function is called. Ideally, we should only
		* do munlock if any page of folio is unmapped from VMA and
		* cause folio not fully mapped to VMA.
		*
		* But it's not easy to confirm that's the situation. So we
		* always munlock the folio and page reclaim will correct it
		* if it's wrong.
		*/
		if (unlikely(vma->vm_flags & VM_LOCKED))
		munlock_folio(folio);
		}

mm/mlock.c

+68 −2

Original line number	Diff line number	Diff line
		@@ -305,6 +305,62 @@ void munlock_folio(struct folio *folio)
		local_unlock(&mlock_fbatch.lock);
		}

		static inline unsigned int folio_mlock_step(struct folio *folio,
		pte_t *pte, unsigned long addr, unsigned long end)
		{
		unsigned int count, i, nr = folio_nr_pages(folio);
		unsigned long pfn = folio_pfn(folio);
		pte_t ptent = ptep_get(pte);

		if (!folio_test_large(folio))
		return 1;

		count = pfn + nr - pte_pfn(ptent);
		count = min_t(unsigned int, count, (end - addr) >> PAGE_SHIFT);

		for (i = 0; i < count; i++, pte++) {
		pte_t entry = ptep_get(pte);

		if (!pte_present(entry))
		break;
		if (pte_pfn(entry) - pfn >= nr)
		break;
		}

		return i;
		}

		static inline bool allow_mlock_munlock(struct folio *folio,
		struct vm_area_struct *vma, unsigned long start,
		unsigned long end, unsigned int step)
		{
		/*
		* For unlock, allow munlock large folio which is partially
		* mapped to VMA. As it's possible that large folio is
		* mlocked and VMA is split later.
		*
		* During memory pressure, such kind of large folio can
		* be split. And the pages are not in VM_LOCKed VMA
		* can be reclaimed.
		*/
		if (!(vma->vm_flags & VM_LOCKED))
		return true;

		/* folio_within_range() cannot take KSM, but any small folio is OK */
		if (!folio_test_large(folio))
		return true;

		/* folio not in range [start, end), skip mlock */
		if (!folio_within_range(folio, vma, start, end))
		return false;

		/* folio is not fully mapped, skip mlock */
		if (step != folio_nr_pages(folio))
		return false;

		return true;
		}

		static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
		unsigned long end, struct mm_walk *walk)

		@@ -314,6 +370,8 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
		pte_t start_pte, pte;
		pte_t ptent;
		struct folio *folio;
		unsigned int step = 1;
		unsigned long start = addr;

		ptl = pmd_trans_huge_lock(pmd, vma);
		if (ptl) {
		@@ -334,6 +392,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
		walk->action = ACTION_AGAIN;
		return 0;
		}

		for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
		ptent = ptep_get(pte);
		if (!pte_present(ptent))
		@@ -341,12 +400,19 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
		folio = vm_normal_folio(vma, addr, ptent);
		if (!folio \|\| folio_is_zone_device(folio))
		continue;
		if (folio_test_large(folio))
		continue;

		step = folio_mlock_step(folio, pte, addr, end);
		if (!allow_mlock_munlock(folio, vma, start, end, step))
		goto next_entry;

		if (vma->vm_flags & VM_LOCKED)
		mlock_folio(folio);
		else
		munlock_folio(folio);

		next_entry:
		pte += step - 1;
		addr += (step - 1) << PAGE_SHIFT;
		}
		pte_unmap(start_pte);
		out:

mm/rmap.c

+55 −11

Original line number	Diff line number	Diff line
		@@ -798,6 +798,7 @@ struct folio_referenced_arg {
		unsigned long vm_flags;
		struct mem_cgroup *memcg;
		};

		/*
		* arg: folio_referenced_arg will be passed
		*/
		@@ -807,18 +808,34 @@ static bool folio_referenced_one(struct folio *folio,
		struct folio_referenced_arg *pra = arg;
		DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
		int referenced = 0;
		unsigned long start = address, ptes = 0;

		while (page_vma_mapped_walk(&pvmw)) {
		address = pvmw.address;

		if ((vma->vm_flags & VM_LOCKED) &&
		(!folio_test_large(folio) \|\| !pvmw.pte)) {
		if (vma->vm_flags & VM_LOCKED) {
		if (!folio_test_large(folio) \|\| !pvmw.pte) {
		/* Restore the mlock which got missed */
		mlock_vma_folio(folio, vma, !pvmw.pte);
		mlock_vma_folio(folio, vma);
		page_vma_mapped_walk_done(&pvmw);
		pra->vm_flags \|= VM_LOCKED;
		return false; /* To break the loop */
		}
		/*
		* For large folio fully mapped to VMA, will
		* be handled after the pvmw loop.
		*
		* For large folio cross VMA boundaries, it's
		* expected to be picked by page reclaim. But
		* should skip reference of pages which are in
		* the range of VM_LOCKED vma. As page reclaim
		* should just count the reference of pages out
		* the range of VM_LOCKED vma.
		*/
		ptes++;
		pra->mapcount--;
		continue;
		}

		if (pvmw.pte) {
		if (lru_gen_enabled() &&
		@@ -842,6 +859,23 @@ static bool folio_referenced_one(struct folio *folio,
		pra->mapcount--;
		}

		if ((vma->vm_flags & VM_LOCKED) &&
		folio_test_large(folio) &&
		folio_within_vma(folio, vma)) {
		unsigned long s_align, e_align;

		s_align = ALIGN_DOWN(start, PMD_SIZE);
		e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE);

		/* folio doesn't cross page table boundary and fully mapped */
		if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) {
		/* Restore the mlock which got missed */
		mlock_vma_folio(folio, vma);
		pra->vm_flags \|= VM_LOCKED;
		return false; /* To break the loop */
		}
		}

		if (referenced)
		folio_clear_idle(folio);
		if (folio_test_clear_young(folio))
		@@ -1253,7 +1287,14 @@ void page_add_anon_rmap(struct page page, struct vm_area_struct vma,
		__page_check_anon_rmap(folio, page, vma, address);
		}

		mlock_vma_folio(folio, vma, compound);
		/*
		* For large folio, only mlock it if it's fully mapped to VMA. It's
		* not easy to check whether the large folio is fully mapped to VMA
		* here. Only mlock normal 4K folio and leave page reclaim to handle
		* large folio.
		*/
		if (!folio_test_large(folio))
		mlock_vma_folio(folio, vma);
		}

		/**
		@@ -1352,7 +1393,9 @@ void folio_add_file_rmap_range(struct folio folio, struct page page,
		if (nr)
		__lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr);

		mlock_vma_folio(folio, vma, compound);
		/* See comments in page_add_anon_rmap() */
		if (!folio_test_large(folio))
		mlock_vma_folio(folio, vma);
		}

		/**
		@@ -1463,7 +1506,7 @@ void page_remove_rmap(struct page page, struct vm_area_struct vma,
		* it's only reliable while mapped.
		*/

		munlock_vma_folio(folio, vma, compound);
		munlock_vma_folio(folio, vma);
		}

		/*
		@@ -1528,7 +1571,8 @@ static bool try_to_unmap_one(struct folio folio, struct vm_area_struct vma,
		if (!(flags & TTU_IGNORE_MLOCK) &&
		(vma->vm_flags & VM_LOCKED)) {
		/* Restore the mlock which got missed */
		mlock_vma_folio(folio, vma, false);
		if (!folio_test_large(folio))
		mlock_vma_folio(folio, vma);
		page_vma_mapped_walk_done(&pvmw);
		ret = false;
		break;