mm/memory: optimize fork() with PTE-mapped THP (1741ac63) · Commits · EulixOS / Software / Kernel

include/linux/pgtable.h

+31 −0

Original line number	Diff line number	Diff line
		@@ -622,6 +622,37 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
		}
		#endif

		#ifndef wrprotect_ptes
		/**
		* wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
		* folio.
		* @mm: Address space the pages are mapped into.
		* @addr: Address the first page is mapped at.
		* @ptep: Page table pointer for the first entry.
		* @nr: Number of entries to write-protect.
		*
		* May be overridden by the architecture; otherwise, implemented as a simple
		* loop over ptep_set_wrprotect().
		*
		* Note that PTE bits in the PTE range besides the PFN can differ. For example,
		* some PTEs might be write-protected.
		*
		* Context: The caller holds the page table lock. The PTEs map consecutive
		* pages that belong to the same folio. The PTEs are all in the same PMD.
		*/
		static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
		pte_t *ptep, unsigned int nr)
		{
		for (;;) {
		ptep_set_wrprotect(mm, addr, ptep);
		if (--nr == 0)
		break;
		ptep++;
		addr += PAGE_SIZE;
		}
		}
		#endif

		/*
		* On some architectures hardware does not set page access bit when accessing
		* memory page, it is responsibility of software setting this bit. It brings

mm/memory.c

+93 −19

Original line number	Diff line number	Diff line
		@@ -935,15 +935,15 @@ copy_present_page(struct vm_area_struct dst_vma, struct vm_area_struct src_vma
		return 0;
		}

		static inline void __copy_present_pte(struct vm_area_struct *dst_vma,
		static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
		struct vm_area_struct src_vma, pte_t dst_pte, pte_t *src_pte,
		pte_t pte, unsigned long addr)
		pte_t pte, unsigned long addr, int nr)
		{
		struct mm_struct *src_mm = src_vma->vm_mm;

		/* If it's a COW mapping, write protect it both processes. */
		if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
		ptep_set_wrprotect(src_mm, addr, src_pte);
		wrprotect_ptes(src_mm, addr, src_pte, nr);
		pte = pte_wrprotect(pte);
		}

		@@ -955,26 +955,93 @@ static inline void __copy_present_pte(struct vm_area_struct *dst_vma,
		if (!userfaultfd_wp(dst_vma))
		pte = pte_clear_uffd_wp(pte);

		set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
		set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
		}

		/*
		* Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
		* is required to copy this pte.
		* Detect a PTE batch: consecutive (present) PTEs that map consecutive
		* pages of the same folio.
		*
		* All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN.
		*/
		static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
		pte_t *start_ptep, pte_t pte, int max_nr)
		{
		unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
		const pte_t *end_ptep = start_ptep + max_nr;
		pte_t expected_pte = pte_next_pfn(pte);
		pte_t *ptep = start_ptep + 1;

		VM_WARN_ON_FOLIO(!pte_present(pte), folio);

		while (ptep != end_ptep) {
		pte = ptep_get(ptep);

		if (!pte_same(pte, expected_pte))
		break;

		/*
		* Stop immediately once we reached the end of the folio. In
		* corner cases the next PFN might fall into a different
		* folio.
		*/
		if (pte_pfn(pte) == folio_end_pfn)
		break;

		expected_pte = pte_next_pfn(expected_pte);
		ptep++;
		}

		return ptep - start_ptep;
		}

		/*
		* Copy one present PTE, trying to batch-process subsequent PTEs that map
		* consecutive pages of the same folio by copying them as well.
		*
		* Returns -EAGAIN if one preallocated page is required to copy the next PTE.
		* Otherwise, returns the number of copied PTEs (at least 1).
		*/
		static inline int
		copy_present_pte(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
		copy_present_ptes(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
		pte_t dst_pte, pte_t src_pte, pte_t pte, unsigned long addr,
		int rss, struct folio *prealloc)
		int max_nr, int rss, struct folio *prealloc)
		{
		struct page *page;
		struct folio *folio;
		int err, nr;

		page = vm_normal_page(src_vma, addr, pte);
		if (unlikely(!page))
		goto copy_pte;

		folio = page_folio(page);

		/*
		* If we likely have to copy, just don't bother with batching. Make
		* sure that the common "small folio" case is as fast as possible
		* by keeping the batching logic separate.
		*/
		if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr);
		folio_ref_add(folio, nr);
		if (folio_test_anon(folio)) {
		if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
		nr, src_vma))) {
		folio_ref_sub(folio, nr);
		return -EAGAIN;
		}
		rss[MM_ANONPAGES] += nr;
		VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
		} else {
		folio_dup_file_rmap_ptes(folio, page, nr);
		rss[mm_counter_file(folio)] += nr;
		}
		__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
		addr, nr);
		return nr;
		}

		folio_get(folio);
		if (folio_test_anon(folio)) {
		/*
		@@ -986,8 +1053,9 @@ copy_present_pte(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
		if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
		/* Page may be pinned, we have to copy. */
		folio_put(folio);
		return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
		err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
		addr, rss, prealloc, page);
		return err ? err : 1;
		}
		rss[MM_ANONPAGES]++;
		VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
		@@ -998,8 +1066,8 @@ copy_present_pte(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
		}

		copy_pte:
		__copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, pte, addr);
		return 0;
		__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
		return 1;
		}

		static inline struct folio page_copy_prealloc(struct mm_struct src_mm,
		@@ -1031,10 +1099,11 @@ copy_pte_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
		pte_t src_pte, dst_pte;
		pte_t ptent;
		spinlock_t src_ptl, dst_ptl;
		int progress, ret = 0;
		int progress, max_nr, ret = 0;
		int rss[NR_MM_COUNTERS];
		swp_entry_t entry = (swp_entry_t){0};
		struct folio *prealloc = NULL;
		int nr;

		again:
		progress = 0;
		@@ -1065,6 +1134,8 @@ copy_pte_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
		arch_enter_lazy_mmu_mode();

		do {
		nr = 1;

		/*
		* We are holding two locks at this point - either of them
		* could generate latencies in another task on another CPU.
		@@ -1103,9 +1174,10 @@ copy_pte_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
		*/
		WARN_ON_ONCE(ret != -ENOENT);
		}
		/* copy_present_pte() will clear `prealloc' if consumed /
		ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
		ptent, addr, rss, &prealloc);
		/* copy_present_ptes() will clear `prealloc' if consumed /
		max_nr = (end - addr) / PAGE_SIZE;
		ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
		ptent, addr, max_nr, rss, &prealloc);
		/*
		* If we need a pre-allocated page for this pte, drop the
		* locks, allocate, and try again.
		@@ -1122,8 +1194,10 @@ copy_pte_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
		folio_put(prealloc);
		prealloc = NULL;
		}
		progress += 8;
		} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
		nr = ret;
		progress += 8 * nr;
		} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
		addr != end);

		arch_leave_lazy_mmu_mode();
		pte_unmap_unlock(orig_src_pte, src_ptl);
		@@ -1144,7 +1218,7 @@ copy_pte_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
		prealloc = page_copy_prealloc(src_mm, src_vma, addr);
		if (!prealloc)
		return -ENOMEM;
		} else if (ret) {
		} else if (ret < 0) {
		VM_WARN_ON_ONCE(1);
		}