mm/khugepaged: retract_page_tables() without mmap or vma lock (1d65b771) · Commits · EulixOS / Software / Kernel

mm/khugepaged.c

+69 −103

Original line number	Diff line number	Diff line
		@@ -1617,9 +1617,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
		break;
		case SCAN_PMD_NONE:
		/*
		* In MADV_COLLAPSE path, possible race with khugepaged where
		* all pte entries have been removed and pmd cleared. If so,
		* skip all the pte checks and just update the pmd mapping.
		* All pte entries have been removed and pmd cleared.
		* Skip all the pte checks and just update the pmd mapping.
		*/
		goto maybe_install_pmd;
		default:
		@@ -1750,123 +1749,88 @@ static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_sl
		mmap_write_unlock(mm);
		}

		static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
		struct mm_struct *target_mm,
		unsigned long target_addr, struct page *hpage,
		struct collapse_control *cc)
		static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
		{
		struct vm_area_struct *vma;
		int target_result = SCAN_FAIL;

		i_mmap_lock_write(mapping);
		i_mmap_lock_read(mapping);
		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
		int result = SCAN_FAIL;
		struct mm_struct *mm = NULL;
		unsigned long addr = 0;
		pmd_t *pmd;
		bool is_target = false;
		struct mmu_notifier_range range;
		struct mm_struct *mm;
		unsigned long addr;
		pmd_t *pmd, pgt_pmd;
		spinlock_t *pml;
		spinlock_t *ptl;
		bool skipped_uffd = false;

		/*
		* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
		* got written to. These VMAs are likely not worth investing
		* mmap_write_lock(mm) as PMD-mapping is likely to be split
		* later.
		*
		* Note that vma->anon_vma check is racy: it can be set up after
		* the check but before we took mmap_lock by the fault path.
		* But page lock would prevent establishing any new ptes of the
		* page, so we are safe.
		*
		* An alternative would be drop the check, but check that page
		* table is clear before calling pmdp_collapse_flush() under
		* ptl. It has higher chance to recover THP for the VMA, but
		* has higher cost too. It would also probably require locking
		* the anon_vma.
		* got written to. These VMAs are likely not worth removing
		* page tables from, as PMD-mapping is likely to be split later.
		*/
		if (READ_ONCE(vma->anon_vma)) {
		result = SCAN_PAGE_ANON;
		goto next;
		}
		if (READ_ONCE(vma->anon_vma))
		continue;

		addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
		if (addr & ~HPAGE_PMD_MASK \|\|
		vma->vm_end < addr + HPAGE_PMD_SIZE) {
		result = SCAN_VMA_CHECK;
		goto next;
		}
		vma->vm_end < addr + HPAGE_PMD_SIZE)
		continue;

		mm = vma->vm_mm;
		is_target = mm == target_mm && addr == target_addr;
		result = find_pmd_or_thp_or_none(mm, addr, &pmd);
		if (result != SCAN_SUCCEED)
		goto next;
		/*
		* We need exclusive mmap_lock to retract page table.
		*
		* We use trylock due to lock inversion: we need to acquire
		* mmap_lock while holding page lock. Fault path does it in
		* reverse order. Trylock is a way to avoid deadlock.
		*
		* Also, it's not MADV_COLLAPSE's job to collapse other
		* mappings - let khugepaged take care of them later.
		*/
		result = SCAN_PTE_MAPPED_HUGEPAGE;
		if ((cc->is_khugepaged \|\| is_target) &&
		mmap_write_trylock(mm)) {
		/* trylock for the same lock inversion as above */
		if (!vma_try_start_write(vma))
		goto unlock_next;
		if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
		continue;

		if (hpage_collapse_test_exit(mm))
		continue;
		/*
		* Re-check whether we have an ->anon_vma, because
		* collapse_and_free_pmd() requires that either no
		* ->anon_vma exists or the anon_vma is locked.
		* We already checked ->anon_vma above, but that check
		* is racy because ->anon_vma can be populated under the
		* mmap lock in read mode.
		* When a vma is registered with uffd-wp, we cannot recycle
		* the page table because there may be pte markers installed.
		* Other vmas can still have the same file mapped hugely, but
		* skip this one: it will always be mapped in small page size
		* for uffd-wp registered ranges.
		*/
		if (vma->anon_vma) {
		result = SCAN_PAGE_ANON;
		goto unlock_next;
		}
		if (userfaultfd_wp(vma))
		continue;

		/* PTEs were notified when unmapped; but now for the PMD? */
		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
		addr, addr + HPAGE_PMD_SIZE);
		mmu_notifier_invalidate_range_start(&range);

		pml = pmd_lock(mm, pmd);
		ptl = pte_lockptr(mm, pmd);
		if (ptl != pml)
		spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

		/*
		* When a vma is registered with uffd-wp, we can't
		* recycle the pmd pgtable because there can be pte
		* markers installed. Skip it only, so the rest mm/vma
		* can still have the same file mapped hugely, however
		* it'll always mapped in small page size for uffd-wp
		* registered ranges.
		* Huge page lock is still held, so normally the page table
		* must remain empty; and we have already skipped anon_vma
		* and userfaultfd_wp() vmas. But since the mmap_lock is not
		* held, it is still possible for a racing userfaultfd_ioctl()
		* to have inserted ptes or markers. Now that we hold ptlock,
		* repeating the anon_vma check protects from one category,
		* and repeating the userfaultfd_wp() check from another.
		*/
		if (hpage_collapse_test_exit(mm)) {
		result = SCAN_ANY_PROCESS;
		goto unlock_next;
		}
		if (userfaultfd_wp(vma)) {
		result = SCAN_PTE_UFFD_WP;
		goto unlock_next;
		if (unlikely(vma->anon_vma \|\| userfaultfd_wp(vma))) {
		skipped_uffd = true;
		} else {
		pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
		pmdp_get_lockless_sync();
		}
		collapse_and_free_pmd(mm, vma, addr, pmd);
		if (!cc->is_khugepaged && is_target)
		result = set_huge_pmd(vma, addr, pmd, hpage);
		else
		result = SCAN_SUCCEED;

		unlock_next:
		mmap_write_unlock(mm);
		goto next;
		}
		/*
		* Calling context will handle target mm/addr. Otherwise, let
		* khugepaged try again later.
		*/
		if (!is_target) {
		khugepaged_add_pte_mapped_thp(mm, addr);
		continue;
		if (ptl != pml)
		spin_unlock(ptl);
		spin_unlock(pml);

		mmu_notifier_invalidate_range_end(&range);

		if (!skipped_uffd) {
		mm_dec_nr_ptes(mm);
		page_table_check_pte_clear_range(mm, addr, pgt_pmd);
		pte_free_defer(mm, pmd_pgtable(pgt_pmd));
		}
		next:
		if (is_target)
		target_result = result;
		}
		i_mmap_unlock_write(mapping);
		return target_result;
		i_mmap_unlock_read(mapping);
		}

		/**
		@@ -2260,9 +2224,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,

		/*
		* Remove pte page tables, so we can re-fault the page as huge.
		* If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
		*/
		result = retract_page_tables(mapping, start, mm, addr, hpage,
		cc);
		retract_page_tables(mapping, start);
		if (cc && !cc->is_khugepaged)
		result = SCAN_PTE_MAPPED_HUGEPAGE;
		unlock_page(hpage);

		/*