Commit 9c67a207 authored by Peter Xu's avatar Peter Xu Committed by Andrew Morton
Browse files

mm/hugetlb: introduce hugetlb_walk()

huge_pte_offset() is the main walker function for hugetlb pgtables.  The
name is not really representing what it does, though.

Instead of renaming it, introduce a wrapper function called hugetlb_walk()
which will use huge_pte_offset() inside.  Assert on the locks when walking
the pgtable.

Note, the vma lock assertion will be a no-op for private mappings.

Document the last special case in the page_vma_mapped_walk() path where we
don't need any more lock to call hugetlb_walk().

Taking vma lock there is not needed because either: (1) potential callers
of hugetlb pvmw holds i_mmap_rwsem already (from one rmap_walk()), or (2)
the caller will not walk a hugetlb vma at all so the hugetlb code path not
reachable (e.g.  in ksm or uprobe paths).

It's slightly implicit for future page_vma_mapped_walk() callers on that
lock requirement.  But anyway, when one day this rule breaks, one will get
a straightforward warning in hugetlb_walk() with lockdep, then there'll be
a way out.

[akpm@linux-foundation.org: coding-style cleanups]
Link: https://lkml.kernel.org/r/20221216155229.2043750-1-peterx@redhat.com


Signed-off-by: default avatarPeter Xu <peterx@redhat.com>
Reviewed-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: default avatarJohn Hubbard <jhubbard@nvidia.com>
Reviewed-by: default avatarDavid Hildenbrand <david@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent dd361e50
Loading
Loading
Loading
Loading
+1 −3
Original line number Diff line number Diff line
@@ -388,9 +388,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
{
	pte_t *ptep, pte;

	ptep = huge_pte_offset(vma->vm_mm, addr,
			huge_page_size(hstate_vma(vma)));

	ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma)));
	if (!ptep)
		return false;

+2 −4
Original line number Diff line number Diff line
@@ -252,14 +252,12 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
					 unsigned long flags,
					 unsigned long reason)
{
	struct mm_struct *mm = ctx->mm;
	pte_t *ptep, pte;
	bool ret = true;

	mmap_assert_locked(mm);

	ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
	mmap_assert_locked(ctx->mm);

	ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
	if (!ptep)
		goto out;

+37 −0
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@
#ifndef _LINUX_HUGETLB_H
#define _LINUX_HUGETLB_H

#include <linux/mm.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/fs.h>
@@ -196,6 +197,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE.
 * Returns the pte_t* if found, or NULL if the address is not mapped.
 *
 * IMPORTANT: we should normally not directly call this function, instead
 * this is only a common interface to implement arch-specific
 * walker. Please use hugetlb_walk() instead, because that will attempt to
 * verify the locking for you.
 *
 * Since this function will walk all the pgtable pages (including not only
 * high-level pgtable page, but also PUD entry that can be unshared
 * concurrently for VM_SHARED), the caller of this function should be
@@ -1229,4 +1235,35 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
#define flush_hugetlb_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
#endif

static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
{
	return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
}

/*
 * Safe version of huge_pte_offset() to check the locks.  See comments
 * above huge_pte_offset().
 */
static inline pte_t *
hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
{
#if defined(CONFIG_HUGETLB_PAGE) && \
	defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

	/*
	 * If pmd sharing possible, locking needed to safely walk the
	 * hugetlb pgtables.  More information can be found at the comment
	 * above huge_pte_offset() in the same file.
	 *
	 * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP.
	 */
	if (__vma_shareable_lock(vma))
		WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) &&
			     !lockdep_is_held(
				 &vma->vm_file->f_mapping->i_mmap_rwsem));
#endif
	return huge_pte_offset(vma->vm_mm, addr, sz);
}

#endif /* _LINUX_HUGETLB_H */
+13 −18
Original line number Diff line number Diff line
@@ -260,11 +260,6 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
/*
 * hugetlb vma_lock helper routines
 */
static bool __vma_shareable_lock(struct vm_area_struct *vma)
{
	return vma->vm_flags & VM_MAYSHARE && vma->vm_private_data;
}

void hugetlb_vma_lock_read(struct vm_area_struct *vma)
{
	if (__vma_shareable_lock(vma)) {
@@ -4980,7 +4975,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
	} else {
		/*
		 * For shared mappings the vma lock must be held before
		 * calling huge_pte_offset in the src vma. Otherwise, the
		 * calling hugetlb_walk() in the src vma. Otherwise, the
		 * returned ptep could go away if part of a shared pmd and
		 * another thread calls huge_pmd_unshare.
		 */
@@ -4990,7 +4985,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
	last_addr_mask = hugetlb_mask_last_page(h);
	for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
		spinlock_t *src_ptl, *dst_ptl;
		src_pte = huge_pte_offset(src, addr, sz);
		src_pte = hugetlb_walk(src_vma, addr, sz);
		if (!src_pte) {
			addr |= last_addr_mask;
			continue;
@@ -5197,7 +5192,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
	hugetlb_vma_lock_write(vma);
	i_mmap_lock_write(mapping);
	for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
		src_pte = huge_pte_offset(mm, old_addr, sz);
		src_pte = hugetlb_walk(vma, old_addr, sz);
		if (!src_pte) {
			old_addr |= last_addr_mask;
			new_addr |= last_addr_mask;
@@ -5260,7 +5255,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
	last_addr_mask = hugetlb_mask_last_page(h);
	address = start;
	for (; address < end; address += sz) {
		ptep = huge_pte_offset(mm, address, sz);
		ptep = hugetlb_walk(vma, address, sz);
		if (!ptep) {
			address |= last_addr_mask;
			continue;
@@ -5573,7 +5568,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
			mutex_lock(&hugetlb_fault_mutex_table[hash]);
			hugetlb_vma_lock_read(vma);
			spin_lock(ptl);
			ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
			ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
			if (likely(ptep &&
				   pte_same(huge_ptep_get(ptep), pte)))
				goto retry_avoidcopy;
@@ -5611,7 +5606,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
	 * before the page tables are altered
	 */
	spin_lock(ptl);
	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
	ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
		/* Break COW or unshare */
		huge_ptep_clear_flush(vma, haddr, ptep);
@@ -6397,7 +6392,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
		return NULL;

	hugetlb_vma_lock_read(vma);
	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
	pte = hugetlb_walk(vma, haddr, huge_page_size(h));
	if (!pte)
		goto out_unlock;

@@ -6462,7 +6457,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
		 *
		 * Note that page table lock is not held when pte is null.
		 */
		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
		pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
				   huge_page_size(h));
		if (pte)
			ptl = huge_pte_lock(h, mm, pte);
@@ -6654,7 +6649,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
	last_addr_mask = hugetlb_mask_last_page(h);
	for (; address < end; address += psize) {
		spinlock_t *ptl;
		ptep = huge_pte_offset(mm, address, psize);
		ptep = hugetlb_walk(vma, address, psize);
		if (!ptep) {
			if (!uffd_wp) {
				address |= last_addr_mask;
@@ -7064,7 +7059,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,

		saddr = page_table_shareable(svma, vma, addr, idx);
		if (saddr) {
			spte = huge_pte_offset(svma->vm_mm, saddr,
			spte = hugetlb_walk(svma, saddr,
					    vma_mmu_pagesize(svma));
			if (spte) {
				get_page(virt_to_page(spte));
@@ -7377,7 +7372,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
	hugetlb_vma_lock_write(vma);
	i_mmap_lock_write(vma->vm_file->f_mapping);
	for (address = start; address < end; address += PUD_SIZE) {
		ptep = huge_pte_offset(mm, address, sz);
		ptep = hugetlb_walk(vma, address, sz);
		if (!ptep)
			continue;
		ptl = huge_pte_lock(h, mm, ptep);
+6 −3
Original line number Diff line number Diff line
@@ -168,9 +168,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
		/* The only possible mapping was handled on last iteration */
		if (pvmw->pte)
			return not_found(pvmw);

		/* when pud is not present, pte will be NULL */
		pvmw->pte = huge_pte_offset(mm, pvmw->address, size);
		/*
		 * All callers that get here will already hold the
		 * i_mmap_rwsem.  Therefore, no additional locks need to be
		 * taken before calling hugetlb_walk().
		 */
		pvmw->pte = hugetlb_walk(vma, pvmw->address, size);
		if (!pvmw->pte)
			return false;

Loading