Commit 49b06385 authored by Suren Baghdasaryan's avatar Suren Baghdasaryan Committed by Andrew Morton
Browse files

mm: enable page walking API to lock vmas during the walk

walk_page_range() and friends often operate under write-locked mmap_lock. 
With introduction of vma locks, the vmas have to be locked as well during
such walks to prevent concurrent page faults in these areas.  Add an
additional member to mm_walk_ops to indicate locking requirements for the
walk.

The change ensures that page walks which prevent concurrent page faults
by write-locking mmap_lock, operate correctly after introduction of
per-vma locks.  With per-vma locks page faults can be handled under vma
lock without taking mmap_lock at all, so write locking mmap_lock would
not stop them.  The change ensures vmas are properly locked during such
walks.

A sample issue this solves is do_mbind() performing queue_pages_range()
to queue pages for migration.  Without this change a concurrent page
can be faulted into the area and be left out of migration.

Link: https://lkml.kernel.org/r/20230804152724.3090321-2-surenb@google.com


Signed-off-by: default avatarSuren Baghdasaryan <surenb@google.com>
Suggested-by: default avatarLinus Torvalds <torvalds@linuxfoundation.org>
Suggested-by: default avatarJann Horn <jannh@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michel Lespinasse <michel@lespinasse.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 8b9c1cc0
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -145,6 +145,7 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,

static const struct mm_walk_ops subpage_walk_ops = {
	.pmd_entry	= subpage_walk_pmd_entry,
	.walk_lock	= PGWALK_WRLOCK_VERIFY,
};

static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+1 −0
Original line number Diff line number Diff line
@@ -102,6 +102,7 @@ static const struct mm_walk_ops pageattr_ops = {
	.pmd_entry = pageattr_pmd_entry,
	.pte_entry = pageattr_pte_entry,
	.pte_hole = pageattr_pte_hole,
	.walk_lock = PGWALK_RDLOCK,
};

static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
+5 −0
Original line number Diff line number Diff line
@@ -2514,6 +2514,7 @@ static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,

static const struct mm_walk_ops thp_split_walk_ops = {
	.pmd_entry	= thp_split_walk_pmd_entry,
	.walk_lock	= PGWALK_WRLOCK_VERIFY,
};

static inline void thp_split_mm(struct mm_struct *mm)
@@ -2565,6 +2566,7 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start,

static const struct mm_walk_ops zap_zero_walk_ops = {
	.pmd_entry	= __zap_zero_pages,
	.walk_lock	= PGWALK_WRLOCK,
};

/*
@@ -2655,6 +2657,7 @@ static const struct mm_walk_ops enable_skey_walk_ops = {
	.hugetlb_entry		= __s390_enable_skey_hugetlb,
	.pte_entry		= __s390_enable_skey_pte,
	.pmd_entry		= __s390_enable_skey_pmd,
	.walk_lock		= PGWALK_WRLOCK,
};

int s390_enable_skey(void)
@@ -2692,6 +2695,7 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr,

static const struct mm_walk_ops reset_cmma_walk_ops = {
	.pte_entry		= __s390_reset_cmma,
	.walk_lock		= PGWALK_WRLOCK,
};

void s390_reset_cmma(struct mm_struct *mm)
@@ -2728,6 +2732,7 @@ static int s390_gather_pages(pte_t *ptep, unsigned long addr,

static const struct mm_walk_ops gather_pages_ops = {
	.pte_entry = s390_gather_pages,
	.walk_lock = PGWALK_RDLOCK,
};

/*
+5 −0
Original line number Diff line number Diff line
@@ -757,12 +757,14 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
static const struct mm_walk_ops smaps_walk_ops = {
	.pmd_entry		= smaps_pte_range,
	.hugetlb_entry		= smaps_hugetlb_range,
	.walk_lock		= PGWALK_RDLOCK,
};

static const struct mm_walk_ops smaps_shmem_walk_ops = {
	.pmd_entry		= smaps_pte_range,
	.hugetlb_entry		= smaps_hugetlb_range,
	.pte_hole		= smaps_pte_hole,
	.walk_lock		= PGWALK_RDLOCK,
};

/*
@@ -1244,6 +1246,7 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
static const struct mm_walk_ops clear_refs_walk_ops = {
	.pmd_entry		= clear_refs_pte_range,
	.test_walk		= clear_refs_test_walk,
	.walk_lock		= PGWALK_WRLOCK,
};

static ssize_t clear_refs_write(struct file *file, const char __user *buf,
@@ -1621,6 +1624,7 @@ static const struct mm_walk_ops pagemap_ops = {
	.pmd_entry	= pagemap_pmd_range,
	.pte_hole	= pagemap_pte_hole,
	.hugetlb_entry	= pagemap_hugetlb_range,
	.walk_lock	= PGWALK_RDLOCK,
};

/*
@@ -1934,6 +1938,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
static const struct mm_walk_ops show_numa_ops = {
	.hugetlb_entry = gather_hugetlb_stats,
	.pmd_entry = gather_pte_stats,
	.walk_lock = PGWALK_RDLOCK,
};

/*
+11 −0
Original line number Diff line number Diff line
@@ -6,6 +6,16 @@

struct mm_walk;

/* Locking requirement during a page walk. */
enum page_walk_lock {
	/* mmap_lock should be locked for read to stabilize the vma tree */
	PGWALK_RDLOCK = 0,
	/* vma will be write-locked during the walk */
	PGWALK_WRLOCK = 1,
	/* vma is expected to be already write-locked during the walk */
	PGWALK_WRLOCK_VERIFY = 2,
};

/**
 * struct mm_walk_ops - callbacks for walk_page_range
 * @pgd_entry:		if set, called for each non-empty PGD (top-level) entry
@@ -66,6 +76,7 @@ struct mm_walk_ops {
	int (*pre_vma)(unsigned long start, unsigned long end,
		       struct mm_walk *walk);
	void (*post_vma)(struct mm_walk *walk);
	enum page_walk_lock walk_lock;
};

/*
Loading