Commit 40d49a3c authored by Matthew Wilcox (Oracle)'s avatar Matthew Wilcox (Oracle) Committed by Andrew Morton
Browse files

mm: allow ->huge_fault() to be called without the mmap_lock held

Remove the checks for the VMA lock being held, allowing the page fault
path to call into the filesystem instead of retrying with the mmap_lock
held.  This will improve scalability for DAX page faults.  Also update the
documentation to match (and fix some other changes that have happened
recently).

Link: https://lkml.kernel.org/r/20230818202335.2739663-3-willy@infradead.org


Signed-off-by: default avatarMatthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 051ddcfe
Loading
Loading
Loading
Loading
+23 −13
Original line number Diff line number Diff line
@@ -630,24 +630,27 @@ prototypes::

	void (*open)(struct vm_area_struct *);
	void (*close)(struct vm_area_struct *);
	vm_fault_t (*fault)(struct vm_area_struct*, struct vm_fault *);
	vm_fault_t (*fault)(struct vm_fault *);
	vm_fault_t (*huge_fault)(struct vm_fault *, unsigned int order);
	vm_fault_t (*map_pages)(struct vm_fault *, pgoff_t start, pgoff_t end);
	vm_fault_t (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
	vm_fault_t (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *);
	int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);

locking rules:

=============	=========	===========================
=============	==========	===========================
ops		mmap_lock	PageLocked(page)
=============	=========	===========================
open:		yes
close:		yes
fault:		yes		can return with page locked
map_pages:	read
page_mkwrite:	yes		can return with page locked
pfn_mkwrite:	yes
access:		yes
=============	=========	===========================
=============	==========	===========================
open:		write
close:		read/write
fault:		read		can return with page locked
huge_fault:	maybe-read
map_pages:	maybe-read
page_mkwrite:	read		can return with page locked
pfn_mkwrite:	read
access:		read
=============	==========	===========================

->fault() is called when a previously not present pte is about to be faulted
in. The filesystem must find and return the page associated with the passed in
@@ -657,6 +660,13 @@ then ensure the page is not already truncated (invalidate_lock will block
subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
locked. The VM will unlock the page.

->huge_fault() is called when there is no PUD or PMD entry present.  This
gives the filesystem the opportunity to install a PUD or PMD sized page.
Filesystems can also use the ->fault method to return a PMD sized page,
so implementing this function may not be necessary.  In particular,
filesystems should not call filemap_fault() from ->huge_fault().
The mmap_lock may not be held when this method is called.

->map_pages() is called when VM asks to map easy accessible pages.
Filesystem should find and map pages associated with offsets from "start_pgoff"
till "end_pgoff". ->map_pages() is called with the RCU lock held and must
+11 −0
Original line number Diff line number Diff line
@@ -943,3 +943,14 @@ file pointer instead of struct dentry pointer. d_tmpfile() is similarly
changed to simplify callers.  The passed file is in a non-open state and on
success must be opened before returning (e.g. by calling
finish_open_simple()).

---

**mandatory**

Calling convention for ->huge_fault has changed.  It now takes a page
order instead of an enum page_entry_size, and it may be called without the
mmap_lock held.  All in-tree users have been audited and do not seem to
depend on the mmap_lock being held, but out of tree users should verify
for themselves.  If they do need it, they can return VM_FAULT_RETRY to
be called with the mmap_lock held.
+2 −20
Original line number Diff line number Diff line
@@ -4854,13 +4854,8 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
	struct vm_area_struct *vma = vmf->vma;
	if (vma_is_anonymous(vma))
		return do_huge_pmd_anonymous_page(vmf);
	if (vma->vm_ops->huge_fault) {
		if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
			vma_end_read(vma);
			return VM_FAULT_RETRY;
		}
	if (vma->vm_ops->huge_fault)
		return vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
	}
	return VM_FAULT_FALLBACK;
}

@@ -4880,10 +4875,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)

	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
		if (vma->vm_ops->huge_fault) {
			if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
				vma_end_read(vma);
				return VM_FAULT_RETRY;
			}
			ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
			if (!(ret & VM_FAULT_FALLBACK))
				return ret;
@@ -4904,13 +4895,8 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
	/* No support for anonymous transparent PUD pages yet */
	if (vma_is_anonymous(vma))
		return VM_FAULT_FALLBACK;
	if (vma->vm_ops->huge_fault) {
		if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
			vma_end_read(vma);
			return VM_FAULT_RETRY;
		}
	if (vma->vm_ops->huge_fault)
		return vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
	}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	return VM_FAULT_FALLBACK;
}
@@ -4927,10 +4913,6 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
		goto split;
	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
		if (vma->vm_ops->huge_fault) {
			if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
				vma_end_read(vma);
				return VM_FAULT_RETRY;
			}
			ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
			if (!(ret & VM_FAULT_FALLBACK))
				return ret;