Commit f6191471 authored by Axel Rasmussen's avatar Axel Rasmussen Committed by Linus Torvalds
Browse files

userfaultfd: add UFFDIO_CONTINUE ioctl

This ioctl is how userspace ought to resolve "minor" userfaults.  The
idea is, userspace is notified that a minor fault has occurred.  It
might change the contents of the page using its second non-UFFD mapping,
or not.  Then, it calls UFFDIO_CONTINUE to tell the kernel "I have
ensured the page contents are correct, carry on setting up the mapping".

Note that it doesn't make much sense to use UFFDIO_{COPY,ZEROPAGE} for
MINOR registered VMAs.  ZEROPAGE maps the VMA to the zero page; but in
the minor fault case, we already have some pre-existing underlying page.
Likewise, UFFDIO_COPY isn't useful if we have a second non-UFFD mapping.
We'd just use memcpy() or similar instead.

It turns out hugetlb_mcopy_atomic_pte() already does very close to what
we want, if an existing page is provided via `struct page **pagep`.  We
already special-case the behavior a bit for the UFFDIO_ZEROPAGE case, so
just extend that design: add an enum for the three modes of operation,
and make the small adjustments needed for the MCOPY_ATOMIC_CONTINUE
case.  (Basically, look up the existing page, and avoid adding the
existing page to the page cache or calling set_page_huge_active() on
it.)

Link: https://lkml.kernel.org/r/20210301222728.176417-5-axelrasmussen@google.com


Signed-off-by: default avatarAxel Rasmussen <axelrasmussen@google.com>
Reviewed-by: default avatarPeter Xu <peterx@redhat.com>
Cc: Adam Ruprecht <ruprecht@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michal Koutn" <mkoutny@suse.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shawn Anastasio <shawn@anastas.io>
Cc: Steven Price <steven.price@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 714c1891
Loading
Loading
Loading
Loading
+67 −0
Original line number Diff line number Diff line
@@ -1487,6 +1487,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
			ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);

		/* CONTINUE ioctl is only supported for MINOR ranges. */
		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);

		/*
		 * Now that we scanned all vmas we can already tell
		 * userland which ioctls methods are guaranteed to
@@ -1840,6 +1844,66 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
	return ret;
}

static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
{
	__s64 ret;
	struct uffdio_continue uffdio_continue;
	struct uffdio_continue __user *user_uffdio_continue;
	struct userfaultfd_wake_range range;

	user_uffdio_continue = (struct uffdio_continue __user *)arg;

	ret = -EAGAIN;
	if (READ_ONCE(ctx->mmap_changing))
		goto out;

	ret = -EFAULT;
	if (copy_from_user(&uffdio_continue, user_uffdio_continue,
			   /* don't copy the output fields */
			   sizeof(uffdio_continue) - (sizeof(__s64))))
		goto out;

	ret = validate_range(ctx->mm, &uffdio_continue.range.start,
			     uffdio_continue.range.len);
	if (ret)
		goto out;

	ret = -EINVAL;
	/* double check for wraparound just in case. */
	if (uffdio_continue.range.start + uffdio_continue.range.len <=
	    uffdio_continue.range.start) {
		goto out;
	}
	if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
		goto out;

	if (mmget_not_zero(ctx->mm)) {
		ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
				     uffdio_continue.range.len,
				     &ctx->mmap_changing);
		mmput(ctx->mm);
	} else {
		return -ESRCH;
	}

	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
		return -EFAULT;
	if (ret < 0)
		goto out;

	/* len == 0 would wake all */
	BUG_ON(!ret);
	range.len = ret;
	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
		range.start = uffdio_continue.range.start;
		wake_userfault(ctx, &range);
	}
	ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;

out:
	return ret;
}

static inline unsigned int uffd_ctx_features(__u64 user_features)
{
	/*
@@ -1927,6 +1991,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
	case UFFDIO_WRITEPROTECT:
		ret = userfaultfd_writeprotect(ctx, arg);
		break;
	case UFFDIO_CONTINUE:
		ret = userfaultfd_continue(ctx, arg);
		break;
	}
	return ret;
}
+3 −0
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@
#include <linux/kref.h>
#include <linux/pgtable.h>
#include <linux/gfp.h>
#include <linux/userfaultfd_k.h>

struct ctl_table;
struct user_struct;
@@ -139,6 +140,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
				struct vm_area_struct *dst_vma,
				unsigned long dst_addr,
				unsigned long src_addr,
				enum mcopy_atomic_mode mode,
				struct page **pagep);
#endif /* CONFIG_USERFAULTFD */
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
@@ -318,6 +320,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
						struct vm_area_struct *dst_vma,
						unsigned long dst_addr,
						unsigned long src_addr,
						enum mcopy_atomic_mode mode,
						struct page **pagep)
{
	BUG();
+18 −0
Original line number Diff line number Diff line
@@ -37,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd;

extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);

/*
 * The mode of operation for __mcopy_atomic and its helpers.
 *
 * This is almost an implementation detail (mcopy_atomic below doesn't take this
 * as a parameter), but it's exposed here because memory-kind-specific
 * implementations (e.g. hugetlbfs) need to know the mode of operation.
 */
enum mcopy_atomic_mode {
	/* A normal copy_from_user into the destination range. */
	MCOPY_ATOMIC_NORMAL,
	/* Don't copy; map the destination range to the zero page. */
	MCOPY_ATOMIC_ZEROPAGE,
	/* Just install pte(s) with the existing page(s) in the page cache. */
	MCOPY_ATOMIC_CONTINUE,
};

extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
			    unsigned long src_start, unsigned long len,
			    bool *mmap_changing, __u64 mode);
@@ -44,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
			      unsigned long dst_start,
			      unsigned long len,
			      bool *mmap_changing);
extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
			      unsigned long len, bool *mmap_changing);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
			       unsigned long start, unsigned long len,
			       bool enable_wp, bool *mmap_changing);
+19 −2
Original line number Diff line number Diff line
@@ -40,10 +40,12 @@
	((__u64)1 << _UFFDIO_WAKE |		\
	 (__u64)1 << _UFFDIO_COPY |		\
	 (__u64)1 << _UFFDIO_ZEROPAGE |		\
	 (__u64)1 << _UFFDIO_WRITEPROTECT)
	 (__u64)1 << _UFFDIO_WRITEPROTECT |	\
	 (__u64)1 << _UFFDIO_CONTINUE)
#define UFFD_API_RANGE_IOCTLS_BASIC		\
	((__u64)1 << _UFFDIO_WAKE |		\
	 (__u64)1 << _UFFDIO_COPY)
	 (__u64)1 << _UFFDIO_COPY |		\
	 (__u64)1 << _UFFDIO_CONTINUE)

/*
 * Valid ioctl command number range with this API is from 0x00 to
@@ -59,6 +61,7 @@
#define _UFFDIO_COPY			(0x03)
#define _UFFDIO_ZEROPAGE		(0x04)
#define _UFFDIO_WRITEPROTECT		(0x06)
#define _UFFDIO_CONTINUE		(0x07)
#define _UFFDIO_API			(0x3F)

/* userfaultfd ioctl ids */
@@ -77,6 +80,8 @@
				      struct uffdio_zeropage)
#define UFFDIO_WRITEPROTECT	_IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
				      struct uffdio_writeprotect)
#define UFFDIO_CONTINUE		_IOR(UFFDIO, _UFFDIO_CONTINUE,	\
				     struct uffdio_continue)

/* read() structure */
struct uffd_msg {
@@ -268,6 +273,18 @@ struct uffdio_writeprotect {
	__u64 mode;
};

struct uffdio_continue {
	struct uffdio_range range;
#define UFFDIO_CONTINUE_MODE_DONTWAKE		((__u64)1<<0)
	__u64 mode;

	/*
	 * Fields below here are written by the ioctl and must be at the end:
	 * the copy_from_user will not read past here.
	 */
	__s64 mapped;
};

/*
 * Flags for the userfaultfd(2) system call itself.
 */
+26 −14
Original line number Diff line number Diff line
@@ -39,7 +39,6 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_owner.h>
#include "internal.h"

@@ -4865,8 +4864,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
			    struct vm_area_struct *dst_vma,
			    unsigned long dst_addr,
			    unsigned long src_addr,
			    enum mcopy_atomic_mode mode,
			    struct page **pagep)
{
	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
	struct address_space *mapping;
	pgoff_t idx;
	unsigned long size;
@@ -4876,8 +4877,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
	spinlock_t *ptl;
	int ret;
	struct page *page;
	int writable;

	if (!*pagep) {
	mapping = dst_vma->vm_file->f_mapping;
	idx = vma_hugecache_offset(h, dst_vma, dst_addr);

	if (is_continue) {
		ret = -EFAULT;
		page = find_lock_page(mapping, idx);
		if (!page)
			goto out;
	} else if (!*pagep) {
		ret = -ENOMEM;
		page = alloc_huge_page(dst_vma, dst_addr, 0);
		if (IS_ERR(page))
@@ -4906,13 +4916,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
	 */
	__SetPageUptodate(page);

	mapping = dst_vma->vm_file->f_mapping;
	idx = vma_hugecache_offset(h, dst_vma, dst_addr);

	/*
	 * If shared, add to page cache
	 */
	if (vm_shared) {
	/* Add shared, newly allocated pages to the page cache. */
	if (vm_shared && !is_continue) {
		size = i_size_read(mapping->host) >> huge_page_shift(h);
		ret = -EFAULT;
		if (idx >= size)
@@ -4957,8 +4962,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
	}

	_dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
	if (dst_vma->vm_flags & VM_WRITE)
	/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
	if (is_continue && !vm_shared)
		writable = 0;
	else
		writable = dst_vma->vm_flags & VM_WRITE;

	_dst_pte = make_huge_pte(dst_vma, page, writable);
	if (writable)
		_dst_pte = huge_pte_mkdirty(_dst_pte);
	_dst_pte = pte_mkyoung(_dst_pte);

@@ -4972,15 +4983,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
	update_mmu_cache(dst_vma, dst_addr, dst_pte);

	spin_unlock(ptl);
	if (!is_continue)
		SetHPageMigratable(page);
	if (vm_shared)
	if (vm_shared || is_continue)
		unlock_page(page);
	ret = 0;
out:
	return ret;
out_release_unlock:
	spin_unlock(ptl);
	if (vm_shared)
	if (vm_shared || is_continue)
		unlock_page(page);
out_release_nounlock:
	put_page(page);
Loading