Commit 22e5fe2a authored by Nadav Amit's avatar Nadav Amit Committed by Linus Torvalds
Browse files

userfaultfd: prevent concurrent API initialization

userfaultfd assumes that the enabled features are set once and never
changed after UFFDIO_API ioctl succeeded.

However, currently, UFFDIO_API can be called concurrently from two
different threads, succeed on both threads and leave userfaultfd's
features in non-deterministic state.  Theoretically, other uffd operations
(ioctl's and page-faults) can be dispatched while adversely affected by
such changes of features.

Moreover, the writes to ctx->state and ctx->features are not ordered,
which can - theoretically, again - let userfaultfd_ioctl() think that
userfaultfd API completed, while the features are still not initialized.

To avoid races, it is arguably best to get rid of ctx->state.  Since there
are only 2 states, record the API initialization in ctx->features as the
uppermost bit and remove ctx->state.

Link: https://lkml.kernel.org/r/20210808020724.1022515-3-namit@vmware.com


Fixes: 9cd75c3c ("userfaultfd: non-cooperative: add ability to report non-PF events from uffd descriptor")
Signed-off-by: default avatarNadav Amit <namit@vmware.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent a759a909
Loading
Loading
Loading
Loading
+44 −47
Original line number Diff line number Diff line
@@ -33,11 +33,6 @@ int sysctl_unprivileged_userfaultfd __read_mostly;

static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;

enum userfaultfd_state {
	UFFD_STATE_WAIT_API,
	UFFD_STATE_RUNNING,
};

/*
 * Start with fault_pending_wqh and fault_wqh so they're more likely
 * to be in the same cacheline.
@@ -69,8 +64,6 @@ struct userfaultfd_ctx {
	unsigned int flags;
	/* features requested from the userspace */
	unsigned int features;
	/* state machine */
	enum userfaultfd_state state;
	/* released */
	bool released;
	/* memory mappings are changing because of non-cooperative event */
@@ -104,6 +97,14 @@ struct userfaultfd_wake_range {
	unsigned long len;
};

/* internal indication that UFFD_API ioctl was successfully executed */
#define UFFD_FEATURE_INITIALIZED		(1u << 31)

static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
{
	return ctx->features & UFFD_FEATURE_INITIALIZED;
}

static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
				     int wake_flags, void *key)
{
@@ -667,7 +668,6 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)

		refcount_set(&ctx->refcount, 1);
		ctx->flags = octx->flags;
		ctx->state = UFFD_STATE_RUNNING;
		ctx->features = octx->features;
		ctx->released = false;
		atomic_set(&ctx->mmap_changing, 0);
@@ -944,10 +944,9 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)

	poll_wait(file, &ctx->fd_wqh, wait);

	switch (ctx->state) {
	case UFFD_STATE_WAIT_API:
	if (!userfaultfd_is_initialized(ctx))
		return EPOLLERR;
	case UFFD_STATE_RUNNING:

	/*
	 * poll() never guarantees that read won't block.
	 * userfaults can be waken before they're read().
@@ -972,10 +971,6 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
		ret = EPOLLIN;

	return ret;
	default:
		WARN_ON_ONCE(1);
		return EPOLLERR;
	}
}

static const struct file_operations userfaultfd_fops;
@@ -1170,7 +1165,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
	int no_wait = file->f_flags & O_NONBLOCK;
	struct inode *inode = file_inode(file);

	if (ctx->state == UFFD_STATE_WAIT_API)
	if (!userfaultfd_is_initialized(ctx))
		return -EINVAL;

	for (;;) {
@@ -1909,9 +1904,10 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
	/*
	 * For the current set of features the bits just coincide
	 * For the current set of features the bits just coincide. Set
	 * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
	 */
	return (unsigned int)user_features;
	return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
}

/*
@@ -1924,12 +1920,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
{
	struct uffdio_api uffdio_api;
	void __user *buf = (void __user *)arg;
	unsigned int ctx_features;
	int ret;
	__u64 features;

	ret = -EINVAL;
	if (ctx->state != UFFD_STATE_WAIT_API)
		goto out;
	ret = -EFAULT;
	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
		goto out;
@@ -1953,9 +1947,13 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
	ret = -EFAULT;
	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
		goto out;
	ctx->state = UFFD_STATE_RUNNING;

	/* only enable the requested features for this uffd context */
	ctx->features = uffd_ctx_features(features);
	ctx_features = uffd_ctx_features(features);
	ret = -EINVAL;
	if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
		goto err_out;

	ret = 0;
out:
	return ret;
@@ -1972,7 +1970,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
	int ret = -EINVAL;
	struct userfaultfd_ctx *ctx = file->private_data;

	if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
	if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
		return -EINVAL;

	switch(cmd) {
@@ -2086,7 +2084,6 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
	refcount_set(&ctx->refcount, 1);
	ctx->flags = flags;
	ctx->features = 0;
	ctx->state = UFFD_STATE_WAIT_API;
	ctx->released = false;
	atomic_set(&ctx->mmap_changing, 0);
	ctx->mm = current->mm;