Commit ca60ad6a authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-5.6-2020-02-14' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:
 "Here's a set of fixes for io_uring:

   - Various fixes with cleanups from Pavel, fixing corner cases where
     we're not correctly dealing with iovec cleanup.

   - Clarify that statx/openat/openat2 don't accept fixed files

   - Buffered raw device write EOPTNOTSUPP fix

   - Ensure async workers grab current->fs

   - A few task exit fixes with pending requests that grab the file
     table

   - send/recvmsg async load fix

   - io-wq offline node setup fix

   - CQ overflow flush in poll"

* tag 'io_uring-5.6-2020-02-14' of git://git.kernel.dk/linux-block: (21 commits)
  io_uring: prune request from overflow list on flush
  io-wq: don't call kXalloc_node() with non-online node
  io_uring: retain sockaddr_storage across send/recvmsg async punt
  io_uring: cancel pending async work if task exits
  io-wq: add io_wq_cancel_pid() to cancel based on a specific pid
  io-wq: make io_wqe_cancel_work() take a match handler
  io_uring: fix openat/statx's filename leak
  io_uring: fix double prep iovec leak
  io_uring: fix async close() with f_op->flush()
  io_uring: allow AT_FDCWD for non-file openat/openat2/statx
  io_uring: grab ->fs as part of async preparation
  io-wq: add support for inheriting ->fs
  io_uring: retry raw bdev writes if we hit -EOPNOTSUPP
  io_uring: add cleanup for openat()/statx()
  io_uring: fix iovec leaks
  io_uring: remove unused struct io_async_open
  io_uring: flush overflowed CQ events in the io_uring_poll()
  io_uring: statx/openat/openat2 don't support fixed files
  io_uring: fix deferred req iovec leak
  io_uring: fix 1-bit bitfields to be unsigned
  ...
parents 2019fc96 2ca10259
Loading
Loading
Loading
Loading
+77 −15
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/kthread.h>
#include <linux/rculist_nulls.h>
#include <linux/fs_struct.h>

#include "io-wq.h"

@@ -59,6 +60,7 @@ struct io_worker {
	const struct cred *cur_creds;
	const struct cred *saved_creds;
	struct files_struct *restore_files;
	struct fs_struct *restore_fs;
};

#if BITS_PER_LONG == 64
@@ -151,6 +153,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
		task_unlock(current);
	}

	if (current->fs != worker->restore_fs)
		current->fs = worker->restore_fs;

	/*
	 * If we have an active mm, we need to drop the wq lock before unusing
	 * it. If we do, return true and let the caller retry the idle loop.
@@ -311,6 +316,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)

	worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
	worker->restore_files = current->files;
	worker->restore_fs = current->fs;
	io_wqe_inc_running(wqe, worker);
}

@@ -481,6 +487,8 @@ static void io_worker_handle_work(struct io_worker *worker)
			current->files = work->files;
			task_unlock(current);
		}
		if (work->fs && current->fs != work->fs)
			current->fs = work->fs;
		if (work->mm != worker->mm)
			io_wq_switch_mm(worker, work);
		if (worker->cur_creds != work->creds)
@@ -691,11 +699,16 @@ static int io_wq_manager(void *data)
	/* create fixed workers */
	refcount_set(&wq->refs, workers_to_create);
	for_each_node(node) {
		if (!node_online(node))
			continue;
		if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
			goto err;
		workers_to_create--;
	}

	while (workers_to_create--)
		refcount_dec(&wq->refs);

	complete(&wq->done);

	while (!kthread_should_stop()) {
@@ -703,6 +716,9 @@ static int io_wq_manager(void *data)
			struct io_wqe *wqe = wq->wqes[node];
			bool fork_worker[2] = { false, false };

			if (!node_online(node))
				continue;

			spin_lock_irq(&wqe->lock);
			if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
				fork_worker[IO_WQ_ACCT_BOUND] = true;
@@ -821,6 +837,8 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,

	list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
		if (io_worker_get(worker)) {
			/* no task if node is/was offline */
			if (worker->task)
				ret = func(worker, data);
			io_worker_release(worker);
			if (ret)
@@ -929,17 +947,19 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
	return ret;
}

struct work_match {
	bool (*fn)(struct io_wq_work *, void *data);
	void *data;
};

static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
	struct io_wq_work *work = data;
	struct work_match *match = data;
	unsigned long flags;
	bool ret = false;

	if (worker->cur_work != work)
		return false;

	spin_lock_irqsave(&worker->lock, flags);
	if (worker->cur_work == work &&
	if (match->fn(worker->cur_work, match->data) &&
	    !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
		send_sig(SIGINT, worker->task, 1);
		ret = true;
@@ -950,15 +970,13 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
}

static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
					    struct io_wq_work *cwork)
					    struct work_match *match)
{
	struct io_wq_work_node *node, *prev;
	struct io_wq_work *work;
	unsigned long flags;
	bool found = false;

	cwork->flags |= IO_WQ_WORK_CANCEL;

	/*
	 * First check pending list, if we're lucky we can just remove it
	 * from there. CANCEL_OK means that the work is returned as-new,
@@ -968,7 +986,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
	wq_list_for_each(node, prev, &wqe->work_list) {
		work = container_of(node, struct io_wq_work, list);

		if (work == cwork) {
		if (match->fn(work, match->data)) {
			wq_node_del(&wqe->work_list, node, prev);
			found = true;
			break;
@@ -989,20 +1007,60 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
	 * completion will run normally in this case.
	 */
	rcu_read_lock();
	found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, cwork);
	found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
	rcu_read_unlock();
	return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
}

static bool io_wq_work_match(struct io_wq_work *work, void *data)
{
	return work == data;
}

enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
{
	struct work_match match = {
		.fn	= io_wq_work_match,
		.data	= cwork
	};
	enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
	int node;

	cwork->flags |= IO_WQ_WORK_CANCEL;

	for_each_node(node) {
		struct io_wqe *wqe = wq->wqes[node];

		ret = io_wqe_cancel_work(wqe, cwork);
		ret = io_wqe_cancel_work(wqe, &match);
		if (ret != IO_WQ_CANCEL_NOTFOUND)
			break;
	}

	return ret;
}

static bool io_wq_pid_match(struct io_wq_work *work, void *data)
{
	pid_t pid = (pid_t) (unsigned long) data;

	if (work)
		return work->task_pid == pid;
	return false;
}

enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
{
	struct work_match match = {
		.fn	= io_wq_pid_match,
		.data	= (void *) (unsigned long) pid
	};
	enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
	int node;

	for_each_node(node) {
		struct io_wqe *wqe = wq->wqes[node];

		ret = io_wqe_cancel_work(wqe, &match);
		if (ret != IO_WQ_CANCEL_NOTFOUND)
			break;
	}
@@ -1036,6 +1094,8 @@ void io_wq_flush(struct io_wq *wq)
	for_each_node(node) {
		struct io_wqe *wqe = wq->wqes[node];

		if (!node_online(node))
			continue;
		init_completion(&data.done);
		INIT_IO_WORK(&data.work, io_wq_flush_func);
		data.work.flags |= IO_WQ_WORK_INTERNAL;
@@ -1067,12 +1127,15 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)

	for_each_node(node) {
		struct io_wqe *wqe;
		int alloc_node = node;

		wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
		if (!node_online(alloc_node))
			alloc_node = NUMA_NO_NODE;
		wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
		if (!wqe)
			goto err;
		wq->wqes[node] = wqe;
		wqe->node = node;
		wqe->node = alloc_node;
		wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
		atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
		if (wq->user) {
@@ -1080,7 +1143,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
					task_rlimit(current, RLIMIT_NPROC);
		}
		atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
		wqe->node = node;
		wqe->wq = wq;
		spin_lock_init(&wqe->lock);
		INIT_WQ_LIST(&wqe->work_list);
+5 −1
Original line number Diff line number Diff line
@@ -74,17 +74,20 @@ struct io_wq_work {
	struct files_struct *files;
	struct mm_struct *mm;
	const struct cred *creds;
	struct fs_struct *fs;
	unsigned flags;
	pid_t task_pid;
};

#define INIT_IO_WORK(work, _func)			\
	do {						\
		(work)->list.next = NULL;		\
		(work)->func = _func;			\
		(work)->flags = 0;			\
		(work)->files = NULL;			\
		(work)->mm = NULL;			\
		(work)->creds = NULL;			\
		(work)->fs = NULL;			\
		(work)->flags = 0;			\
	} while (0)					\

typedef void (get_work_fn)(struct io_wq_work *);
@@ -107,6 +110,7 @@ void io_wq_flush(struct io_wq *wq);

void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid);

typedef bool (work_cancel_fn)(struct io_wq_work *, void *);

+202 −97
Original line number Diff line number Diff line
@@ -75,6 +75,7 @@
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/eventpoll.h>
#include <linux/fs_struct.h>

#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -204,11 +205,11 @@ struct io_ring_ctx {

	struct {
		unsigned int		flags;
		int			compat: 1;
		int			account_mem: 1;
		int			cq_overflow_flushed: 1;
		int			drain_next: 1;
		int			eventfd_async: 1;
		unsigned int		compat: 1;
		unsigned int		account_mem: 1;
		unsigned int		cq_overflow_flushed: 1;
		unsigned int		drain_next: 1;
		unsigned int		eventfd_async: 1;

		/*
		 * Ring buffer of indices into array of io_uring_sqe, which is
@@ -441,6 +442,7 @@ struct io_async_msghdr {
	struct iovec			*iov;
	struct sockaddr __user		*uaddr;
	struct msghdr			msg;
	struct sockaddr_storage		addr;
};

struct io_async_rw {
@@ -450,17 +452,12 @@ struct io_async_rw {
	ssize_t				size;
};

struct io_async_open {
	struct filename			*filename;
};

struct io_async_ctx {
	union {
		struct io_async_rw	rw;
		struct io_async_msghdr	msg;
		struct io_async_connect	connect;
		struct io_timeout_data	timeout;
		struct io_async_open	open;
	};
};

@@ -483,6 +480,8 @@ enum {
	REQ_F_MUST_PUNT_BIT,
	REQ_F_TIMEOUT_NOSEQ_BIT,
	REQ_F_COMP_LOCKED_BIT,
	REQ_F_NEED_CLEANUP_BIT,
	REQ_F_OVERFLOW_BIT,
};

enum {
@@ -521,6 +520,10 @@ enum {
	REQ_F_TIMEOUT_NOSEQ	= BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
	/* completion under lock */
	REQ_F_COMP_LOCKED	= BIT(REQ_F_COMP_LOCKED_BIT),
	/* needs cleanup */
	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
	/* in overflow list */
	REQ_F_OVERFLOW		= BIT(REQ_F_OVERFLOW_BIT),
};

/*
@@ -553,7 +556,6 @@ struct io_kiocb {
	 * llist_node is only used for poll deferred completions
	 */
	struct llist_node		llist_node;
	bool				has_user;
	bool				in_async;
	bool				needs_fixed_file;
	u8				opcode;
@@ -614,6 +616,8 @@ struct io_op_def {
	unsigned		not_supported : 1;
	/* needs file table */
	unsigned		file_table : 1;
	/* needs ->fs */
	unsigned		needs_fs : 1;
};

static const struct io_op_def io_op_defs[] = {
@@ -656,12 +660,14 @@ static const struct io_op_def io_op_defs[] = {
		.needs_mm		= 1,
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.needs_fs		= 1,
	},
	[IORING_OP_RECVMSG] = {
		.async_ctx		= 1,
		.needs_mm		= 1,
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.needs_fs		= 1,
	},
	[IORING_OP_TIMEOUT] = {
		.async_ctx		= 1,
@@ -692,6 +698,7 @@ static const struct io_op_def io_op_defs[] = {
		.needs_file		= 1,
		.fd_non_neg		= 1,
		.file_table		= 1,
		.needs_fs		= 1,
	},
	[IORING_OP_CLOSE] = {
		.needs_file		= 1,
@@ -705,6 +712,7 @@ static const struct io_op_def io_op_defs[] = {
		.needs_mm		= 1,
		.needs_file		= 1,
		.fd_non_neg		= 1,
		.needs_fs		= 1,
	},
	[IORING_OP_READ] = {
		.needs_mm		= 1,
@@ -736,6 +744,7 @@ static const struct io_op_def io_op_defs[] = {
		.needs_file		= 1,
		.fd_non_neg		= 1,
		.file_table		= 1,
		.needs_fs		= 1,
	},
	[IORING_OP_EPOLL_CTL] = {
		.unbound_nonreg_file	= 1,
@@ -754,6 +763,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
				 unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
static void io_ring_file_ref_flush(struct fixed_file_data *data);
static void io_cleanup_req(struct io_kiocb *req);

static struct kmem_cache *req_cachep;

@@ -909,6 +919,18 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
	}
	if (!req->work.creds)
		req->work.creds = get_current_cred();
	if (!req->work.fs && def->needs_fs) {
		spin_lock(&current->fs->lock);
		if (!current->fs->in_exec) {
			req->work.fs = current->fs;
			req->work.fs->users++;
		} else {
			req->work.flags |= IO_WQ_WORK_CANCEL;
		}
		spin_unlock(&current->fs->lock);
	}
	if (!req->work.task_pid)
		req->work.task_pid = task_pid_vnr(current);
}

static inline void io_req_work_drop_env(struct io_kiocb *req)
@@ -921,6 +943,16 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
		put_cred(req->work.creds);
		req->work.creds = NULL;
	}
	if (req->work.fs) {
		struct fs_struct *fs = req->work.fs;

		spin_lock(&req->work.fs->lock);
		if (--fs->users)
			fs = NULL;
		spin_unlock(&req->work.fs->lock);
		if (fs)
			free_fs_struct(fs);
	}
}

static inline bool io_prep_async_work(struct io_kiocb *req,
@@ -1074,6 +1106,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
		req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
						list);
		list_move(&req->list, &list);
		req->flags &= ~REQ_F_OVERFLOW;
		if (cqe) {
			WRITE_ONCE(cqe->user_data, req->user_data);
			WRITE_ONCE(cqe->res, req->result);
@@ -1126,6 +1159,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
			set_bit(0, &ctx->sq_check_overflow);
			set_bit(0, &ctx->cq_check_overflow);
		}
		req->flags |= REQ_F_OVERFLOW;
		refcount_inc(&req->refs);
		req->result = res;
		list_add_tail(&req->list, &ctx->cq_overflow_list);
@@ -1241,6 +1275,9 @@ static void __io_free_req(struct io_kiocb *req)
{
	__io_req_aux_free(req);

	if (req->flags & REQ_F_NEED_CLEANUP)
		io_cleanup_req(req);

	if (req->flags & REQ_F_INFLIGHT) {
		struct io_ring_ctx *ctx = req->ctx;
		unsigned long flags;
@@ -2056,9 +2093,6 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
		return iorw->size;
	}

	if (!req->has_user)
		return -EFAULT;

#ifdef CONFIG_COMPAT
	if (req->ctx->compat)
		return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
@@ -2137,6 +2171,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
		req->io->rw.iov = req->io->rw.fast_iov;
		memcpy(req->io->rw.iov, fast_iov,
			sizeof(struct iovec) * iter->nr_segs);
	} else {
		req->flags |= REQ_F_NEED_CLEANUP;
	}
}

@@ -2148,17 +2184,6 @@ static int io_alloc_async_ctx(struct io_kiocb *req)
	return req->io == NULL;
}

static void io_rw_async(struct io_wq_work **workptr)
{
	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
	struct iovec *iov = NULL;

	if (req->io->rw.iov != req->io->rw.fast_iov)
		iov = req->io->rw.iov;
	io_wq_submit_work(workptr);
	kfree(iov);
}

static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
			     struct iovec *iovec, struct iovec *fast_iov,
			     struct iov_iter *iter)
@@ -2171,7 +2196,6 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,

		io_req_map_rw(req, io_size, iovec, fast_iov, iter);
	}
	req->work.func = io_rw_async;
	return 0;
}

@@ -2189,7 +2213,8 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
	if (unlikely(!(req->file->f_mode & FMODE_READ)))
		return -EBADF;

	if (!req->io)
	/* either don't need iovec imported or already have it */
	if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
		return 0;

	io = req->io;
@@ -2258,8 +2283,8 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
		}
	}
out_free:
	if (!io_wq_current_is_worker())
	kfree(iovec);
	req->flags &= ~REQ_F_NEED_CLEANUP;
	return ret;
}

@@ -2277,7 +2302,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
	if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
		return -EBADF;

	if (!req->io)
	/* either don't need iovec imported or already have it */
	if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
		return 0;

	io = req->io;
@@ -2352,6 +2378,12 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
			ret2 = call_write_iter(req->file, kiocb, &iter);
		else
			ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
		/*
		 * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just
		 * retry them without IOCB_NOWAIT.
		 */
		if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
			ret2 = -EAGAIN;
		if (!force_nonblock || ret2 != -EAGAIN) {
			kiocb_done(kiocb, ret2, nxt, req->in_async);
		} else {
@@ -2364,7 +2396,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
		}
	}
out_free:
	if (!io_wq_current_is_worker())
	req->flags &= ~REQ_F_NEED_CLEANUP;
	kfree(iovec);
	return ret;
}
@@ -2534,6 +2566,10 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)

	if (sqe->ioprio || sqe->buf_index)
		return -EINVAL;
	if (sqe->flags & IOSQE_FIXED_FILE)
		return -EBADF;
	if (req->flags & REQ_F_NEED_CLEANUP)
		return 0;

	req->open.dfd = READ_ONCE(sqe->fd);
	req->open.how.mode = READ_ONCE(sqe->len);
@@ -2547,6 +2583,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
		return ret;
	}

	req->flags |= REQ_F_NEED_CLEANUP;
	return 0;
}

@@ -2559,6 +2596,10 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)

	if (sqe->ioprio || sqe->buf_index)
		return -EINVAL;
	if (sqe->flags & IOSQE_FIXED_FILE)
		return -EBADF;
	if (req->flags & REQ_F_NEED_CLEANUP)
		return 0;

	req->open.dfd = READ_ONCE(sqe->fd);
	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -2583,6 +2624,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
		return ret;
	}

	req->flags |= REQ_F_NEED_CLEANUP;
	return 0;
}

@@ -2614,6 +2656,7 @@ static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
	}
err:
	putname(req->open.filename);
	req->flags &= ~REQ_F_NEED_CLEANUP;
	if (ret < 0)
		req_set_fail_links(req);
	io_cqring_add_event(req, ret);
@@ -2754,6 +2797,10 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)

	if (sqe->ioprio || sqe->buf_index)
		return -EINVAL;
	if (sqe->flags & IOSQE_FIXED_FILE)
		return -EBADF;
	if (req->flags & REQ_F_NEED_CLEANUP)
		return 0;

	req->open.dfd = READ_ONCE(sqe->fd);
	req->open.mask = READ_ONCE(sqe->len);
@@ -2771,6 +2818,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
		return ret;
	}

	req->flags |= REQ_F_NEED_CLEANUP;
	return 0;
}

@@ -2808,6 +2856,7 @@ static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
		ret = cp_statx(&stat, ctx->buffer);
err:
	putname(ctx->filename);
	req->flags &= ~REQ_F_NEED_CLEANUP;
	if (ret < 0)
		req_set_fail_links(req);
	io_cqring_add_event(req, ret);
@@ -2827,7 +2876,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
	    sqe->rw_flags || sqe->buf_index)
		return -EINVAL;
	if (sqe->flags & IOSQE_FIXED_FILE)
		return -EINVAL;
		return -EBADF;

	req->close.fd = READ_ONCE(sqe->fd);
	if (req->file->f_op == &io_uring_fops ||
@@ -2837,24 +2886,25 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
	return 0;
}

static void io_close_finish(struct io_wq_work **workptr)
/* only called when __close_fd_get_file() is done */
static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt)
{
	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
	struct io_kiocb *nxt = NULL;

	/* Invoked with files, we need to do the close */
	if (req->work.files) {
	int ret;

	ret = filp_close(req->close.put_file, req->work.files);
	if (ret < 0)
		req_set_fail_links(req);
	io_cqring_add_event(req, ret);
	fput(req->close.put_file);
	io_put_req_find_next(req, nxt);
}

	fput(req->close.put_file);
static void io_close_finish(struct io_wq_work **workptr)
{
	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
	struct io_kiocb *nxt = NULL;

	io_put_req_find_next(req, &nxt);
	__io_close_finish(req, &nxt);
	if (nxt)
		io_wq_assign_next(workptr, nxt);
}
@@ -2877,22 +2927,8 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
	 * No ->flush(), safely close from here and just punt the
	 * fput() to async context.
	 */
	ret = filp_close(req->close.put_file, current->files);

	if (ret < 0)
		req_set_fail_links(req);
	io_cqring_add_event(req, ret);

	if (io_wq_current_is_worker()) {
		struct io_wq_work *old_work, *work;

		old_work = work = &req->work;
		io_close_finish(&work);
		if (work && work != old_work)
			*nxt = container_of(work, struct io_kiocb, work);
	__io_close_finish(req, nxt);
	return 0;
	}

eagain:
	req->work.func = io_close_finish;
	/*
@@ -2960,24 +2996,12 @@ static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
	return 0;
}

#if defined(CONFIG_NET)
static void io_sendrecv_async(struct io_wq_work **workptr)
{
	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
	struct iovec *iov = NULL;

	if (req->io->rw.iov != req->io->rw.fast_iov)
		iov = req->io->msg.iov;
	io_wq_submit_work(workptr);
	kfree(iov);
}
#endif

static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
	struct io_sr_msg *sr = &req->sr_msg;
	struct io_async_ctx *io = req->io;
	int ret;

	sr->msg_flags = READ_ONCE(sqe->msg_flags);
	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -2985,10 +3009,16 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)

	if (!io || req->opcode == IORING_OP_SEND)
		return 0;
	/* iovec is already imported */
	if (req->flags & REQ_F_NEED_CLEANUP)
		return 0;

	io->msg.iov = io->msg.fast_iov;
	return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
	ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
					&io->msg.iov);
	if (!ret)
		req->flags |= REQ_F_NEED_CLEANUP;
	return ret;
#else
	return -EOPNOTSUPP;
#endif
@@ -3008,12 +3038,11 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
	sock = sock_from_file(req->file, &ret);
	if (sock) {
		struct io_async_ctx io;
		struct sockaddr_storage addr;
		unsigned flags;

		if (req->io) {
			kmsg = &req->io->msg;
			kmsg->msg.msg_name = &addr;
			kmsg->msg.msg_name = &req->io->msg.addr;
			/* if iov is set, it's allocated already */
			if (!kmsg->iov)
				kmsg->iov = kmsg->fast_iov;
@@ -3022,7 +3051,7 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
			struct io_sr_msg *sr = &req->sr_msg;

			kmsg = &io.msg;
			kmsg->msg.msg_name = &addr;
			kmsg->msg.msg_name = &io.msg.addr;

			io.msg.iov = io.msg.fast_iov;
			ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
@@ -3041,18 +3070,22 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
		if (force_nonblock && ret == -EAGAIN) {
			if (req->io)
				return -EAGAIN;
			if (io_alloc_async_ctx(req))
			if (io_alloc_async_ctx(req)) {
				if (kmsg && kmsg->iov != kmsg->fast_iov)
					kfree(kmsg->iov);
				return -ENOMEM;
			}
			req->flags |= REQ_F_NEED_CLEANUP;
			memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
			req->work.func = io_sendrecv_async;
			return -EAGAIN;
		}
		if (ret == -ERESTARTSYS)
			ret = -EINTR;
	}

	if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
	if (kmsg && kmsg->iov != kmsg->fast_iov)
		kfree(kmsg->iov);
	req->flags &= ~REQ_F_NEED_CLEANUP;
	io_cqring_add_event(req, ret);
	if (ret < 0)
		req_set_fail_links(req);
@@ -3120,6 +3153,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
#if defined(CONFIG_NET)
	struct io_sr_msg *sr = &req->sr_msg;
	struct io_async_ctx *io = req->io;
	int ret;

	sr->msg_flags = READ_ONCE(sqe->msg_flags);
	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -3127,10 +3161,16 @@ static int io_recvmsg_prep(struct io_kiocb *req,

	if (!io || req->opcode == IORING_OP_RECV)
		return 0;
	/* iovec is already imported */
	if (req->flags & REQ_F_NEED_CLEANUP)
		return 0;

	io->msg.iov = io->msg.fast_iov;
	return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
	ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
					&io->msg.uaddr, &io->msg.iov);
	if (!ret)
		req->flags |= REQ_F_NEED_CLEANUP;
	return ret;
#else
	return -EOPNOTSUPP;
#endif
@@ -3150,12 +3190,11 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
	sock = sock_from_file(req->file, &ret);
	if (sock) {
		struct io_async_ctx io;
		struct sockaddr_storage addr;
		unsigned flags;

		if (req->io) {
			kmsg = &req->io->msg;
			kmsg->msg.msg_name = &addr;
			kmsg->msg.msg_name = &req->io->msg.addr;
			/* if iov is set, it's allocated already */
			if (!kmsg->iov)
				kmsg->iov = kmsg->fast_iov;
@@ -3164,7 +3203,7 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
			struct io_sr_msg *sr = &req->sr_msg;

			kmsg = &io.msg;
			kmsg->msg.msg_name = &addr;
			kmsg->msg.msg_name = &io.msg.addr;

			io.msg.iov = io.msg.fast_iov;
			ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
@@ -3185,18 +3224,22 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
		if (force_nonblock && ret == -EAGAIN) {
			if (req->io)
				return -EAGAIN;
			if (io_alloc_async_ctx(req))
			if (io_alloc_async_ctx(req)) {
				if (kmsg && kmsg->iov != kmsg->fast_iov)
					kfree(kmsg->iov);
				return -ENOMEM;
			}
			memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
			req->work.func = io_sendrecv_async;
			req->flags |= REQ_F_NEED_CLEANUP;
			return -EAGAIN;
		}
		if (ret == -ERESTARTSYS)
			ret = -EINTR;
	}

	if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
	if (kmsg && kmsg->iov != kmsg->fast_iov)
		kfree(kmsg->iov);
	req->flags &= ~REQ_F_NEED_CLEANUP;
	io_cqring_add_event(req, ret);
	if (ret < 0)
		req_set_fail_links(req);
@@ -4207,6 +4250,35 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
	return -EIOCBQUEUED;
}

static void io_cleanup_req(struct io_kiocb *req)
{
	struct io_async_ctx *io = req->io;

	switch (req->opcode) {
	case IORING_OP_READV:
	case IORING_OP_READ_FIXED:
	case IORING_OP_READ:
	case IORING_OP_WRITEV:
	case IORING_OP_WRITE_FIXED:
	case IORING_OP_WRITE:
		if (io->rw.iov != io->rw.fast_iov)
			kfree(io->rw.iov);
		break;
	case IORING_OP_SENDMSG:
	case IORING_OP_RECVMSG:
		if (io->msg.iov != io->msg.fast_iov)
			kfree(io->msg.iov);
		break;
	case IORING_OP_OPENAT:
	case IORING_OP_OPENAT2:
	case IORING_OP_STATX:
		putname(req->open.filename);
		break;
	}

	req->flags &= ~REQ_F_NEED_CLEANUP;
}

static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
			struct io_kiocb **nxt, bool force_nonblock)
{
@@ -4446,7 +4518,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
	}

	if (!ret) {
		req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
		req->in_async = true;
		do {
			ret = io_issue_sqe(req, NULL, &nxt, false);
@@ -4479,7 +4550,7 @@ static int io_req_needs_file(struct io_kiocb *req, int fd)
{
	if (!io_op_defs[req->opcode].needs_file)
		return 0;
	if (fd == -1 && io_op_defs[req->opcode].fd_non_neg)
	if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg)
		return 0;
	return 1;
}
@@ -4950,6 +5021,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
	for (i = 0; i < nr; i++) {
		const struct io_uring_sqe *sqe;
		struct io_kiocb *req;
		int err;

		req = io_get_req(ctx, statep);
		if (unlikely(!req)) {
@@ -4966,20 +5038,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
		submitted++;

		if (unlikely(req->opcode >= IORING_OP_LAST)) {
			io_cqring_add_event(req, -EINVAL);
			err = -EINVAL;
fail_req:
			io_cqring_add_event(req, err);
			io_double_put_req(req);
			break;
		}

		if (io_op_defs[req->opcode].needs_mm && !*mm) {
			mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
			if (!mm_fault) {
			if (unlikely(mm_fault)) {
				err = -EFAULT;
				goto fail_req;
			}
			use_mm(ctx->sqo_mm);
			*mm = ctx->sqo_mm;
		}
		}

		req->has_user = *mm != NULL;
		req->in_async = async;
		req->needs_fixed_file = async;
		trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
@@ -6301,7 +6376,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
	if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
	    ctx->rings->sq_ring_entries)
		mask |= EPOLLOUT | EPOLLWRNORM;
	if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
	if (io_cqring_events(ctx, false))
		mask |= EPOLLIN | EPOLLRDNORM;

	return mask;
@@ -6393,6 +6468,29 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
		if (!cancel_req)
			break;

		if (cancel_req->flags & REQ_F_OVERFLOW) {
			spin_lock_irq(&ctx->completion_lock);
			list_del(&cancel_req->list);
			cancel_req->flags &= ~REQ_F_OVERFLOW;
			if (list_empty(&ctx->cq_overflow_list)) {
				clear_bit(0, &ctx->sq_check_overflow);
				clear_bit(0, &ctx->cq_check_overflow);
			}
			spin_unlock_irq(&ctx->completion_lock);

			WRITE_ONCE(ctx->rings->cq_overflow,
				atomic_inc_return(&ctx->cached_cq_overflow));

			/*
			 * Put inflight ref and overflow ref. If that's
			 * all we had, then we're done with this request.
			 */
			if (refcount_sub_and_test(2, &cancel_req->refs)) {
				io_put_req(cancel_req);
				continue;
			}
		}

		io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
		io_put_req(cancel_req);
		schedule();
@@ -6405,6 +6503,13 @@ static int io_uring_flush(struct file *file, void *data)
	struct io_ring_ctx *ctx = file->private_data;

	io_uring_cancel_files(ctx, data);

	/*
	 * If the task is going away, cancel work it may have pending
	 */
	if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
		io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));

	return 0;
}