Commit 96f7e448 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-6.2/io_uring-next-2022-12-08' of git://git.kernel.dk/linux

Pull io_uring updates part two from Jens Axboe:

 - Misc fixes (me, Lin)

 - Series from Pavel extending the single task exclusive ring mode,
   yielding nice improvements for the common case of having a single
   ring per thread (Pavel)

 - Cleanup for MSG_RING, removing our IOPOLL hack (Pavel)

 - Further poll cleanups and fixes (Pavel)

 - Misc cleanups and fixes (Pavel)

* tag 'for-6.2/io_uring-next-2022-12-08' of git://git.kernel.dk/linux: (22 commits)
  io_uring/msg_ring: flag target ring as having task_work, if needed
  io_uring: skip spinlocking for ->task_complete
  io_uring: do msg_ring in target task via tw
  io_uring: extract a io_msg_install_complete helper
  io_uring: get rid of double locking
  io_uring: never run tw and fallback in parallel
  io_uring: use tw for putting rsrc
  io_uring: force multishot CQEs into task context
  io_uring: complete all requests in task context
  io_uring: don't check overflow flush failures
  io_uring: skip overflow CQE posting for dying ring
  io_uring: improve io_double_lock_ctx fail handling
  io_uring: dont remove file from msg_ring reqs
  io_uring: reshuffle issue_flags
  io_uring: don't reinstall quiesce node for each tw
  io_uring: improve rsrc quiesce refs checks
  io_uring: don't raw spin unlock to match cq_lock
  io_uring: combine poll tw handlers
  io_uring: improve poll warning handling
  io_uring: remove ctx variable in io_poll_check_events
  ...
parents 54e60e50 761c61c1
Loading
Loading
Loading
Loading
+7 −6
Original line number Diff line number Diff line
@@ -9,16 +9,17 @@
enum io_uring_cmd_flags {
	IO_URING_F_COMPLETE_DEFER	= 1,
	IO_URING_F_UNLOCKED		= 2,
	/* the request is executed from poll, it should not be freed */
	IO_URING_F_MULTISHOT		= 4,
	/* executed by io-wq */
	IO_URING_F_IOWQ			= 8,
	/* int's last bit, sign checks are usually faster than a bit test */
	IO_URING_F_NONBLOCK		= INT_MIN,

	/* ctx state flags, for URING_CMD */
	IO_URING_F_SQE128		= 4,
	IO_URING_F_CQE32		= 8,
	IO_URING_F_IOPOLL		= 16,

	/* the request is executed from poll, it should not be freed */
	IO_URING_F_MULTISHOT		= 32,
	IO_URING_F_SQE128		= (1 << 8),
	IO_URING_F_CQE32		= (1 << 9),
	IO_URING_F_IOPOLL		= (1 << 10),
};

struct io_uring_cmd {
+3 −0
Original line number Diff line number Diff line
@@ -208,6 +208,8 @@ struct io_ring_ctx {
		unsigned int		drain_disabled: 1;
		unsigned int		has_evfd: 1;
		unsigned int		syscall_iopoll: 1;
		/* all CQEs should be posted only by the submitter task */
		unsigned int		task_complete: 1;
	} ____cacheline_aligned_in_smp;

	/* submission data */
@@ -326,6 +328,7 @@ struct io_ring_ctx {
	struct io_rsrc_data		*buf_data;

	struct delayed_work		rsrc_put_work;
	struct callback_head		rsrc_put_tw;
	struct llist_head		rsrc_put_llist;
	struct list_head		rsrc_ref_list;
	spinlock_t			rsrc_ref_lock;
+110 −57
Original line number Diff line number Diff line
@@ -149,6 +149,7 @@ static void io_clean_op(struct io_kiocb *req);
static void io_queue_sqe(struct io_kiocb *req);
static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
static __cold void io_fallback_tw(struct io_uring_task *tctx);

static struct kmem_cache *req_cachep;

@@ -326,6 +327,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
	spin_lock_init(&ctx->rsrc_ref_lock);
	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
	init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw);
	init_llist_head(&ctx->rsrc_put_llist);
	init_llist_head(&ctx->work_llist);
	INIT_LIST_HEAD(&ctx->tctx_list);
@@ -582,13 +584,25 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
		io_eventfd_flush_signal(ctx);
}

static inline void __io_cq_lock(struct io_ring_ctx *ctx)
	__acquires(ctx->completion_lock)
{
	if (!ctx->task_complete)
		spin_lock(&ctx->completion_lock);
}

static inline void __io_cq_unlock(struct io_ring_ctx *ctx)
{
	if (!ctx->task_complete)
		spin_unlock(&ctx->completion_lock);
}

/* keep it inlined for io_submit_flush_completions() */
static inline void io_cq_unlock_post_inline(struct io_ring_ctx *ctx)
static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
	__releases(ctx->completion_lock)
{
	io_commit_cqring(ctx);
	spin_unlock(&ctx->completion_lock);

	__io_cq_unlock(ctx);
	io_commit_cqring_flush(ctx);
	io_cqring_wake(ctx);
}
@@ -596,17 +610,37 @@ static inline void io_cq_unlock_post_inline(struct io_ring_ctx *ctx)
void io_cq_unlock_post(struct io_ring_ctx *ctx)
	__releases(ctx->completion_lock)
{
	io_cq_unlock_post_inline(ctx);
	io_commit_cqring(ctx);
	spin_unlock(&ctx->completion_lock);
	io_commit_cqring_flush(ctx);
	io_cqring_wake(ctx);
}

/* Returns true if there are no backlogged entries after the flush */
static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
{
	struct io_overflow_cqe *ocqe;
	LIST_HEAD(list);

	io_cq_lock(ctx);
	list_splice_init(&ctx->cq_overflow_list, &list);
	clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
	io_cq_unlock(ctx);

	while (!list_empty(&list)) {
		ocqe = list_first_entry(&list, struct io_overflow_cqe, list);
		list_del(&ocqe->list);
		kfree(ocqe);
	}
}

/* Returns true if there are no backlogged entries after the flush */
static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
{
	bool all_flushed;
	size_t cqe_size = sizeof(struct io_uring_cqe);

	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
		return false;
	if (__io_cqring_events(ctx) == ctx->cq_entries)
		return;

	if (ctx->flags & IORING_SETUP_CQE32)
		cqe_size <<= 1;
@@ -616,43 +650,32 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
		struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
		struct io_overflow_cqe *ocqe;

		if (!cqe && !force)
		if (!cqe)
			break;
		ocqe = list_first_entry(&ctx->cq_overflow_list,
					struct io_overflow_cqe, list);
		if (cqe)
		memcpy(cqe, &ocqe->cqe, cqe_size);
		else
			io_account_cq_overflow(ctx);

		list_del(&ocqe->list);
		kfree(ocqe);
	}

	all_flushed = list_empty(&ctx->cq_overflow_list);
	if (all_flushed) {
	if (list_empty(&ctx->cq_overflow_list)) {
		clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
		atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
	}

	io_cq_unlock_post(ctx);
	return all_flushed;
}

static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
{
	bool ret = true;

	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
		/* iopoll syncs against uring_lock, not completion_lock */
		if (ctx->flags & IORING_SETUP_IOPOLL)
			mutex_lock(&ctx->uring_lock);
		ret = __io_cqring_overflow_flush(ctx, false);
		__io_cqring_overflow_flush(ctx);
		if (ctx->flags & IORING_SETUP_IOPOLL)
			mutex_unlock(&ctx->uring_lock);
	}

	return ret;
}

void __io_put_task(struct task_struct *task, int nr)
@@ -777,11 +800,12 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
	return &rings->cqes[off];
}

static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
			    bool allow_overflow)
static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
			      u32 cflags)
{
	struct io_uring_cqe *cqe;

	if (!ctx->task_complete)
		lockdep_assert_held(&ctx->completion_lock);

	ctx->cq_extra++;
@@ -805,10 +829,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32
		}
		return true;
	}

	if (allow_overflow)
		return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);

	return false;
}

@@ -822,7 +842,17 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
	for (i = 0; i < state->cqes_count; i++) {
		struct io_uring_cqe *cqe = &state->cqes[i];

		io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags, true);
		if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
			if (ctx->task_complete) {
				spin_lock(&ctx->completion_lock);
				io_cqring_event_overflow(ctx, cqe->user_data,
							cqe->res, cqe->flags, 0, 0);
				spin_unlock(&ctx->completion_lock);
			} else {
				io_cqring_event_overflow(ctx, cqe->user_data,
							cqe->res, cqe->flags, 0, 0);
			}
		}
	}
	state->cqes_count = 0;
}
@@ -833,7 +863,10 @@ static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u
	bool filled;

	io_cq_lock(ctx);
	filled = io_fill_cqe_aux(ctx, user_data, res, cflags, allow_overflow);
	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
	if (!filled && allow_overflow)
		filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);

	io_cq_unlock_post(ctx);
	return filled;
}
@@ -857,10 +890,10 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32
	lockdep_assert_held(&ctx->uring_lock);

	if (ctx->submit_state.cqes_count == length) {
		io_cq_lock(ctx);
		__io_cq_lock(ctx);
		__io_flush_post_cqes(ctx);
		/* no need to flush - flush is deferred */
		spin_unlock(&ctx->completion_lock);
		__io_cq_unlock_post(ctx);
	}

	/* For defered completions this is not as strict as it is otherwise,
@@ -915,7 +948,10 @@ static void __io_req_complete_post(struct io_kiocb *req)

void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{
	if (!(issue_flags & IO_URING_F_UNLOCKED) ||
	if (req->ctx->task_complete && (issue_flags & IO_URING_F_IOWQ)) {
		req->io_task_work.func = io_req_task_complete;
		io_req_task_work_add(req);
	} else if (!(issue_flags & IO_URING_F_UNLOCKED) ||
		   !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
		__io_req_complete_post(req);
	} else {
@@ -1139,10 +1175,17 @@ void tctx_task_work(struct callback_head *cb)
	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
						  task_work);
	struct llist_node fake = {};
	struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
	struct llist_node *node;
	unsigned int loops = 1;
	unsigned int count = handle_tw_list(node, &ctx, &uring_locked, NULL);
	unsigned int count;

	if (unlikely(current->flags & PF_EXITING)) {
		io_fallback_tw(tctx);
		return;
	}

	node = io_llist_xchg(&tctx->task_list, &fake);
	count = handle_tw_list(node, &ctx, &uring_locked, NULL);
	node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
	while (node != &fake) {
		loops++;
@@ -1385,7 +1428,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
	struct io_wq_work_node *node, *prev;
	struct io_submit_state *state = &ctx->submit_state;

	io_cq_lock(ctx);
	__io_cq_lock(ctx);
	/* must come first to preserve CQE ordering in failure cases */
	if (state->cqes_count)
		__io_flush_post_cqes(ctx);
@@ -1393,10 +1436,18 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
		struct io_kiocb *req = container_of(node, struct io_kiocb,
					    comp_list);

		if (!(req->flags & REQ_F_CQE_SKIP))
			__io_fill_cqe_req(ctx, req);
		if (!(req->flags & REQ_F_CQE_SKIP) &&
		    unlikely(!__io_fill_cqe_req(ctx, req))) {
			if (ctx->task_complete) {
				spin_lock(&ctx->completion_lock);
				io_req_cqe_overflow(req);
				spin_unlock(&ctx->completion_lock);
			} else {
				io_req_cqe_overflow(req);
			}
		}
	io_cq_unlock_post_inline(ctx);
	}
	__io_cq_unlock_post(ctx);

	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
		io_free_batch_list(ctx, state->compl_reqs.first);
@@ -1467,7 +1518,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
	check_cq = READ_ONCE(ctx->check_cq);
	if (unlikely(check_cq)) {
		if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
			__io_cqring_overflow_flush(ctx, false);
			__io_cqring_overflow_flush(ctx);
		/*
		 * Similarly do not spin if we have not informed the user of any
		 * dropped CQE.
@@ -1799,7 +1850,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
		return ret;

	/* If the op doesn't have a file, we're not polling for it */
	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
		io_iopoll_req_issued(req, issue_flags);

	return 0;
@@ -1808,8 +1859,6 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
int io_poll_issue(struct io_kiocb *req, bool *locked)
{
	io_tw_lock(req->ctx, locked);
	if (unlikely(req->task->flags & PF_EXITING))
		return -EFAULT;
	return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT|
				 IO_URING_F_COMPLETE_DEFER);
}
@@ -1826,7 +1875,7 @@ void io_wq_submit_work(struct io_wq_work *work)
{
	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
	const struct io_op_def *def = &io_op_defs[req->opcode];
	unsigned int issue_flags = IO_URING_F_UNLOCKED;
	unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ;
	bool needs_poll = false;
	int ret = 0, err = -ECANCELED;

@@ -2482,11 +2531,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,

	trace_io_uring_cqring_wait(ctx, min_events);
	do {
		/* if we can't even flush overflow, don't wait for more */
		if (!io_cqring_overflow_flush(ctx)) {
			ret = -EBUSY;
			break;
		}
		io_cqring_overflow_flush(ctx);
		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
						TASK_INTERRUPTIBLE);
		ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
@@ -2637,8 +2682,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
		__io_sqe_buffers_unregister(ctx);
	if (ctx->file_data)
		__io_sqe_files_unregister(ctx);
	if (ctx->rings)
		__io_cqring_overflow_flush(ctx, true);
	io_cqring_overflow_kill(ctx);
	io_eventfd_unregister(ctx);
	io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
@@ -2781,6 +2825,12 @@ static __cold void io_ring_exit_work(struct work_struct *work)
	 * as nobody else will be looking for them.
	 */
	do {
		if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
			mutex_lock(&ctx->uring_lock);
			io_cqring_overflow_kill(ctx);
			mutex_unlock(&ctx->uring_lock);
		}

		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
			io_move_task_work_from_local(ctx);

@@ -2846,8 +2896,6 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)

	mutex_lock(&ctx->uring_lock);
	percpu_ref_kill(&ctx->refs);
	if (ctx->rings)
		__io_cqring_overflow_flush(ctx, true);
	xa_for_each(&ctx->personalities, index, creds)
		io_unregister_personality(ctx, index);
	if (ctx->rings)
@@ -3489,6 +3537,11 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
	if (!ctx)
		return -ENOMEM;

	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
	    !(ctx->flags & IORING_SETUP_IOPOLL) &&
	    !(ctx->flags & IORING_SETUP_SQPOLL))
		ctx->task_complete = true;

	/*
	 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
	 * space applications don't need to do io completion events
+14 −1
Original line number Diff line number Diff line
@@ -93,6 +93,11 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
	spin_lock(&ctx->completion_lock);
}

static inline void io_cq_unlock(struct io_ring_ctx *ctx)
{
	spin_unlock(&ctx->completion_lock);
}

void io_cq_unlock_post(struct io_ring_ctx *ctx);

static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx,
@@ -128,7 +133,7 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
	 */
	cqe = io_get_cqe(ctx);
	if (unlikely(!cqe))
		return io_req_cqe_overflow(req);
		return false;

	trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
				req->cqe.res, req->cqe.flags,
@@ -151,6 +156,14 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
	return true;
}

static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
				   struct io_kiocb *req)
{
	if (likely(__io_fill_cqe_req(ctx, req)))
		return true;
	return io_req_cqe_overflow(req);
}

static inline void req_set_fail(struct io_kiocb *req)
{
	req->flags |= REQ_F_FAIL;
+117 −47
Original line number Diff line number Diff line
@@ -15,6 +15,8 @@

struct io_msg {
	struct file			*file;
	struct file			*src_file;
	struct callback_head		tw;
	u64 user_data;
	u32 len;
	u32 cmd;
@@ -23,6 +25,34 @@ struct io_msg {
	u32 flags;
};

void io_msg_ring_cleanup(struct io_kiocb *req)
{
	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);

	if (WARN_ON_ONCE(!msg->src_file))
		return;

	fput(msg->src_file);
	msg->src_file = NULL;
}

static void io_msg_tw_complete(struct callback_head *head)
{
	struct io_msg *msg = container_of(head, struct io_msg, tw);
	struct io_kiocb *req = cmd_to_io_kiocb(msg);
	struct io_ring_ctx *target_ctx = req->file->private_data;
	int ret = 0;

	if (current->flags & PF_EXITING)
		ret = -EOWNERDEAD;
	else if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
		ret = -EOVERFLOW;

	if (ret < 0)
		req_set_fail(req);
	io_req_queue_tw_complete(req, ret);
}

static int io_msg_ring_data(struct io_kiocb *req)
{
	struct io_ring_ctx *target_ctx = req->file->private_data;
@@ -31,23 +61,29 @@ static int io_msg_ring_data(struct io_kiocb *req)
	if (msg->src_fd || msg->dst_fd || msg->flags)
		return -EINVAL;

	if (target_ctx->task_complete && current != target_ctx->submitter_task) {
		init_task_work(&msg->tw, io_msg_tw_complete);
		if (task_work_add(target_ctx->submitter_task, &msg->tw,
				  TWA_SIGNAL_NO_IPI))
			return -EOWNERDEAD;

		atomic_or(IORING_SQ_TASKRUN, &target_ctx->rings->sq_flags);
		return IOU_ISSUE_SKIP_COMPLETE;
	}

	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
		return 0;

	return -EOVERFLOW;
}

static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
				 struct io_ring_ctx *octx,
static void io_double_unlock_ctx(struct io_ring_ctx *octx,
				 unsigned int issue_flags)
{
	if (issue_flags & IO_URING_F_UNLOCKED)
		mutex_unlock(&ctx->uring_lock);
	mutex_unlock(&octx->uring_lock);
}

static int io_double_lock_ctx(struct io_ring_ctx *ctx,
			      struct io_ring_ctx *octx,
static int io_double_lock_ctx(struct io_ring_ctx *octx,
			      unsigned int issue_flags)
{
	/*
@@ -60,56 +96,49 @@ static int io_double_lock_ctx(struct io_ring_ctx *ctx,
			return -EAGAIN;
		return 0;
	}

	/* Always grab smallest value ctx first. We know ctx != octx. */
	if (ctx < octx) {
		mutex_lock(&ctx->uring_lock);
		mutex_lock(&octx->uring_lock);
	} else {
	mutex_lock(&octx->uring_lock);
		mutex_lock(&ctx->uring_lock);
	}

	return 0;
}

static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags)
{
	struct io_ring_ctx *target_ctx = req->file->private_data;
	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
	struct io_ring_ctx *ctx = req->ctx;
	struct file *file = NULL;
	unsigned long file_ptr;
	struct file *src_file;
	int ret;

	if (target_ctx == ctx)
		return -EINVAL;

	ret = io_double_lock_ctx(ctx, target_ctx, issue_flags);
	if (unlikely(ret))
		return ret;

	ret = -EBADF;
	if (unlikely(msg->src_fd >= ctx->nr_user_files))
		goto out_unlock;
	int idx = msg->src_fd;

	io_ring_submit_lock(ctx, issue_flags);
	if (likely(idx < ctx->nr_user_files)) {
		idx = array_index_nospec(idx, ctx->nr_user_files);
		file_ptr = io_fixed_file_slot(&ctx->file_table, idx)->file_ptr;
		file = (struct file *) (file_ptr & FFS_MASK);
		if (file)
			get_file(file);
	}
	io_ring_submit_unlock(ctx, issue_flags);
	return file;
}

	msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files);
	file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr;
	if (!file_ptr)
		goto out_unlock;
static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flags)
{
	struct io_ring_ctx *target_ctx = req->file->private_data;
	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
	struct file *src_file = msg->src_file;
	int ret;

	src_file = (struct file *) (file_ptr & FFS_MASK);
	get_file(src_file);
	if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
		return -EAGAIN;

	ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
	if (ret < 0) {
		fput(src_file);
	if (ret < 0)
		goto out_unlock;
	}

	msg->src_file = NULL;
	req->flags &= ~REQ_F_NEED_CLEANUP;

	if (msg->flags & IORING_MSG_RING_CQE_SKIP)
		goto out_unlock;

	/*
	 * If this fails, the target still received the file descriptor but
	 * wasn't notified of the fact. This means that if this request
@@ -119,10 +148,51 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
	if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
		ret = -EOVERFLOW;
out_unlock:
	io_double_unlock_ctx(ctx, target_ctx, issue_flags);
	io_double_unlock_ctx(target_ctx, issue_flags);
	return ret;
}

static void io_msg_tw_fd_complete(struct callback_head *head)
{
	struct io_msg *msg = container_of(head, struct io_msg, tw);
	struct io_kiocb *req = cmd_to_io_kiocb(msg);
	int ret = -EOWNERDEAD;

	if (!(current->flags & PF_EXITING))
		ret = io_msg_install_complete(req, IO_URING_F_UNLOCKED);
	if (ret < 0)
		req_set_fail(req);
	io_req_queue_tw_complete(req, ret);
}

static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
{
	struct io_ring_ctx *target_ctx = req->file->private_data;
	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
	struct io_ring_ctx *ctx = req->ctx;
	struct file *src_file = msg->src_file;

	if (target_ctx == ctx)
		return -EINVAL;
	if (!src_file) {
		src_file = io_msg_grab_file(req, issue_flags);
		if (!src_file)
			return -EBADF;
		msg->src_file = src_file;
		req->flags |= REQ_F_NEED_CLEANUP;
	}

	if (target_ctx->task_complete && current != target_ctx->submitter_task) {
		init_task_work(&msg->tw, io_msg_tw_fd_complete);
		if (task_work_add(target_ctx->submitter_task, &msg->tw,
				  TWA_SIGNAL))
			return -EOWNERDEAD;

		return IOU_ISSUE_SKIP_COMPLETE;
	}
	return io_msg_install_complete(req, issue_flags);
}

int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
@@ -130,6 +200,7 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
	if (unlikely(sqe->buf_index || sqe->personality))
		return -EINVAL;

	msg->src_file = NULL;
	msg->user_data = READ_ONCE(sqe->off);
	msg->len = READ_ONCE(sqe->len);
	msg->cmd = READ_ONCE(sqe->addr);
@@ -164,12 +235,11 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
	}

done:
	if (ret < 0)
	if (ret < 0) {
		if (ret == -EAGAIN || ret == IOU_ISSUE_SKIP_COMPLETE)
			return ret;
		req_set_fail(req);
	}
	io_req_set_res(req, ret, 0);
	/* put file to avoid an attempt to IOPOLL the req */
	if (!(req->flags & REQ_F_FIXED_FILE))
		io_put_file(req->file);
	req->file = NULL;
	return IOU_OK;
}
Loading