Commit 54e60e50 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-6.2/io_uring-2022-12-08' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:

 - Always ensure proper ordering in case of CQ ring overflow, which then
   means we can remove some work-arounds for that (Dylan)

 - Support completion batching for multishot, greatly increasing the
   efficiency for those (Dylan)

 - Flag epoll/eventfd wakeups done from io_uring, so that we can easily
   tell if we're recursing into io_uring again.

   Previously, this would have resulted in repeated multishot
   notifications if we had a dependency there. That could happen if an
   eventfd was registered as the ring eventfd, and we multishot polled
   for events on it. Or if an io_uring fd was added to epoll, and
   io_uring had a multishot request for the epoll fd.

   Test cases here:
	https://git.kernel.dk/cgit/liburing/commit/?id=919755a7d0096fda08fb6d65ac54ad8d0fe027cd

   Previously these got terminated when the CQ ring eventually
   overflowed, now it's handled gracefully (me).

 - Tightening of the IOPOLL based completions (Pavel)

 - Optimizations of the networking zero-copy paths (Pavel)

 - Various tweaks and fixes (Dylan, Pavel)

* tag 'for-6.2/io_uring-2022-12-08' of git://git.kernel.dk/linux: (41 commits)
  io_uring: keep unlock_post inlined in hot path
  io_uring: don't use complete_post in kbuf
  io_uring: spelling fix
  io_uring: remove io_req_complete_post_tw
  io_uring: allow multishot polled reqs to defer completion
  io_uring: remove overflow param from io_post_aux_cqe
  io_uring: add lockdep assertion in io_fill_cqe_aux
  io_uring: make io_fill_cqe_aux static
  io_uring: add io_aux_cqe which allows deferred completion
  io_uring: allow defer completion for aux posted cqes
  io_uring: defer all io_req_complete_failed
  io_uring: always lock in io_apoll_task_func
  io_uring: remove iopoll spinlock
  io_uring: iopoll protect complete_post
  io_uring: inline __io_req_complete_put()
  io_uring: remove io_req_tw_post_queue
  io_uring: use io_req_task_complete() in timeout
  io_uring: hold locks for io_req_complete_failed
  io_uring: add completion locking for iopoll
  io_uring: kill io_cqring_ev_posted() and __io_cq_unlock_post()
  ...
parents d523ec4c 5d772916
Loading
Loading
Loading
Loading
+21 −16
Original line number Diff line number Diff line
@@ -43,21 +43,7 @@ struct eventfd_ctx {
	int id;
};

/**
 * eventfd_signal - Adds @n to the eventfd counter.
 * @ctx: [in] Pointer to the eventfd context.
 * @n: [in] Value of the counter to be added to the eventfd internal counter.
 *          The value cannot be negative.
 *
 * This function is supposed to be called by the kernel in paths that do not
 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
 * value, and we signal this as overflow condition by returning a EPOLLERR
 * to poll(2).
 *
 * Returns the amount by which the counter was incremented.  This will be less
 * than @n if the counter has overflowed.
 */
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
{
	unsigned long flags;

@@ -78,12 +64,31 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
		n = ULLONG_MAX - ctx->count;
	ctx->count += n;
	if (waitqueue_active(&ctx->wqh))
		wake_up_locked_poll(&ctx->wqh, EPOLLIN);
		wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
	current->in_eventfd = 0;
	spin_unlock_irqrestore(&ctx->wqh.lock, flags);

	return n;
}

/**
 * eventfd_signal - Adds @n to the eventfd counter.
 * @ctx: [in] Pointer to the eventfd context.
 * @n: [in] Value of the counter to be added to the eventfd internal counter.
 *          The value cannot be negative.
 *
 * This function is supposed to be called by the kernel in paths that do not
 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
 * value, and we signal this as overflow condition by returning a EPOLLERR
 * to poll(2).
 *
 * Returns the amount by which the counter was incremented.  This will be less
 * than @n if the counter has overflowed.
 */
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
{
	return eventfd_signal_mask(ctx, n, 0);
}
EXPORT_SYMBOL_GPL(eventfd_signal);

static void eventfd_free_ctx(struct eventfd_ctx *ctx)
+10 −8
Original line number Diff line number Diff line
@@ -491,7 +491,8 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC

static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
			     unsigned pollflags)
{
	struct eventpoll *ep_src;
	unsigned long flags;
@@ -522,16 +523,17 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
	}
	spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
	ep->nests = nests + 1;
	wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
	wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
	ep->nests = 0;
	spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}

#else

static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
			     unsigned pollflags)
{
	wake_up_poll(&ep->poll_wait, EPOLLIN);
	wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
}

#endif
@@ -742,7 +744,7 @@ static void ep_free(struct eventpoll *ep)

	/* We need to release all tasks waiting for these file */
	if (waitqueue_active(&ep->poll_wait))
		ep_poll_safewake(ep, NULL);
		ep_poll_safewake(ep, NULL, 0);

	/*
	 * We need to lock this because we could be hit by
@@ -1208,7 +1210,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(ep, epi);
		ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);

	if (!(epi->event.events & EPOLLEXCLUSIVE))
		ewake = 1;
@@ -1553,7 +1555,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(ep, NULL);
		ep_poll_safewake(ep, NULL, 0);

	return 0;
}
@@ -1629,7 +1631,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(ep, NULL);
		ep_poll_safewake(ep, NULL, 0);

	return 0;
}
+7 −0
Original line number Diff line number Diff line
@@ -40,6 +40,7 @@ struct file *eventfd_fget(int fd);
struct eventfd_ctx *eventfd_ctx_fdget(int fd);
struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask);
int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
				  __u64 *cnt);
void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
@@ -66,6 +67,12 @@ static inline int eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
	return -ENOSYS;
}

static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n,
				      unsigned mask)
{
	return -ENOSYS;
}

static inline void eventfd_ctx_put(struct eventfd_ctx *ctx)
{

+2 −0
Original line number Diff line number Diff line
@@ -174,7 +174,9 @@ struct io_submit_state {
	bool			plug_started;
	bool			need_plug;
	unsigned short		submit_nr;
	unsigned int		cqes_count;
	struct blk_plug		plug;
	struct io_uring_cqe	cqes[16];
};

struct io_ev_fd {
+6 −0
Original line number Diff line number Diff line
@@ -41,6 +41,12 @@
#define EPOLLMSG	(__force __poll_t)0x00000400
#define EPOLLRDHUP	(__force __poll_t)0x00002000

/*
 * Internal flag - wakeup generated by io_uring, used to detect recursion back
 * into the io_uring poll handler.
 */
#define EPOLL_URING_WAKE	((__force __poll_t)(1U << 27))

/* Set exclusive wakeup mode for the target file descriptor */
#define EPOLLEXCLUSIVE	((__force __poll_t)(1U << 28))

Loading