Commit 2af89abd authored by Pavel Begunkov's avatar Pavel Begunkov Committed by Jens Axboe
Browse files

io_uring: add option to remove SQ indirection



Not many aware, but io_uring submission queue has two levels. The first
level usually appears as sq_array and stores indexes into the actual SQ.

To my knowledge, no one has ever seriously used it, nor liburing exposes
it to users. Add IORING_SETUP_NO_SQARRAY, when set we don't bother
creating and using the sq_array and SQ heads/tails will be pointing
directly into the SQ. Improves memory footprint, in term of both
allocations as well as cache usage, and also should make io_get_sqe()
less branchy in the end.

Signed-off-by: default avatarPavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0ffa3268a5ef61d326201ff43a233315c96312e0.1692916914.git.asml.silence@gmail.com


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent e5598d6a
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -185,6 +185,11 @@ enum {
 */
#define IORING_SETUP_REGISTERED_FD_ONLY	(1U << 15)

/*
 * Removes indirection through the SQ index array.
 */
#define IORING_SETUP_NO_SQARRAY		(1U << 16)

enum io_uring_op {
	IORING_OP_NOP,
	IORING_OP_READV,
+32 −20
Original line number Diff line number Diff line
@@ -2339,8 +2339,21 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
 */
static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
{
	unsigned head, mask = ctx->sq_entries - 1;
	unsigned sq_idx = ctx->cached_sq_head++ & mask;
	unsigned mask = ctx->sq_entries - 1;
	unsigned head = ctx->cached_sq_head++ & mask;

	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) {
		head = READ_ONCE(ctx->sq_array[head]);
		if (unlikely(head >= ctx->sq_entries)) {
			/* drop invalid entries */
			spin_lock(&ctx->completion_lock);
			ctx->cq_extra--;
			spin_unlock(&ctx->completion_lock);
			WRITE_ONCE(ctx->rings->sq_dropped,
				   READ_ONCE(ctx->rings->sq_dropped) + 1);
			return false;
		}
	}

	/*
	 * The cached sq head (or cq tail) serves two purposes:
@@ -2350,8 +2363,7 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
	 * 2) allows the kernel side to track the head on its own, even
	 *    though the application is the one updating it.
	 */
	head = READ_ONCE(ctx->sq_array[sq_idx]);
	if (likely(head < ctx->sq_entries)) {

	/* double index for 128-byte SQEs, twice as long */
	if (ctx->flags & IORING_SETUP_SQE128)
		head <<= 1;
@@ -2359,15 +2371,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
	return true;
}

	/* drop invalid entries */
	spin_lock(&ctx->completion_lock);
	ctx->cq_extra--;
	spin_unlock(&ctx->completion_lock);
	WRITE_ONCE(ctx->rings->sq_dropped,
		   READ_ONCE(ctx->rings->sq_dropped) + 1);
	return false;
}

int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
	__must_hold(&ctx->uring_lock)
{
@@ -2734,6 +2737,12 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
		return SIZE_MAX;
#endif

	if (ctx->flags & IORING_SETUP_NO_SQARRAY) {
		if (sq_offset)
			*sq_offset = SIZE_MAX;
		return off;
	}

	if (sq_offset)
		*sq_offset = off;

@@ -3710,6 +3719,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
		return PTR_ERR(rings);

	ctx->rings = rings;
	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
		ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
	rings->sq_ring_mask = p->sq_entries - 1;
	rings->cq_ring_mask = p->cq_entries - 1;
@@ -3921,6 +3931,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
		p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
	p->sq_off.resv1 = 0;
	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
@@ -4010,7 +4021,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY))
			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
			IORING_SETUP_NO_SQARRAY))
		return -EINVAL;

	return io_uring_create(entries, &p, params);