Commit 6e76ac59 authored by Josh Triplett's avatar Josh Triplett Committed by Jens Axboe
Browse files

io_uring: Add io_uring_setup flag to pre-register ring fd and never install it



With IORING_REGISTER_USE_REGISTERED_RING, an application can register
the ring fd and use it via registered index rather than installed fd.
This allows using a registered ring for everything *except* the initial
mmap.

With IORING_SETUP_NO_MMAP, io_uring_setup uses buffers allocated by the
user, rather than requiring a subsequent mmap.

The combination of the two allows a user to operate *entirely* via a
registered ring fd, making it unnecessary to ever install the fd in the
first place. So, add a flag IORING_SETUP_REGISTERED_FD_ONLY to make
io_uring_setup register the fd and return a registered index, without
installing the fd.

This allows an application to avoid touching the fd table at all, and
allows a library to never even momentarily install a file descriptor.

This splits out an io_ring_add_registered_file helper from
io_ring_add_registered_fd, for use by io_uring_setup.

Signed-off-by: default avatarJosh Triplett <josh@joshtriplett.org>
Link: https://lore.kernel.org/r/bc8f431bada371c183b95a83399628b605e978a3.1682699803.git.josh@joshtriplett.org


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 03d89a2d
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -178,6 +178,13 @@ enum {
 */
#define IORING_SETUP_NO_MMAP		(1U << 14)

/*
 * Register the ring fd in itself for use with
 * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather
 * than an fd.
 */
#define IORING_SETUP_REGISTERED_FD_ONLY	(1U << 15)

enum io_uring_op {
	IORING_OP_NOP,
	IORING_OP_READV,
+22 −15
Original line number Diff line number Diff line
@@ -3788,19 +3788,13 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
	return 0;
}

static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
static int io_uring_install_fd(struct file *file)
{
	int ret, fd;
	int fd;

	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
	if (fd < 0)
		return fd;

	ret = __io_uring_add_tctx_node(ctx);
	if (ret) {
		put_unused_fd(fd);
		return ret;
	}
	fd_install(fd, file);
	return fd;
}
@@ -3840,6 +3834,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
				  struct io_uring_params __user *params)
{
	struct io_ring_ctx *ctx;
	struct io_uring_task *tctx;
	struct file *file;
	int ret;

@@ -3851,6 +3846,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
		entries = IORING_MAX_ENTRIES;
	}

	if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
	    && !(p->flags & IORING_SETUP_NO_MMAP))
		return -EINVAL;

	/*
	 * Use twice as many entries for the CQ ring. It's possible for the
	 * application to drive a higher depth than the size of the SQ ring,
@@ -4007,22 +4006,30 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
		goto err;
	}

	ret = __io_uring_add_tctx_node(ctx);
	if (ret)
		goto err_fput;
	tctx = current->io_uring;

	/*
	 * Install ring fd as the very last thing, so we don't risk someone
	 * having closed it before we finish setup
	 */
	ret = io_uring_install_fd(ctx, file);
	if (ret < 0) {
		/* fput will clean it up */
		fput(file);
		return ret;
	}
	if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
		ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX);
	else
		ret = io_uring_install_fd(file);
	if (ret < 0)
		goto err_fput;

	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
	return ret;
err:
	io_ring_ctx_wait_and_kill(ctx);
	return ret;
err_fput:
	fput(file);
	return ret;
}

/*
@@ -4049,7 +4056,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
			IORING_SETUP_NO_MMAP))
			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY))
		return -EINVAL;

	return io_uring_create(entries, &p, params);
+3 −0
Original line number Diff line number Diff line
@@ -75,6 +75,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
int io_uring_alloc_task_context(struct task_struct *task,
				struct io_ring_ctx *ctx);

int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
				     int start, int end);

int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
+20 −11
Original line number Diff line number Diff line
@@ -208,17 +208,27 @@ void io_uring_unreg_ringfd(void)
	}
}

static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
				     int start, int end)
{
	struct file *file;
	int offset;

	for (offset = start; offset < end; offset++) {
		offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
		if (tctx->registered_rings[offset])
			continue;

		tctx->registered_rings[offset] = file;
		return offset;
	}
	return -EBUSY;
}

static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
				     int start, int end)
{
	struct file *file;
	int offset;

	file = fget(fd);
	if (!file) {
		return -EBADF;
@@ -226,13 +236,12 @@ static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
		fput(file);
		return -EOPNOTSUPP;
	}
		tctx->registered_rings[offset] = file;
	offset = io_ring_add_registered_file(tctx, file, start, end);
	if (offset < 0)
		fput(file);
	return offset;
}

	return -EBUSY;
}

/*
 * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
 * invocation. User passes in an array of struct io_uring_rsrc_update