Commit 7ccc3ebf authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-6.6-2023-09-08' of git://git.kernel.dk/linux

Pull io_uring fixes from Jens Axboe:
 "A few fixes that should go into the 6.6-rc merge window:

   - Fix for a regression this merge window caused by the SQPOLL
     affinity patch, where we can race with SQPOLL thread shutdown and
     cause an oops when trying to set affinity (Gabriel)

   - Fix for a regression this merge window where fdinfo reading with
     for a ring setup with IORING_SETUP_NO_SQARRAY will attempt to
     deference the non-existing SQ ring array (me)

   - Add the patch that allows more finegrained control over who can use
     io_uring (Matteo)

   - Locking fix for a regression added this merge window for IOPOLL
     overflow (Pavel)

   - IOPOLL fix for stable, breaking our loop if helper threads are
     exiting (Pavel)

  Also had a fix for unreaped iopoll requests from io-wq from Ming, but
  we found an issue with that and hence it got reverted. Will get this
  sorted for a future rc"

* tag 'io_uring-6.6-2023-09-08' of git://git.kernel.dk/linux:
  Revert "io_uring: fix IO hang in io_wq_put_and_exit from do_exit()"
  io_uring: fix unprotected iopoll overflow
  io_uring: break out of iowq iopoll on teardown
  io_uring: add a sysctl to disable io_uring system-wide
  io_uring/fdinfo: only print ->sq_array[] if it's there
  io_uring: fix IO hang in io_wq_put_and_exit from do_exit()
  io_uring: Don't set affinity on a dying sqpoll thread
parents 32bf43e4 023464fe
Loading
Loading
Loading
Loading
+29 −0
Original line number Diff line number Diff line
@@ -450,6 +450,35 @@ this allows system administrators to override the
``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded.


io_uring_disabled
=================

Prevents all processes from creating new io_uring instances. Enabling this
shrinks the kernel's attack surface.

= ======================================================================
0 All processes can create io_uring instances as normal. This is the
  default setting.
1 io_uring creation is disabled (io_uring_setup() will fail with
  -EPERM) for unprivileged processes not in the io_uring_group group.
  Existing io_uring instances can still be used.  See the
  documentation for io_uring_group for more information.
2 io_uring creation is disabled for all processes. io_uring_setup()
  always fails with -EPERM. Existing io_uring instances can still be
  used.
= ======================================================================


io_uring_group
==============

When io_uring_disabled is set to 1, a process must either be
privileged (CAP_SYS_ADMIN) or be in the io_uring_group group in order
to create an io_uring instance.  If io_uring_group is set to -1 (the
default), only processes with the CAP_SYS_ADMIN capability may create
io_uring instances.


kexec_load_disabled
===================

+2 −0
Original line number Diff line number Diff line
@@ -93,6 +93,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
		struct io_uring_sqe *sqe;
		unsigned int sq_idx;

		if (ctx->flags & IORING_SETUP_NO_SQARRAY)
			break;
		sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
		if (sq_idx > sq_mask)
			continue;
+10 −0
Original line number Diff line number Diff line
@@ -174,6 +174,16 @@ static void io_worker_ref_put(struct io_wq *wq)
		complete(&wq->worker_done);
}

bool io_wq_worker_stopped(void)
{
	struct io_worker *worker = current->worker_private;

	if (WARN_ON_ONCE(!io_wq_current_is_worker()))
		return true;

	return test_bit(IO_WQ_BIT_EXIT, &worker->wq->state);
}

static void io_worker_cancel_cb(struct io_worker *worker)
{
	struct io_wq_acct *acct = io_wq_get_acct(worker);
+1 −0
Original line number Diff line number Diff line
@@ -52,6 +52,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val);

int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
bool io_wq_worker_stopped(void);

static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
+54 −2
Original line number Diff line number Diff line
@@ -150,6 +150,31 @@ static void io_queue_sqe(struct io_kiocb *req);

struct kmem_cache *req_cachep;

static int __read_mostly sysctl_io_uring_disabled;
static int __read_mostly sysctl_io_uring_group = -1;

#ifdef CONFIG_SYSCTL
static struct ctl_table kernel_io_uring_disabled_table[] = {
	{
		.procname	= "io_uring_disabled",
		.data		= &sysctl_io_uring_disabled,
		.maxlen		= sizeof(sysctl_io_uring_disabled),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= SYSCTL_ZERO,
		.extra2		= SYSCTL_TWO,
	},
	{
		.procname	= "io_uring_group",
		.data		= &sysctl_io_uring_group,
		.maxlen		= sizeof(gid_t),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
	{},
};
#endif

struct sock *io_uring_get_socket(struct file *file)
{
#if defined(CONFIG_UNIX)
@@ -883,7 +908,7 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
		struct io_uring_cqe *cqe = &ctx->completion_cqes[i];

		if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
			if (ctx->task_complete) {
			if (ctx->lockless_cq) {
				spin_lock(&ctx->completion_lock);
				io_cqring_event_overflow(ctx, cqe->user_data,
							cqe->res, cqe->flags, 0, 0);
@@ -1541,7 +1566,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)

		if (!(req->flags & REQ_F_CQE_SKIP) &&
		    unlikely(!io_fill_cqe_req(ctx, req))) {
			if (ctx->task_complete) {
			if (ctx->lockless_cq) {
				spin_lock(&ctx->completion_lock);
				io_req_cqe_overflow(req);
				spin_unlock(&ctx->completion_lock);
@@ -1950,6 +1975,8 @@ void io_wq_submit_work(struct io_wq_work *work)
		if (!needs_poll) {
			if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
				break;
			if (io_wq_worker_stopped())
				break;
			cond_resched();
			continue;
		}
@@ -4038,9 +4065,30 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
	return io_uring_create(entries, &p, params);
}

static inline bool io_uring_allowed(void)
{
	int disabled = READ_ONCE(sysctl_io_uring_disabled);
	kgid_t io_uring_group;

	if (disabled == 2)
		return false;

	if (disabled == 0 || capable(CAP_SYS_ADMIN))
		return true;

	io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group);
	if (!gid_valid(io_uring_group))
		return false;

	return in_group_p(io_uring_group);
}

SYSCALL_DEFINE2(io_uring_setup, u32, entries,
		struct io_uring_params __user *, params)
{
	if (!io_uring_allowed())
		return -EPERM;

	return io_uring_setup(entries, params);
}

@@ -4634,6 +4682,10 @@ static int __init io_uring_init(void)
				offsetof(struct io_kiocb, cmd.data),
				sizeof_field(struct io_kiocb, cmd.data), NULL);

#ifdef CONFIG_SYSCTL
	register_sysctl_init("kernel", kernel_io_uring_disabled_table);
#endif

	return 0;
};
__initcall(io_uring_init);
Loading