Commit f3a78227 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-5.17-2022-01-21' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:

 - Fix the io_uring POLLFREE handling, similarly to how it was done for
   aio (Pavel)

 - Remove (now) unused function (Jiapeng)

 - Small series fixing an issue with work cancelations. A window exists
   where work isn't locatable in the pending list, and isn't active in a
   worker yet either. (me)

* tag 'io_uring-5.17-2022-01-21' of git://git.kernel.dk/linux-block:
  io-wq: delete dead lock shuffling code
  io_uring: perform poll removal even if async work removal is successful
  io-wq: add intermediate work step between pending list and active work
  io-wq: perform both unstarted and started work cancelations in one go
  io-wq: invoke work cancelation with wqe->lock held
  io-wq: make io_worker lock a raw spinlock
  io-wq: remove useless 'work' argument to __io_worker_busy()
  io_uring: fix UAF due to missing POLLFREE handling
  io_uring: Remove unused function req_ref_put
parents 1f40caa0 73031f76
Loading
Loading
Loading
Loading
+56 −35
Original line number Diff line number Diff line
@@ -48,7 +48,8 @@ struct io_worker {
	struct io_wqe *wqe;

	struct io_wq_work *cur_work;
	spinlock_t lock;
	struct io_wq_work *next_work;
	raw_spinlock_t lock;

	struct completion ref_done;

@@ -405,8 +406,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
 * Worker will start processing some work. Move it to the busy list, if
 * it's currently on the freelist
 */
static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
			     struct io_wq_work *work)
static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker)
	__must_hold(wqe->lock)
{
	if (worker->flags & IO_WORKER_F_FREE) {
@@ -529,9 +529,10 @@ static void io_assign_current_work(struct io_worker *worker,
		cond_resched();
	}

	spin_lock(&worker->lock);
	raw_spin_lock(&worker->lock);
	worker->cur_work = work;
	spin_unlock(&worker->lock);
	worker->next_work = NULL;
	raw_spin_unlock(&worker->lock);
}

static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
@@ -546,7 +547,7 @@ static void io_worker_handle_work(struct io_worker *worker)

	do {
		struct io_wq_work *work;
get_next:

		/*
		 * If we got some work, mark us as busy. If we didn't, but
		 * the list isn't empty, it means we stalled on hashed work.
@@ -555,9 +556,20 @@ static void io_worker_handle_work(struct io_worker *worker)
		 * clear the stalled flag.
		 */
		work = io_get_next_work(acct, worker);
		if (work)
			__io_worker_busy(wqe, worker, work);
		if (work) {
			__io_worker_busy(wqe, worker);

			/*
			 * Make sure cancelation can find this, even before
			 * it becomes the active work. That avoids a window
			 * where the work has been removed from our general
			 * work list, but isn't yet discoverable as the
			 * current work item for this worker.
			 */
			raw_spin_lock(&worker->lock);
			worker->next_work = work;
			raw_spin_unlock(&worker->lock);
		}
		raw_spin_unlock(&wqe->lock);
		if (!work)
			break;
@@ -594,11 +606,6 @@ static void io_worker_handle_work(struct io_worker *worker)
				spin_unlock_irq(&wq->hash->wait.lock);
				if (wq_has_sleeper(&wq->hash->wait))
					wake_up(&wq->hash->wait);
				raw_spin_lock(&wqe->lock);
				/* skip unnecessary unlock-lock wqe->lock */
				if (!work)
					goto get_next;
				raw_spin_unlock(&wqe->lock);
			}
		} while (work);

@@ -815,7 +822,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)

	refcount_set(&worker->ref, 1);
	worker->wqe = wqe;
	spin_lock_init(&worker->lock);
	raw_spin_lock_init(&worker->lock);
	init_completion(&worker->ref_done);

	if (index == IO_WQ_ACCT_BOUND)
@@ -973,6 +980,19 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
	work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
}

static bool __io_wq_worker_cancel(struct io_worker *worker,
				  struct io_cb_cancel_data *match,
				  struct io_wq_work *work)
{
	if (work && match->fn(work, match->data)) {
		work->flags |= IO_WQ_WORK_CANCEL;
		set_notify_signal(worker->task);
		return true;
	}

	return false;
}

static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
	struct io_cb_cancel_data *match = data;
@@ -981,13 +1001,11 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
	 * Hold the lock to avoid ->cur_work going out of scope, caller
	 * may dereference the passed in work.
	 */
	spin_lock(&worker->lock);
	if (worker->cur_work &&
	    match->fn(worker->cur_work, match->data)) {
		set_notify_signal(worker->task);
	raw_spin_lock(&worker->lock);
	if (__io_wq_worker_cancel(worker, match, worker->cur_work) ||
	    __io_wq_worker_cancel(worker, match, worker->next_work))
		match->nr_running++;
	}
	spin_unlock(&worker->lock);
	raw_spin_unlock(&worker->lock);

	return match->nr_running && !match->cancel_all;
}
@@ -1039,17 +1057,16 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
{
	int i;
retry:
	raw_spin_lock(&wqe->lock);
	for (i = 0; i < IO_WQ_ACCT_NR; i++) {
		struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);

		if (io_acct_cancel_pending_work(wqe, acct, match)) {
			raw_spin_lock(&wqe->lock);
			if (match->cancel_all)
				goto retry;
			return;
			break;
		}
	}
	raw_spin_unlock(&wqe->lock);
}

static void io_wqe_cancel_running_work(struct io_wqe *wqe,
@@ -1074,25 +1091,27 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
	 * First check pending list, if we're lucky we can just remove it
	 * from there. CANCEL_OK means that the work is returned as-new,
	 * no completion will be posted for it.
	 *
	 * Then check if a free (going busy) or busy worker has the work
	 * currently running. If we find it there, we'll return CANCEL_RUNNING
	 * as an indication that we attempt to signal cancellation. The
	 * completion will run normally in this case.
	 *
	 * Do both of these while holding the wqe->lock, to ensure that
	 * we'll find a work item regardless of state.
	 */
	for_each_node(node) {
		struct io_wqe *wqe = wq->wqes[node];

		raw_spin_lock(&wqe->lock);
		io_wqe_cancel_pending_work(wqe, &match);
		if (match.nr_pending && !match.cancel_all)
		if (match.nr_pending && !match.cancel_all) {
			raw_spin_unlock(&wqe->lock);
			return IO_WQ_CANCEL_OK;
		}

	/*
	 * Now check if a free (going busy) or busy worker has the work
	 * currently running. If we find it there, we'll return CANCEL_RUNNING
	 * as an indication that we attempt to signal cancellation. The
	 * completion will run normally in this case.
	 */
	for_each_node(node) {
		struct io_wqe *wqe = wq->wqes[node];

		io_wqe_cancel_running_work(wqe, &match);
		raw_spin_unlock(&wqe->lock);
		if (match.nr_running && !match.cancel_all)
			return IO_WQ_CANCEL_RUNNING;
	}
@@ -1263,7 +1282,9 @@ static void io_wq_destroy(struct io_wq *wq)
			.fn		= io_wq_work_match_all,
			.cancel_all	= true,
		};
		raw_spin_lock(&wqe->lock);
		io_wqe_cancel_pending_work(wqe, &match);
		raw_spin_unlock(&wqe->lock);
		free_cpumask_var(wqe->cpu_mask);
		kfree(wqe);
	}
+60 −19
Original line number Diff line number Diff line
@@ -1192,12 +1192,6 @@ static inline bool req_ref_put_and_test(struct io_kiocb *req)
	return atomic_dec_and_test(&req->refs);
}

static inline void req_ref_put(struct io_kiocb *req)
{
	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
	WARN_ON_ONCE(req_ref_put_and_test(req));
}

static inline void req_ref_get(struct io_kiocb *req)
{
	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
@@ -5468,23 +5462,41 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,

static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
{
	struct wait_queue_head *head = poll->head;
	struct wait_queue_head *head = smp_load_acquire(&poll->head);

	if (head) {
		spin_lock_irq(&head->lock);
		list_del_init(&poll->wait.entry);
		poll->head = NULL;
		spin_unlock_irq(&head->lock);
	}
}

static void io_poll_remove_entries(struct io_kiocb *req)
{
	struct io_poll_iocb *poll = io_poll_get_single(req);
	struct io_poll_iocb *poll_double = io_poll_get_double(req);

	if (poll->head)
	/*
	 * While we hold the waitqueue lock and the waitqueue is nonempty,
	 * wake_up_pollfree() will wait for us.  However, taking the waitqueue
	 * lock in the first place can race with the waitqueue being freed.
	 *
	 * We solve this as eventpoll does: by taking advantage of the fact that
	 * all users of wake_up_pollfree() will RCU-delay the actual free.  If
	 * we enter rcu_read_lock() and see that the pointer to the queue is
	 * non-NULL, we can then lock it without the memory being freed out from
	 * under us.
	 *
	 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
	 * case the caller deletes the entry from the queue, leaving it empty.
	 * In that case, only RCU prevents the queue memory from being freed.
	 */
	rcu_read_lock();
	io_poll_remove_entry(poll);
	if (poll_double && poll_double->head)
	if (poll_double)
		io_poll_remove_entry(poll_double);
	rcu_read_unlock();
}

/*
@@ -5624,6 +5636,30 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
						 wait);
	__poll_t mask = key_to_poll(key);

	if (unlikely(mask & POLLFREE)) {
		io_poll_mark_cancelled(req);
		/* we have to kick tw in case it's not already */
		io_poll_execute(req, 0);

		/*
		 * If the waitqueue is being freed early but someone is already
		 * holds ownership over it, we have to tear down the request as
		 * best we can. That means immediately removing the request from
		 * its waitqueue and preventing all further accesses to the
		 * waitqueue via the request.
		 */
		list_del_init(&poll->wait.entry);

		/*
		 * Careful: this *must* be the last step, since as soon
		 * as req->head is NULL'ed out, the request can be
		 * completed and freed, since aio_poll_complete_work()
		 * will no longer need to take the waitqueue lock.
		 */
		smp_store_release(&poll->head, NULL);
		return 1;
	}

	/* for instances that support it check for an event match first */
	if (mask && !(mask & poll->events))
		return 0;
@@ -6350,16 +6386,21 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);

	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
	if (ret != -ENOENT)
		return ret;
	/*
	 * Fall-through even for -EALREADY, as we may have poll armed
	 * that need unarming.
	 */
	if (!ret)
		return 0;

	spin_lock(&ctx->completion_lock);
	ret = io_poll_cancel(ctx, sqe_addr, false);
	if (ret != -ENOENT)
		goto out;

	spin_lock_irq(&ctx->timeout_lock);
	ret = io_timeout_cancel(ctx, sqe_addr);
	spin_unlock_irq(&ctx->timeout_lock);
	if (ret != -ENOENT)
		goto out;
	ret = io_poll_cancel(ctx, sqe_addr, false);
out:
	spin_unlock(&ctx->completion_lock);
	return ret;