io_uring: allocate memory for overflowed CQEs (6c2450ae) · Commits · EulixOS / Software / Kernel

fs/io_uring.c

+46 −55

Original line number	Diff line number	Diff line
		@@ -202,6 +202,11 @@ struct io_mapped_ubuf {

		struct io_ring_ctx;

		struct io_overflow_cqe {
		struct io_uring_cqe cqe;
		struct list_head list;
		};

		struct io_rsrc_put {
		struct list_head list;
		union {
		@@ -1401,41 +1406,33 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
		}

		/* Returns true if there are no backlogged entries after the flush */
		static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
		struct task_struct *tsk,
		struct files_struct *files)
		static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
		{
		struct io_rings *rings = ctx->rings;
		struct io_kiocb req, tmp;
		struct io_uring_cqe *cqe;
		unsigned long flags;
		bool all_flushed, posted;
		LIST_HEAD(list);

		if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
		return false;

		posted = false;
		spin_lock_irqsave(&ctx->completion_lock, flags);
		list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
		if (!io_match_task(req, tsk, files))
		continue;
		while (!list_empty(&ctx->cq_overflow_list)) {
		struct io_uring_cqe *cqe = io_get_cqring(ctx);
		struct io_overflow_cqe *ocqe;

		cqe = io_get_cqring(ctx);
		if (!cqe && !force)
		break;

		list_move(&req->compl.list, &list);
		if (cqe) {
		WRITE_ONCE(cqe->user_data, req->user_data);
		WRITE_ONCE(cqe->res, req->result);
		WRITE_ONCE(cqe->flags, req->compl.cflags);
		} else {
		ctx->cached_cq_overflow++;
		ocqe = list_first_entry(&ctx->cq_overflow_list,
		struct io_overflow_cqe, list);
		if (cqe)
		memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
		else
		WRITE_ONCE(ctx->rings->cq_overflow,
		ctx->cached_cq_overflow);
		}
		++ctx->cached_cq_overflow);
		posted = true;
		list_del(&ocqe->list);
		kfree(ocqe);
		}

		all_flushed = list_empty(&ctx->cq_overflow_list);
		@@ -1450,19 +1447,10 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
		spin_unlock_irqrestore(&ctx->completion_lock, flags);
		if (posted)
		io_cqring_ev_posted(ctx);

		while (!list_empty(&list)) {
		req = list_first_entry(&list, struct io_kiocb, compl.list);
		list_del(&req->compl.list);
		io_put_req(req);
		}

		return all_flushed;
		}

		static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
		struct task_struct *tsk,
		struct files_struct *files)
		static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
		{
		bool ret = true;

		@@ -1470,7 +1458,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
		/* iopoll syncs against uring_lock, not completion_lock */
		if (ctx->flags & IORING_SETUP_IOPOLL)
		mutex_lock(&ctx->uring_lock);
		ret = __io_cqring_overflow_flush(ctx, force, tsk, files);
		ret = __io_cqring_overflow_flush(ctx, force);
		if (ctx->flags & IORING_SETUP_IOPOLL)
		mutex_unlock(&ctx->uring_lock);
		}
		@@ -1531,29 +1519,33 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res,
		WRITE_ONCE(cqe->user_data, req->user_data);
		WRITE_ONCE(cqe->res, res);
		WRITE_ONCE(cqe->flags, cflags);
		} else if (ctx->cq_overflow_flushed \|\|
		atomic_read(&req->task->io_uring->in_idle)) {
		/*
		* If we're in ring overflow flush mode, or in task cancel mode,
		* then we cannot store the request for later flushing, we need
		* to drop it on the floor.
		*/
		ctx->cached_cq_overflow++;
		WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
		} else {
		return;
		}
		if (!ctx->cq_overflow_flushed &&
		!atomic_read(&req->task->io_uring->in_idle)) {
		struct io_overflow_cqe *ocqe;

		ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC \| __GFP_ACCOUNT);
		if (!ocqe)
		goto overflow;
		if (list_empty(&ctx->cq_overflow_list)) {
		set_bit(0, &ctx->sq_check_overflow);
		set_bit(0, &ctx->cq_check_overflow);
		ctx->rings->sq_flags \|= IORING_SQ_CQ_OVERFLOW;
		}
		if (req->flags & (REQ_F_NEED_CLEANUP \| REQ_F_BUFFER_SELECTED))
		io_clean_op(req);

		req->result = res;
		req->compl.cflags = cflags;
		req_ref_get(req);
		list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
		ocqe->cqe.user_data = req->user_data;
		ocqe->cqe.res = res;
		ocqe->cqe.flags = cflags;
		list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
		return;
		}
		overflow:
		/*
		* If we're in ring overflow flush mode, or in task cancel mode,
		* or cannot allocate an overflow entry, then we need to drop it
		* on the floor.
		*/
		WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
		}

		static void io_cqring_fill_event(struct io_kiocb *req, long res)
		@@ -2398,7 +2390,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
		* already triggered a CQE (eg in error).
		*/
		if (test_bit(0, &ctx->cq_check_overflow))
		__io_cqring_overflow_flush(ctx, false, NULL, NULL);
		__io_cqring_overflow_flush(ctx, false);
		if (io_cqring_events(ctx))
		break;

		@@ -6581,7 +6573,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)

		/* if we have a backlog and couldn't flush it all, return BUSY */
		if (test_bit(0, &ctx->sq_check_overflow)) {
		if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL))
		if (!__io_cqring_overflow_flush(ctx, false))
		return -EBUSY;
		}

		@@ -6881,7 +6873,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
		int ret;

		do {
		io_cqring_overflow_flush(ctx, false, NULL, NULL);
		io_cqring_overflow_flush(ctx, false);
		if (io_cqring_events(ctx) >= min_events)
		return 0;
		if (!io_run_task_work())
		@@ -6913,7 +6905,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
		trace_io_uring_cqring_wait(ctx, min_events);
		do {
		/* if we can't even flush overflow, don't wait for more */
		if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) {
		if (!io_cqring_overflow_flush(ctx, false)) {
		ret = -EBUSY;
		break;
		}
		@@ -8616,7 +8608,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
		/* if force is set, the ring is going away. always drop after that */
		ctx->cq_overflow_flushed = 1;
		if (ctx->rings)
		__io_cqring_overflow_flush(ctx, true, NULL, NULL);
		__io_cqring_overflow_flush(ctx, true);
		xa_for_each(&ctx->personalities, index, creds)
		io_unregister_personality(ctx, index);
		mutex_unlock(&ctx->uring_lock);
		@@ -8766,7 +8758,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
		ret \|= io_kill_timeouts(ctx, task, files);
		ret \|= io_run_task_work();
		ret \|= io_run_ctx_fallback(ctx);
		io_cqring_overflow_flush(ctx, true, task, files);
		if (!ret)
		break;
		cond_resched();
		@@ -9185,7 +9176,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
		*/
		ret = 0;
		if (ctx->flags & IORING_SETUP_SQPOLL) {
		io_cqring_overflow_flush(ctx, false, NULL, NULL);
		io_cqring_overflow_flush(ctx, false);

		ret = -EOWNERDEAD;
		if (unlikely(ctx->sq_data->thread == NULL)) {