Commit cec53f4c authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'io_uring-6.0-2022-09-02' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:

 - A single fix for over-eager retries for networking (Pavel)

 - Revert the notification slot support for zerocopy sends.

   It turns out that even after more than a year or development and
   testing, there's not full agreement on whether just using plain
   ordered notifications is Good Enough to avoid the complexity of using
   the notifications slots. Because of that, we decided that it's best
   left to a future final decision.

   We can always bring back this feature, but we can't really change it
   or remove it once we've released 6.0 with it enabled. The reverts
   leave the usual CQE notifications as the primary interface for
   knowing when data was sent, and when it was acked. (Pavel)

* tag 'io_uring-6.0-2022-09-02' of git://git.kernel.dk/linux-block:
  selftests/net: return back io_uring zc send tests
  io_uring/net: simplify zerocopy send user API
  io_uring/notif: remove notif registration
  Revert "io_uring: rename IORING_OP_FILES_UPDATE"
  Revert "io_uring: add zc notification flush requests"
  selftests/net: temporarily disable io_uring zc test
  io_uring/net: fix overexcessive retries
parents 1551f8f2 916d72c1
Loading
Loading
Loading
Loading
+6 −22
Original line number Diff line number Diff line
@@ -71,8 +71,8 @@ struct io_uring_sqe {
		__s32	splice_fd_in;
		__u32	file_index;
		struct {
			__u16	notification_idx;
			__u16	addr_len;
			__u16	__pad3[1];
		};
	};
	union {
@@ -178,8 +178,7 @@ enum io_uring_op {
	IORING_OP_FALLOCATE,
	IORING_OP_OPENAT,
	IORING_OP_CLOSE,
	IORING_OP_RSRC_UPDATE,
	IORING_OP_FILES_UPDATE = IORING_OP_RSRC_UPDATE,
	IORING_OP_FILES_UPDATE,
	IORING_OP_STATX,
	IORING_OP_READ,
	IORING_OP_WRITE,
@@ -206,7 +205,7 @@ enum io_uring_op {
	IORING_OP_GETXATTR,
	IORING_OP_SOCKET,
	IORING_OP_URING_CMD,
	IORING_OP_SENDZC_NOTIF,
	IORING_OP_SEND_ZC,

	/* this goes last, obviously */
	IORING_OP_LAST,
@@ -228,7 +227,6 @@ enum io_uring_op {
#define IORING_TIMEOUT_ETIME_SUCCESS	(1U << 5)
#define IORING_TIMEOUT_CLOCK_MASK	(IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK	(IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)

/*
 * sqe->splice_flags
 * extends splice(2) flags
@@ -281,29 +279,16 @@ enum io_uring_op {
 *
 * IORING_RECVSEND_FIXED_BUF	Use registered buffers, the index is stored in
 *				the buf_index field.
 *
 * IORING_RECVSEND_NOTIF_FLUSH	Flush a notification after a successful
 *				successful. Only for zerocopy sends.
 */
#define IORING_RECVSEND_POLL_FIRST	(1U << 0)
#define IORING_RECV_MULTISHOT		(1U << 1)
#define IORING_RECVSEND_FIXED_BUF	(1U << 2)
#define IORING_RECVSEND_NOTIF_FLUSH	(1U << 3)

/*
 * accept flags stored in sqe->ioprio
 */
#define IORING_ACCEPT_MULTISHOT	(1U << 0)


/*
 * IORING_OP_RSRC_UPDATE flags
 */
enum {
	IORING_RSRC_UPDATE_FILES,
	IORING_RSRC_UPDATE_NOTIF,
};

/*
 * IORING_OP_MSG_RING command types, stored in sqe->addr
 */
@@ -341,10 +326,13 @@ struct io_uring_cqe {
 * IORING_CQE_F_BUFFER	If set, the upper 16 bits are the buffer ID
 * IORING_CQE_F_MORE	If set, parent SQE will generate more CQE entries
 * IORING_CQE_F_SOCK_NONEMPTY	If set, more data to read after socket recv
 * IORING_CQE_F_NOTIF	Set for notification CQEs. Can be used to distinct
 * 			them from sends.
 */
#define IORING_CQE_F_BUFFER		(1U << 0)
#define IORING_CQE_F_MORE		(1U << 1)
#define IORING_CQE_F_SOCK_NONEMPTY	(1U << 2)
#define IORING_CQE_F_NOTIF		(1U << 3)

enum {
	IORING_CQE_BUFFER_SHIFT		= 16,
@@ -485,10 +473,6 @@ enum {
	/* register a range of fixed file slots for automatic slot allocation */
	IORING_REGISTER_FILE_ALLOC_RANGE	= 25,

	/* zerocopy notification API */
	IORING_REGISTER_NOTIFIERS		= 26,
	IORING_UNREGISTER_NOTIFIERS		= 27,

	/* this goes last */
	IORING_REGISTER_LAST
};
+2 −12
Original line number Diff line number Diff line
@@ -2640,7 +2640,6 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
		io_unregister_personality(ctx, index);
	if (ctx->rings)
		io_poll_remove_all(ctx, NULL, true);
	io_notif_unregister(ctx);
	mutex_unlock(&ctx->uring_lock);

	/* failed during ring init, it couldn't have issued any requests */
@@ -3839,15 +3838,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
			break;
		ret = io_register_file_alloc_range(ctx, arg);
		break;
	case IORING_REGISTER_NOTIFIERS:
		ret = io_notif_register(ctx, arg, nr_args);
		break;
	case IORING_UNREGISTER_NOTIFIERS:
		ret = -EINVAL;
		if (arg || nr_args)
			break;
		ret = io_notif_unregister(ctx);
		break;
	default:
		ret = -EINVAL;
		break;
@@ -3933,8 +3923,8 @@ static int __init io_uring_init(void)
	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
	BUILD_BUG_SQE_ELEM(44, __u16,  notification_idx);
	BUILD_BUG_SQE_ELEM(46, __u16,  addr_len);
	BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
+35 −24
Original line number Diff line number Diff line
@@ -65,12 +65,12 @@ struct io_sendzc {
	struct file			*file;
	void __user			*buf;
	size_t				len;
	u16				slot_idx;
	unsigned			msg_flags;
	unsigned			flags;
	unsigned			addr_len;
	void __user			*addr;
	size_t				done_io;
	struct io_kiocb 		*notif;
};

#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
@@ -879,17 +879,31 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
	return ret;
}

void io_sendzc_cleanup(struct io_kiocb *req)
{
	struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);

	zc->notif->flags |= REQ_F_CQE_SKIP;
	io_notif_flush(zc->notif);
	zc->notif = NULL;
}

int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
	struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
	struct io_ring_ctx *ctx = req->ctx;
	struct io_kiocb *notif;

	if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))
	if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3) ||
	    READ_ONCE(sqe->__pad3[0]))
		return -EINVAL;
	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
	if (req->flags & REQ_F_CQE_SKIP)
		return -EINVAL;

	zc->flags = READ_ONCE(sqe->ioprio);
	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
			  IORING_RECVSEND_FIXED_BUF | IORING_RECVSEND_NOTIF_FLUSH))
			  IORING_RECVSEND_FIXED_BUF))
		return -EINVAL;
	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
		unsigned idx = READ_ONCE(sqe->buf_index);
@@ -900,11 +914,17 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
		req->imu = READ_ONCE(ctx->user_bufs[idx]);
		io_req_set_rsrc_node(req, ctx, 0);
	}
	notif = zc->notif = io_alloc_notif(ctx);
	if (!notif)
		return -ENOMEM;
	notif->cqe.user_data = req->cqe.user_data;
	notif->cqe.res = 0;
	notif->cqe.flags = IORING_CQE_F_NOTIF;
	req->flags |= REQ_F_NEED_CLEANUP;

	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
	zc->len = READ_ONCE(sqe->len);
	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
	zc->slot_idx = READ_ONCE(sqe->notification_idx);
	if (zc->msg_flags & MSG_DONTWAIT)
		req->flags |= REQ_F_NOWAIT;

@@ -956,7 +976,7 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
	shinfo->nr_frags = frag;
	from->bvec += bi.bi_idx;
	from->nr_segs -= bi.bi_idx;
	from->count = bi.bi_size;
	from->count -= copied;
	from->iov_offset = bi.bi_bvec_done;

	skb->data_len += copied;
@@ -976,33 +996,20 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
{
	struct sockaddr_storage __address, *addr = NULL;
	struct io_ring_ctx *ctx = req->ctx;
	struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
	struct io_notif_slot *notif_slot;
	struct io_kiocb *notif;
	struct msghdr msg;
	struct iovec iov;
	struct socket *sock;
	unsigned msg_flags;
	unsigned msg_flags, cflags;
	int ret, min_ret = 0;

	if (!(req->flags & REQ_F_POLLED) &&
	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
		return -EAGAIN;

	if (issue_flags & IO_URING_F_UNLOCKED)
		return -EAGAIN;
	sock = sock_from_file(req->file);
	if (unlikely(!sock))
		return -ENOTSOCK;

	notif_slot = io_get_notif_slot(ctx, zc->slot_idx);
	if (!notif_slot)
		return -EINVAL;
	notif = io_get_notif(ctx, notif_slot);
	if (!notif)
		return -ENOMEM;

	msg.msg_name = NULL;
	msg.msg_control = NULL;
	msg.msg_controllen = 0;
@@ -1033,7 +1040,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
					  &msg.msg_iter);
		if (unlikely(ret))
			return ret;
		ret = io_notif_account_mem(notif, zc->len);
		ret = io_notif_account_mem(zc->notif, zc->len);
		if (unlikely(ret))
			return ret;
	}
@@ -1045,7 +1052,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
		min_ret = iov_iter_count(&msg.msg_iter);

	msg.msg_flags = msg_flags;
	msg.msg_ubuf = &io_notif_to_data(notif)->uarg;
	msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
	msg.sg_from_iter = io_sg_from_iter;
	ret = sock_sendmsg(sock, &msg);

@@ -1060,18 +1067,22 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
			req->flags |= REQ_F_PARTIAL_IO;
			return io_setup_async_addr(req, addr, issue_flags);
		}
		if (ret < 0 && !zc->done_io)
			zc->notif->flags |= REQ_F_CQE_SKIP;
		if (ret == -ERESTARTSYS)
			ret = -EINTR;
		req_set_fail(req);
	} else if (zc->flags & IORING_RECVSEND_NOTIF_FLUSH) {
		io_notif_slot_flush_submit(notif_slot, 0);
	}

	if (ret >= 0)
		ret += zc->done_io;
	else if (zc->done_io)
		ret = zc->done_io;
	io_req_set_res(req, ret, 0);

	io_notif_flush(zc->notif);
	req->flags &= ~REQ_F_NEED_CLEANUP;
	cflags = ret >= 0 ? IORING_CQE_F_MORE : 0;
	io_req_set_res(req, ret, cflags);
	return IOU_OK;
}

+1 −0
Original line number Diff line number Diff line
@@ -55,6 +55,7 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags);

int io_sendzc(struct io_kiocb *req, unsigned int issue_flags);
int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
void io_sendzc_cleanup(struct io_kiocb *req);

void io_netmsg_cache_free(struct io_cache_entry *entry);
#else
+2 −81
Original line number Diff line number Diff line
@@ -42,8 +42,7 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
	}
}

struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
				struct io_notif_slot *slot)
struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
	__must_hold(&ctx->uring_lock)
{
	struct io_kiocb *notif;
@@ -59,101 +58,23 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
	io_get_task_refs(1);
	notif->rsrc_node = NULL;
	io_req_set_rsrc_node(notif, ctx, 0);
	notif->cqe.user_data = slot->tag;
	notif->cqe.flags = slot->seq++;
	notif->cqe.res = 0;

	nd = io_notif_to_data(notif);
	nd->account_pages = 0;
	nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
	nd->uarg.callback = io_uring_tx_zerocopy_callback;
	/* master ref owned by io_notif_slot, will be dropped on flush */
	refcount_set(&nd->uarg.refcnt, 1);
	return notif;
}

void io_notif_slot_flush(struct io_notif_slot *slot)
void io_notif_flush(struct io_kiocb *notif)
	__must_hold(&slot->notif->ctx->uring_lock)
{
	struct io_kiocb *notif = slot->notif;
	struct io_notif_data *nd = io_notif_to_data(notif);

	slot->notif = NULL;

	/* drop slot's master ref */
	if (refcount_dec_and_test(&nd->uarg.refcnt)) {
		notif->io_task_work.func = __io_notif_complete_tw;
		io_req_task_work_add(notif);
	}
}

__cold int io_notif_unregister(struct io_ring_ctx *ctx)
	__must_hold(&ctx->uring_lock)
{
	int i;

	if (!ctx->notif_slots)
		return -ENXIO;

	for (i = 0; i < ctx->nr_notif_slots; i++) {
		struct io_notif_slot *slot = &ctx->notif_slots[i];
		struct io_kiocb *notif = slot->notif;
		struct io_notif_data *nd;

		if (!notif)
			continue;
		nd = io_notif_to_data(notif);
		slot->notif = NULL;
		if (!refcount_dec_and_test(&nd->uarg.refcnt))
			continue;
		notif->io_task_work.func = __io_notif_complete_tw;
		io_req_task_work_add(notif);
	}

	kvfree(ctx->notif_slots);
	ctx->notif_slots = NULL;
	ctx->nr_notif_slots = 0;
	return 0;
}

__cold int io_notif_register(struct io_ring_ctx *ctx,
			     void __user *arg, unsigned int size)
	__must_hold(&ctx->uring_lock)
{
	struct io_uring_notification_slot __user *slots;
	struct io_uring_notification_slot slot;
	struct io_uring_notification_register reg;
	unsigned i;

	if (ctx->nr_notif_slots)
		return -EBUSY;
	if (size != sizeof(reg))
		return -EINVAL;
	if (copy_from_user(&reg, arg, sizeof(reg)))
		return -EFAULT;
	if (!reg.nr_slots || reg.nr_slots > IORING_MAX_NOTIF_SLOTS)
		return -EINVAL;
	if (reg.resv || reg.resv2 || reg.resv3)
		return -EINVAL;

	slots = u64_to_user_ptr(reg.data);
	ctx->notif_slots = kvcalloc(reg.nr_slots, sizeof(ctx->notif_slots[0]),
				GFP_KERNEL_ACCOUNT);
	if (!ctx->notif_slots)
		return -ENOMEM;

	for (i = 0; i < reg.nr_slots; i++, ctx->nr_notif_slots++) {
		struct io_notif_slot *notif_slot = &ctx->notif_slots[i];

		if (copy_from_user(&slot, &slots[i], sizeof(slot))) {
			io_notif_unregister(ctx);
			return -EFAULT;
		}
		if (slot.resv[0] | slot.resv[1] | slot.resv[2]) {
			io_notif_unregister(ctx);
			return -EINVAL;
		}
		notif_slot->tag = slot.tag;
	}
	return 0;
}
Loading