Commit 493108d9 authored by Pavel Begunkov's avatar Pavel Begunkov Committed by Jens Axboe
Browse files

io_uring/net: zerocopy sendmsg

parent c4c0009e
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -213,6 +213,7 @@ enum io_uring_op {
	IORING_OP_SOCKET,
	IORING_OP_URING_CMD,
	IORING_OP_SEND_ZC,
	IORING_OP_SENDMSG_ZC,

	/* this goes last, obviously */
	IORING_OP_LAST,
+86 −5
Original line number Diff line number Diff line
@@ -909,7 +909,12 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
void io_send_zc_cleanup(struct io_kiocb *req)
{
	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
	struct io_async_msghdr *io;

	if (req_has_async_data(req)) {
		io = req->async_data;
		kfree(io->free_iov);
	}
	zc->notif->flags |= REQ_F_CQE_SKIP;
	io_notif_flush(zc->notif);
	zc->notif = NULL;
@@ -921,8 +926,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
	struct io_ring_ctx *ctx = req->ctx;
	struct io_kiocb *notif;

	if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3) ||
	    READ_ONCE(sqe->__pad3[0]))
	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
		return -EINVAL;
	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
	if (req->flags & REQ_F_CQE_SKIP)
@@ -949,14 +953,24 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
		io_req_set_rsrc_node(notif, ctx, 0);
	}

	if (req->opcode == IORING_OP_SEND_ZC) {
		if (READ_ONCE(sqe->__pad3[0]))
			return -EINVAL;
		zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
		zc->addr_len = READ_ONCE(sqe->addr_len);
	} else {
		if (unlikely(sqe->addr2 || sqe->file_index))
			return -EINVAL;
		if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
			return -EINVAL;
	}

	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
	zc->len = READ_ONCE(sqe->len);
	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
	if (zc->msg_flags & MSG_DONTWAIT)
		req->flags |= REQ_F_NOWAIT;

	zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
	zc->addr_len = READ_ONCE(sqe->addr_len);
	zc->done_io = 0;

#ifdef CONFIG_COMPAT
@@ -1118,6 +1132,73 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
	return IOU_OK;
}

int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
{
	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
	struct io_async_msghdr iomsg, *kmsg;
	struct socket *sock;
	unsigned flags, cflags;
	int ret, min_ret = 0;

	sock = sock_from_file(req->file);
	if (unlikely(!sock))
		return -ENOTSOCK;

	if (req_has_async_data(req)) {
		kmsg = req->async_data;
	} else {
		ret = io_sendmsg_copy_hdr(req, &iomsg);
		if (ret)
			return ret;
		kmsg = &iomsg;
	}

	if (!(req->flags & REQ_F_POLLED) &&
	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
		return io_setup_async_msg(req, kmsg, issue_flags);

	flags = sr->msg_flags | MSG_ZEROCOPY;
	if (issue_flags & IO_URING_F_NONBLOCK)
		flags |= MSG_DONTWAIT;
	if (flags & MSG_WAITALL)
		min_ret = iov_iter_count(&kmsg->msg.msg_iter);

	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
	kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);

	if (unlikely(ret < min_ret)) {
		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
			return io_setup_async_msg(req, kmsg, issue_flags);

		if (ret > 0 && io_net_retry(sock, flags)) {
			sr->done_io += ret;
			req->flags |= REQ_F_PARTIAL_IO;
			return io_setup_async_msg(req, kmsg, issue_flags);
		}
		if (ret < 0 && !sr->done_io)
			sr->notif->flags |= REQ_F_CQE_SKIP;
		if (ret == -ERESTARTSYS)
			ret = -EINTR;
		req_set_fail(req);
	}
	/* fast path, check for non-NULL to avoid function call */
	if (kmsg->free_iov)
		kfree(kmsg->free_iov);

	io_netmsg_recycle(req, issue_flags);
	if (ret >= 0)
		ret += sr->done_io;
	else if (sr->done_io)
		ret = sr->done_io;

	io_notif_flush(sr->notif);
	req->flags &= ~REQ_F_NEED_CLEANUP;
	cflags = ret >= 0 ? IORING_CQE_F_MORE : 0;
	io_req_set_res(req, ret, cflags);
	return IOU_OK;
}

void io_sendrecv_fail(struct io_kiocb *req)
{
	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
@@ -1127,7 +1208,7 @@ void io_sendrecv_fail(struct io_kiocb *req)
	if (req->flags & REQ_F_PARTIAL_IO)
		res = sr->done_io;
	if ((req->flags & REQ_F_NEED_CLEANUP) &&
	    req->opcode == IORING_OP_SEND_ZC) {
	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC)) {
		/* preserve notification for partial I/O */
		if (res < 0)
			sr->notif->flags |= REQ_F_CQE_SKIP;
+1 −0
Original line number Diff line number Diff line
@@ -57,6 +57,7 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_connect(struct io_kiocb *req, unsigned int issue_flags);

int io_send_zc(struct io_kiocb *req, unsigned int issue_flags);
int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags);
int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
void io_send_zc_cleanup(struct io_kiocb *req);

+19 −0
Original line number Diff line number Diff line
@@ -503,6 +503,25 @@ const struct io_op_def io_op_defs[] = {
		.fail			= io_sendrecv_fail,
#else
		.prep			= io_eopnotsupp_prep,
#endif
	},
	[IORING_OP_SENDMSG_ZC] = {
		.name			= "SENDMSG_ZC",
		.needs_file		= 1,
		.unbound_nonreg_file	= 1,
		.pollout		= 1,
		.audit_skip		= 1,
		.ioprio			= 1,
		.manual_alloc		= 1,
#if defined(CONFIG_NET)
		.async_size		= sizeof(struct io_async_msghdr),
		.prep			= io_send_zc_prep,
		.issue			= io_sendmsg_zc,
		.prep_async		= io_sendmsg_prep_async,
		.cleanup		= io_send_zc_cleanup,
		.fail			= io_sendrecv_fail,
#else
		.prep			= io_eopnotsupp_prep,
#endif
	},
};