Commit fd5f4d7d authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch...

Merge branch 'splice-net-rewrite-splice-to-socket-fix-splice_f_more-and-handle-msg_splice_pages-in-af_tls'

David Howells says:

====================
splice, net: Rewrite splice-to-socket, fix SPLICE_F_MORE and handle MSG_SPLICE_PAGES in AF_TLS

Here are patches to do the following:

 (1) Block MSG_SENDPAGE_* flags from leaking into ->sendmsg() from
     userspace, whilst allowing splice_to_socket() to pass them in.

 (2) Allow MSG_SPLICE_PAGES to be passed into tls_*_sendmsg().  Until
     support is added, it will be ignored and a splice-driven sendmsg()
     will be treated like a normal sendmsg().  TCP, UDP, AF_UNIX and
     Chelsio-TLS already handle the flag in net-next.

 (3) Replace a chain of functions to splice-to-sendpage with a single
     function to splice via sendmsg() with MSG_SPLICE_PAGES.  This allows a
     bunch of pages to be spliced from a pipe in a single call using a
     bio_vec[] and pushes the main processing loop down into the bowels of
     the protocol driver rather than repeatedly calling in with a page at a
     time.

 (4) Provide a ->splice_eof() op[2] that allows splice to signal to its
     output that the input observed a premature EOF and that the caller
     didn't flag SPLICE_F_MORE, thereby allowing a corked socket to be
     flushed.  This attempts to maintain the current behaviour.  It is also
     not called if we didn't manage to read any data and so didn't called
     the actor function.

     This needs routing though several layers to get it down to the network
     protocol.

     [!] Note that I chose not to pass in any flags - I'm not sure it's
     	 particularly useful to pass in the splice flags; I also elected
     	 not to return any error code - though we might actually want to do
     	 that.

 (5) Provide tls_{device,sw}_splice_eof() to flush a pending TLS record if
     there is one.

 (6) Provide splice_eof() for UDP, TCP, Chelsio-TLS and AF_KCM.  AF_UNIX
     doesn't seem to pay attention to the MSG_MORE or MSG_SENDPAGE_NOTLAST
     flags.

 (7) Alter the behaviour of sendfile() and fix SPLICE_F_MORE/MSG_MORE
     signalling[1] such SPLICE_F_MORE is always signalled until we have
     read sufficient data to finish the request.  If we get a zero-length
     before we've managed to splice sufficient data, we now leave the
     socket expecting more data and leave it to userspace to deal with it.

 (8) Make AF_TLS handle the MSG_SPLICE_PAGES internal sendmsg flag.
     MSG_SPLICE_PAGES is an internal hint that tells the protocol that it
     should splice the pages supplied if it can.  Its sendpage
     implementations are then turned into wrappers around that.

Link: https://lore.kernel.org/r/499791.1685485603@warthog.procyon.org.uk/ [1]
Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/ [2]
Link: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=51c78a4d532efe9543a4df019ff405f05c6157f6 # part 1
Link: https://lore.kernel.org/r/20230524153311.3625329-1-dhowells@redhat.com/ # v1
====================

Link: https://lore.kernel.org/r/20230607181920.2294972-1-dhowells@redhat.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 73601329 3dc8976c
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -568,6 +568,7 @@ void chtls_destroy_sock(struct sock *sk);
int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
int chtls_recvmsg(struct sock *sk, struct msghdr *msg,
		  size_t len, int flags, int *addr_len);
void chtls_splice_eof(struct socket *sock);
int chtls_sendpage(struct sock *sk, struct page *page,
		   int offset, size_t size, int flags);
int send_tx_flowc_wr(struct sock *sk, int compl,
+9 −0
Original line number Diff line number Diff line
@@ -1237,6 +1237,15 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
	goto done;
}

void chtls_splice_eof(struct socket *sock)
{
	struct sock *sk = sock->sk;

	lock_sock(sk);
	chtls_tcp_push(sk, 0);
	release_sock(sk);
}

int chtls_sendpage(struct sock *sk, struct page *page,
		   int offset, size_t size, int flags)
{
+1 −0
Original line number Diff line number Diff line
@@ -606,6 +606,7 @@ static void __init chtls_init_ulp_ops(void)
	chtls_cpl_prot.destroy		= chtls_destroy_sock;
	chtls_cpl_prot.shutdown		= chtls_shutdown;
	chtls_cpl_prot.sendmsg		= chtls_sendmsg;
	chtls_cpl_prot.splice_eof	= chtls_splice_eof;
	chtls_cpl_prot.sendpage		= chtls_sendpage;
	chtls_cpl_prot.recvmsg		= chtls_recvmsg;
	chtls_cpl_prot.setsockopt	= chtls_setsockopt;
+167 −40
Original line number Diff line number Diff line
@@ -33,6 +33,7 @@
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/gfp.h>
#include <linux/net.h>
#include <linux/socket.h>
#include <linux/sched/signal.h>

@@ -448,30 +449,6 @@ const struct pipe_buf_operations nosteal_pipe_buf_ops = {
};
EXPORT_SYMBOL(nosteal_pipe_buf_ops);

/*
 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 * using sendpage(). Return the number of bytes sent.
 */
static int pipe_to_sendpage(struct pipe_inode_info *pipe,
			    struct pipe_buffer *buf, struct splice_desc *sd)
{
	struct file *file = sd->u.file;
	loff_t pos = sd->pos;
	int more;

	if (!likely(file->f_op->sendpage))
		return -EINVAL;

	more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;

	if (sd->len < sd->total_len &&
	    pipe_occupancy(pipe->head, pipe->tail) > 1)
		more |= MSG_SENDPAGE_NOTLAST;

	return file->f_op->sendpage(file, buf->page, buf->offset,
				    sd->len, &pos, more);
}

static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
{
	smp_mb();
@@ -652,7 +629,7 @@ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_des
 * Description:
 *    This function does little more than loop over the pipe and call
 *    @actor to do the actual moving of a single struct pipe_buffer to
 *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
 *    the desired destination. See pipe_to_file, pipe_to_sendmsg, or
 *    pipe_to_user.
 *
 */
@@ -833,8 +810,9 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,

EXPORT_SYMBOL(iter_file_splice_write);

#ifdef CONFIG_NET
/**
 * generic_splice_sendpage - splice data from a pipe to a socket
 * splice_to_socket - splice data from a pipe to a socket
 * @pipe:	pipe to splice from
 * @out:	socket to write to
 * @ppos:	position in @out
@@ -846,13 +824,131 @@ EXPORT_SYMBOL(iter_file_splice_write);
 *    is involved.
 *
 */
ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
			 loff_t *ppos, size_t len, unsigned int flags)
{
	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
	struct socket *sock = sock_from_file(out);
	struct bio_vec bvec[16];
	struct msghdr msg = {};
	ssize_t ret = 0;
	size_t spliced = 0;
	bool need_wakeup = false;

	pipe_lock(pipe);

	while (len > 0) {
		unsigned int head, tail, mask, bc = 0;
		size_t remain = len;

		/*
		 * Check for signal early to make process killable when there
		 * are always buffers available
		 */
		ret = -ERESTARTSYS;
		if (signal_pending(current))
			break;

		while (pipe_empty(pipe->head, pipe->tail)) {
			ret = 0;
			if (!pipe->writers)
				goto out;

			if (spliced)
				goto out;

			ret = -EAGAIN;
			if (flags & SPLICE_F_NONBLOCK)
				goto out;

			ret = -ERESTARTSYS;
			if (signal_pending(current))
				goto out;

			if (need_wakeup) {
				wakeup_pipe_writers(pipe);
				need_wakeup = false;
			}

EXPORT_SYMBOL(generic_splice_sendpage);
			pipe_wait_readable(pipe);
		}

		head = pipe->head;
		tail = pipe->tail;
		mask = pipe->ring_size - 1;

		while (!pipe_empty(head, tail)) {
			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
			size_t seg;

			if (!buf->len) {
				tail++;
				continue;
			}

			seg = min_t(size_t, remain, buf->len);
			seg = min_t(size_t, seg, PAGE_SIZE);

			ret = pipe_buf_confirm(pipe, buf);
			if (unlikely(ret)) {
				if (ret == -ENODATA)
					ret = 0;
				break;
			}

			bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
			remain -= seg;
			if (seg >= buf->len)
				tail++;
			if (bc >= ARRAY_SIZE(bvec))
				break;
		}

		if (!bc)
			break;

		msg.msg_flags = MSG_SPLICE_PAGES;
		if (flags & SPLICE_F_MORE)
			msg.msg_flags |= MSG_MORE;
		if (remain && pipe_occupancy(pipe->head, tail) > 0)
			msg.msg_flags |= MSG_MORE;

		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
			      len - remain);
		ret = sock_sendmsg(sock, &msg);
		if (ret <= 0)
			break;

		spliced += ret;
		len -= ret;
		tail = pipe->tail;
		while (ret > 0) {
			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
			size_t seg = min_t(size_t, ret, buf->len);

			buf->offset += seg;
			buf->len -= seg;
			ret -= seg;

			if (!buf->len) {
				pipe_buf_release(pipe, buf);
				tail++;
			}
		}

		if (tail != pipe->tail) {
			pipe->tail = tail;
			if (pipe->files)
				need_wakeup = true;
		}
	}

out:
	pipe_unlock(pipe);
	if (need_wakeup)
		wakeup_pipe_writers(pipe);
	return spliced ?: ret;
}
#endif

static int warn_unsupported(struct file *file, const char *op)
{
@@ -873,6 +969,17 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
	return out->f_op->splice_write(pipe, out, ppos, len, flags);
}

/*
 * Indicate to the caller that there was a premature EOF when reading from the
 * source and the caller didn't indicate they would be sending more data after
 * this.
 */
static void do_splice_eof(struct splice_desc *sd)
{
	if (sd->splice_eof)
		sd->splice_eof(sd);
}

/*
 * Attempt to initiate a splice from a file to a pipe.
 */
@@ -956,13 +1063,17 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
	 */
	bytes = 0;
	len = sd->total_len;

	/* Don't block on output, we have to drain the direct pipe. */
	flags = sd->flags;
	sd->flags &= ~SPLICE_F_NONBLOCK;

	/*
	 * Don't block on output, we have to drain the direct pipe.
	 * We signal MORE until we've read sufficient data to fulfill the
	 * request and we keep signalling it if the caller set it.
	 */
	sd->flags &= ~SPLICE_F_NONBLOCK;
	more = sd->flags & SPLICE_F_MORE;
	sd->flags |= SPLICE_F_MORE;

	WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));

@@ -972,20 +1083,18 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,

		ret = do_splice_to(in, &pos, pipe, len, flags);
		if (unlikely(ret <= 0))
			goto out_release;
			goto read_failure;

		read_len = ret;
		sd->total_len = read_len;

		/*
		 * If more data is pending, set SPLICE_F_MORE
		 * If this is the last data and SPLICE_F_MORE was not set
		 * initially, clears it.
		 * If we now have sufficient data to fulfill the request then
		 * we clear SPLICE_F_MORE if it was not set initially.
		 */
		if (read_len < len)
			sd->flags |= SPLICE_F_MORE;
		else if (!more)
		if (read_len >= len && !more)
			sd->flags &= ~SPLICE_F_MORE;

		/*
		 * NOTE: nonblocking mode only applies to the input. We
		 * must not do the output in nonblocking mode as then we
@@ -1012,6 +1121,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
	file_accessed(in);
	return bytes;

read_failure:
	/*
	 * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
	 * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
	 * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
	 * least 1 byte *then* we will also do the ->splice_eof() call.
	 */
	if (ret == 0 && !more && len > 0 && bytes)
		do_splice_eof(sd);
out_release:
	/*
	 * If we did an incomplete transfer we must release
@@ -1040,6 +1158,14 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
			      sd->flags);
}

static void direct_file_splice_eof(struct splice_desc *sd)
{
	struct file *file = sd->u.file;

	if (file->f_op->splice_eof)
		file->f_op->splice_eof(file);
}

/**
 * do_splice_direct - splices data directly between two files
 * @in:		file to splice from
@@ -1065,6 +1191,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
		.flags		= flags,
		.pos		= *ppos,
		.u.file		= out,
		.splice_eof	= direct_file_splice_eof,
		.opos		= opos,
	};
	long ret;
+1 −2
Original line number Diff line number Diff line
@@ -1796,6 +1796,7 @@ struct file_operations {
	int (*flock) (struct file *, int, struct file_lock *);
	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
	void (*splice_eof)(struct file *file);
	int (*setlease)(struct file *, long, struct file_lock **, void **);
	long (*fallocate)(struct file *file, int mode, loff_t offset,
			  loff_t len);
@@ -2759,8 +2760,6 @@ extern ssize_t generic_file_splice_read(struct file *, loff_t *,
		struct pipe_inode_info *, size_t, unsigned int);
extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
		struct file *, loff_t *, size_t, unsigned int);
extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
		struct file *out, loff_t *, size_t len, unsigned int flags);
extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
		loff_t *opos, size_t len, unsigned int flags);

Loading