Commit 4a85a6a3 authored by Chuck Lever's avatar Chuck Lever
Browse files

SUNRPC: Handle TCP socket sends with kernel_sendpage() again



Daire Byrne reports a ~50% aggregrate throughput regression on his
Linux NFS server after commit da1661b9 ("SUNRPC: Teach server to
use xprt_sock_sendmsg for socket sends"), which replaced
kernel_send_page() calls in NFSD's socket send path with calls to
sock_sendmsg() using iov_iter.

Investigation showed that tcp_sendmsg() was not using zero-copy to
send the xdr_buf's bvec pages, but instead was relying on memcpy.
This means copying every byte of a large NFS READ payload.

It looks like TLS sockets do indeed support a ->sendpage method,
so it's really not necessary to use xprt_sock_sendmsg() to support
TLS fully on the server. A mechanical reversion of da1661b9 is
not possible at this point, but we can re-implement the server's
TCP socket sendmsg path using kernel_sendpage().

Reported-by: default avatarDaire Byrne <daire@dneg.com>
BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=209439


Signed-off-by: default avatarChuck Lever <chuck.lever@oracle.com>
parent d6c9e436
Loading
Loading
Loading
Loading
+85 −1
Original line number Diff line number Diff line
@@ -1062,6 +1062,90 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
	return 0;	/* record not complete */
}

static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
			      int flags)
{
	return kernel_sendpage(sock, virt_to_page(vec->iov_base),
			       offset_in_page(vec->iov_base),
			       vec->iov_len, flags);
}

/*
 * kernel_sendpage() is used exclusively to reduce the number of
 * copy operations in this path. Therefore the caller must ensure
 * that the pages backing @xdr are unchanging.
 *
 * In addition, the logic assumes that * .bv_len is never larger
 * than PAGE_SIZE.
 */
static int svc_tcp_sendmsg(struct socket *sock, struct msghdr *msg,
			   struct xdr_buf *xdr, rpc_fraghdr marker,
			   unsigned int *sentp)
{
	const struct kvec *head = xdr->head;
	const struct kvec *tail = xdr->tail;
	struct kvec rm = {
		.iov_base	= &marker,
		.iov_len	= sizeof(marker),
	};
	int flags, ret;

	*sentp = 0;
	xdr_alloc_bvec(xdr, GFP_KERNEL);

	msg->msg_flags = MSG_MORE;
	ret = kernel_sendmsg(sock, msg, &rm, 1, rm.iov_len);
	if (ret < 0)
		return ret;
	*sentp += ret;
	if (ret != rm.iov_len)
		return -EAGAIN;

	flags = head->iov_len < xdr->len ? MSG_MORE | MSG_SENDPAGE_NOTLAST : 0;
	ret = svc_tcp_send_kvec(sock, head, flags);
	if (ret < 0)
		return ret;
	*sentp += ret;
	if (ret != head->iov_len)
		goto out;

	if (xdr->page_len) {
		unsigned int offset, len, remaining;
		struct bio_vec *bvec;

		bvec = xdr->bvec;
		offset = xdr->page_base;
		remaining = xdr->page_len;
		flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
		while (remaining > 0) {
			if (remaining <= PAGE_SIZE && tail->iov_len == 0)
				flags = 0;
			len = min(remaining, bvec->bv_len);
			ret = kernel_sendpage(sock, bvec->bv_page,
					      bvec->bv_offset + offset,
					      len, flags);
			if (ret < 0)
				return ret;
			*sentp += ret;
			if (ret != len)
				goto out;
			remaining -= len;
			offset = 0;
			bvec++;
		}
	}

	if (tail->iov_len) {
		ret = svc_tcp_send_kvec(sock, tail, 0);
		if (ret < 0)
			return ret;
		*sentp += ret;
	}

out:
	return 0;
}

/**
 * svc_tcp_sendto - Send out a reply on a TCP socket
 * @rqstp: completed svc_rqst
@@ -1089,7 +1173,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
	mutex_lock(&xprt->xpt_mutex);
	if (svc_xprt_is_dead(xprt))
		goto out_notconn;
	err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent);
	err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker, &sent);
	xdr_free_bvec(xdr);
	trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
	if (err < 0 || sent != (xdr->len + sizeof(marker)))