Commit 5a8c8b72 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'vsock-sockmap-support'



Bobby Eshleman says:

====================
Add support for sockmap to vsock.

We're testing usage of vsock as a way to redirect guest-local UDS
requests to the host and this patch series greatly improves the
performance of such a setup.

Compared to copying packets via userspace, this improves throughput by
121% in basic testing.

Tested as follows.

Setup: guest unix dgram sender -> guest vsock redirector -> host vsock
       server
Threads: 1
Payload: 64k
No sockmap:
- 76.3 MB/s
- The guest vsock redirector was
  "socat VSOCK-CONNECT:2:1234 UNIX-RECV:/path/to/sock"
Using sockmap (this patch):
- 168.8 MB/s (+121%)
- The guest redirector was a simple sockmap echo server,
  redirecting unix ingress to vsock 2:1234 egress.
- Same sender and server programs

*Note: these numbers are from RFC v1

Only the virtio transport has been tested. The loopback transport was
used in writing bpf/selftests, but not thoroughly tested otherwise.

This series requires the skb patch.

Changes in v4:
- af_vsock: fix parameter alignment in vsock_dgram_recvmsg()
- af_vsock: add TCP_ESTABLISHED comment in vsock_dgram_connect()
- vsock/bpf: change ret type to bool

Changes in v3:
- vsock/bpf: Refactor wait logic in vsock_bpf_recvmsg() to avoid
  backwards goto
- vsock/bpf: Check psock before acquiring slock
- vsock/bpf: Return bool instead of int of 0 or 1
- vsock/bpf: Wrap macro args __sk/__psock in parens
- vsock/bpf: Place comment trailer */ on separate line

Changes in v2:
- vsock/bpf: rename vsock_dgram_* -> vsock_*
- vsock/bpf: change sk_psock_{get,put} and {lock,release}_sock() order
  to minimize slock hold time
- vsock/bpf: use "new style" wait
- vsock/bpf: fix bug in wait log
- vsock/bpf: add check that recvmsg sk_type is one dgram, seqpacket, or
  stream.  Return error if not one of the three.
- virtio/vsock: comment __skb_recv_datagram() usage
- virtio/vsock: do not init copied in read_skb()
- vsock/bpf: add ifdef guard around struct proto in dgram_recvmsg()
- selftests/bpf: add vsock loopback config for aarch64
- selftests/bpf: add vsock loopback config for s390x
- selftests/bpf: remove vsock device from vmtest.sh qemu machine
- selftests/bpf: remove CONFIG_VIRTIO_VSOCKETS=y from config.x86_64
- vsock/bpf: move transport-related (e.g., if (!vsk->transport)) checks
  out of fast path
====================

Signed-off-by: default avatarBobby Eshleman <bobby.eshleman@bytedance.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 24265c2c d61bd8c1
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -439,6 +439,7 @@ static struct virtio_transport vhost_transport = {
		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
		.notify_buffer_size       = virtio_transport_notify_buffer_size,

		.read_skb = virtio_transport_read_skb,
	},

	.send_pkt = vhost_transport_send_pkt,
+1 −0
Original line number Diff line number Diff line
@@ -245,4 +245,5 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor);
#endif /* _LINUX_VIRTIO_VSOCK_H */
+17 −0
Original line number Diff line number Diff line
@@ -75,6 +75,7 @@ struct vsock_sock {
	void *trans;
};

s64 vsock_connectible_has_data(struct vsock_sock *vsk);
s64 vsock_stream_has_data(struct vsock_sock *vsk);
s64 vsock_stream_has_space(struct vsock_sock *vsk);
struct sock *vsock_create_connected(struct sock *parent);
@@ -173,6 +174,9 @@ struct vsock_transport {

	/* Addressing. */
	u32 (*get_local_cid)(void);

	/* Read a single skb */
	int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
};

/**** CORE ****/
@@ -225,5 +229,18 @@ int vsock_init_tap(void);
int vsock_add_tap(struct vsock_tap *vt);
int vsock_remove_tap(struct vsock_tap *vt);
void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque);
int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
			      int flags);
int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
			size_t len, int flags);

#ifdef CONFIG_BPF_SYSCALL
extern struct proto vsock_proto;
int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
void __init vsock_bpf_build_proto(void);
#else
static inline void __init vsock_bpf_build_proto(void)
{}
#endif

#endif /* __AF_VSOCK_H__ */
+1 −0
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o
obj-$(CONFIG_VSOCKETS_LOOPBACK) += vsock_loopback.o

vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o
vsock-$(CONFIG_BPF_SYSCALL) += vsock_bpf.o

vsock_diag-y += diag.o

+58 −6
Original line number Diff line number Diff line
@@ -116,10 +116,13 @@ static void vsock_sk_destruct(struct sock *sk);
static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);

/* Protocol family. */
static struct proto vsock_proto = {
struct proto vsock_proto = {
	.name = "AF_VSOCK",
	.owner = THIS_MODULE,
	.obj_size = sizeof(struct vsock_sock),
#ifdef CONFIG_BPF_SYSCALL
	.psock_update_sk_prot = vsock_bpf_update_proto,
#endif
};

/* The default peer timeout indicates how long we will wait for a peer response
@@ -865,7 +868,7 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk)
}
EXPORT_SYMBOL_GPL(vsock_stream_has_data);

static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
s64 vsock_connectible_has_data(struct vsock_sock *vsk)
{
	struct sock *sk = sk_vsock(vsk);

@@ -874,6 +877,7 @@ static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
	else
		return vsock_stream_has_data(vsk);
}
EXPORT_SYMBOL_GPL(vsock_connectible_has_data);

s64 vsock_stream_has_space(struct vsock_sock *vsk)
{
@@ -1131,6 +1135,13 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
	return mask;
}

static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor)
{
	struct vsock_sock *vsk = vsock_sk(sk);

	return vsk->transport->read_skb(vsk, read_actor);
}

static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
			       size_t len)
{
@@ -1242,18 +1253,42 @@ static int vsock_dgram_connect(struct socket *sock,
	memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
	sock->state = SS_CONNECTED;

	/* sock map disallows redirection of non-TCP sockets with sk_state !=
	 * TCP_ESTABLISHED (see sock_map_redirect_allowed()), so we set
	 * TCP_ESTABLISHED here to allow redirection of connected vsock dgrams.
	 *
	 * This doesn't seem to be abnormal state for datagram sockets, as the
	 * same approach can be see in other datagram socket types as well
	 * (such as unix sockets).
	 */
	sk->sk_state = TCP_ESTABLISHED;

out:
	release_sock(sk);
	return err;
}

static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
			size_t len, int flags)
{
	struct vsock_sock *vsk = vsock_sk(sock->sk);
#ifdef CONFIG_BPF_SYSCALL
	const struct proto *prot;
#endif
	struct vsock_sock *vsk;
	struct sock *sk;

	sk = sock->sk;
	vsk = vsock_sk(sk);

#ifdef CONFIG_BPF_SYSCALL
	prot = READ_ONCE(sk->sk_prot);
	if (prot != &vsock_proto)
		return prot->recvmsg(sk, msg, len, flags, NULL);
#endif

	return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
}
EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);

static const struct proto_ops vsock_dgram_ops = {
	.family = PF_VSOCK,
@@ -1272,6 +1307,7 @@ static const struct proto_ops vsock_dgram_ops = {
	.recvmsg = vsock_dgram_recvmsg,
	.mmap = sock_no_mmap,
	.sendpage = sock_no_sendpage,
	.read_skb = vsock_read_skb,
};

static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
@@ -2086,13 +2122,16 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
	return err;
}

static int
int
vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
			  int flags)
{
	struct sock *sk;
	struct vsock_sock *vsk;
	const struct vsock_transport *transport;
#ifdef CONFIG_BPF_SYSCALL
	const struct proto *prot;
#endif
	int err;

	sk = sock->sk;
@@ -2139,6 +2178,14 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
		goto out;
	}

#ifdef CONFIG_BPF_SYSCALL
	prot = READ_ONCE(sk->sk_prot);
	if (prot != &vsock_proto) {
		release_sock(sk);
		return prot->recvmsg(sk, msg, len, flags, NULL);
	}
#endif

	if (sk->sk_type == SOCK_STREAM)
		err = __vsock_stream_recvmsg(sk, msg, len, flags);
	else
@@ -2148,6 +2195,7 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
	release_sock(sk);
	return err;
}
EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg);

static int vsock_set_rcvlowat(struct sock *sk, int val)
{
@@ -2188,6 +2236,7 @@ static const struct proto_ops vsock_stream_ops = {
	.mmap = sock_no_mmap,
	.sendpage = sock_no_sendpage,
	.set_rcvlowat = vsock_set_rcvlowat,
	.read_skb = vsock_read_skb,
};

static const struct proto_ops vsock_seqpacket_ops = {
@@ -2209,6 +2258,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
	.recvmsg = vsock_connectible_recvmsg,
	.mmap = sock_no_mmap,
	.sendpage = sock_no_sendpage,
	.read_skb = vsock_read_skb,
};

static int vsock_create(struct net *net, struct socket *sock,
@@ -2348,6 +2398,8 @@ static int __init vsock_init(void)
		goto err_unregister_proto;
	}

	vsock_bpf_build_proto();

	return 0;

err_unregister_proto:
Loading