Commit 965b57b4 authored by Cong Wang's avatar Cong Wang Committed by Daniel Borkmann
Browse files

net: Introduce a new proto_ops ->read_skb()



Currently both splice() and sockmap use ->read_sock() to
read skb from receive queue, but for sockmap we only read
one entire skb at a time, so ->read_sock() is too conservative
to use. Introduce a new proto_ops ->read_skb() which supports
this sematic, with this we can finally pass the ownership of
skb to recv actors.

For non-TCP protocols, all ->read_sock() can be simply
converted to ->read_skb().

Signed-off-by: default avatarCong Wang <cong.wang@bytedance.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Reviewed-by: default avatarJohn Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20220615162014.89193-3-xiyou.wangcong@gmail.com
parent 04919bed
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -152,6 +152,8 @@ struct module;
struct sk_buff;
typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
			       unsigned int, size_t);
typedef int (*skb_read_actor_t)(struct sock *, struct sk_buff *);


struct proto_ops {
	int		family;
@@ -214,6 +216,8 @@ struct proto_ops {
	 */
	int		(*read_sock)(struct sock *sk, read_descriptor_t *desc,
				     sk_read_actor_t recv_actor);
	/* This is different from read_sock(), it reads an entire skb at a time. */
	int		(*read_skb)(struct sock *sk, skb_read_actor_t recv_actor);
	int		(*sendpage_locked)(struct sock *sk, struct page *page,
					   int offset, size_t size, int flags);
	int		(*sendmsg_locked)(struct sock *sk, struct msghdr *msg,
+1 −2
Original line number Diff line number Diff line
@@ -672,8 +672,7 @@ void tcp_get_info(struct sock *, struct tcp_info *);
/* Read 'sendfile()'-style from a TCP socket */
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
		  sk_read_actor_t recv_actor);
int tcp_read_skb(struct sock *sk, read_descriptor_t *desc,
		 sk_read_actor_t recv_actor);
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);

void tcp_initialize_rcv_mss(struct sock *sk);

+1 −2
Original line number Diff line number Diff line
@@ -306,8 +306,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
			       struct sk_buff *skb);
struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
				 __be16 sport, __be16 dport);
int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
		  sk_read_actor_t recv_actor);
int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);

/* UDP uses skb->dev_scratch to cache as much information as possible and avoid
 * possibly multiple cache miss on dequeue()
+5 −15
Original line number Diff line number Diff line
@@ -1160,21 +1160,17 @@ static void sk_psock_done_strp(struct sk_psock *psock)
}
#endif /* CONFIG_BPF_STREAM_PARSER */

static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
				 unsigned int offset, size_t orig_len)
static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
{
	struct sock *sk = (struct sock *)desc->arg.data;
	struct sk_psock *psock;
	struct bpf_prog *prog;
	int ret = __SK_DROP;
	int len = orig_len;
	int len = skb->len;

	/* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */
	skb = skb_clone(skb, GFP_ATOMIC);
	if (!skb) {
		desc->error = -ENOMEM;
	if (!skb)
		return 0;
	}

	rcu_read_lock();
	psock = sk_psock(sk);
@@ -1204,16 +1200,10 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
static void sk_psock_verdict_data_ready(struct sock *sk)
{
	struct socket *sock = sk->sk_socket;
	read_descriptor_t desc;

	if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
	if (unlikely(!sock || !sock->ops || !sock->ops->read_skb))
		return;

	desc.arg.data = sk;
	desc.error = 0;
	desc.count = 1;

	sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
	sock->ops->read_skb(sk, sk_psock_verdict_recv);
}

void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
+2 −1
Original line number Diff line number Diff line
@@ -1040,6 +1040,7 @@ const struct proto_ops inet_stream_ops = {
	.sendpage	   = inet_sendpage,
	.splice_read	   = tcp_splice_read,
	.read_sock	   = tcp_read_sock,
	.read_skb	   = tcp_read_skb,
	.sendmsg_locked    = tcp_sendmsg_locked,
	.sendpage_locked   = tcp_sendpage_locked,
	.peek_len	   = tcp_peek_len,
@@ -1067,7 +1068,7 @@ const struct proto_ops inet_dgram_ops = {
	.setsockopt	   = sock_common_setsockopt,
	.getsockopt	   = sock_common_getsockopt,
	.sendmsg	   = inet_sendmsg,
	.read_sock	   = udp_read_sock,
	.read_skb	   = udp_read_skb,
	.recvmsg	   = inet_recvmsg,
	.mmap		   = sock_no_mmap,
	.sendpage	   = inet_sendpage,
Loading