Commit 9cacf81f authored by Stanislav Fomichev's avatar Stanislav Fomichev Committed by Alexei Starovoitov
Browse files

bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE



Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.
We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom
call in do_tcp_getsockopt using the on-stack data. This removes
3% overhead for locking/unlocking the socket.

Without this patch:
     3.38%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
            |
             --3.30%--__cgroup_bpf_run_filter_getsockopt
                       |
                        --0.81%--__kmalloc

With the patch applied:
     0.52%     0.12%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt_kern

Note, exporting uapi/tcp.h requires removing netinet/tcp.h
from test_progs.h because those headers have confliciting
definitions.

Signed-off-by: default avatarStanislav Fomichev <sdf@google.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Acked-by: default avatarMartin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210115163501.805133-2-sdf@google.com
parent 13ca51d5
Loading
Loading
Loading
Loading
+23 −4
Original line number Diff line number Diff line
@@ -147,6 +147,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
				       int __user *optlen, int max_optlen,
				       int retval);

int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
					    int optname, void *optval,
					    int *optlen, int retval);

static inline enum bpf_cgroup_storage_type cgroup_storage_type(
	struct bpf_map *map)
{
@@ -364,10 +368,23 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
({									       \
	int __ret = retval;						       \
	if (cgroup_bpf_enabled)						       \
		__ret = __cgroup_bpf_run_filter_getsockopt(sock, level,	       \
							   optname, optval,    \
							   optlen, max_optlen, \
							   retval);	       \
		if (!(sock)->sk_prot->bpf_bypass_getsockopt ||		       \
		    !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \
					tcp_bpf_bypass_getsockopt,	       \
					level, optname))		       \
			__ret = __cgroup_bpf_run_filter_getsockopt(	       \
				sock, level, optname, optval, optlen,	       \
				max_optlen, retval);			       \
	__ret;								       \
})

#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval,      \
					    optlen, retval)		       \
({									       \
	int __ret = retval;						       \
	if (cgroup_bpf_enabled)						       \
		__ret = __cgroup_bpf_run_filter_getsockopt_kern(	       \
			sock, level, optname, optval, optlen, retval);	       \
	__ret;								       \
})

@@ -452,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
				       optlen, max_optlen, retval) ({ retval; })
#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
					    optlen, retval) ({ retval; })
#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
				       kernel_optval) ({ 0; })

+6 −0
Original line number Diff line number Diff line
@@ -60,4 +60,10 @@
#define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__)
#endif

#if IS_ENABLED(CONFIG_INET)
#define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
#else
#define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__)
#endif

#endif
+2 −0
Original line number Diff line number Diff line
@@ -1174,6 +1174,8 @@ struct proto {

	int			(*backlog_rcv) (struct sock *sk,
						struct sk_buff *skb);
	bool			(*bpf_bypass_getsockopt)(int level,
							 int optname);

	void		(*release_cb)(struct sock *sk);

+1 −0
Original line number Diff line number Diff line
@@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock,
		      struct poll_table_struct *wait);
int tcp_getsockopt(struct sock *sk, int level, int optname,
		   char __user *optval, int __user *optlen);
bool tcp_bpf_bypass_getsockopt(int level, int optname);
int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
		   unsigned int optlen);
void tcp_set_keepalive(struct sock *sk, int val);
+46 −0
Original line number Diff line number Diff line
@@ -1486,6 +1486,52 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
	sockopt_free_buf(&ctx);
	return ret;
}

int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
					    int optname, void *optval,
					    int *optlen, int retval)
{
	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
	struct bpf_sockopt_kern ctx = {
		.sk = sk,
		.level = level,
		.optname = optname,
		.retval = retval,
		.optlen = *optlen,
		.optval = optval,
		.optval_end = optval + *optlen,
	};
	int ret;

	/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
	 * user data back into BPF buffer when reval != 0. This is
	 * done as an optimization to avoid extra copy, assuming
	 * kernel won't populate the data in case of an error.
	 * Here we always pass the data and memset() should
	 * be called if that data shouldn't be "exported".
	 */

	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
				 &ctx, BPF_PROG_RUN);
	if (!ret)
		return -EPERM;

	if (ctx.optlen > *optlen)
		return -EFAULT;

	/* BPF programs only allowed to set retval to 0, not some
	 * arbitrary value.
	 */
	if (ctx.retval != 0 && ctx.retval != retval)
		return -EFAULT;

	/* BPF programs can shrink the buffer, export the modifications.
	 */
	if (ctx.optlen != 0)
		*optlen = ctx.optlen;

	return ctx.retval;
}
#endif

static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
Loading