Commit 2bab48c5 authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'improve-bpf-tcp-cc-init'



Neal Cardwell says:

====================
This patch series reorganizes TCP congestion control initialization so that if
EBPF code called by tcp_init_transfer() sets the congestion control algorithm
by calling setsockopt(TCP_CONGESTION) then the TCP stack initializes the
congestion control module immediately, instead of having tcp_init_transfer()
later initialize the congestion control module.

This increases flexibility for the EBPF code that runs at connection
establishment time, and simplifies the code.

This has the following benefits:

(1) This allows CC module customizations made by the EBPF called in
    tcp_init_transfer() to persist, and not be wiped out by a later
    call to tcp_init_congestion_control() in tcp_init_transfer().

(2) Does not flip the order of EBPF and CC init, to avoid causing bugs
    for existing code upstream that depends on the current order.

(3) Does not cause 2 initializations for for CC in the case where the
    EBPF called in tcp_init_transfer() wants to set the CC to a new CC
    algorithm.

(4) Allows follow-on simplifications to the code in net/core/filter.c
    and net/ipv4/tcp_cong.c, which currently both have some complexity
    to special-case CC initialization to avoid double CC
    initialization if EBPF sets the CC.

changes in v2:

o rebase onto bpf-next

o add another follow-on simplification suggested by Martin KaFai Lau:
   "tcp: simplify tcp_set_congestion_control() load=false case"

changes in v3:

o no change in commits

o resent patch series from @gmail.com, since mail from ncardwell@google.com
  stopped being accepted at netdev@vger.kernel.org mid-way through processing
  the v2 patch series (between patches 2 and 3), confusing patchwork about
  which patches belonged to the v2 patch series
====================

Acked-by: default avatarMartin KaFai Lau <kafai@fb.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 18841da9 5050bef8
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -96,7 +96,8 @@ struct inet_connection_sock {
	void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
	struct hlist_node         icsk_listen_portaddr_node;
	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
	__u8			  icsk_ca_state:6,
	__u8			  icsk_ca_state:5,
				  icsk_ca_initialized:1,
				  icsk_ca_setsockopt:1,
				  icsk_ca_dst_locked:1;
	__u8			  icsk_retransmits;
+1 −1
Original line number Diff line number Diff line
@@ -1104,7 +1104,7 @@ void tcp_get_available_congestion_control(char *buf, size_t len);
void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed);
int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
			       bool reinit, bool cap_net_admin);
			       bool cap_net_admin);
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);

+4 −14
Original line number Diff line number Diff line
@@ -4313,10 +4313,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
	.arg1_type      = ARG_PTR_TO_CTX,
};

#define SOCKOPT_CC_REINIT (1 << 0)

static int _bpf_setsockopt(struct sock *sk, int level, int optname,
			   char *optval, int optlen, u32 flags)
			   char *optval, int optlen)
{
	char devname[IFNAMSIZ];
	int val, valbool;
@@ -4449,13 +4447,11 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
		   sk->sk_prot->setsockopt == tcp_setsockopt) {
		if (optname == TCP_CONGESTION) {
			char name[TCP_CA_NAME_MAX];
			bool reinit = flags & SOCKOPT_CC_REINIT;

			strncpy(name, optval, min_t(long, optlen,
						    TCP_CA_NAME_MAX-1));
			name[TCP_CA_NAME_MAX-1] = 0;
			ret = tcp_set_congestion_control(sk, name, false,
							 reinit, true);
			ret = tcp_set_congestion_control(sk, name, false, true);
		} else {
			struct inet_connection_sock *icsk = inet_csk(sk);
			struct tcp_sock *tp = tcp_sk(sk);
@@ -4615,9 +4611,7 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
	   int, level, int, optname, char *, optval, int, optlen)
{
	u32 flags = 0;
	return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen,
			       flags);
	return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
@@ -4651,11 +4645,7 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
	   int, level, int, optname, char *, optval, int, optlen)
{
	u32 flags = 0;
	if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
		flags |= SOCKOPT_CC_REINIT;
	return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen,
			       flags);
	return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
+2 −1
Original line number Diff line number Diff line
@@ -2698,6 +2698,7 @@ int tcp_disconnect(struct sock *sk, int flags)
	if (icsk->icsk_ca_ops->release)
		icsk->icsk_ca_ops->release(sk);
	memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
	icsk->icsk_ca_initialized = 0;
	tcp_set_ca_state(sk, TCP_CA_Open);
	tp->is_sack_reneg = 0;
	tcp_clear_retrans(tp);
@@ -3049,7 +3050,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
		name[val] = 0;

		lock_sock(sk);
		err = tcp_set_congestion_control(sk, name, true, true,
		err = tcp_set_congestion_control(sk, name, true,
						 ns_capable(sock_net(sk)->user_ns,
							    CAP_NET_ADMIN));
		release_sock(sk);
+7 −20
Original line number Diff line number Diff line
@@ -176,7 +176,7 @@ void tcp_assign_congestion_control(struct sock *sk)

void tcp_init_congestion_control(struct sock *sk)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);

	tcp_sk(sk)->prior_ssthresh = 0;
	if (icsk->icsk_ca_ops->init)
@@ -185,6 +185,7 @@ void tcp_init_congestion_control(struct sock *sk)
		INET_ECN_xmit(sk);
	else
		INET_ECN_dontxmit(sk);
	icsk->icsk_ca_initialized = 1;
}

static void tcp_reinit_congestion_control(struct sock *sk,
@@ -340,7 +341,7 @@ int tcp_set_allowed_congestion_control(char *val)
 * already initialized.
 */
int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
			       bool reinit, bool cap_net_admin)
			       bool cap_net_admin)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	const struct tcp_congestion_ops *ca;
@@ -361,28 +362,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
		goto out;
	}

	if (!ca) {
	if (!ca)
		err = -ENOENT;
	} else if (!load) {
		const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops;

		if (bpf_try_module_get(ca, ca->owner)) {
			if (reinit) {
				tcp_reinit_congestion_control(sk, ca);
			} else {
				icsk->icsk_ca_ops = ca;
				bpf_module_put(old_ca, old_ca->owner);
			}
		} else {
			err = -EBUSY;
		}
	} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) {
	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin))
		err = -EPERM;
	} else if (!bpf_try_module_get(ca, ca->owner)) {
	else if (!bpf_try_module_get(ca, ca->owner))
		err = -EBUSY;
	} else {
	else
		tcp_reinit_congestion_control(sk, ca);
	}
 out:
	rcu_read_unlock();
	return err;
Loading