Commit 5ee35abb authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'bpf: Remove recursion check for struct_ops prog'

Martin KaFai Lau says:

====================

From: Martin KaFai Lau <martin.lau@kernel.org>

The struct_ops is sharing the tracing-trampoline's enter/exit
function which tracks prog->active to avoid recursion.  It turns
out the struct_ops bpf prog will hit this prog->active and
unnecessarily skipped running the struct_ops prog.  eg.  The
'.ssthresh' may run in_task() and then interrupted by softirq
that runs the same '.ssthresh'.

The kernel does not call the tcp-cc's ops in a recursive way,
so this set is to remove the recursion check for struct_ops prog.

v3:
- Clear the bpf_chg_cc_inprogress from the newly cloned tcp_sock
  in tcp_create_openreq_child() because the listen sk can
  be cloned without lock being held. (Eric Dumazet)

v2:
- v1 [0] turned into a long discussion on a few cases and also
  whether it needs to follow the bpf_run_ctx chain if there is
  tracing bpf_run_ctx (kprobe/trace/trampoline) running in between.

  It is a good signal that it is not obvious enough to reason
  about it and needs a tradeoff for a more straight forward approach.

  This revision uses one bit out of an existing 1 byte hole
  in the tcp_sock.  It is in Patch 4.

  [0]: https://lore.kernel.org/bpf/20220922225616.3054840-1-kafai@fb.com/T/#md98d40ac5ec295fdadef476c227a3401b2b6b911


====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 8526f0d6 3411c5b6
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -1836,6 +1836,9 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
	if (p->aux->sleepable) {
		enter = __bpf_prog_enter_sleepable;
		exit = __bpf_prog_exit_sleepable;
	} else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) {
		enter = __bpf_prog_enter_struct_ops;
		exit = __bpf_prog_exit_struct_ops;
	} else if (p->expected_attach_type == BPF_LSM_CGROUP) {
		enter = __bpf_prog_enter_lsm_cgroup;
		exit = __bpf_prog_exit_lsm_cgroup;
+4 −0
Original line number Diff line number Diff line
@@ -864,6 +864,10 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
					struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
					struct bpf_tramp_run_ctx *run_ctx);
u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
					struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
					struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);

+6 −0
Original line number Diff line number Diff line
@@ -388,6 +388,12 @@ struct tcp_sock {
	u8	bpf_sock_ops_cb_flags;  /* Control calling BPF programs
					 * values defined in uapi/linux/tcp.h
					 */
	u8	bpf_chg_cc_inprogress:1; /* In the middle of
					  * bpf_setsockopt(TCP_CONGESTION),
					  * it is to avoid the bpf_tcp_cc->init()
					  * to recur itself by calling
					  * bpf_setsockopt(TCP_CONGESTION, "itself").
					  */
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
#else
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
+23 −0
Original line number Diff line number Diff line
@@ -964,6 +964,29 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
	rcu_read_unlock_trace();
}

u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
					struct bpf_tramp_run_ctx *run_ctx)
	__acquires(RCU)
{
	rcu_read_lock();
	migrate_disable();

	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);

	return bpf_prog_start_time();
}

void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
					struct bpf_tramp_run_ctx *run_ctx)
	__releases(RCU)
{
	bpf_reset_run_ctx(run_ctx->saved_run_ctx);

	update_prog_stats(prog, start);
	migrate_enable();
	rcu_read_unlock();
}

void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
{
	percpu_ref_get(&tr->pcref);
+54 −16
Original line number Diff line number Diff line
@@ -5102,6 +5102,59 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
	return 0;
}

static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
				      int *optlen, bool getopt)
{
	struct tcp_sock *tp;
	int ret;

	if (*optlen < 2)
		return -EINVAL;

	if (getopt) {
		if (!inet_csk(sk)->icsk_ca_ops)
			return -EINVAL;
		/* BPF expects NULL-terminated tcp-cc string */
		optval[--(*optlen)] = '\0';
		return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
					 KERNEL_SOCKPTR(optval),
					 KERNEL_SOCKPTR(optlen));
	}

	/* "cdg" is the only cc that alloc a ptr
	 * in inet_csk_ca area.  The bpf-tcp-cc may
	 * overwrite this ptr after switching to cdg.
	 */
	if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
		return -ENOTSUPP;

	/* It stops this looping
	 *
	 * .init => bpf_setsockopt(tcp_cc) => .init =>
	 * bpf_setsockopt(tcp_cc)" => .init => ....
	 *
	 * The second bpf_setsockopt(tcp_cc) is not allowed
	 * in order to break the loop when both .init
	 * are the same bpf prog.
	 *
	 * This applies even the second bpf_setsockopt(tcp_cc)
	 * does not cause a loop.  This limits only the first
	 * '.init' can call bpf_setsockopt(TCP_CONGESTION) to
	 * pick a fallback cc (eg. peer does not support ECN)
	 * and the second '.init' cannot fallback to
	 * another.
	 */
	tp = tcp_sk(sk);
	if (tp->bpf_chg_cc_inprogress)
		return -EBUSY;

	tp->bpf_chg_cc_inprogress = 1;
	ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
				KERNEL_SOCKPTR(optval), *optlen);
	tp->bpf_chg_cc_inprogress = 0;
	return ret;
}

static int sol_tcp_sockopt(struct sock *sk, int optname,
			   char *optval, int *optlen,
			   bool getopt)
@@ -5125,9 +5178,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
			return -EINVAL;
		break;
	case TCP_CONGESTION:
		if (*optlen < 2)
			return -EINVAL;
		break;
		return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
	case TCP_SAVED_SYN:
		if (*optlen < 1)
			return -EINVAL;
@@ -5152,13 +5203,6 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
			return 0;
		}

		if (optname == TCP_CONGESTION) {
			if (!inet_csk(sk)->icsk_ca_ops)
				return -EINVAL;
			/* BPF expects NULL-terminated tcp-cc string */
			optval[--(*optlen)] = '\0';
		}

		return do_tcp_getsockopt(sk, SOL_TCP, optname,
					 KERNEL_SOCKPTR(optval),
					 KERNEL_SOCKPTR(optlen));
@@ -5285,12 +5329,6 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
	   int, optname, char *, optval, int, optlen)
{
	if (level == SOL_TCP && optname == TCP_CONGESTION) {
		if (optlen >= sizeof("cdg") - 1 &&
		    !strncmp("cdg", optval, optlen))
			return -ENOTSUPP;
	}

	return _bpf_setsockopt(sk, level, optname, optval, optlen);
}

Loading