Commit 66dd1014 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'mptcp-fixes-for-connect-timeout-access-annotations-and-subflow-init'

Mat Martineau says:

====================
mptcp: Fixes for connect timeout, access annotations, and subflow init

Patch 1 allows the SO_SNDTIMEO sockopt to correctly change the connect
timeout on MPTCP sockets.

Patches 2-5 add READ_ONCE()/WRITE_ONCE() annotations to fix KCSAN issues.

Patch 6 correctly initializes some subflow fields on outgoing connections.
====================

Link: https://lore.kernel.org/r/20230531-send-net-20230531-v1-0-47750c420571@kernel.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 3021dbfe 55b47ca7
Loading
Loading
Loading
Loading
+78 −62
Original line number Diff line number Diff line
@@ -90,8 +90,8 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
	if (err)
		return err;

	msk->first = ssock->sk;
	msk->subflow = ssock;
	WRITE_ONCE(msk->first, ssock->sk);
	WRITE_ONCE(msk->subflow, ssock);
	subflow = mptcp_subflow_ctx(ssock->sk);
	list_add(&subflow->node, &msk->conn_list);
	sock_hold(ssock->sk);
@@ -603,7 +603,7 @@ static bool mptcp_check_data_fin(struct sock *sk)
		WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1);
		WRITE_ONCE(msk->rcv_data_fin, 0);

		sk->sk_shutdown |= RCV_SHUTDOWN;
		WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);
		smp_mb__before_atomic(); /* SHUTDOWN must be visible first */

		switch (sk->sk_state) {
@@ -825,6 +825,13 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
	mptcp_data_unlock(sk);
}

static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk)
{
	mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq);
	WRITE_ONCE(msk->allow_infinite_fallback, false);
	mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
}

static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
{
	struct sock *sk = (struct sock *)msk;
@@ -839,6 +846,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
		mptcp_sock_graft(ssk, sk->sk_socket);

	mptcp_sockopt_sync_locked(msk, ssk);
	mptcp_subflow_joined(msk, ssk);
	return true;
}

@@ -910,7 +918,7 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk)
		/* hopefully temporary hack: propagate shutdown status
		 * to msk, when all subflows agree on it
		 */
		sk->sk_shutdown |= RCV_SHUTDOWN;
		WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);

		smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
		sk->sk_data_ready(sk);
@@ -1702,7 +1710,6 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,

	lock_sock(ssk);
	msg->msg_flags |= MSG_DONTWAIT;
	msk->connect_flags = O_NONBLOCK;
	msk->fastopening = 1;
	ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL);
	msk->fastopening = 0;
@@ -2283,7 +2290,7 @@ static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk)
{
	if (msk->subflow) {
		iput(SOCK_INODE(msk->subflow));
		msk->subflow = NULL;
		WRITE_ONCE(msk->subflow, NULL);
	}
}

@@ -2420,7 +2427,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
	sock_put(ssk);

	if (ssk == msk->first)
		msk->first = NULL;
		WRITE_ONCE(msk->first, NULL);

out:
	if (ssk == msk->last_snd)
@@ -2527,7 +2534,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
	}

	inet_sk_state_store(sk, TCP_CLOSE);
	sk->sk_shutdown = SHUTDOWN_MASK;
	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
	smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
	set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags);

@@ -2721,7 +2728,7 @@ static int __mptcp_init_sock(struct sock *sk)
	WRITE_ONCE(msk->rmem_released, 0);
	msk->timer_ival = TCP_RTO_MIN;

	msk->first = NULL;
	WRITE_ONCE(msk->first, NULL);
	inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
	WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
	WRITE_ONCE(msk->allow_infinite_fallback, true);
@@ -2959,7 +2966,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
	bool do_cancel_work = false;
	int subflows_alive = 0;

	sk->sk_shutdown = SHUTDOWN_MASK;
	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);

	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
		mptcp_listen_inuse_dec(sk);
@@ -3039,7 +3046,7 @@ static void mptcp_close(struct sock *sk, long timeout)
	sock_put(sk);
}

void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
	const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
@@ -3102,7 +3109,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
	mptcp_pm_data_reset(msk);
	mptcp_ca_reset(sk);

	sk->sk_shutdown = 0;
	WRITE_ONCE(sk->sk_shutdown, 0);
	sk_error_report(sk);
	return 0;
}
@@ -3116,8 +3123,9 @@ static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
}
#endif

struct sock *mptcp_sk_clone(const struct sock *sk,
struct sock *mptcp_sk_clone_init(const struct sock *sk,
				 const struct mptcp_options_received *mp_opt,
				 struct sock *ssk,
				 struct request_sock *req)
{
	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
@@ -3137,7 +3145,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
	msk = mptcp_sk(nsk);
	msk->local_key = subflow_req->local_key;
	msk->token = subflow_req->token;
	msk->subflow = NULL;
	WRITE_ONCE(msk->subflow, NULL);
	msk->in_accept_queue = 1;
	WRITE_ONCE(msk->fully_established, false);
	if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD)
@@ -3150,10 +3158,30 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
	msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;

	sock_reset_flag(nsk, SOCK_RCU_FREE);
	/* will be fully established after successful MPC subflow creation */
	inet_sk_state_store(nsk, TCP_SYN_RECV);

	security_inet_csk_clone(nsk, req);

	/* this can't race with mptcp_close(), as the msk is
	 * not yet exposted to user-space
	 */
	inet_sk_state_store(nsk, TCP_ESTABLISHED);

	/* The msk maintain a ref to each subflow in the connections list */
	WRITE_ONCE(msk->first, ssk);
	list_add(&mptcp_subflow_ctx(ssk)->node, &msk->conn_list);
	sock_hold(ssk);

	/* new mpc subflow takes ownership of the newly
	 * created mptcp socket
	 */
	mptcp_token_accept(subflow_req, msk);

	/* set msk addresses early to ensure mptcp_pm_get_local_id()
	 * uses the correct data
	 */
	mptcp_copy_inaddrs(nsk, ssk);
	mptcp_propagate_sndbuf(nsk, ssk);

	mptcp_rcv_space_init(msk, ssk);
	bh_unlock_sock(nsk);

	/* note: the newly allocated socket refcount is 2 now */
@@ -3185,7 +3213,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
	struct socket *listener;
	struct sock *newsk;

	listener = msk->subflow;
	listener = READ_ONCE(msk->subflow);
	if (WARN_ON_ONCE(!listener)) {
		*err = -EINVAL;
		return NULL;
@@ -3465,14 +3493,16 @@ bool mptcp_finish_join(struct sock *ssk)
		return false;
	}

	if (!list_empty(&subflow->node))
		goto out;
	/* active subflow, already present inside the conn_list */
	if (!list_empty(&subflow->node)) {
		mptcp_subflow_joined(msk, ssk);
		return true;
	}

	if (!mptcp_pm_allow_new_subflow(msk))
		goto err_prohibited;

	/* active connections are already on conn_list.
	 * If we can't acquire msk socket lock here, let the release callback
	/* If we can't acquire msk socket lock here, let the release callback
	 * handle it
	 */
	mptcp_data_lock(parent);
@@ -3495,11 +3525,6 @@ bool mptcp_finish_join(struct sock *ssk)
		return false;
	}

	subflow->map_seq = READ_ONCE(msk->ack_seq);
	WRITE_ONCE(msk->allow_infinite_fallback, false);

out:
	mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
	return true;
}

@@ -3617,9 +3642,9 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
	 * acquired the subflow socket lock, too.
	 */
	if (msk->fastopening)
		err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1);
		err = __inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK, 1);
	else
		err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags);
		err = inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK);
	inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect;

	/* on successful connect, the msk state will be moved to established by
@@ -3632,12 +3657,10 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)

	mptcp_copy_inaddrs(sk, ssock->sk);

	/* unblocking connect, mptcp-level inet_stream_connect will error out
	 * without changing the socket state, update it here.
	/* silence EINPROGRESS and let the caller inet_stream_connect
	 * handle the connection in progress
	 */
	if (err == -EINPROGRESS)
		sk->sk_socket->state = ssock->state;
	return err;
	return 0;
}

static struct proto mptcp_prot = {
@@ -3696,18 +3719,6 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
	return err;
}

static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
				int addr_len, int flags)
{
	int ret;

	lock_sock(sock->sk);
	mptcp_sk(sock->sk)->connect_flags = flags;
	ret = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
	release_sock(sock->sk);
	return ret;
}

static int mptcp_listen(struct socket *sock, int backlog)
{
	struct mptcp_sock *msk = mptcp_sk(sock->sk);
@@ -3751,10 +3762,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,

	pr_debug("msk=%p", msk);

	/* buggy applications can call accept on socket states other then LISTEN
	/* Buggy applications can call accept on socket states other then LISTEN
	 * but no need to allocate the first subflow just to error out.
	 */
	ssock = msk->subflow;
	ssock = READ_ONCE(msk->subflow);
	if (!ssock)
		return -EINVAL;

@@ -3800,9 +3811,6 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
{
	struct sock *sk = (struct sock *)msk;

	if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
		return EPOLLOUT | EPOLLWRNORM;

	if (sk_stream_is_writeable(sk))
		return EPOLLOUT | EPOLLWRNORM;

@@ -3820,6 +3828,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
	struct sock *sk = sock->sk;
	struct mptcp_sock *msk;
	__poll_t mask = 0;
	u8 shutdown;
	int state;

	msk = mptcp_sk(sk);
@@ -3828,23 +3837,30 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
	state = inet_sk_state_load(sk);
	pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
	if (state == TCP_LISTEN) {
		if (WARN_ON_ONCE(!msk->subflow || !msk->subflow->sk))
		struct socket *ssock = READ_ONCE(msk->subflow);

		if (WARN_ON_ONCE(!ssock || !ssock->sk))
			return 0;

		return inet_csk_listen_poll(msk->subflow->sk);
		return inet_csk_listen_poll(ssock->sk);
	}

	shutdown = READ_ONCE(sk->sk_shutdown);
	if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
		mask |= EPOLLHUP;
	if (shutdown & RCV_SHUTDOWN)
		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;

	if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
		mask |= mptcp_check_readable(msk);
		if (shutdown & SEND_SHUTDOWN)
			mask |= EPOLLOUT | EPOLLWRNORM;
		else
			mask |= mptcp_check_writeable(msk);
	} else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
		/* cf tcp_poll() note about TFO */
		mask |= EPOLLOUT | EPOLLWRNORM;
	}
	if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
		mask |= EPOLLHUP;
	if (sk->sk_shutdown & RCV_SHUTDOWN)
		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;

	/* This barrier is coupled with smp_wmb() in __mptcp_error_report() */
	smp_rmb();
@@ -3859,7 +3875,7 @@ static const struct proto_ops mptcp_stream_ops = {
	.owner		   = THIS_MODULE,
	.release	   = inet_release,
	.bind		   = mptcp_bind,
	.connect	   = mptcp_stream_connect,
	.connect	   = inet_stream_connect,
	.socketpair	   = sock_no_socketpair,
	.accept		   = mptcp_stream_accept,
	.getname	   = inet_getname,
@@ -3954,7 +3970,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
	.owner		   = THIS_MODULE,
	.release	   = inet6_release,
	.bind		   = mptcp_bind,
	.connect	   = mptcp_stream_connect,
	.connect	   = inet_stream_connect,
	.socketpair	   = sock_no_socketpair,
	.accept		   = mptcp_stream_accept,
	.getname	   = inet6_getname,
+9 −6
Original line number Diff line number Diff line
@@ -297,7 +297,6 @@ struct mptcp_sock {
			nodelay:1,
			fastopening:1,
			in_accept_queue:1;
	int		connect_flags;
	struct work_struct work;
	struct sk_buff  *ooo_last_skb;
	struct rb_root  out_of_order_queue;
@@ -306,7 +305,11 @@ struct mptcp_sock {
	struct list_head rtx_queue;
	struct mptcp_data_frag *first_pending;
	struct list_head join_list;
	struct socket	*subflow; /* outgoing connect/listener/!mp_capable */
	struct socket	*subflow; /* outgoing connect/listener/!mp_capable
				   * The mptcp ops can safely dereference, using suitable
				   * ONCE annotation, the subflow outside the socket
				   * lock as such sock is freed after close().
				   */
	struct sock	*first;
	struct mptcp_pm_data	pm;
	struct {
@@ -613,7 +616,6 @@ int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
int mptcp_get_pm_type(const struct net *net);
void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
				     const struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
@@ -683,8 +685,9 @@ void __init mptcp_proto_init(void);
int __init mptcp_proto_v6_init(void);
#endif

struct sock *mptcp_sk_clone(const struct sock *sk,
struct sock *mptcp_sk_clone_init(const struct sock *sk,
				 const struct mptcp_options_received *mp_opt,
				 struct sock *ssk,
				 struct request_sock *req);
void mptcp_get_options(const struct sk_buff *skb,
		       struct mptcp_options_received *mp_opt);
+1 −27
Original line number Diff line number Diff line
@@ -815,38 +815,12 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
		ctx->setsockopt_seq = listener->setsockopt_seq;

		if (ctx->mp_capable) {
			ctx->conn = mptcp_sk_clone(listener->conn, &mp_opt, req);
			ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req);
			if (!ctx->conn)
				goto fallback;

			owner = mptcp_sk(ctx->conn);

			/* this can't race with mptcp_close(), as the msk is
			 * not yet exposted to user-space
			 */
			inet_sk_state_store(ctx->conn, TCP_ESTABLISHED);

			/* record the newly created socket as the first msk
			 * subflow, but don't link it yet into conn_list
			 */
			WRITE_ONCE(owner->first, child);

			/* new mpc subflow takes ownership of the newly
			 * created mptcp socket
			 */
			owner->setsockopt_seq = ctx->setsockopt_seq;
			mptcp_pm_new_connection(owner, child, 1);
			mptcp_token_accept(subflow_req, owner);

			/* set msk addresses early to ensure mptcp_pm_get_local_id()
			 * uses the correct data
			 */
			mptcp_copy_inaddrs(ctx->conn, child);
			mptcp_propagate_sndbuf(ctx->conn, child);

			mptcp_rcv_space_init(owner, child);
			list_add(&ctx->node, &owner->conn_list);
			sock_hold(child);

			/* with OoO packets we can reach here without ingress
			 * mpc option