Commit 533aa0ba authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'mptcp-fixes-for-6-4'

Matthieu Baerts says:

====================
mptcp: fixes for 6.4

Patch 1 correctly handles disconnect() failures that can happen in some
specific cases: now the socket state is set as unconnected as expected.
That fixes an issue introduced in v6.2.

Patch 2 fixes a divide by zero bug in mptcp_recvmsg() with a fix similar
to a recent one from Eric Dumazet for TCP introducing sk_wait_pending
flag. It should address an issue present in MPTCP from almost the
beginning, from v5.9.

Patch 3 fixes a possible list corruption on passive MPJ even if the race
seems very unlikely, better be safe than sorry. The possible issue is
present from v5.17.

Patch 4 consolidates fallback and non fallback state machines to avoid
leaking some MPTCP sockets. The fix is likely needed for versions from
v5.11.

Patch 5 drops code that is no longer used after the introduction of
patch 4/6. This is not really a fix but this patch can probably land in
the -net tree as well not to leave unused code.

Patch 6 ensures listeners are unhashed before updating their sk status
to avoid possible deadlocks when diag info are going to be retrieved
with a lock. Even if it should not be visible with the way we are
currently getting diag info, the issue is present from v5.17.
====================

Link: https://lore.kernel.org/r/20230620-upstream-net-20230620-misc-fixes-for-v6-4-v1-0-f36aa5eae8b9@tessares.net


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 59bb14bd 57fc0f1c
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -1047,6 +1047,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
	if (err)
		return err;

	inet_sk_state_store(newsk, TCP_LISTEN);
	err = kernel_listen(ssock, backlog);
	if (err)
		return err;
+64 −96
Original line number Diff line number Diff line
@@ -44,7 +44,7 @@ enum {
static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp;

static void __mptcp_destroy_sock(struct sock *sk);
static void __mptcp_check_send_data_fin(struct sock *sk);
static void mptcp_check_send_data_fin(struct sock *sk);

DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
static struct net_device mptcp_napi_dev;
@@ -424,8 +424,7 @@ static bool mptcp_pending_data_fin_ack(struct sock *sk)
{
	struct mptcp_sock *msk = mptcp_sk(sk);

	return !__mptcp_check_fallback(msk) &&
	       ((1 << sk->sk_state) &
	return ((1 << sk->sk_state) &
		(TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
	       msk->write_seq == READ_ONCE(msk->snd_una);
}
@@ -583,9 +582,6 @@ static bool mptcp_check_data_fin(struct sock *sk)
	u64 rcv_data_fin_seq;
	bool ret = false;

	if (__mptcp_check_fallback(msk))
		return ret;

	/* Need to ack a DATA_FIN received from a peer while this side
	 * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
	 * msk->rcv_data_fin was set when parsing the incoming options
@@ -623,6 +619,7 @@ static bool mptcp_check_data_fin(struct sock *sk)
		}

		ret = true;
		if (!__mptcp_check_fallback(msk))
			mptcp_send_ack(msk);
		mptcp_close_wake_up(sk);
	}
@@ -850,12 +847,12 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
	return true;
}

static void __mptcp_flush_join_list(struct sock *sk)
static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list)
{
	struct mptcp_subflow_context *tmp, *subflow;
	struct mptcp_sock *msk = mptcp_sk(sk);

	list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) {
	list_for_each_entry_safe(subflow, tmp, join_list, node) {
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
		bool slow = lock_sock_fast(ssk);

@@ -897,49 +894,6 @@ bool mptcp_schedule_work(struct sock *sk)
	return false;
}

void mptcp_subflow_eof(struct sock *sk)
{
	if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags))
		mptcp_schedule_work(sk);
}

static void mptcp_check_for_eof(struct mptcp_sock *msk)
{
	struct mptcp_subflow_context *subflow;
	struct sock *sk = (struct sock *)msk;
	int receivers = 0;

	mptcp_for_each_subflow(msk, subflow)
		receivers += !subflow->rx_eof;
	if (receivers)
		return;

	if (!(sk->sk_shutdown & RCV_SHUTDOWN)) {
		/* hopefully temporary hack: propagate shutdown status
		 * to msk, when all subflows agree on it
		 */
		WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);

		smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
		sk->sk_data_ready(sk);
	}

	switch (sk->sk_state) {
	case TCP_ESTABLISHED:
		inet_sk_state_store(sk, TCP_CLOSE_WAIT);
		break;
	case TCP_FIN_WAIT1:
		inet_sk_state_store(sk, TCP_CLOSING);
		break;
	case TCP_FIN_WAIT2:
		inet_sk_state_store(sk, TCP_CLOSE);
		break;
	default:
		return;
	}
	mptcp_close_wake_up(sk);
}

static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
{
	struct mptcp_subflow_context *subflow;
@@ -1609,7 +1563,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
	if (!mptcp_timer_pending(sk))
		mptcp_reset_timer(sk);
	if (do_check_data_fin)
		__mptcp_check_send_data_fin(sk);
		mptcp_check_send_data_fin(sk);
}

static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first)
@@ -1727,7 +1681,13 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
		if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR)
			*copied_syn = 0;
	} else if (ret && ret != -EINPROGRESS) {
		mptcp_disconnect(sk, 0);
		/* The disconnect() op called by tcp_sendmsg_fastopen()/
		 * __inet_stream_connect() can fail, due to looking check,
		 * see mptcp_disconnect().
		 * Attempt it again outside the problematic scope.
		 */
		if (!mptcp_disconnect(sk, 0))
			sk->sk_socket->state = SS_UNCONNECTED;
	}
	inet_sk(sk)->defer_connect = 0;

@@ -2158,9 +2118,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
				break;
			}

			if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
				mptcp_check_for_eof(msk);

			if (sk->sk_shutdown & RCV_SHUTDOWN) {
				/* race breaker: the shutdown could be after the
				 * previous receive queue check
@@ -2389,7 +2346,10 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,

	need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
	if (!dispose_it) {
		tcp_disconnect(ssk, 0);
		/* The MPTCP code never wait on the subflow sockets, TCP-level
		 * disconnect should never fail
		 */
		WARN_ON_ONCE(tcp_disconnect(ssk, 0));
		msk->subflow->state = SS_UNCONNECTED;
		mptcp_subflow_ctx_reset(subflow);
		release_sock(ssk);
@@ -2408,13 +2368,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
		kfree_rcu(subflow, rcu);
	} else {
		/* otherwise tcp will dispose of the ssk and subflow ctx */
		if (ssk->sk_state == TCP_LISTEN) {
			tcp_set_state(ssk, TCP_CLOSE);
			mptcp_subflow_queue_clean(sk, ssk);
			inet_csk_listen_stop(ssk);
			mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
		}

		__tcp_close(ssk, 0);

		/* close acquired an extra ref */
@@ -2671,16 +2624,12 @@ static void mptcp_worker(struct work_struct *work)
	if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN)))
		goto unlock;

	mptcp_check_data_fin_ack(sk);

	mptcp_check_fastclose(msk);

	mptcp_pm_nl_work(msk);

	if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
		mptcp_check_for_eof(msk);

	__mptcp_check_send_data_fin(sk);
	mptcp_check_send_data_fin(sk);
	mptcp_check_data_fin_ack(sk);
	mptcp_check_data_fin(sk);

	if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
@@ -2812,13 +2761,19 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
			break;
		fallthrough;
	case TCP_SYN_SENT:
		tcp_disconnect(ssk, O_NONBLOCK);
		WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK));
		break;
	default:
		if (__mptcp_check_fallback(mptcp_sk(sk))) {
			pr_debug("Fallback");
			ssk->sk_shutdown |= how;
			tcp_shutdown(ssk, how);

			/* simulate the data_fin ack reception to let the state
			 * machine move forward
			 */
			WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt);
			mptcp_schedule_work(sk);
		} else {
			pr_debug("Sending DATA_FIN on subflow %p", ssk);
			tcp_send_ack(ssk);
@@ -2858,7 +2813,7 @@ static int mptcp_close_state(struct sock *sk)
	return next & TCP_ACTION_FIN;
}

static void __mptcp_check_send_data_fin(struct sock *sk)
static void mptcp_check_send_data_fin(struct sock *sk)
{
	struct mptcp_subflow_context *subflow;
	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -2876,19 +2831,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk)

	WRITE_ONCE(msk->snd_nxt, msk->write_seq);

	/* fallback socket will not get data_fin/ack, can move to the next
	 * state now
	 */
	if (__mptcp_check_fallback(msk)) {
		WRITE_ONCE(msk->snd_una, msk->write_seq);
		if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
			inet_sk_state_store(sk, TCP_CLOSE);
			mptcp_close_wake_up(sk);
		} else if (sk->sk_state == TCP_FIN_WAIT1) {
			inet_sk_state_store(sk, TCP_FIN_WAIT2);
		}
	}

	mptcp_for_each_subflow(msk, subflow) {
		struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);

@@ -2908,7 +2850,7 @@ static void __mptcp_wr_shutdown(struct sock *sk)
	WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
	WRITE_ONCE(msk->snd_data_fin_enable, 1);

	__mptcp_check_send_data_fin(sk);
	mptcp_check_send_data_fin(sk);
}

static void __mptcp_destroy_sock(struct sock *sk)
@@ -2953,10 +2895,24 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
	return EPOLLIN | EPOLLRDNORM;
}

static void mptcp_listen_inuse_dec(struct sock *sk)
static void mptcp_check_listen_stop(struct sock *sk)
{
	if (inet_sk_state_load(sk) == TCP_LISTEN)
	struct sock *ssk;

	if (inet_sk_state_load(sk) != TCP_LISTEN)
		return;

	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
	ssk = mptcp_sk(sk)->first;
	if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN))
		return;

	lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
	mptcp_subflow_queue_clean(sk, ssk);
	inet_csk_listen_stop(ssk);
	mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
	tcp_set_state(ssk, TCP_CLOSE);
	release_sock(ssk);
}

bool __mptcp_close(struct sock *sk, long timeout)
@@ -2969,7 +2925,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);

	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
		mptcp_listen_inuse_dec(sk);
		mptcp_check_listen_stop(sk);
		inet_sk_state_store(sk, TCP_CLOSE);
		goto cleanup;
	}
@@ -3073,15 +3029,20 @@ static int mptcp_disconnect(struct sock *sk, int flags)
{
	struct mptcp_sock *msk = mptcp_sk(sk);

	/* Deny disconnect if other threads are blocked in sk_wait_event()
	 * or inet_wait_for_connect().
	 */
	if (sk->sk_wait_pending)
		return -EBUSY;

	/* We are on the fastopen error path. We can't call straight into the
	 * subflows cleanup code due to lock nesting (we are already under
	 * msk->firstsocket lock). Do nothing and leave the cleanup to the
	 * caller.
	 * msk->firstsocket lock).
	 */
	if (msk->fastopening)
		return 0;
		return -EBUSY;

	mptcp_listen_inuse_dec(sk);
	mptcp_check_listen_stop(sk);
	inet_sk_state_store(sk, TCP_CLOSE);

	mptcp_stop_timer(sk);
@@ -3140,6 +3101,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
		inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
#endif

	nsk->sk_wait_pending = 0;
	__mptcp_init_sock(nsk);

	msk = mptcp_sk(nsk);
@@ -3327,9 +3289,14 @@ static void mptcp_release_cb(struct sock *sk)
	for (;;) {
		unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) |
				      msk->push_pending;
		struct list_head join_list;

		if (!flags)
			break;

		INIT_LIST_HEAD(&join_list);
		list_splice_init(&msk->join_list, &join_list);

		/* the following actions acquire the subflow socket lock
		 *
		 * 1) can't be invoked in atomic scope
@@ -3340,8 +3307,9 @@ static void mptcp_release_cb(struct sock *sk)
		msk->push_pending = 0;
		msk->cb_flags &= ~flags;
		spin_unlock_bh(&sk->sk_lock.slock);

		if (flags & BIT(MPTCP_FLUSH_JOIN_LIST))
			__mptcp_flush_join_list(sk);
			__mptcp_flush_join_list(sk, &join_list);
		if (flags & BIT(MPTCP_PUSH_PENDING))
			__mptcp_push_pending(sk, 0);
		if (flags & BIT(MPTCP_RETRANSMIT))
+1 −4
Original line number Diff line number Diff line
@@ -113,7 +113,6 @@
/* MPTCP socket atomic flags */
#define MPTCP_NOSPACE		1
#define MPTCP_WORK_RTX		2
#define MPTCP_WORK_EOF		3
#define MPTCP_FALLBACK_DONE	4
#define MPTCP_WORK_CLOSE_SUBFLOW 5

@@ -476,14 +475,13 @@ struct mptcp_subflow_context {
		send_mp_fail : 1,
		send_fastclose : 1,
		send_infinite_map : 1,
		rx_eof : 1,
		remote_key_valid : 1,        /* received the peer key from */
		disposable : 1,	    /* ctx can be free at ulp release time */
		stale : 1,	    /* unable to snd/rcv data, do not use for xmit */
		local_id_valid : 1, /* local_id is correctly initialized */
		valid_csum_seen : 1,        /* at least one csum validated */
		is_mptfo : 1,	    /* subflow is doing TFO */
		__unused : 8;
		__unused : 9;
	enum mptcp_data_avail data_avail;
	u32	remote_nonce;
	u64	thmac;
@@ -720,7 +718,6 @@ static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit)
void __mptcp_check_push(struct sock *sk, struct sock *ssk);
void __mptcp_data_acked(struct sock *sk);
void __mptcp_error_report(struct sock *sk);
void mptcp_subflow_eof(struct sock *sk);
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
{
+10 −7
Original line number Diff line number Diff line
@@ -1749,14 +1749,16 @@ static void subflow_state_change(struct sock *sk)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
	struct sock *parent = subflow->conn;
	struct mptcp_sock *msk;

	__subflow_state_change(sk);

	msk = mptcp_sk(parent);
	if (subflow_simultaneous_connect(sk)) {
		mptcp_propagate_sndbuf(parent, sk);
		mptcp_do_fallback(sk);
		mptcp_rcv_space_init(mptcp_sk(parent), sk);
		pr_fallback(mptcp_sk(parent));
		mptcp_rcv_space_init(msk, sk);
		pr_fallback(msk);
		subflow->conn_finished = 1;
		mptcp_set_connected(parent);
	}
@@ -1772,11 +1774,12 @@ static void subflow_state_change(struct sock *sk)

	subflow_sched_work_if_closed(mptcp_sk(parent), sk);

	if (__mptcp_check_fallback(mptcp_sk(parent)) &&
	    !subflow->rx_eof && subflow_is_done(sk)) {
		subflow->rx_eof = 1;
		mptcp_subflow_eof(parent);
	}
	/* when the fallback subflow closes the rx side, trigger a 'dummy'
	 * ingress data fin, so that the msk state will follow along
	 */
	if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk &&
	    mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true))
		mptcp_schedule_work(parent);
}

void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)