Commit 972983fc authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'mptcp-stalled-connections-fix'



Matthieu Baerts says:

====================
mptcp: fix stalled connections

Daire reported a few issues with MPTCP where some connections were
stalled in different states. Paolo did a great job fixing them.

Patch 1 fixes bogus receive window shrinkage with multiple subflows. Due
to a race condition and unlucky circumstances, that may lead to
TCP-level window shrinkage, and the connection being stalled on the
sender end.

Patch 2 is a preparation for patch 3 which processes pending subflow
errors on close. Without that and under specific circumstances, the
MPTCP-level socket might not switch to the CLOSE state and stall.

Patch 4 is also a preparation patch for the next one. Patch 5 fixes
MPTCP connections not switching to the CLOSE state when all subflows
have been closed but no DATA_FIN have been exchanged to explicitly close
the MPTCP connection. Now connections in such state will switch to the
CLOSE state after a timeout, still allowing the "make-after-break"
feature but making sure connections don't stall forever. It will be
possible to modify this timeout -- currently matching TCP TIMEWAIT value
(60 seconds) -- in a future version.
====================

Signed-off-by: default avatarMatthieu Baerts <matthieu.baerts@tessares.net>
parents 8a47558a 27e5ccc2
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -1269,12 +1269,13 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)

			if (rcv_wnd == rcv_wnd_old)
				break;
			if (before64(rcv_wnd_new, rcv_wnd)) {

			rcv_wnd_old = rcv_wnd;
			if (before64(rcv_wnd_new, rcv_wnd_old)) {
				MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE);
				goto raise_win;
			}
			MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT);
			rcv_wnd_old = rcv_wnd;
		}
		return;
	}
+102 −63
Original line number Diff line number Diff line
@@ -405,7 +405,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
	return false;
}

static void mptcp_stop_timer(struct sock *sk)
static void mptcp_stop_rtx_timer(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);

@@ -770,6 +770,46 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
	return moved;
}

static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
{
	int err = sock_error(ssk);
	int ssk_state;

	if (!err)
		return false;

	/* only propagate errors on fallen-back sockets or
	 * on MPC connect
	 */
	if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk)))
		return false;

	/* We need to propagate only transition to CLOSE state.
	 * Orphaned socket will see such state change via
	 * subflow_sched_work_if_closed() and that path will properly
	 * destroy the msk as needed.
	 */
	ssk_state = inet_sk_state_load(ssk);
	if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
		inet_sk_state_store(sk, ssk_state);
	WRITE_ONCE(sk->sk_err, -err);

	/* This barrier is coupled with smp_rmb() in mptcp_poll() */
	smp_wmb();
	sk_error_report(sk);
	return true;
}

void __mptcp_error_report(struct sock *sk)
{
	struct mptcp_subflow_context *subflow;
	struct mptcp_sock *msk = mptcp_sk(sk);

	mptcp_for_each_subflow(msk, subflow)
		if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow)))
			break;
}

/* In most cases we will be able to lock the mptcp socket.  If its already
 * owned, we need to defer to the work queue to avoid ABBA deadlock.
 */
@@ -852,6 +892,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
	mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++;
	mptcp_sockopt_sync_locked(msk, ssk);
	mptcp_subflow_joined(msk, ssk);
	mptcp_stop_tout_timer(sk);
	return true;
}

@@ -871,12 +912,12 @@ static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list
	}
}

static bool mptcp_timer_pending(struct sock *sk)
static bool mptcp_rtx_timer_pending(struct sock *sk)
{
	return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
}

static void mptcp_reset_timer(struct sock *sk)
static void mptcp_reset_rtx_timer(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	unsigned long tout;
@@ -1010,10 +1051,10 @@ static void __mptcp_clean_una(struct sock *sk)
out:
	if (snd_una == READ_ONCE(msk->snd_nxt) &&
	    snd_una == READ_ONCE(msk->write_seq)) {
		if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
			mptcp_stop_timer(sk);
		if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
			mptcp_stop_rtx_timer(sk);
	} else {
		mptcp_reset_timer(sk);
		mptcp_reset_rtx_timer(sk);
	}
}

@@ -1586,8 +1627,8 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
		mptcp_push_release(ssk, &info);

	/* ensure the rtx timer is running */
	if (!mptcp_timer_pending(sk))
		mptcp_reset_timer(sk);
	if (!mptcp_rtx_timer_pending(sk))
		mptcp_reset_rtx_timer(sk);
	if (do_check_data_fin)
		mptcp_check_send_data_fin(sk);
}
@@ -1650,8 +1691,8 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
	if (copied) {
		tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
			 info.size_goal);
		if (!mptcp_timer_pending(sk))
			mptcp_reset_timer(sk);
		if (!mptcp_rtx_timer_pending(sk))
			mptcp_reset_rtx_timer(sk);

		if (msk->snd_data_fin_enable &&
		    msk->snd_nxt + 1 == msk->write_seq)
@@ -2220,7 +2261,7 @@ static void mptcp_retransmit_timer(struct timer_list *t)
	sock_put(sk);
}

static void mptcp_timeout_timer(struct timer_list *t)
static void mptcp_tout_timer(struct timer_list *t)
{
	struct sock *sk = from_timer(sk, t, sk_timer);

@@ -2329,18 +2370,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
	bool dispose_it, need_push = false;

	/* If the first subflow moved to a close state before accept, e.g. due
	 * to an incoming reset, mptcp either:
	 * - if either the subflow or the msk are dead, destroy the context
	 *   (the subflow socket is deleted by inet_child_forget) and the msk
	 * - otherwise do nothing at the moment and take action at accept and/or
	 *   listener shutdown - user-space must be able to accept() the closed
	 *   socket.
	 */
	if (msk->in_accept_queue && msk->first == ssk) {
		if (!sock_flag(sk, SOCK_DEAD) && !sock_flag(ssk, SOCK_DEAD))
			return;

	 * to an incoming reset or listener shutdown, the subflow socket is
	 * already deleted by inet_child_forget() and the mptcp socket can't
	 * survive too.
	 */
	if (msk->in_accept_queue && msk->first == ssk &&
	    (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) {
		/* ensure later check in mptcp_worker() will dispose the msk */
		mptcp_set_close_tout(sk, tcp_jiffies32 - (TCP_TIMEWAIT_LEN + 1));
		sock_set_flag(sk, SOCK_DEAD);
		lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
		mptcp_subflow_drop_ctx(ssk);
@@ -2392,6 +2429,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
	}

out_release:
	__mptcp_subflow_error_report(sk, ssk);
	release_sock(ssk);

	sock_put(ssk);
@@ -2402,6 +2440,22 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
out:
	if (need_push)
		__mptcp_push_pending(sk, 0);

	/* Catch every 'all subflows closed' scenario, including peers silently
	 * closing them, e.g. due to timeout.
	 * For established sockets, allow an additional timeout before closing,
	 * as the protocol can still create more subflows.
	 */
	if (list_is_singular(&msk->conn_list) && msk->first &&
	    inet_sk_state_load(msk->first) == TCP_CLOSE) {
		if (sk->sk_state != TCP_ESTABLISHED ||
		    msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) {
			inet_sk_state_store(sk, TCP_CLOSE);
			mptcp_close_wake_up(sk);
		} else {
			mptcp_start_tout_timer(sk);
		}
	}
}

void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
@@ -2445,23 +2499,14 @@ static void __mptcp_close_subflow(struct sock *sk)

}

static bool mptcp_should_close(const struct sock *sk)
static bool mptcp_close_tout_expired(const struct sock *sk)
{
	s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp;
	struct mptcp_subflow_context *subflow;

	if (delta >= TCP_TIMEWAIT_LEN || mptcp_sk(sk)->in_accept_queue)
		return true;

	/* if all subflows are in closed status don't bother with additional
	 * timeout
	 */
	mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
		if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) !=
		    TCP_CLOSE)
	if (!inet_csk(sk)->icsk_mtup.probe_timestamp ||
	    sk->sk_state == TCP_CLOSE)
		return false;
	}
	return true;

	return time_after32(tcp_jiffies32,
		  inet_csk(sk)->icsk_mtup.probe_timestamp + TCP_TIMEWAIT_LEN);
}

static void mptcp_check_fastclose(struct mptcp_sock *msk)
@@ -2588,27 +2633,28 @@ static void __mptcp_retrans(struct sock *sk)
reset_timer:
	mptcp_check_and_set_pending(sk);

	if (!mptcp_timer_pending(sk))
		mptcp_reset_timer(sk);
	if (!mptcp_rtx_timer_pending(sk))
		mptcp_reset_rtx_timer(sk);
}

/* schedule the timeout timer for the relevant event: either close timeout
 * or mp_fail timeout. The close timeout takes precedence on the mp_fail one
 */
void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout)
void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
{
	struct sock *sk = (struct sock *)msk;
	unsigned long timeout, close_timeout;

	if (!fail_tout && !sock_flag(sk, SOCK_DEAD))
	if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp)
		return;

	close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN;
	close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies +
			TCP_TIMEWAIT_LEN;

	/* the close timeout takes precedence on the fail one, and here at least one of
	 * them is active
	 */
	timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout;
	timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout;

	sk_reset_timer(sk, &sk->sk_timer, timeout);
}
@@ -2627,8 +2673,6 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
	mptcp_subflow_reset(ssk);
	WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0);
	unlock_sock_fast(ssk, slow);

	mptcp_reset_timeout(msk, 0);
}

static void mptcp_do_fastclose(struct sock *sk)
@@ -2665,19 +2709,15 @@ static void mptcp_worker(struct work_struct *work)
	if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
		__mptcp_close_subflow(sk);

	/* There is no point in keeping around an orphaned sk timedout or
	 * closed, but we need the msk around to reply to incoming DATA_FIN,
	 * even if it is orphaned and in FIN_WAIT2 state
	 */
	if (sock_flag(sk, SOCK_DEAD)) {
		if (mptcp_should_close(sk))
	if (mptcp_close_tout_expired(sk)) {
		mptcp_do_fastclose(sk);
		mptcp_close_wake_up(sk);
	}

		if (sk->sk_state == TCP_CLOSE) {
	if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) {
		__mptcp_destroy_sock(sk);
		goto unlock;
	}
	}

	if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
		__mptcp_retrans(sk);
@@ -2717,7 +2757,7 @@ static void __mptcp_init_sock(struct sock *sk)

	/* re-use the csk retrans timer for MPTCP-level retrans */
	timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
	timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0);
	timer_setup(&sk->sk_timer, mptcp_tout_timer, 0);
}

static void mptcp_ca_reset(struct sock *sk)
@@ -2808,8 +2848,8 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
		} else {
			pr_debug("Sending DATA_FIN on subflow %p", ssk);
			tcp_send_ack(ssk);
			if (!mptcp_timer_pending(sk))
				mptcp_reset_timer(sk);
			if (!mptcp_rtx_timer_pending(sk))
				mptcp_reset_rtx_timer(sk);
		}
		break;
	}
@@ -2892,7 +2932,7 @@ static void __mptcp_destroy_sock(struct sock *sk)

	might_sleep();

	mptcp_stop_timer(sk);
	mptcp_stop_rtx_timer(sk);
	sk_stop_timer(sk, &sk->sk_timer);
	msk->pm.status = 0;
	mptcp_release_sched(msk);
@@ -2975,7 +3015,6 @@ bool __mptcp_close(struct sock *sk, long timeout)

cleanup:
	/* orphan all the subflows */
	inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
	mptcp_for_each_subflow(msk, subflow) {
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
		bool slow = lock_sock_fast_nested(ssk);
@@ -3012,7 +3051,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
		__mptcp_destroy_sock(sk);
		do_cancel_work = true;
	} else {
		mptcp_reset_timeout(msk, 0);
		mptcp_start_tout_timer(sk);
	}

	return do_cancel_work;
@@ -3075,8 +3114,8 @@ static int mptcp_disconnect(struct sock *sk, int flags)
	mptcp_check_listen_stop(sk);
	inet_sk_state_store(sk, TCP_CLOSE);

	mptcp_stop_timer(sk);
	sk_stop_timer(sk, &sk->sk_timer);
	mptcp_stop_rtx_timer(sk);
	mptcp_stop_tout_timer(sk);

	if (msk->token)
		mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
+23 −1
Original line number Diff line number Diff line
@@ -718,7 +718,29 @@ void mptcp_get_options(const struct sk_buff *skb,

void mptcp_finish_connect(struct sock *sk);
void __mptcp_set_connected(struct sock *sk);
void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout);
void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout);

static inline void mptcp_stop_tout_timer(struct sock *sk)
{
	if (!inet_csk(sk)->icsk_mtup.probe_timestamp)
		return;

	sk_stop_timer(sk, &sk->sk_timer);
	inet_csk(sk)->icsk_mtup.probe_timestamp = 0;
}

static inline void mptcp_set_close_tout(struct sock *sk, unsigned long tout)
{
	/* avoid 0 timestamp, as that means no close timeout */
	inet_csk(sk)->icsk_mtup.probe_timestamp = tout ? : 1;
}

static inline void mptcp_start_tout_timer(struct sock *sk)
{
	mptcp_set_close_tout(sk, tcp_jiffies32);
	mptcp_reset_tout_timer(mptcp_sk(sk), 0);
}

static inline bool mptcp_is_fully_established(struct sock *sk)
{
	return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
+2 −37
Original line number Diff line number Diff line
@@ -1226,7 +1226,7 @@ static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
	WRITE_ONCE(subflow->fail_tout, fail_tout);
	tcp_send_ack(ssk);

	mptcp_reset_timeout(msk, subflow->fail_tout);
	mptcp_reset_tout_timer(msk, subflow->fail_tout);
}

static bool subflow_check_data_avail(struct sock *ssk)
@@ -1362,42 +1362,6 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
	*full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
}

void __mptcp_error_report(struct sock *sk)
{
	struct mptcp_subflow_context *subflow;
	struct mptcp_sock *msk = mptcp_sk(sk);

	mptcp_for_each_subflow(msk, subflow) {
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
		int err = sock_error(ssk);
		int ssk_state;

		if (!err)
			continue;

		/* only propagate errors on fallen-back sockets or
		 * on MPC connect
		 */
		if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
			continue;

		/* We need to propagate only transition to CLOSE state.
		 * Orphaned socket will see such state change via
		 * subflow_sched_work_if_closed() and that path will properly
		 * destroy the msk as needed.
		 */
		ssk_state = inet_sk_state_load(ssk);
		if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
			inet_sk_state_store(sk, ssk_state);
		WRITE_ONCE(sk->sk_err, -err);

		/* This barrier is coupled with smp_rmb() in mptcp_poll() */
		smp_wmb();
		sk_error_report(sk);
		break;
	}
}

static void subflow_error_report(struct sock *ssk)
{
	struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
@@ -1588,6 +1552,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
	mptcp_sock_graft(ssk, sk->sk_socket);
	iput(SOCK_INODE(sf));
	WRITE_ONCE(msk->allow_infinite_fallback, false);
	mptcp_stop_tout_timer(sk);
	return 0;

failed_unlink: