Commit bce3bb30 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'mptcp-fixes-for-5-19'

Mat Martineau says:

====================
mptcp: Fixes for 5.19

Several categories of fixes from the mptcp tree:

Patches 1-3 are fixes related to MP_FAIL and FASTCLOSE, to make sure
MIBs are accurate, and to handle MP_FAIL transmission and responses at
the correct times. sk_timer conflicts are also resolved.

Patches 4 and 6 handle two separate race conditions, one at socket
shutdown and one with unaccepted subflows.

Patch 5 makes sure read operations are not blocked during fallback to
TCP.

Patch 7 improves the diag selftest, which were incorrectly failing on
slow machines (like the VMs used for CI testing).

Patch 8 avoids possible symbol redefinition errors in the userspace
mptcp.h file.

Patch 9 fixes a selftest build issue with gcc 12.
====================

Link: https://lore.kernel.org/r/20220628010243.166605-1-mathew.j.martineau@linux.intel.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents ab84db25 fd37c2ec
Loading
Loading
Loading
Loading
+5 −4
Original line number Original line Diff line number Diff line
@@ -2,16 +2,17 @@
#ifndef _UAPI_MPTCP_H
#ifndef _UAPI_MPTCP_H
#define _UAPI_MPTCP_H
#define _UAPI_MPTCP_H


#ifndef __KERNEL__
#include <netinet/in.h>		/* for sockaddr_in and sockaddr_in6	*/
#include <sys/socket.h>		/* for struct sockaddr			*/
#endif

#include <linux/const.h>
#include <linux/const.h>
#include <linux/types.h>
#include <linux/types.h>
#include <linux/in.h>		/* for sockaddr_in			*/
#include <linux/in.h>		/* for sockaddr_in			*/
#include <linux/in6.h>		/* for sockaddr_in6			*/
#include <linux/in6.h>		/* for sockaddr_in6			*/
#include <linux/socket.h>	/* for sockaddr_storage and sa_family	*/
#include <linux/socket.h>	/* for sockaddr_storage and sa_family	*/


#ifndef __KERNEL__
#include <sys/socket.h>		/* for struct sockaddr			*/
#endif

#define MPTCP_SUBFLOW_FLAG_MCAP_REM		_BITUL(0)
#define MPTCP_SUBFLOW_FLAG_MCAP_REM		_BITUL(0)
#define MPTCP_SUBFLOW_FLAG_MCAP_LOC		_BITUL(1)
#define MPTCP_SUBFLOW_FLAG_MCAP_LOC		_BITUL(1)
#define MPTCP_SUBFLOW_FLAG_JOIN_REM		_BITUL(2)
#define MPTCP_SUBFLOW_FLAG_JOIN_REM		_BITUL(2)
+4 −3
Original line number Original line Diff line number Diff line
@@ -765,6 +765,7 @@ static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_bu
	opts->suboptions |= OPTION_MPTCP_RST;
	opts->suboptions |= OPTION_MPTCP_RST;
	opts->reset_transient = subflow->reset_transient;
	opts->reset_transient = subflow->reset_transient;
	opts->reset_reason = subflow->reset_reason;
	opts->reset_reason = subflow->reset_reason;
	MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX);


	return true;
	return true;
}
}
@@ -788,6 +789,7 @@ static bool mptcp_established_options_fastclose(struct sock *sk,
	opts->rcvr_key = msk->remote_key;
	opts->rcvr_key = msk->remote_key;


	pr_debug("FASTCLOSE key=%llu", opts->rcvr_key);
	pr_debug("FASTCLOSE key=%llu", opts->rcvr_key);
	MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX);
	return true;
	return true;
}
}


@@ -809,6 +811,7 @@ static bool mptcp_established_options_mp_fail(struct sock *sk,
	opts->fail_seq = subflow->map_seq;
	opts->fail_seq = subflow->map_seq;


	pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq);
	pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq);
	MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX);


	return true;
	return true;
}
}
@@ -833,13 +836,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
		    mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) {
		    mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) {
			*size += opt_size;
			*size += opt_size;
			remaining -= opt_size;
			remaining -= opt_size;
			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX);
		}
		}
		/* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */
		/* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */
		if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) {
		if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) {
			*size += opt_size;
			*size += opt_size;
			remaining -= opt_size;
			remaining -= opt_size;
			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX);
		}
		}
		return true;
		return true;
	}
	}
@@ -966,7 +967,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
			goto reset;
			goto reset;
		subflow->mp_capable = 0;
		subflow->mp_capable = 0;
		pr_fallback(msk);
		pr_fallback(msk);
		__mptcp_do_fallback(msk);
		mptcp_do_fallback(ssk);
		return false;
		return false;
	}
	}


+4 −6
Original line number Original line Diff line number Diff line
@@ -299,23 +299,21 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
{
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
	struct sock *s = (struct sock *)msk;


	pr_debug("fail_seq=%llu", fail_seq);
	pr_debug("fail_seq=%llu", fail_seq);


	if (!READ_ONCE(msk->allow_infinite_fallback))
	if (!READ_ONCE(msk->allow_infinite_fallback))
		return;
		return;


	if (!READ_ONCE(subflow->mp_fail_response_expect)) {
	if (!subflow->fail_tout) {
		pr_debug("send MP_FAIL response and infinite map");
		pr_debug("send MP_FAIL response and infinite map");


		subflow->send_mp_fail = 1;
		subflow->send_mp_fail = 1;
		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX);
		subflow->send_infinite_map = 1;
		subflow->send_infinite_map = 1;
	} else if (!sock_flag(sk, SOCK_DEAD)) {
		tcp_send_ack(sk);
	} else {
		pr_debug("MP_FAIL response received");
		pr_debug("MP_FAIL response received");

		WRITE_ONCE(subflow->fail_tout, 0);
		sk_stop_timer(s, &s->sk_timer);
	}
	}
}
}


+53 −31
Original line number Original line Diff line number Diff line
@@ -500,7 +500,7 @@ static void mptcp_set_timeout(struct sock *sk)
	__mptcp_set_timeout(sk, tout);
	__mptcp_set_timeout(sk, tout);
}
}


static bool tcp_can_send_ack(const struct sock *ssk)
static inline bool tcp_can_send_ack(const struct sock *ssk)
{
{
	return !((1 << inet_sk_state_load(ssk)) &
	return !((1 << inet_sk_state_load(ssk)) &
	       (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN));
	       (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN));
@@ -1245,7 +1245,7 @@ static void mptcp_update_infinite_map(struct mptcp_sock *msk,
	MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX);
	MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX);
	mptcp_subflow_ctx(ssk)->send_infinite_map = 0;
	mptcp_subflow_ctx(ssk)->send_infinite_map = 0;
	pr_fallback(msk);
	pr_fallback(msk);
	__mptcp_do_fallback(msk);
	mptcp_do_fallback(ssk);
}
}


static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
@@ -2175,21 +2175,6 @@ static void mptcp_retransmit_timer(struct timer_list *t)
	sock_put(sk);
	sock_put(sk);
}
}


static struct mptcp_subflow_context *
mp_fail_response_expect_subflow(struct mptcp_sock *msk)
{
	struct mptcp_subflow_context *subflow, *ret = NULL;

	mptcp_for_each_subflow(msk, subflow) {
		if (READ_ONCE(subflow->mp_fail_response_expect)) {
			ret = subflow;
			break;
		}
	}

	return ret;
}

static void mptcp_timeout_timer(struct timer_list *t)
static void mptcp_timeout_timer(struct timer_list *t)
{
{
	struct sock *sk = from_timer(sk, t, sk_timer);
	struct sock *sk = from_timer(sk, t, sk_timer);
@@ -2346,6 +2331,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
		kfree_rcu(subflow, rcu);
		kfree_rcu(subflow, rcu);
	} else {
	} else {
		/* otherwise tcp will dispose of the ssk and subflow ctx */
		/* otherwise tcp will dispose of the ssk and subflow ctx */
		if (ssk->sk_state == TCP_LISTEN) {
			tcp_set_state(ssk, TCP_CLOSE);
			mptcp_subflow_queue_clean(ssk);
			inet_csk_listen_stop(ssk);
		}
		__tcp_close(ssk, 0);
		__tcp_close(ssk, 0);


		/* close acquired an extra ref */
		/* close acquired an extra ref */
@@ -2518,27 +2508,50 @@ static void __mptcp_retrans(struct sock *sk)
		mptcp_reset_timer(sk);
		mptcp_reset_timer(sk);
}
}


/* schedule the timeout timer for the relevant event: either close timeout
 * or mp_fail timeout. The close timeout takes precedence on the mp_fail one
 */
void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout)
{
	struct sock *sk = (struct sock *)msk;
	unsigned long timeout, close_timeout;

	if (!fail_tout && !sock_flag(sk, SOCK_DEAD))
		return;

	close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN;

	/* the close timeout takes precedence on the fail one, and here at least one of
	 * them is active
	 */
	timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout;

	sk_reset_timer(sk, &sk->sk_timer, timeout);
}

static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
{
{
	struct mptcp_subflow_context *subflow;
	struct sock *ssk = msk->first;
	struct sock *ssk;
	bool slow;
	bool slow;


	subflow = mp_fail_response_expect_subflow(msk);
	if (!ssk)
	if (subflow) {
		return;

	pr_debug("MP_FAIL doesn't respond, reset the subflow");
	pr_debug("MP_FAIL doesn't respond, reset the subflow");


		ssk = mptcp_subflow_tcp_sock(subflow);
	slow = lock_sock_fast(ssk);
	slow = lock_sock_fast(ssk);
	mptcp_subflow_reset(ssk);
	mptcp_subflow_reset(ssk);
	WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0);
	unlock_sock_fast(ssk, slow);
	unlock_sock_fast(ssk, slow);
	}

	mptcp_reset_timeout(msk, 0);
}
}


static void mptcp_worker(struct work_struct *work)
static void mptcp_worker(struct work_struct *work)
{
{
	struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
	struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
	struct sock *sk = &msk->sk.icsk_inet.sk;
	struct sock *sk = &msk->sk.icsk_inet.sk;
	unsigned long fail_tout;
	int state;
	int state;


	lock_sock(sk);
	lock_sock(sk);
@@ -2575,6 +2588,8 @@ static void mptcp_worker(struct work_struct *work)
	if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
	if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
		__mptcp_retrans(sk);
		__mptcp_retrans(sk);


	fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0;
	if (fail_tout && time_after(jiffies, fail_tout))
		mptcp_mp_fail_no_response(msk);
		mptcp_mp_fail_no_response(msk);


unlock:
unlock:
@@ -2822,6 +2837,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
static void mptcp_close(struct sock *sk, long timeout)
static void mptcp_close(struct sock *sk, long timeout)
{
{
	struct mptcp_subflow_context *subflow;
	struct mptcp_subflow_context *subflow;
	struct mptcp_sock *msk = mptcp_sk(sk);
	bool do_cancel_work = false;
	bool do_cancel_work = false;


	lock_sock(sk);
	lock_sock(sk);
@@ -2840,10 +2856,16 @@ static void mptcp_close(struct sock *sk, long timeout)
cleanup:
cleanup:
	/* orphan all the subflows */
	/* orphan all the subflows */
	inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
	inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
	mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
	mptcp_for_each_subflow(msk, subflow) {
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
		bool slow = lock_sock_fast_nested(ssk);
		bool slow = lock_sock_fast_nested(ssk);


		/* since the close timeout takes precedence on the fail one,
		 * cancel the latter
		 */
		if (ssk == msk->first)
			subflow->fail_tout = 0;

		sock_orphan(ssk);
		sock_orphan(ssk);
		unlock_sock_fast(ssk, slow);
		unlock_sock_fast(ssk, slow);
	}
	}
@@ -2852,13 +2874,13 @@ static void mptcp_close(struct sock *sk, long timeout)
	sock_hold(sk);
	sock_hold(sk);
	pr_debug("msk=%p state=%d", sk, sk->sk_state);
	pr_debug("msk=%p state=%d", sk, sk->sk_state);
	if (mptcp_sk(sk)->token)
	if (mptcp_sk(sk)->token)
		mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
		mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);


	if (sk->sk_state == TCP_CLOSE) {
	if (sk->sk_state == TCP_CLOSE) {
		__mptcp_destroy_sock(sk);
		__mptcp_destroy_sock(sk);
		do_cancel_work = true;
		do_cancel_work = true;
	} else {
	} else {
		sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN);
		mptcp_reset_timeout(msk, 0);
	}
	}
	release_sock(sk);
	release_sock(sk);
	if (do_cancel_work)
	if (do_cancel_work)
+20 −4
Original line number Original line Diff line number Diff line
@@ -306,6 +306,7 @@ struct mptcp_sock {


	u32 setsockopt_seq;
	u32 setsockopt_seq;
	char		ca_name[TCP_CA_NAME_MAX];
	char		ca_name[TCP_CA_NAME_MAX];
	struct mptcp_sock	*dl_next;
};
};


#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
@@ -468,7 +469,6 @@ struct mptcp_subflow_context {
		local_id_valid : 1, /* local_id is correctly initialized */
		local_id_valid : 1, /* local_id is correctly initialized */
		valid_csum_seen : 1;        /* at least one csum validated */
		valid_csum_seen : 1;        /* at least one csum validated */
	enum mptcp_data_avail data_avail;
	enum mptcp_data_avail data_avail;
	bool	mp_fail_response_expect;
	u32	remote_nonce;
	u32	remote_nonce;
	u64	thmac;
	u64	thmac;
	u32	local_nonce;
	u32	local_nonce;
@@ -482,6 +482,7 @@ struct mptcp_subflow_context {
	u8	stale_count;
	u8	stale_count;


	long	delegated_status;
	long	delegated_status;
	unsigned long	fail_tout;


	);
	);


@@ -608,6 +609,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
		     struct mptcp_subflow_context *subflow);
		     struct mptcp_subflow_context *subflow);
void mptcp_subflow_send_ack(struct sock *ssk);
void mptcp_subflow_send_ack(struct sock *ssk);
void mptcp_subflow_reset(struct sock *ssk);
void mptcp_subflow_reset(struct sock *ssk);
void mptcp_subflow_queue_clean(struct sock *ssk);
void mptcp_sock_graft(struct sock *sk, struct socket *parent);
void mptcp_sock_graft(struct sock *sk, struct socket *parent);
struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);


@@ -662,6 +664,7 @@ void mptcp_get_options(const struct sk_buff *skb,


void mptcp_finish_connect(struct sock *sk);
void mptcp_finish_connect(struct sock *sk);
void __mptcp_set_connected(struct sock *sk);
void __mptcp_set_connected(struct sock *sk);
void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout);
static inline bool mptcp_is_fully_established(struct sock *sk)
static inline bool mptcp_is_fully_established(struct sock *sk)
{
{
	return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
	return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
@@ -926,12 +929,25 @@ static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
	set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
	set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}
}


static inline void mptcp_do_fallback(struct sock *sk)
static inline void mptcp_do_fallback(struct sock *ssk)
{
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
	struct sock *sk = subflow->conn;
	struct mptcp_sock *msk;


	msk = mptcp_sk(sk);
	__mptcp_do_fallback(msk);
	__mptcp_do_fallback(msk);
	if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) {
		gfp_t saved_allocation = ssk->sk_allocation;

		/* we are in a atomic (BH) scope, override ssk default for data
		 * fin allocation
		 */
		ssk->sk_allocation = GFP_ATOMIC;
		ssk->sk_shutdown |= SEND_SHUTDOWN;
		tcp_shutdown(ssk, SEND_SHUTDOWN);
		ssk->sk_allocation = saved_allocation;
	}
}
}


#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a)
#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a)
Loading