Commit 949dfdcf authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'mptcp-improve-mptcp-level-window-tracking'

Mat Martineau says:

====================
mptcp: Improve MPTCP-level window tracking

This series improves MPTCP receive window compliance with RFC 8684 and
helps increase throughput on high-speed links. Note that patch 3 makes a
change in tcp_output.c

For the details, Paolo says:

I've been chasing bad/unstable performance with multiple subflows
on very high speed links.

It looks like the root cause is due to the current mptcp-level
congestion window handling. There are apparently a few different
sub-issues:

- the rcv_wnd is not effectively shared on the tx side, as each
  subflow takes in account only the value received by the underlaying
  TCP connection. This is addressed in patch 1/5

- The mptcp-level offered wnd right edge is currently allowed to shrink.
  Reading section 3.3.4.:

"""
   The receive window is relative to the DATA_ACK.  As in TCP, a
   receiver MUST NOT shrink the right edge of the receive window (i.e.,
   DATA_ACK + receive window).  The receiver will use the data sequence
   number to tell if a packet should be accepted at the connection
   level.
"""

I read the above as we need to reflect window right-edge tracking
on the wire, see patch 4/5.

- The offered window right edge tracking can happen concurrently on
  multiple subflows, but there is no mutex protection. We need an
  additional atomic operation - still patch 4/5

This series additionally bumps a few new MIBs to track all the above
(ensure/observe that the suspected races actually take place).

I could not access again the host where the issue was so
noticeable, still in the current setup the tput changes from
[6-18] Gbps to 19Gbps very stable.
====================

Link: https://lore.kernel.org/r/20220504215408.349318-1-mathew.j.martineau@linux.intel.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 10b4a11f 38acb626
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -125,7 +125,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
			       struct mptcp_out_options *opts);
bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);

void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
			 struct mptcp_out_options *opts);

void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info);
+8 −6
Original line number Diff line number Diff line
@@ -445,12 +445,13 @@ struct tcp_out_options {
	struct mptcp_out_options mptcp;
};

static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp,
static void mptcp_options_write(struct tcphdr *th, __be32 *ptr,
				struct tcp_sock *tp,
				struct tcp_out_options *opts)
{
#if IS_ENABLED(CONFIG_MPTCP)
	if (unlikely(OPTION_MPTCP & opts->options))
		mptcp_write_options(ptr, tp, &opts->mptcp);
		mptcp_write_options(th, ptr, tp, &opts->mptcp);
#endif
}

@@ -606,9 +607,10 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
 * At least SACK_PERM as the first option is known to lead to a disaster
 * (but it may well be that other scenarios fail similarly).
 */
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
			      struct tcp_out_options *opts)
{
	__be32 *ptr = (__be32 *)(th + 1);
	u16 options = opts->options;	/* mungable copy */

	if (unlikely(OPTION_MD5 & options)) {
@@ -702,7 +704,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,

	smc_options_write(ptr, &options);

	mptcp_options_write(ptr, tp, opts);
	mptcp_options_write(th, ptr, tp, opts);
}

static void smc_set_option(const struct tcp_sock *tp,
@@ -1355,7 +1357,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
		th->window	= htons(min(tp->rcv_wnd, 65535U));
	}

	tcp_options_write((__be32 *)(th + 1), tp, &opts);
	tcp_options_write(th, tp, &opts);

#ifdef CONFIG_TCP_MD5SIG
	/* Calculate the MD5 hash, as we have all we need now */
@@ -3591,7 +3593,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,

	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
	tcp_options_write((__be32 *)(th + 1), NULL, &opts);
	tcp_options_write(th, NULL, &opts);
	th->doff = (tcp_header_size >> 2);
	__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);

+4 −0
Original line number Diff line number Diff line
@@ -56,6 +56,10 @@ static const struct snmp_mib mptcp_snmp_list[] = {
	SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED),
	SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE),
	SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER),
	SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED),
	SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED),
	SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE),
	SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT),
	SNMP_MIB_SENTINEL
};

+6 −0
Original line number Diff line number Diff line
@@ -49,6 +49,12 @@ enum linux_mptcp_mib_field {
	MPTCP_MIB_RCVPRUNED,		/* Incoming packet dropped due to memory limit */
	MPTCP_MIB_SUBFLOWSTALE,		/* Subflows entered 'stale' status */
	MPTCP_MIB_SUBFLOWRECOVER,	/* Subflows returned to active status after being stale */
	MPTCP_MIB_SNDWNDSHARED,		/* Subflow snd wnd is overridden by msk's one */
	MPTCP_MIB_RCVWNDSHARED,		/* Subflow rcv wnd is overridden by msk's one */
	MPTCP_MIB_RCVWNDCONFLICTUPDATE,	/* subflow rcv wnd is overridden by msk's one due to
					 * conflict with another subflow while updating msk rcv wnd
					 */
	MPTCP_MIB_RCVWNDCONFLICT,	/* Conflict with while updating msk rcv wnd */
	__MPTCP_MIB_MAX
};

+50 −8
Original line number Diff line number Diff line
@@ -1224,20 +1224,62 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
	return true;
}

static void mptcp_set_rwin(const struct tcp_sock *tp)
static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
{
	const struct sock *ssk = (const struct sock *)tp;
	const struct mptcp_subflow_context *subflow;
	struct mptcp_subflow_context *subflow;
	u64 ack_seq, rcv_wnd_old, rcv_wnd_new;
	struct mptcp_sock *msk;
	u64 ack_seq;
	u32 new_win;
	u64 win;

	subflow = mptcp_subflow_ctx(ssk);
	msk = mptcp_sk(subflow->conn);

	ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd;
	ack_seq = READ_ONCE(msk->ack_seq);
	rcv_wnd_new = ack_seq + tp->rcv_wnd;

	rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent);
	if (after64(rcv_wnd_new, rcv_wnd_old)) {
		u64 rcv_wnd;

		for (;;) {
			rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new);

			if (rcv_wnd == rcv_wnd_old)
				break;
			if (before64(rcv_wnd_new, rcv_wnd)) {
				MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE);
				goto raise_win;
			}
			MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT);
			rcv_wnd_old = rcv_wnd;
		}
		return;
	}

	if (rcv_wnd_new != rcv_wnd_old) {
raise_win:
		win = rcv_wnd_old - ack_seq;
		tp->rcv_wnd = min_t(u64, win, U32_MAX);
		new_win = tp->rcv_wnd;

	if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent)))
		WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
		/* Make sure we do not exceed the maximum possible
		 * scaled window.
		 */
		if (unlikely(th->syn))
			new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale;
		if (!tp->rx_opt.rcv_wscale &&
		    sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)
			new_win = min(new_win, MAX_TCP_WINDOW);
		else
			new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));

		/* RFC1323 scaling applied */
		new_win >>= tp->rx_opt.rcv_wscale;
		th->window = htons(new_win);
		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED);
	}
}

u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
@@ -1265,7 +1307,7 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
				 ~csum_unfold(mpext->csum));
}

void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
			 struct mptcp_out_options *opts)
{
	const struct sock *ssk = (const struct sock *)tp;
@@ -1554,7 +1596,7 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
	}

	if (tp)
		mptcp_set_rwin(tp);
		mptcp_set_rwin(tp, th);
}

__be32 mptcp_get_reset_option(const struct sk_buff *skb)
Loading