Commit cb0f8b03 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'mptcp-next'



Mat Martineau says:

====================
mptcp: Optimize output options and add MP_FAIL

This patch set contains two groups of changes that we've been testing in
the MPTCP tree.

The first optimizes the code path and data structure for populating
MPTCP option headers when transmitting.

Patch 1 reorganizes code to reduce the number of conditionals that need
to be evaluated in common cases.

Patch 2 rearranges struct mptcp_out_options to save 80 bytes (on x86_64).

The next five patches add partial support for the MP_FAIL option as
defined in RFC 8684. MP_FAIL is an option header used to cleanly handle
MPTCP checksum failures. When the MPTCP checksum detects an error in the
MPTCP DSS header or the data mapped by that header, the receiver uses a
TCP RST with MP_FAIL to close the subflow that experienced the error and
provide associated MPTCP sequence number information to the peer. RFC
8684 also describes how a single-subflow connection can discard corrupt
data and remain connected under certain conditions using MP_FAIL, but
that feature is not implemented here.

Patches 3-5 implement MP_FAIL transmit and receive, and integrates with
checksum validation.

Patches 6 & 7 add MP_FAIL selftests and the MIBs required for those
tests.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents d484dc2b 6bb3ab49
Loading
Loading
Loading
Loading
+20 −9
Original line number Diff line number Diff line
@@ -58,10 +58,6 @@ struct mptcp_addr_info {
struct mptcp_out_options {
#if IS_ENABLED(CONFIG_MPTCP)
	u16 suboptions;
	u64 sndr_key;
	u64 rcvr_key;
	u64 ahmac;
	struct mptcp_addr_info addr;
	struct mptcp_rm_list rm_list;
	u8 join_id;
	u8 backup;
@@ -69,11 +65,26 @@ struct mptcp_out_options {
	   reset_transient:1,
	   csum_reqd:1,
	   allow_join_id0:1;
	union {
		struct {
			u64 sndr_key;
			u64 rcvr_key;
		};
		struct {
			struct mptcp_addr_info addr;
			u64 ahmac;
		};
		struct {
			struct mptcp_ext ext_copy;
			u64 fail_seq;
		};
		struct {
			u32 nonce;
	u64 thmac;
			u32 token;
			u64 thmac;
			u8 hmac[20];
	struct mptcp_ext ext_copy;
		};
	};
#endif
};

+2 −0
Original line number Diff line number Diff line
@@ -44,6 +44,8 @@ static const struct snmp_mib mptcp_snmp_list[] = {
	SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW),
	SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX),
	SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX),
	SNMP_MIB_ITEM("MPFailTx", MPTCP_MIB_MPFAILTX),
	SNMP_MIB_ITEM("MPFailRx", MPTCP_MIB_MPFAILRX),
	SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED),
	SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE),
	SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER),
+2 −0
Original line number Diff line number Diff line
@@ -37,6 +37,8 @@ enum linux_mptcp_mib_field {
	MPTCP_MIB_RMSUBFLOW,		/* Remove a subflow */
	MPTCP_MIB_MPPRIOTX,		/* Transmit a MP_PRIO */
	MPTCP_MIB_MPPRIORX,		/* Received a MP_PRIO */
	MPTCP_MIB_MPFAILTX,		/* Transmit a MP_FAIL */
	MPTCP_MIB_MPFAILRX,		/* Received a MP_FAIL */
	MPTCP_MIB_RCVPRUNED,		/* Incoming packet dropped due to memory limit */
	MPTCP_MIB_SUBFLOWSTALE,		/* Subflows entered 'stale' status */
	MPTCP_MIB_SUBFLOWRECOVER,	/* Subflows returned to active status after being stale */
+192 −113
Original line number Diff line number Diff line
@@ -336,6 +336,16 @@ static void mptcp_parse_option(const struct sk_buff *skb,
		mp_opt->reset_reason = *ptr;
		break;

	case MPTCPOPT_MP_FAIL:
		if (opsize != TCPOLEN_MPTCP_FAIL)
			break;

		ptr += 2;
		mp_opt->mp_fail = 1;
		mp_opt->fail_seq = get_unaligned_be64(ptr);
		pr_debug("MP_FAIL: data_seq=%llu", mp_opt->fail_seq);
		break;

	default:
		break;
	}
@@ -364,6 +374,7 @@ void mptcp_get_options(const struct sock *sk,
	mp_opt->reset = 0;
	mp_opt->csum_reqd = READ_ONCE(msk->csum_enabled);
	mp_opt->deny_join_id0 = 0;
	mp_opt->mp_fail = 0;

	length = (th->doff * 4) - sizeof(struct tcphdr);
	ptr = (const unsigned char *)(th + 1);
@@ -592,6 +603,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
		dss_size = map_size;
		if (skb && snd_data_fin_enable)
			mptcp_write_data_fin(subflow, skb, &opts->ext_copy);
		opts->suboptions = OPTION_MPTCP_DSS;
		ret = true;
	}

@@ -615,6 +627,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
		opts->ext_copy.ack64 = 0;
	}
	opts->ext_copy.use_ack = 1;
	opts->suboptions = OPTION_MPTCP_DSS;
	WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));

	/* Add kind/length/subtype/flag overhead if mapping is not populated */
@@ -686,8 +699,13 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
	if (drop_other_suboptions) {
		pr_debug("drop other suboptions");
		opts->suboptions = 0;
		opts->ext_copy.use_ack = 0;
		opts->ext_copy.use_map = 0;

		/* note that e.g. DSS could have written into the memory
		 * aliased by ahmac, we must reset the field here
		 * to avoid appending the hmac even for ADD_ADDR echo
		 * options
		 */
		opts->ahmac = 0;
		*size -= opt_size;
	}
	opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
@@ -739,7 +757,12 @@ static bool mptcp_established_options_mp_prio(struct sock *sk,
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

	if (!subflow->send_mp_prio)
	/* can't send MP_PRIO with MPC, as they share the same option space:
	 * 'backup'. Also it makes no sense at all
	 */
	if (!subflow->send_mp_prio ||
	    ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
	      OPTION_MPTCP_MPC_ACK) & opts->suboptions))
		return false;

	/* account for the trailing 'nop' option */
@@ -755,7 +778,7 @@ static bool mptcp_established_options_mp_prio(struct sock *sk,
	return true;
}

static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb,
static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb,
						   unsigned int *size,
						   unsigned int remaining,
						   struct mptcp_out_options *opts)
@@ -763,12 +786,36 @@ static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_bu
	const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

	if (remaining < TCPOLEN_MPTCP_RST)
		return;
		return false;

	*size = TCPOLEN_MPTCP_RST;
	opts->suboptions |= OPTION_MPTCP_RST;
	opts->reset_transient = subflow->reset_transient;
	opts->reset_reason = subflow->reset_reason;

	return true;
}

static bool mptcp_established_options_mp_fail(struct sock *sk,
					      unsigned int *size,
					      unsigned int remaining,
					      struct mptcp_out_options *opts)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

	if (likely(!subflow->send_mp_fail))
		return false;

	if (remaining < TCPOLEN_MPTCP_FAIL)
		return false;

	*size = TCPOLEN_MPTCP_FAIL;
	opts->suboptions |= OPTION_MPTCP_FAIL;
	opts->fail_seq = subflow->map_seq;

	pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq);

	return true;
}

bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
@@ -787,15 +834,28 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
		return false;

	if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) {
		mptcp_established_options_rst(sk, skb, size, remaining, opts);
		if (mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) {
			*size += opt_size;
			remaining -= opt_size;
		}
		if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) {
			*size += opt_size;
			remaining -= opt_size;
		}
		return true;
	}

	snd_data_fin = mptcp_data_fin_enabled(msk);
	if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts))
		ret = true;
	else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts))
	else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) {
		ret = true;
		if (mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) {
			*size += opt_size;
			remaining -= opt_size;
			return true;
		}
	}

	/* we reserved enough space for the above options, and exceeding the
	 * TCP option space would be fatal
@@ -1096,6 +1156,12 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
		mp_opt.mp_prio = 0;
	}

	if (mp_opt.mp_fail) {
		mptcp_pm_mp_fail_received(sk, mp_opt.fail_seq);
		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX);
		mp_opt.mp_fail = 0;
	}

	if (mp_opt.reset) {
		subflow->reset_seen = 1;
		subflow->reset_reason = mp_opt.reset_reason;
@@ -1198,7 +1264,87 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
			 struct mptcp_out_options *opts)
{
	if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
	if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) {
		const struct sock *ssk = (const struct sock *)tp;
		struct mptcp_subflow_context *subflow;

		subflow = mptcp_subflow_ctx(ssk);
		subflow->send_mp_fail = 0;

		*ptr++ = mptcp_option(MPTCPOPT_MP_FAIL,
				      TCPOLEN_MPTCP_FAIL,
				      0, 0);
		put_unaligned_be64(opts->fail_seq, ptr);
		ptr += 2;
	}

	/* RST is mutually exclusive with everything else */
	if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) {
		*ptr++ = mptcp_option(MPTCPOPT_RST,
				      TCPOLEN_MPTCP_RST,
				      opts->reset_transient,
				      opts->reset_reason);
		return;
	}

	/* DSS, MPC, MPJ and ADD_ADDR are mutually exclusive, see
	 * mptcp_established_options*()
	 */
	if (likely(OPTION_MPTCP_DSS & opts->suboptions)) {
		struct mptcp_ext *mpext = &opts->ext_copy;
		u8 len = TCPOLEN_MPTCP_DSS_BASE;
		u8 flags = 0;

		if (mpext->use_ack) {
			flags = MPTCP_DSS_HAS_ACK;
			if (mpext->ack64) {
				len += TCPOLEN_MPTCP_DSS_ACK64;
				flags |= MPTCP_DSS_ACK64;
			} else {
				len += TCPOLEN_MPTCP_DSS_ACK32;
			}
		}

		if (mpext->use_map) {
			len += TCPOLEN_MPTCP_DSS_MAP64;

			/* Use only 64-bit mapping flags for now, add
			 * support for optional 32-bit mappings later.
			 */
			flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
			if (mpext->data_fin)
				flags |= MPTCP_DSS_DATA_FIN;

			if (opts->csum_reqd)
				len += TCPOLEN_MPTCP_DSS_CHECKSUM;
		}

		*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);

		if (mpext->use_ack) {
			if (mpext->ack64) {
				put_unaligned_be64(mpext->data_ack, ptr);
				ptr += 2;
			} else {
				put_unaligned_be32(mpext->data_ack32, ptr);
				ptr += 1;
			}
		}

		if (mpext->use_map) {
			put_unaligned_be64(mpext->data_seq, ptr);
			ptr += 2;
			put_unaligned_be32(mpext->subflow_seq, ptr);
			ptr += 1;
			if (opts->csum_reqd) {
				put_unaligned_be32(mpext->data_len << 16 |
						   mptcp_make_csum(mpext), ptr);
			} else {
				put_unaligned_be32(mpext->data_len << 16 |
						   TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
			}
		}
	} else if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
		    OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
		u8 len, flag = MPTCP_CAP_HMAC_SHA256;

@@ -1246,10 +1392,31 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
					   TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
		}
		ptr += 1;
	}

mp_capable_done:
	if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
		/* MPC is additionally mutually exclusive with MP_PRIO */
		goto mp_capable_done;
	} else if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
				      TCPOLEN_MPTCP_MPJ_SYN,
				      opts->backup, opts->join_id);
		put_unaligned_be32(opts->token, ptr);
		ptr += 1;
		put_unaligned_be32(opts->nonce, ptr);
		ptr += 1;
	} else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
				      TCPOLEN_MPTCP_MPJ_SYNACK,
				      opts->backup, opts->join_id);
		put_unaligned_be64(opts->thmac, ptr);
		ptr += 2;
		put_unaligned_be32(opts->nonce, ptr);
		ptr += 1;
	} else if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
				      TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
		memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
		ptr += 5;
	} else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
		u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;
		u8 echo = MPTCP_ADDR_ECHO;

@@ -1307,6 +1474,19 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
		}
	}

	if (OPTION_MPTCP_PRIO & opts->suboptions) {
		const struct sock *ssk = (const struct sock *)tp;
		struct mptcp_subflow_context *subflow;

		subflow = mptcp_subflow_ctx(ssk);
		subflow->send_mp_prio = 0;

		*ptr++ = mptcp_option(MPTCPOPT_MP_PRIO,
				      TCPOLEN_MPTCP_PRIO,
				      opts->backup, TCPOPT_NOP);
	}

mp_capable_done:
	if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
		u8 i = 1;

@@ -1327,107 +1507,6 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
		}
	}

	if (OPTION_MPTCP_PRIO & opts->suboptions) {
		const struct sock *ssk = (const struct sock *)tp;
		struct mptcp_subflow_context *subflow;

		subflow = mptcp_subflow_ctx(ssk);
		subflow->send_mp_prio = 0;

		*ptr++ = mptcp_option(MPTCPOPT_MP_PRIO,
				      TCPOLEN_MPTCP_PRIO,
				      opts->backup, TCPOPT_NOP);
	}

	if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
				      TCPOLEN_MPTCP_MPJ_SYN,
				      opts->backup, opts->join_id);
		put_unaligned_be32(opts->token, ptr);
		ptr += 1;
		put_unaligned_be32(opts->nonce, ptr);
		ptr += 1;
	}

	if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
				      TCPOLEN_MPTCP_MPJ_SYNACK,
				      opts->backup, opts->join_id);
		put_unaligned_be64(opts->thmac, ptr);
		ptr += 2;
		put_unaligned_be32(opts->nonce, ptr);
		ptr += 1;
	}

	if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
				      TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
		memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
		ptr += 5;
	}

	if (OPTION_MPTCP_RST & opts->suboptions)
		*ptr++ = mptcp_option(MPTCPOPT_RST,
				      TCPOLEN_MPTCP_RST,
				      opts->reset_transient,
				      opts->reset_reason);

	if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
		struct mptcp_ext *mpext = &opts->ext_copy;
		u8 len = TCPOLEN_MPTCP_DSS_BASE;
		u8 flags = 0;

		if (mpext->use_ack) {
			flags = MPTCP_DSS_HAS_ACK;
			if (mpext->ack64) {
				len += TCPOLEN_MPTCP_DSS_ACK64;
				flags |= MPTCP_DSS_ACK64;
			} else {
				len += TCPOLEN_MPTCP_DSS_ACK32;
			}
		}

		if (mpext->use_map) {
			len += TCPOLEN_MPTCP_DSS_MAP64;

			/* Use only 64-bit mapping flags for now, add
			 * support for optional 32-bit mappings later.
			 */
			flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
			if (mpext->data_fin)
				flags |= MPTCP_DSS_DATA_FIN;

			if (opts->csum_reqd)
				len += TCPOLEN_MPTCP_DSS_CHECKSUM;
		}

		*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);

		if (mpext->use_ack) {
			if (mpext->ack64) {
				put_unaligned_be64(mpext->data_ack, ptr);
				ptr += 2;
			} else {
				put_unaligned_be32(mpext->data_ack32, ptr);
				ptr += 1;
			}
		}

		if (mpext->use_map) {
			put_unaligned_be64(mpext->data_seq, ptr);
			ptr += 2;
			put_unaligned_be32(mpext->subflow_seq, ptr);
			ptr += 1;
			if (opts->csum_reqd) {
				put_unaligned_be32(mpext->data_len << 16 |
						   mptcp_make_csum(mpext), ptr);
			} else {
				put_unaligned_be32(mpext->data_len << 16 |
						   TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
			}
		}
	}

	if (tp)
		mptcp_set_rwin(tp);
}
+5 −0
Original line number Diff line number Diff line
@@ -249,6 +249,11 @@ void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup)
	mptcp_event(MPTCP_EVENT_SUB_PRIORITY, mptcp_sk(subflow->conn), sk, GFP_ATOMIC);
}

void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
{
	pr_debug("fail_seq=%llu", fail_seq);
}

/* path manager helpers */

bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb,
Loading