Commit 232e3683 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'mptcp-fixes'



Mat Martineau says:

====================
mptcp: More v5.13 fixes

Here's another batch of MPTCP fixes for v5.13.

Patch 1 cleans up memory accounting between the MPTCP-level socket and
the subflows to more reliably transfer forward allocated memory under
pressure.

Patch 2 wakes up socket readers more reliably.

Patch 3 changes a WARN_ONCE to a pr_debug.

Patch 4 changes the selftests to only use syncookies in test cases where
they do not cause spurious failures.

Patch 5 modifies socket error reporting to avoid a possible soft lockup.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 22488e45 499ada50
Loading
Loading
Loading
Loading
+27 −25
Original line number Diff line number Diff line
@@ -280,11 +280,13 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,

	/* try to fetch required memory from subflow */
	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
		if (ssk->sk_forward_alloc < skb->truesize)
			goto drop;
		__sk_mem_reclaim(ssk, skb->truesize);
		if (!sk_rmem_schedule(sk, skb, skb->truesize))
		int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT;

		if (ssk->sk_forward_alloc < amount)
			goto drop;

		ssk->sk_forward_alloc -= amount;
		sk->sk_forward_alloc += amount;
	}

	/* the skb map_seq accounts for the skb offset:
@@ -668,18 +670,22 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
/* In most cases we will be able to lock the mptcp socket.  If its already
 * owned, we need to defer to the work queue to avoid ABBA deadlock.
 */
static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
{
	struct sock *sk = (struct sock *)msk;
	unsigned int moved = 0;

	if (inet_sk_state_load(sk) == TCP_CLOSE)
		return;

	mptcp_data_lock(sk);
		return false;

	__mptcp_move_skbs_from_subflow(msk, ssk, &moved);
	__mptcp_ofo_queue(msk);
	if (unlikely(ssk->sk_err)) {
		if (!sock_owned_by_user(sk))
			__mptcp_error_report(sk);
		else
			set_bit(MPTCP_ERROR_REPORT,  &msk->flags);
	}

	/* If the moves have caught up with the DATA_FIN sequence number
	 * it's time to ack the DATA_FIN and change socket state, but
@@ -688,7 +694,7 @@ static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
	 */
	if (mptcp_pending_data_fin(sk, NULL))
		mptcp_schedule_work(sk);
	mptcp_data_unlock(sk);
	return moved > 0;
}

void mptcp_data_ready(struct sock *sk, struct sock *ssk)
@@ -696,7 +702,6 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
	struct mptcp_sock *msk = mptcp_sk(sk);
	int sk_rbuf, ssk_rbuf;
	bool wake;

	/* The peer can send data while we are shutting down this
	 * subflow at msk destruction time, but we must avoid enqueuing
@@ -705,29 +710,23 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
	if (unlikely(subflow->disposable))
		return;

	/* move_skbs_to_msk below can legitly clear the data_avail flag,
	 * but we will need later to properly woke the reader, cache its
	 * value
	 */
	wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL;
	if (wake)
		set_bit(MPTCP_DATA_READY, &msk->flags);

	ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
	sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
	if (unlikely(ssk_rbuf > sk_rbuf))
		sk_rbuf = ssk_rbuf;

	/* over limit? can't append more skbs to msk */
	/* over limit? can't append more skbs to msk, Also, no need to wake-up*/
	if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
		goto wake;

	move_skbs_to_msk(msk, ssk);
		return;

wake:
	if (wake)
	/* Wake-up the reader only for in-sequence data */
	mptcp_data_lock(sk);
	if (move_skbs_to_msk(msk, ssk)) {
		set_bit(MPTCP_DATA_READY, &msk->flags);
		sk->sk_data_ready(sk);
	}
	mptcp_data_unlock(sk);
}

static bool mptcp_do_flush_join_list(struct mptcp_sock *msk)
{
@@ -858,7 +857,7 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
	sock_owned_by_me(sk);

	mptcp_for_each_subflow(msk, subflow) {
		if (subflow->data_avail)
		if (READ_ONCE(subflow->data_avail))
			return mptcp_subflow_tcp_sock(subflow);
	}

@@ -1955,6 +1954,9 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
		done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
		mptcp_data_unlock(sk);
		tcp_cleanup_rbuf(ssk, moved);

		if (unlikely(ssk->sk_err))
			__mptcp_error_report(sk);
		unlock_sock_fast(ssk, slowpath);
	} while (!done);

+0 −1
Original line number Diff line number Diff line
@@ -362,7 +362,6 @@ mptcp_subflow_rsk(const struct request_sock *rsk)
enum mptcp_data_avail {
	MPTCP_SUBFLOW_NODATA,
	MPTCP_SUBFLOW_DATA_AVAIL,
	MPTCP_SUBFLOW_OOO_DATA
};

struct mptcp_delegated_action {
+53 −55
Original line number Diff line number Diff line
@@ -784,9 +784,9 @@ static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
	return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
}

static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
{
	WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
	pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
		 ssn, subflow->map_subflow_seq, subflow->map_data_len);
}

@@ -812,13 +812,13 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
		/* Mapping covers data later in the subflow stream,
		 * currently unsupported.
		 */
		warn_bad_map(subflow, ssn);
		dbg_bad_map(subflow, ssn);
		return false;
	}
	if (unlikely(!before(ssn, subflow->map_subflow_seq +
				  subflow->map_data_len))) {
		/* Mapping does covers past subflow data, invalid */
		warn_bad_map(subflow, ssn + skb->len);
		dbg_bad_map(subflow, ssn);
		return false;
	}
	return true;
@@ -1000,7 +1000,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
	struct sk_buff *skb;

	if (!skb_peek(&ssk->sk_receive_queue))
		subflow->data_avail = 0;
		WRITE_ONCE(subflow->data_avail, 0);
	if (subflow->data_avail)
		return true;

@@ -1039,18 +1039,13 @@ static bool subflow_check_data_avail(struct sock *ssk)
		ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
		pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
			 ack_seq);
		if (ack_seq == old_ack) {
			subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
			break;
		} else if (after64(ack_seq, old_ack)) {
			subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
			break;
		if (unlikely(before64(ack_seq, old_ack))) {
			mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
			continue;
		}

		/* only accept in-sequence mapping. Old values are spurious
		 * retransmission
		 */
		mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
		WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
		break;
	}
	return true;

@@ -1065,12 +1060,11 @@ static bool subflow_check_data_avail(struct sock *ssk)
		 * subflow_error_report() will introduce the appropriate barriers
		 */
		ssk->sk_err = EBADMSG;
		ssk->sk_error_report(ssk);
		tcp_set_state(ssk, TCP_CLOSE);
		subflow->reset_transient = 0;
		subflow->reset_reason = MPTCP_RST_EMPTCP;
		tcp_send_active_reset(ssk, GFP_ATOMIC);
		subflow->data_avail = 0;
		WRITE_ONCE(subflow->data_avail, 0);
		return false;
	}

@@ -1080,7 +1074,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
	subflow->map_seq = READ_ONCE(msk->ack_seq);
	subflow->map_data_len = skb->len;
	subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
	subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
	WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
	return true;
}

@@ -1092,7 +1086,7 @@ bool mptcp_subflow_data_available(struct sock *sk)
	if (subflow->map_valid &&
	    mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
		subflow->map_valid = 0;
		subflow->data_avail = 0;
		WRITE_ONCE(subflow->data_avail, 0);

		pr_debug("Done with mapping: seq=%u data_len=%u",
			 subflow->map_subflow_seq,
@@ -1120,41 +1114,6 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
	*full_space = tcp_full_space(sk);
}

static void subflow_data_ready(struct sock *sk)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
	u16 state = 1 << inet_sk_state_load(sk);
	struct sock *parent = subflow->conn;
	struct mptcp_sock *msk;

	msk = mptcp_sk(parent);
	if (state & TCPF_LISTEN) {
		/* MPJ subflow are removed from accept queue before reaching here,
		 * avoid stray wakeups
		 */
		if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue))
			return;

		set_bit(MPTCP_DATA_READY, &msk->flags);
		parent->sk_data_ready(parent);
		return;
	}

	WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
		     !subflow->mp_join && !(state & TCPF_CLOSE));

	if (mptcp_subflow_data_available(sk))
		mptcp_data_ready(parent, sk);
}

static void subflow_write_space(struct sock *ssk)
{
	struct sock *sk = mptcp_subflow_ctx(ssk)->conn;

	mptcp_propagate_sndbuf(sk, ssk);
	mptcp_write_space(sk);
}

void __mptcp_error_report(struct sock *sk)
{
	struct mptcp_subflow_context *subflow;
@@ -1195,6 +1154,43 @@ static void subflow_error_report(struct sock *ssk)
	mptcp_data_unlock(sk);
}

static void subflow_data_ready(struct sock *sk)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
	u16 state = 1 << inet_sk_state_load(sk);
	struct sock *parent = subflow->conn;
	struct mptcp_sock *msk;

	msk = mptcp_sk(parent);
	if (state & TCPF_LISTEN) {
		/* MPJ subflow are removed from accept queue before reaching here,
		 * avoid stray wakeups
		 */
		if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue))
			return;

		set_bit(MPTCP_DATA_READY, &msk->flags);
		parent->sk_data_ready(parent);
		return;
	}

	WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
		     !subflow->mp_join && !(state & TCPF_CLOSE));

	if (mptcp_subflow_data_available(sk))
		mptcp_data_ready(parent, sk);
	else if (unlikely(sk->sk_err))
		subflow_error_report(sk);
}

static void subflow_write_space(struct sock *ssk)
{
	struct sock *sk = mptcp_subflow_ctx(ssk)->conn;

	mptcp_propagate_sndbuf(sk, ssk);
	mptcp_write_space(sk);
}

static struct inet_connection_sock_af_ops *
subflow_default_af_ops(struct sock *sk)
{
@@ -1505,6 +1501,8 @@ static void subflow_state_change(struct sock *sk)
	 */
	if (mptcp_subflow_data_available(sk))
		mptcp_data_ready(parent, sk);
	else if (unlikely(sk->sk_err))
		subflow_error_report(sk);

	subflow_sched_work_if_closed(mptcp_sk(parent), sk);

+8 −3
Original line number Diff line number Diff line
@@ -197,9 +197,6 @@ ip -net "$ns4" link set ns4eth3 up
ip -net "$ns4" route add default via 10.0.3.2
ip -net "$ns4" route add default via dead:beef:3::2

# use TCP syn cookies, even if no flooding was detected.
ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=2

set_ethtool_flags() {
	local ns="$1"
	local dev="$2"
@@ -737,6 +734,14 @@ for sender in $ns1 $ns2 $ns3 $ns4;do
		exit $ret
	fi

	# ns1<->ns2 is not subject to reordering/tc delays. Use it to test
	# mptcp syncookie support.
	if [ $sender = $ns1 ]; then
		ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=2
	else
		ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=1
	fi

	run_tests "$ns2" $sender 10.0.1.2
	run_tests "$ns2" $sender dead:beef:1::2
	run_tests "$ns2" $sender 10.0.2.1