Unverified Commit 9dfdffa2 authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!9245 MPTCP Upstream part 5

Merge Pull Request from: @geliangtang 
 
mptcp: Features and fixes for v6.7

Patch 1 adds a configurable timeout for the MPTCP connection when all
subflows are closed, to support break-before-make use cases.


Patch 2 is a minor code cleanup.

Patches 3 & 4 add handling of rcvlowat for MPTCP sockets, with a
prerequisite patch to use a common scaling ratio between TCP and MPTCP.

Patch 5 improves efficiency of memory copying in MPTCP transmit code. 
 
Link:https://gitee.com/openeuler/kernel/pulls/9245

 

Reviewed-by: default avatarYue Haibing <yuehaibing@huawei.com>
Signed-off-by: default avatarZhang Peng <zhangpeng362@huawei.com>
parents 05b67977 f586a144
Loading
Loading
Loading
Loading
+11 −0
Original line number Diff line number Diff line
@@ -25,6 +25,17 @@ add_addr_timeout - INTEGER (seconds)

	Default: 120

close_timeout - INTEGER (seconds)
	Set the make-after-break timeout: in absence of any close or
	shutdown syscall, MPTCP sockets will maintain the status
	unchanged for such time, after the last subflow removal, before
	moving to TCP_CLOSE.

	The default value matches TCP_TIMEWAIT_LEN. This is a per-namespace
	sysctl.

	Default: 60

checksum_enabled - BOOLEAN
	Control whether DSS checksum can be enabled.

+7 −5
Original line number Diff line number Diff line
@@ -1472,13 +1472,15 @@ static inline int tcp_space_from_win(const struct sock *sk, int win)
	return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
}

static inline void tcp_scaling_ratio_init(struct sock *sk)
{
/* Assume a conservative default of 1200 bytes of payload per 4K page.
 * This may be adjusted later in tcp_measure_rcv_mss().
 */
	tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
				    SKB_TRUESIZE(4096);
#define TCP_DEFAULT_SCALING_RATIO ((1200 << TCP_RMEM_TO_WIN_SCALE) / \
				   SKB_TRUESIZE(4096))

static inline void tcp_scaling_ratio_init(struct sock *sk)
{
	tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
}

/* Note: caller must be prepared to deal with negative returns */
+16 −0
Original line number Diff line number Diff line
@@ -27,6 +27,7 @@ struct mptcp_pernet {
#endif

	unsigned int add_addr_timeout;
	unsigned int close_timeout;
	unsigned int stale_loss_cnt;
	u8 mptcp_enabled;
	u8 checksum_enabled;
@@ -65,6 +66,13 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net)
	return mptcp_get_pernet(net)->stale_loss_cnt;
}

unsigned int mptcp_close_timeout(const struct sock *sk)
{
	if (sock_flag(sk, SOCK_DEAD))
		return TCP_TIMEWAIT_LEN;
	return mptcp_get_pernet(sock_net(sk))->close_timeout;
}

int mptcp_get_pm_type(const struct net *net)
{
	return mptcp_get_pernet(net)->pm_type;
@@ -79,6 +87,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
	pernet->mptcp_enabled = 1;
	pernet->add_addr_timeout = TCP_RTO_MAX;
	pernet->close_timeout = TCP_TIMEWAIT_LEN;
	pernet->checksum_enabled = 0;
	pernet->allow_join_initial_addr_port = 1;
	pernet->stale_loss_cnt = 4;
@@ -178,6 +187,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
		.mode = 0644,
		.proc_handler = proc_scheduler,
	},
	{
		.procname = "close_timeout",
		.maxlen = sizeof(unsigned int),
		.mode = 0644,
		.proc_handler = proc_dointvec_jiffies,
	},
	{}
};

@@ -200,6 +215,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
	table[4].data = &pernet->stale_loss_cnt;
	table[5].data = &pernet->pm_type;
	table[6].data = &pernet->scheduler;
	table[7].data = &pernet->close_timeout;

	hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table,
				     ARRAY_SIZE(mptcp_sysctl_table));
+29 −20
Original line number Diff line number Diff line
@@ -849,9 +849,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)

	/* Wake-up the reader only for in-sequence data */
	mptcp_data_lock(sk);
	if (move_skbs_to_msk(msk, ssk))
	if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
		sk->sk_data_ready(sk);

	mptcp_data_unlock(sk);
}

@@ -1757,6 +1756,18 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
	return ret;
}

static int do_copy_data_nocache(struct sock *sk, int copy,
				struct iov_iter *from, char *to)
{
	if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
		if (!copy_from_iter_full_nocache(to, copy, from))
			return -EFAULT;
	} else if (!copy_from_iter_full(to, copy, from)) {
		return -EFAULT;
	}
	return 0;
}

static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1830,11 +1841,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
		if (!sk_wmem_schedule(sk, total_ts))
			goto wait_for_memory;

		if (copy_page_from_iter(dfrag->page, offset, psize,
					&msg->msg_iter) != psize) {
			ret = -EFAULT;
		ret = do_copy_data_nocache(sk, psize, &msg->msg_iter,
					   page_address(dfrag->page) + offset);
		if (ret)
			goto do_error;
		}

		/* data successfully copied into the write queue */
		sk_forward_alloc_add(sk, -total_ts);
@@ -1918,6 +1928,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
			if (!(flags & MSG_PEEK)) {
				MPTCP_SKB_CB(skb)->offset += count;
				MPTCP_SKB_CB(skb)->map_seq += count;
				msk->bytes_consumed += count;
			}
			break;
		}
@@ -1928,6 +1939,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
			WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
			__skb_unlink(skb, &msk->receive_queue);
			__kfree_skb(skb);
			msk->bytes_consumed += count;
		}

		if (copied >= len)
@@ -2387,8 +2399,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
	if (msk->in_accept_queue && msk->first == ssk &&
	    (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) {
		/* ensure later check in mptcp_worker() will dispose the msk */
		mptcp_set_close_tout(sk, tcp_jiffies32 - (TCP_TIMEWAIT_LEN + 1));
		sock_set_flag(sk, SOCK_DEAD);
		mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1));
		lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
		mptcp_subflow_drop_ctx(ssk);
		goto out_release;
@@ -2513,7 +2525,7 @@ static bool mptcp_close_tout_expired(const struct sock *sk)
		return false;

	return time_after32(tcp_jiffies32,
		  inet_csk(sk)->icsk_mtup.probe_timestamp + TCP_TIMEWAIT_LEN);
		  inet_csk(sk)->icsk_mtup.probe_timestamp + mptcp_close_timeout(sk));
}

static void mptcp_check_fastclose(struct mptcp_sock *msk)
@@ -2656,7 +2668,7 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
		return;

	close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies +
			TCP_TIMEWAIT_LEN;
			mptcp_close_timeout(sk);

	/* the close timeout takes precedence on the fail one, and here at least one of
	 * them is active
@@ -2752,6 +2764,7 @@ static void __mptcp_init_sock(struct sock *sk)
	msk->rmem_fwd_alloc = 0;
	WRITE_ONCE(msk->rmem_released, 0);
	msk->timer_ival = TCP_RTO_MIN;
	msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;

	WRITE_ONCE(msk->first, NULL);
	inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@@ -2979,16 +2992,9 @@ void __mptcp_unaccepted_force_close(struct sock *sk)
	__mptcp_destroy_sock(sk);
}

static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
static __poll_t mptcp_check_readable(struct sock *sk)
{
	/* Concurrent splices from sk_receive_queue into receive_queue will
	 * always show at least one non-empty queue when checked in this order.
	 */
	if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
	    skb_queue_empty_lockless(&msk->receive_queue))
		return 0;

	return EPOLLIN | EPOLLRDNORM;
	return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0;
}

static void mptcp_check_listen_stop(struct sock *sk)
@@ -3026,7 +3032,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
		goto cleanup;
	}

	if (mptcp_check_readable(msk) || timeout < 0) {
	if (mptcp_data_avail(msk) || timeout < 0) {
		/* If the msk has read data, or the caller explicitly ask it,
		 * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose
		 */
@@ -3152,6 +3158,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
	msk->snd_data_fin_enable = false;
	msk->rcv_fastclose = false;
	msk->use_64bit_ack = false;
	msk->bytes_consumed = 0;
	WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
	mptcp_pm_data_reset(msk);
	mptcp_ca_reset(sk);
@@ -3977,7 +3984,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;

	if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
		mask |= mptcp_check_readable(msk);
		mask |= mptcp_check_readable(sk);
		if (shutdown & SEND_SHUTDOWN)
			mask |= EPOLLOUT | EPOLLWRNORM;
		else
@@ -4015,6 +4022,7 @@ static const struct proto_ops mptcp_stream_ops = {
	.sendmsg	   = inet_sendmsg,
	.recvmsg	   = inet_recvmsg,
	.mmap		   = sock_no_mmap,
	.set_rcvlowat	   = mptcp_set_rcvlowat,
};

static struct inet_protosw mptcp_protosw = {
@@ -4116,6 +4124,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
#ifdef CONFIG_COMPAT
	.compat_ioctl	   = inet6_compat_ioctl,
#endif
	.set_rcvlowat	   = mptcp_set_rcvlowat,
};

static struct proto mptcp_v6_prot;
+22 −6
Original line number Diff line number Diff line
@@ -270,6 +270,7 @@ struct mptcp_sock {
	atomic64_t	rcv_wnd_sent;
	u64		rcv_data_fin_seq;
	u64		bytes_retrans;
	u64		bytes_consumed;
	int		rmem_fwd_alloc;
	int		snd_burst;
	int		old_wspace;
@@ -438,11 +439,6 @@ mptcp_subflow_rsk(const struct request_sock *rsk)
	return (struct mptcp_subflow_request_sock *)rsk;
}

enum mptcp_data_avail {
	MPTCP_SUBFLOW_NODATA,
	MPTCP_SUBFLOW_DATA_AVAIL,
};

struct mptcp_delegated_action {
	struct napi_struct napi;
	struct list_head head;
@@ -498,7 +494,7 @@ struct mptcp_subflow_context {
		valid_csum_seen : 1,        /* at least one csum validated */
		is_mptfo : 1,	    /* subflow is doing TFO */
		__unused : 10;
	enum mptcp_data_avail data_avail;
	bool	data_avail;
	bool	scheduled;
	u32	remote_nonce;
	u64	thmac;
@@ -623,6 +619,7 @@ unsigned int mptcp_get_add_addr_timeout(const struct net *net);
int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
unsigned int mptcp_close_timeout(const struct sock *sk);
int mptcp_get_pm_type(const struct net *net);
const char *mptcp_get_scheduler(const struct net *net);
void __mptcp_subflow_fully_established(struct mptcp_sock *msk,
@@ -673,6 +670,24 @@ struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk);
int mptcp_sched_get_send(struct mptcp_sock *msk);
int mptcp_sched_get_retrans(struct mptcp_sock *msk);

static inline u64 mptcp_data_avail(const struct mptcp_sock *msk)
{
	return READ_ONCE(msk->bytes_received) - READ_ONCE(msk->bytes_consumed);
}

static inline bool mptcp_epollin_ready(const struct sock *sk)
{
	/* mptcp doesn't have to deal with small skbs in the receive queue,
	 * at it can always coalesce them
	 */
	return (mptcp_data_avail(mptcp_sk(sk)) >= sk->sk_rcvlowat) ||
	       (mem_cgroup_sockets_enabled && sk->sk_memcg &&
		mem_cgroup_under_socket_pressure(sk->sk_memcg)) ||
	       READ_ONCE(tcp_memory_pressure);
}

int mptcp_set_rcvlowat(struct sock *sk, int val);

static inline bool __tcp_can_send(const struct sock *ssk)
{
	/* only send if our side has not closed yet */
@@ -747,6 +762,7 @@ static inline bool mptcp_is_fully_established(struct sock *sk)
	return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
	       READ_ONCE(mptcp_sk(sk)->fully_established);
}

void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk);
Loading