Commit c133acf3 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'mptcp-socket-options'



Mat Martineau says:

====================
mptcp: Improve socket option handling

MPTCP sockets have previously had limited socket option support. The
architecture of MPTCP sockets (one userspace-facing MPTCP socket that
manages one or more in-kernel TCP subflow sockets) adds complexity for
passing options through to lower levels. This patch set adds MPTCP
support for socket options commonly used with TCP.

Patch 1 reverts an interim socket option fix (a socket option blocklist)
that was merged in the net tree for v5.12.

Patch 2 moves the socket option code to a separate file, with no
functional changes.

Patch 3 adds an allowlist for socket options that are known to function
with MPTCP. Later patches in this set add more allowed options.

Patches 4 and 5 add infrastructure for syncing MPTCP-level options with
the TCP subflows.

Patches 6-12 add support for specific socket options.

Patch 13 adds a socket option self test.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents a1150a04 dc65fe82
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -2,7 +2,7 @@
obj-$(CONFIG_MPTCP) += mptcp.o

mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
	   mib.o pm_netlink.o
	   mib.o pm_netlink.o sockopt.o

obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
+45 −174
Original line number Diff line number Diff line
@@ -90,16 +90,6 @@ static bool mptcp_is_tcpsk(struct sock *sk)
	return false;
}

static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
{
	sock_owned_by_me((const struct sock *)msk);

	if (likely(!__mptcp_check_fallback(msk)))
		return NULL;

	return msk->first;
}

static int __mptcp_socket_create(struct mptcp_sock *msk)
{
	struct mptcp_subflow_context *subflow;
@@ -740,18 +730,47 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
		sk->sk_data_ready(sk);
}

void __mptcp_flush_join_list(struct mptcp_sock *msk)
static bool mptcp_do_flush_join_list(struct mptcp_sock *msk)
{
	struct mptcp_subflow_context *subflow;
	bool ret = false;

	if (likely(list_empty(&msk->join_list)))
		return;
		return false;

	spin_lock_bh(&msk->join_list_lock);
	list_for_each_entry(subflow, &msk->join_list, node)
	list_for_each_entry(subflow, &msk->join_list, node) {
		u32 sseq = READ_ONCE(subflow->setsockopt_seq);

		mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
		if (READ_ONCE(msk->setsockopt_seq) != sseq)
			ret = true;
	}
	list_splice_tail_init(&msk->join_list, &msk->conn_list);
	spin_unlock_bh(&msk->join_list_lock);

	return ret;
}

void __mptcp_flush_join_list(struct mptcp_sock *msk)
{
	if (likely(!mptcp_do_flush_join_list(msk)))
		return;

	if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags))
		mptcp_schedule_work((struct sock *)msk);
}

static void mptcp_flush_join_list(struct mptcp_sock *msk)
{
	bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags);

	might_sleep();

	if (!mptcp_do_flush_join_list(msk) && !sync_needed)
		return;

	mptcp_sockopt_sync_all(msk);
}

static bool mptcp_timer_pending(struct sock *sk)
@@ -1467,7 +1486,7 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags)
			int ret = 0;

			prev_ssk = ssk;
			__mptcp_flush_join_list(msk);
			mptcp_flush_join_list(msk);
			ssk = mptcp_subflow_get_send(msk);

			/* try to keep the subflow socket lock across
@@ -1893,7 +1912,7 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
	unsigned int moved = 0;
	bool ret, done;

	__mptcp_flush_join_list(msk);
	mptcp_flush_join_list(msk);
	do {
		struct sock *ssk = mptcp_subflow_recv_lookup(msk);
		bool slowpath;
@@ -2317,7 +2336,7 @@ static void mptcp_worker(struct work_struct *work)
		goto unlock;

	mptcp_check_data_fin_ack(sk);
	__mptcp_flush_join_list(msk);
	mptcp_flush_join_list(msk);

	mptcp_check_fastclose(msk);

@@ -2380,6 +2399,9 @@ static int __mptcp_init_sock(struct sock *sk)
	/* re-use the csk retrans timer for MPTCP-level retrans */
	timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
	timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0);

	tcp_assign_congestion_control(sk);

	return 0;
}

@@ -2517,7 +2539,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk)
		}
	}

	__mptcp_flush_join_list(msk);
	mptcp_flush_join_list(msk);
	mptcp_for_each_subflow(msk, subflow) {
		struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);

@@ -2573,6 +2595,8 @@ static void __mptcp_destroy_sock(struct sock *sk)
	WARN_ON_ONCE(msk->rmem_released);
	sk_stream_kill_queues(sk);
	xfrm_sk_free_policy(sk);

	tcp_cleanup_congestion_control(sk);
	sk_refcnt_debug_release(sk);
	mptcp_dispose_initial_subflow(msk);
	sock_put(sk);
@@ -2654,7 +2678,8 @@ static int mptcp_disconnect(struct sock *sk, int flags)
	struct mptcp_subflow_context *subflow;
	struct mptcp_sock *msk = mptcp_sk(sk);

	__mptcp_flush_join_list(msk);
	mptcp_do_flush_join_list(msk);

	mptcp_for_each_subflow(msk, subflow) {
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

@@ -2703,6 +2728,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
	msk->snd_nxt = msk->write_seq;
	msk->snd_una = msk->write_seq;
	msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
	msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;

	if (mp_opt->mp_capable) {
		msk->can_ack = true;
@@ -2811,161 +2837,6 @@ static void mptcp_destroy(struct sock *sk)
	sk_sockets_allocated_dec(sk);
}

static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
				       sockptr_t optval, unsigned int optlen)
{
	struct sock *sk = (struct sock *)msk;
	struct socket *ssock;
	int ret;

	switch (optname) {
	case SO_REUSEPORT:
	case SO_REUSEADDR:
		lock_sock(sk);
		ssock = __mptcp_nmpc_socket(msk);
		if (!ssock) {
			release_sock(sk);
			return -EINVAL;
		}

		ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen);
		if (ret == 0) {
			if (optname == SO_REUSEPORT)
				sk->sk_reuseport = ssock->sk->sk_reuseport;
			else if (optname == SO_REUSEADDR)
				sk->sk_reuse = ssock->sk->sk_reuse;
		}
		release_sock(sk);
		return ret;
	}

	return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen);
}

static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
			       sockptr_t optval, unsigned int optlen)
{
	struct sock *sk = (struct sock *)msk;
	int ret = -EOPNOTSUPP;
	struct socket *ssock;

	switch (optname) {
	case IPV6_V6ONLY:
		lock_sock(sk);
		ssock = __mptcp_nmpc_socket(msk);
		if (!ssock) {
			release_sock(sk);
			return -EINVAL;
		}

		ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen);
		if (ret == 0)
			sk->sk_ipv6only = ssock->sk->sk_ipv6only;

		release_sock(sk);
		break;
	}

	return ret;
}

static bool mptcp_unsupported(int level, int optname)
{
	if (level == SOL_IP) {
		switch (optname) {
		case IP_ADD_MEMBERSHIP:
		case IP_ADD_SOURCE_MEMBERSHIP:
		case IP_DROP_MEMBERSHIP:
		case IP_DROP_SOURCE_MEMBERSHIP:
		case IP_BLOCK_SOURCE:
		case IP_UNBLOCK_SOURCE:
		case MCAST_JOIN_GROUP:
		case MCAST_LEAVE_GROUP:
		case MCAST_JOIN_SOURCE_GROUP:
		case MCAST_LEAVE_SOURCE_GROUP:
		case MCAST_BLOCK_SOURCE:
		case MCAST_UNBLOCK_SOURCE:
		case MCAST_MSFILTER:
			return true;
		}
		return false;
	}
	if (level == SOL_IPV6) {
		switch (optname) {
		case IPV6_ADDRFORM:
		case IPV6_ADD_MEMBERSHIP:
		case IPV6_DROP_MEMBERSHIP:
		case IPV6_JOIN_ANYCAST:
		case IPV6_LEAVE_ANYCAST:
		case MCAST_JOIN_GROUP:
		case MCAST_LEAVE_GROUP:
		case MCAST_JOIN_SOURCE_GROUP:
		case MCAST_LEAVE_SOURCE_GROUP:
		case MCAST_BLOCK_SOURCE:
		case MCAST_UNBLOCK_SOURCE:
		case MCAST_MSFILTER:
			return true;
		}
		return false;
	}
	return false;
}

static int mptcp_setsockopt(struct sock *sk, int level, int optname,
			    sockptr_t optval, unsigned int optlen)
{
	struct mptcp_sock *msk = mptcp_sk(sk);
	struct sock *ssk;

	pr_debug("msk=%p", msk);

	if (mptcp_unsupported(level, optname))
		return -ENOPROTOOPT;

	if (level == SOL_SOCKET)
		return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);

	/* @@ the meaning of setsockopt() when the socket is connected and
	 * there are multiple subflows is not yet defined. It is up to the
	 * MPTCP-level socket to configure the subflows until the subflow
	 * is in TCP fallback, when TCP socket options are passed through
	 * to the one remaining subflow.
	 */
	lock_sock(sk);
	ssk = __mptcp_tcp_fallback(msk);
	release_sock(sk);
	if (ssk)
		return tcp_setsockopt(ssk, level, optname, optval, optlen);

	if (level == SOL_IPV6)
		return mptcp_setsockopt_v6(msk, optname, optval, optlen);

	return -EOPNOTSUPP;
}

static int mptcp_getsockopt(struct sock *sk, int level, int optname,
			    char __user *optval, int __user *option)
{
	struct mptcp_sock *msk = mptcp_sk(sk);
	struct sock *ssk;

	pr_debug("msk=%p", msk);

	/* @@ the meaning of setsockopt() when the socket is connected and
	 * there are multiple subflows is not yet defined. It is up to the
	 * MPTCP-level socket to configure the subflows until the subflow
	 * is in TCP fallback, when socket options are passed through
	 * to the one remaining subflow.
	 */
	lock_sock(sk);
	ssk = __mptcp_tcp_fallback(msk);
	release_sock(sk);
	if (ssk)
		return tcp_getsockopt(ssk, level, optname, optval, option);

	return -EOPNOTSUPP;
}

void __mptcp_data_acked(struct sock *sk)
{
	if (!sock_owned_by_user(sk))
@@ -3375,7 +3246,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
		/* set ssk->sk_socket of accept()ed flows to mptcp socket.
		 * This is needed so NOSPACE flag can be set from tcp stack.
		 */
		__mptcp_flush_join_list(msk);
		mptcp_flush_join_list(msk);
		mptcp_for_each_subflow(msk, subflow) {
			struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

+16 −0
Original line number Diff line number Diff line
@@ -108,6 +108,7 @@
#define MPTCP_CLEAN_UNA		7
#define MPTCP_ERROR_REPORT	8
#define MPTCP_RETRANSMIT	9
#define MPTCP_WORK_SYNC_SETSOCKOPT 10

static inline bool before64(__u64 seq1, __u64 seq2)
{
@@ -255,6 +256,8 @@ struct mptcp_sock {
		u64	time;	/* start time of measurement window */
		u64	rtt_us; /* last maximum rtt of subflows */
	} rcvq_space;

	u32 setsockopt_seq;
};

#define mptcp_lock_sock(___sk, cb) do {					\
@@ -413,6 +416,8 @@ struct mptcp_subflow_context {
	long	delegated_status;
	struct	list_head delegated_node;   /* link into delegated_action, protected by local BH */

	u32 setsockopt_seq;

	struct	sock *tcp_sock;	    /* tcp sk backpointer */
	struct	sock *conn;	    /* parent mptcp_sock */
	const	struct inet_connection_sock_af_ops *icsk_af_ops;
@@ -571,6 +576,11 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk);
bool mptcp_schedule_work(struct sock *sk);
int mptcp_setsockopt(struct sock *sk, int level, int optname,
		     sockptr_t optval, unsigned int optlen);
int mptcp_getsockopt(struct sock *sk, int level, int optname,
		     char __user *optval, int __user *option);

void __mptcp_check_push(struct sock *sk, struct sock *ssk);
void __mptcp_data_acked(struct sock *sk);
void __mptcp_error_report(struct sock *sk);
@@ -730,6 +740,12 @@ unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk);
unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk);
unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk);

int mptcp_setsockopt(struct sock *sk, int level, int optname,
		     sockptr_t optval, unsigned int optlen);

void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk);
void mptcp_sockopt_sync_all(struct mptcp_sock *msk);

static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb)
{
	return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP);

net/mptcp/sockopt.c

0 → 100644
+756 −0

File added.

Preview size limit exceeded, changes collapsed.

+5 −0
Original line number Diff line number Diff line
@@ -679,6 +679,9 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
			goto out;
		}

		/* ssk inherits options of listener sk */
		ctx->setsockopt_seq = listener->setsockopt_seq;

		if (ctx->mp_capable) {
			/* this can't race with mptcp_close(), as the msk is
			 * not yet exposted to user-space
@@ -694,6 +697,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
			 * created mptcp socket
			 */
			new_msk->sk_destruct = mptcp_sock_destruct;
			mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq;
			mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1);
			mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
			ctx->conn = new_msk;
@@ -1317,6 +1321,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
	mptcp_info2sockaddr(remote, &addr, ssk->sk_family);

	mptcp_add_pending_subflow(msk, subflow);
	mptcp_sockopt_sync(msk, ssk);
	err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
	if (err && err != -EINPROGRESS)
		goto failed_unlink;
Loading