Commit 7e1c5d7b authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'mptcp-miscellaneous-changes-for-5-17'

Mat Martineau says:

====================
mptcp: Miscellaneous changes for 5.17

These are three unrelated patches that we've been testing in the MPTCP
tree.

Patch 1 modifies the packet scheduler that picks which TCP subflow is
used for each chunk of outgoing data. The updated scheduler improves
throughput on multiple-subflow connections.

Patch 2 updates a selftest to verify recent TCP_ULP sockopt changes
on MPTCP fallback sockets.

Patch 3 cleans up some unnecessary comparisons with an 8-bit value.
====================

Link: https://lore.kernel.org/r/20211217233702.299461-1-mathew.j.martineau@linux.intel.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents ab9d0e21 59060a47
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -38,7 +38,8 @@ struct mptcp_pm_add_entry {
	u8			retrans_times;
};

#define MAX_ADDR_ID		255
/* max value of mptcp_addr_info.id */
#define MAX_ADDR_ID		U8_MAX
#define BITMAP_SZ DIV_ROUND_UP(MAX_ADDR_ID + 1, BITS_PER_LONG)

struct pm_nl_pernet {
@@ -825,14 +826,13 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
		entry->addr.id = find_next_zero_bit(pernet->id_bitmap,
						    MAX_ADDR_ID + 1,
						    pernet->next_id);
		if ((!entry->addr.id || entry->addr.id > MAX_ADDR_ID) &&
		    pernet->next_id != 1) {
		if (!entry->addr.id && pernet->next_id != 1) {
			pernet->next_id = 1;
			goto find_next;
		}
	}

	if (!entry->addr.id || entry->addr.id > MAX_ADDR_ID)
	if (!entry->addr.id)
		goto out;

	__set_bit(entry->addr.id, pernet->id_bitmap);
+47 −25
Original line number Diff line number Diff line
@@ -1372,7 +1372,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,

struct subflow_send_info {
	struct sock *ssk;
	u64 ratio;
	u64 linger_time;
};

void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow)
@@ -1397,20 +1397,24 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
	return __mptcp_subflow_active(subflow);
}

#define SSK_MODE_ACTIVE	0
#define SSK_MODE_BACKUP	1
#define SSK_MODE_MAX	2

/* implement the mptcp packet scheduler;
 * returns the subflow that will transmit the next DSS
 * additionally updates the rtx timeout
 */
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
	struct subflow_send_info send_info[2];
	struct subflow_send_info send_info[SSK_MODE_MAX];
	struct mptcp_subflow_context *subflow;
	struct sock *sk = (struct sock *)msk;
	u32 pace, burst, wmem;
	int i, nr_active = 0;
	struct sock *ssk;
	u64 linger_time;
	long tout = 0;
	u64 ratio;
	u32 pace;

	sock_owned_by_me(sk);

@@ -1429,10 +1433,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
	}

	/* pick the subflow with the lower wmem/wspace ratio */
	for (i = 0; i < 2; ++i) {
	for (i = 0; i < SSK_MODE_MAX; ++i) {
		send_info[i].ssk = NULL;
		send_info[i].ratio = -1;
		send_info[i].linger_time = -1;
	}

	mptcp_for_each_subflow(msk, subflow) {
		trace_mptcp_subflow_get_send(subflow);
		ssk =  mptcp_subflow_tcp_sock(subflow);
@@ -1441,34 +1446,51 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)

		tout = max(tout, mptcp_timeout_from_subflow(subflow));
		nr_active += !subflow->backup;
		if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd)
			continue;

		pace = READ_ONCE(ssk->sk_pacing_rate);
		pace = subflow->avg_pacing_rate;
		if (unlikely(!pace)) {
			/* init pacing rate from socket */
			subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate);
			pace = subflow->avg_pacing_rate;
			if (!pace)
				continue;
		}

		ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
				pace);
		if (ratio < send_info[subflow->backup].ratio) {
		linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace);
		if (linger_time < send_info[subflow->backup].linger_time) {
			send_info[subflow->backup].ssk = ssk;
			send_info[subflow->backup].ratio = ratio;
			send_info[subflow->backup].linger_time = linger_time;
		}
	}
	__mptcp_set_timeout(sk, tout);

	/* pick the best backup if no other subflow is active */
	if (!nr_active)
		send_info[0].ssk = send_info[1].ssk;

	if (send_info[0].ssk) {
		msk->last_snd = send_info[0].ssk;
		msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
				       tcp_sk(msk->last_snd)->snd_wnd);
		return msk->last_snd;
	}

		send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk;

	/* According to the blest algorithm, to avoid HoL blocking for the
	 * faster flow, we need to:
	 * - estimate the faster flow linger time
	 * - use the above to estimate the amount of byte transferred
	 *   by the faster flow
	 * - check that the amount of queued data is greter than the above,
	 *   otherwise do not use the picked, slower, subflow
	 * We select the subflow with the shorter estimated time to flush
	 * the queued mem, which basically ensure the above. We just need
	 * to check that subflow has a non empty cwin.
	 */
	ssk = send_info[SSK_MODE_ACTIVE].ssk;
	if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd)
		return NULL;

	burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd);
	wmem = READ_ONCE(ssk->sk_wmem_queued);
	subflow = mptcp_subflow_ctx(ssk);
	subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
					   READ_ONCE(ssk->sk_pacing_rate) * burst,
					   burst + wmem);
	msk->last_snd = ssk;
	msk->snd_burst = burst;
	return ssk;
}

static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info)
+1 −0
Original line number Diff line number Diff line
@@ -395,6 +395,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
/* MPTCP subflow context */
struct mptcp_subflow_context {
	struct	list_head node;/* conn_list of subflows */
	unsigned long avg_pacing_rate; /* protected by msk socket lock */
	u64	local_key;
	u64	remote_key;
	u64	idsn;
+51 −46
Original line number Diff line number Diff line
@@ -59,7 +59,6 @@ static enum cfg_peek cfg_peek = CFG_NONE_PEEK;
static const char *cfg_host;
static const char *cfg_port	= "12000";
static int cfg_sock_proto	= IPPROTO_MPTCP;
static bool tcpulp_audit;
static int pf = AF_INET;
static int cfg_sndbuf;
static int cfg_rcvbuf;
@@ -103,7 +102,6 @@ static void die_usage(void)
	fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n");
	fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n");
	fprintf(stderr, "\t-M mark -- set socket packet mark\n");
	fprintf(stderr, "\t-u -- check mptcp ulp\n");
	fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n");
	fprintf(stderr, "\t-c cmsg -- test cmsg type <cmsg>\n");
	fprintf(stderr, "\t-o option -- test sockopt <option>\n");
@@ -215,6 +213,42 @@ static void set_transparent(int fd, int pf)
	}
}

static int do_ulp_so(int sock, const char *name)
{
	return setsockopt(sock, IPPROTO_TCP, TCP_ULP, name, strlen(name));
}

#define X(m)	xerror("%s:%u: %s: failed for proto %d at line %u", __FILE__, __LINE__, (m), proto, line)
static void sock_test_tcpulp(int sock, int proto, unsigned int line)
{
	socklen_t buflen = 8;
	char buf[8] = "";
	int ret = getsockopt(sock, IPPROTO_TCP, TCP_ULP, buf, &buflen);

	if (ret != 0)
		X("getsockopt");

	if (buflen > 0) {
		if (strcmp(buf, "mptcp") != 0)
			xerror("unexpected ULP '%s' for proto %d at line %u", buf, proto, line);
		ret = do_ulp_so(sock, "tls");
		if (ret == 0)
			X("setsockopt");
	} else if (proto == IPPROTO_MPTCP) {
		ret = do_ulp_so(sock, "tls");
		if (ret != -1)
			X("setsockopt");
	}

	ret = do_ulp_so(sock, "mptcp");
	if (ret != -1)
		X("setsockopt");

#undef X
}

#define SOCK_TEST_TCPULP(s, p) sock_test_tcpulp((s), (p), __LINE__)

static int sock_listen_mptcp(const char * const listenaddr,
			     const char * const port)
{
@@ -238,6 +272,8 @@ static int sock_listen_mptcp(const char * const listenaddr,
		if (sock < 0)
			continue;

		SOCK_TEST_TCPULP(sock, cfg_sock_proto);

		if (-1 == setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &one,
				     sizeof(one)))
			perror("setsockopt");
@@ -260,50 +296,17 @@ static int sock_listen_mptcp(const char * const listenaddr,
		return sock;
	}

	SOCK_TEST_TCPULP(sock, cfg_sock_proto);

	if (listen(sock, 20)) {
		perror("listen");
		close(sock);
		return -1;
	}

	return sock;
}

static bool sock_test_tcpulp(const char * const remoteaddr,
			     const char * const port)
{
	struct addrinfo hints = {
		.ai_protocol = IPPROTO_TCP,
		.ai_socktype = SOCK_STREAM,
	};
	struct addrinfo *a, *addr;
	int sock = -1, ret = 0;
	bool test_pass = false;

	hints.ai_family = AF_INET;
	SOCK_TEST_TCPULP(sock, cfg_sock_proto);

	xgetaddrinfo(remoteaddr, port, &hints, &addr);
	for (a = addr; a; a = a->ai_next) {
		sock = socket(a->ai_family, a->ai_socktype, IPPROTO_TCP);
		if (sock < 0) {
			perror("socket");
			continue;
		}
		ret = setsockopt(sock, IPPROTO_TCP, TCP_ULP, "mptcp",
				 sizeof("mptcp"));
		if (ret == -1 && errno == EOPNOTSUPP)
			test_pass = true;
		close(sock);

		if (test_pass)
			break;
		if (!ret)
			fprintf(stderr,
				"setsockopt(TCP_ULP) returned 0\n");
		else
			perror("setsockopt(TCP_ULP)");
	}
	return test_pass;
	return sock;
}

static int sock_connect_mptcp(const char * const remoteaddr,
@@ -326,6 +329,8 @@ static int sock_connect_mptcp(const char * const remoteaddr,
			continue;
		}

		SOCK_TEST_TCPULP(sock, proto);

		if (cfg_mark)
			set_mark(sock, cfg_mark);

@@ -338,6 +343,8 @@ static int sock_connect_mptcp(const char * const remoteaddr,
	}

	freeaddrinfo(addr);
	if (sock != -1)
		SOCK_TEST_TCPULP(sock, proto);
	return sock;
}

@@ -954,6 +961,8 @@ int main_loop_s(int listensock)
		check_sockaddr(pf, &ss, salen);
		check_getpeername(remotesock, &ss, salen);

		SOCK_TEST_TCPULP(remotesock, 0);

		return copyfd_io(0, remotesock, 1);
	}

@@ -1059,6 +1068,8 @@ int main_loop(void)

	check_getpeername_connect(fd);

	SOCK_TEST_TCPULP(fd, cfg_sock_proto);

	if (cfg_rcvbuf)
		set_rcvbuf(fd, cfg_rcvbuf);
	if (cfg_sndbuf)
@@ -1151,7 +1162,7 @@ static void parse_opts(int argc, char **argv)
{
	int c;

	while ((c = getopt(argc, argv, "6jr:lp:s:hut:T:m:S:R:w:M:P:c:o:")) != -1) {
	while ((c = getopt(argc, argv, "6jr:lp:s:ht:T:m:S:R:w:M:P:c:o:")) != -1) {
		switch (c) {
		case 'j':
			cfg_join = true;
@@ -1177,9 +1188,6 @@ static void parse_opts(int argc, char **argv)
		case 'h':
			die_usage();
			break;
		case 'u':
			tcpulp_audit = true;
			break;
		case '6':
			pf = AF_INET6;
			break;
@@ -1233,9 +1241,6 @@ int main(int argc, char *argv[])
	signal(SIGUSR1, handle_signal);
	parse_opts(argc, argv);

	if (tcpulp_audit)
		return sock_test_tcpulp(cfg_host, cfg_port) ? 0 : 1;

	if (listen_mode) {
		int fd = sock_listen_mptcp(cfg_host, cfg_port);

+0 −20
Original line number Diff line number Diff line
@@ -296,24 +296,6 @@ check_mptcp_disabled()
	return 0
}

check_mptcp_ulp_setsockopt()
{
	local t retval
	t="ns_ulp-$sech-$(mktemp -u XXXXXX)"

	ip netns add ${t} || exit $ksft_skip
	if ! ip netns exec ${t} ./mptcp_connect -u -p 10000 -s TCP 127.0.0.1 2>&1; then
		printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) allowed\t[ FAIL ]\n"
		retval=1
		ret=$retval
	else
		printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) blocked\t[ OK ]\n"
		retval=0
	fi
	ip netns del ${t}
	return $retval
}

# $1: IP address
is_v6()
{
@@ -780,8 +762,6 @@ make_file "$sin" "server"

check_mptcp_disabled

check_mptcp_ulp_setsockopt

stop_if_error "The kernel configuration is not valid for MPTCP"

echo "INFO: validating network environment with pings"