Commit 978f4175 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'mptcp-prepare-mptcp-packet-scheduler-for-bpf-extension'

Mat Martineau says:

====================
mptcp: Prepare MPTCP packet scheduler for BPF extension

The kernel's MPTCP packet scheduler has, to date, been a one-size-fits
all algorithm that is hard-coded. It attempts to balance latency and
throughput when transmitting data across multiple TCP subflows, and has
some limited tunability through sysctls. It has been a long-term goal of
the Linux MPTCP community to support customizable packet schedulers for
use cases that need to make different trade-offs regarding latency,
throughput, redundancy, and other metrics. BPF is well-suited for
configuring customized, per-packet scheduling decisions without having
to modify the kernel or manage out-of-tree kernel modules.

The first steps toward implementing BPF packet schedulers are to update
the existing MPTCP transmit loops to allow more flexible scheduling
decisions, and to add infrastructure for swappable packet schedulers.
The existing scheduling algorithm remains the default. BPF-related
changes will be in a future patch series.

This code has been in the MPTCP development tree for quite a while,
undergoing testing in our CI and community.

Patches 1 and 2 refactor the transmit code and do some related cleanup.

Patches 3-9 add infrastructure for registering and calling multiple
schedulers.

Patch 10 connects the in-kernel default scheduler to the new
infrastructure.
====================

Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-0-0c860fb256a8@kernel.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 98173633 ed1ad86b
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -74,3 +74,11 @@ stale_loss_cnt - INTEGER
	This is a per-namespace sysctl.

	Default: 4

scheduler - STRING
	Select the scheduler of your choice.

	Support for selection of different schedulers. This is a per-namespace
	sysctl.

	Default: "default"
+21 −0
Original line number Diff line number Diff line
@@ -96,6 +96,27 @@ struct mptcp_out_options {
#endif
};

#define MPTCP_SCHED_NAME_MAX	16
#define MPTCP_SUBFLOWS_MAX	8

struct mptcp_sched_data {
	bool	reinject;
	u8	subflows;
	struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX];
};

struct mptcp_sched_ops {
	int (*get_subflow)(struct mptcp_sock *msk,
			   struct mptcp_sched_data *data);

	char			name[MPTCP_SCHED_NAME_MAX];
	struct module		*owner;
	struct list_head	list;

	void (*init)(struct mptcp_sock *msk);
	void (*release)(struct mptcp_sock *msk);
} ____cacheline_aligned_in_smp;

#ifdef CONFIG_MPTCP
void mptcp_init(void);

+1 −1
Original line number Diff line number Diff line
@@ -2,7 +2,7 @@
obj-$(CONFIG_MPTCP) += mptcp.o

mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
	   mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o
	   mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o

obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
+14 −0
Original line number Diff line number Diff line
@@ -32,6 +32,7 @@ struct mptcp_pernet {
	u8 checksum_enabled;
	u8 allow_join_initial_addr_port;
	u8 pm_type;
	char scheduler[MPTCP_SCHED_NAME_MAX];
};

static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
@@ -69,6 +70,11 @@ int mptcp_get_pm_type(const struct net *net)
	return mptcp_get_pernet(net)->pm_type;
}

const char *mptcp_get_scheduler(const struct net *net)
{
	return mptcp_get_pernet(net)->scheduler;
}

static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
	pernet->mptcp_enabled = 1;
@@ -77,6 +83,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
	pernet->allow_join_initial_addr_port = 1;
	pernet->stale_loss_cnt = 4;
	pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
	strcpy(pernet->scheduler, "default");
}

#ifdef CONFIG_SYSCTL
@@ -128,6 +135,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
		.extra1       = SYSCTL_ZERO,
		.extra2       = &mptcp_pm_type_max
	},
	{
		.procname = "scheduler",
		.maxlen	= MPTCP_SCHED_NAME_MAX,
		.mode = 0644,
		.proc_handler = proc_dostring,
	},
	{}
};

@@ -149,6 +162,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
	table[3].data = &pernet->allow_join_initial_addr_port;
	table[4].data = &pernet->stale_loss_cnt;
	table[5].data = &pernet->pm_type;
	table[6].data = &pernet->scheduler;

	hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
	if (!hdr)
+1 −8
Original line number Diff line number Diff line
@@ -299,15 +299,8 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)

	pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
	msk = mptcp_sk(sk);
	if (subflow->backup != bkup) {
	if (subflow->backup != bkup)
		subflow->backup = bkup;
		mptcp_data_lock(sk);
		if (!sock_owned_by_user(sk))
			msk->last_snd = NULL;
		else
			__set_bit(MPTCP_RESET_SCHEDULER,  &msk->cb_flags);
		mptcp_data_unlock(sk);
	}

	mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
}
Loading