Commit 957ed5e7 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tcp-plb'

Mubashir Adnan Qureshi says:

====================
net: Add PLB functionality to TCP

This patch series adds PLB (Protective Load Balancing) to TCP and hooks
it up to DCTCP. PLB is disabled by default and can be enabled using
relevant sysctls and support from underlying CC.

PLB (Protective Load Balancing) is a host based mechanism for load
balancing across switch links. It leverages congestion signals(e.g. ECN)
from transport layer to randomly change the path of the connection
experiencing congestion. PLB changes the path of the connection by
changing the outgoing IPv6 flow label for IPv6 connections (implemented
in Linux by calling sk_rethink_txhash()). Because of this implementation
mechanism, PLB can currently only work for IPv6 traffic. For more
information, see the SIGCOMM 2022 paper:
  https://doi.org/10.1145/3544216.3544226


====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 7f86cf50 71fc7047
Loading
Loading
Loading
Loading
+75 −0
Original line number Diff line number Diff line
@@ -1069,6 +1069,81 @@ tcp_child_ehash_entries - INTEGER

	Default: 0

tcp_plb_enabled - BOOLEAN
	If set and the underlying congestion control (e.g. DCTCP) supports
	and enables PLB feature, TCP PLB (Protective Load Balancing) is
	enabled. PLB is described in the following paper:
	https://doi.org/10.1145/3544216.3544226. Based on PLB parameters,
	upon sensing sustained congestion, TCP triggers a change in
	flow label field for outgoing IPv6 packets. A change in flow label
	field potentially changes the path of outgoing packets for switches
	that use ECMP/WCMP for routing.

	PLB changes socket txhash which results in a change in IPv6 Flow Label
	field, and currently no-op for IPv4 headers. It is possible
	to apply PLB for IPv4 with other network header fields (e.g. TCP
	or IPv4 options) or using encapsulation where outer header is used
	by switches to determine next hop. In either case, further host
	and switch side changes will be needed.

	When set, PLB assumes that congestion signal (e.g. ECN) is made
	available and used by congestion control module to estimate a
	congestion measure (e.g. ce_ratio). PLB needs a congestion measure to
	make repathing decisions.

	Default: FALSE

tcp_plb_idle_rehash_rounds - INTEGER
	Number of consecutive congested rounds (RTT) seen after which
	a rehash can be performed, given there are no packets in flight.
	This is referred to as M in PLB paper:
	https://doi.org/10.1145/3544216.3544226.

	Possible Values: 0 - 31

	Default: 3

tcp_plb_rehash_rounds - INTEGER
	Number of consecutive congested rounds (RTT) seen after which
	a forced rehash can be performed. Be careful when setting this
	parameter, as a small value increases the risk of retransmissions.
	This is referred to as N in PLB paper:
	https://doi.org/10.1145/3544216.3544226.

	Possible Values: 0 - 31

	Default: 12

tcp_plb_suspend_rto_sec - INTEGER
	Time, in seconds, to suspend PLB in event of an RTO. In order to avoid
	having PLB repath onto a connectivity "black hole", after an RTO a TCP
	connection suspends PLB repathing for a random duration between 1x and
	2x of this parameter. Randomness is added to avoid concurrent rehashing
	of multiple TCP connections. This should be set corresponding to the
	amount of time it takes to repair a failed link.

	Possible Values: 0 - 255

	Default: 60

tcp_plb_cong_thresh - INTEGER
	Fraction of packets marked with congestion over a round (RTT) to
	tag that round as congested. This is referred to as K in the PLB paper:
	https://doi.org/10.1145/3544216.3544226.

	The 0-1 fraction range is mapped to 0-256 range to avoid floating
	point operations. For example, 128 means that if at least 50% of
	the packets in a round were marked as congested then the round
	will be tagged as congested.

	Setting threshold to 0 means that PLB repaths every RTT regardless
	of congestion. This is not intended behavior for PLB and should be
	used only for experimentation purpose.

	Possible Values: 0 - 256

	Default: 128

UDP variables
=============

+1 −0
Original line number Diff line number Diff line
@@ -423,6 +423,7 @@ struct tcp_sock {
		u32		  probe_seq_start;
		u32		  probe_seq_end;
	} mtu_probe;
	u32     plb_rehash;     /* PLB-triggered rehash attempts */
	u32	mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
			   * while socket was owned by user.
			   */
+5 −0
Original line number Diff line number Diff line
@@ -183,6 +183,11 @@ struct netns_ipv4 {
	unsigned long tfo_active_disable_stamp;
	u32 tcp_challenge_timestamp;
	u32 tcp_challenge_count;
	u8 sysctl_tcp_plb_enabled;
	u8 sysctl_tcp_plb_idle_rehash_rounds;
	u8 sysctl_tcp_plb_rehash_rounds;
	u8 sysctl_tcp_plb_suspend_rto_sec;
	int sysctl_tcp_plb_cong_thresh;

	int sysctl_udp_wmem_min;
	int sysctl_udp_rmem_min;
+28 −0
Original line number Diff line number Diff line
@@ -2140,6 +2140,34 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
extern void tcp_rack_reo_timeout(struct sock *sk);
extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);

/* tcp_plb.c */

/*
 * Scaling factor for fractions in PLB. For example, tcp_plb_update_state
 * expects cong_ratio which represents fraction of traffic that experienced
 * congestion over a single RTT. In order to avoid floating point operations,
 * this fraction should be mapped to (1 << TCP_PLB_SCALE) and passed in.
 */
#define TCP_PLB_SCALE 8

/* State for PLB (Protective Load Balancing) for a single TCP connection. */
struct tcp_plb_state {
	u8	consec_cong_rounds:5, /* consecutive congested rounds */
		unused:3;
	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
};

static inline void tcp_plb_init(const struct sock *sk,
				struct tcp_plb_state *plb)
{
	plb->consec_cong_rounds = 0;
	plb->pause_until = 0;
}
void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
			  const int cong_ratio);
void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb);
void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb);

/* At how many usecs into the future should the RTO fire? */
static inline s64 tcp_rto_delta_us(const struct sock *sk)
{
+1 −0
Original line number Diff line number Diff line
@@ -292,6 +292,7 @@ enum
	LINUX_MIB_TCPDSACKIGNOREDDUBIOUS,	/* TCPDSACKIgnoredDubious */
	LINUX_MIB_TCPMIGRATEREQSUCCESS,		/* TCPMigrateReqSuccess */
	LINUX_MIB_TCPMIGRATEREQFAILURE,		/* TCPMigrateReqFailure */
	LINUX_MIB_TCPPLBREHASH,			/* TCPPLBRehash */
	__LINUX_MIB_MAX
};

Loading