Commit 983f507c authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'net-support-ipv4-big-tcp'

Xin Long says:

====================
net: support ipv4 big tcp

This is similar to the BIG TCP patchset added by Eric for IPv6:

  https://lwn.net/Articles/895398/

Different from IPv6, IPv4 tot_len is 16-bit long only, and IPv4 header
doesn't have exthdrs(options) for the BIG TCP packets' length. To make
it simple, as David and Paolo suggested, we set IPv4 tot_len to 0 to
indicate this might be a BIG TCP packet and use skb->len as the real
IPv4 total length.

This will work safely, as all BIG TCP packets are GSO/GRO packets and
processed on the same host as they were created; There is no padding
in GSO/GRO packets, and skb->len - network_offset is exactly the IPv4
packet total length; Also, before implementing the feature, all those
places that may get iph tot_len from BIG TCP packets are taken care
with some new APIs:

Patch 1 adds some APIs for iph tot_len setting and getting, which are
used in all these places where IPv4 BIG TCP packets may reach in Patch
2-7, Patch 8 adds a GSO_TCP tp_status for af_packet users, and Patch 9
add new netlink attributes to make IPv4 BIG TCP independent from IPv6
BIG TCP on configuration, and Patch 10 implements this feature.

Note that the similar change as in Patch 2-6 are also needed for IPv6
BIG TCP packets, and will be addressed in another patchset.

The similar performance test is done for IPv4 BIG TCP with 25Gbit NIC
and 1.5K MTU:

No BIG TCP:
for i in {1..10}; do netperf -t TCP_RR -H 192.168.100.1 -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done
168          322          337          3776.49
143          236          277          4654.67
128          258          288          4772.83
171          229          278          4645.77
175          228          243          4678.93
149          239          279          4599.86
164          234          268          4606.94
155          276          289          4235.82
180          255          268          4418.95
168          241          249          4417.82

Enable BIG TCP:
ip link set dev ens1f0np0 gro_ipv4_max_size 128000 gso_ipv4_max_size 128000
for i in {1..10}; do netperf -t TCP_RR -H 192.168.100.1 -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done
161          241          252          4821.73
174          205          217          5098.28
167          208          220          5001.43
164          228          249          4883.98
150          233          249          4914.90
180          233          244          4819.66
154          208          219          5004.92
157          209          247          4999.78
160          218          246          4842.31
174          206          217          5080.99

Thanks for the feedback from Eric and David Ahern.
====================

Link: https://lore.kernel.org/r/cover.1674921359.git.lucien.xin@gmail.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents d8673afb b1a78b9b
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -157,7 +157,7 @@ void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type)
			return NULL;

		ip4h = ip_hdr(skb);
		pktlen = ntohs(ip4h->tot_len);
		pktlen = skb_ip_totlen(skb);
		if (ip4h->ihl < 5 || ip4h->version != 4)
			return NULL;
		if (skb->len < pktlen || pktlen < (ip4h->ihl * 4))
+21 −0
Original line number Diff line number Diff line
@@ -35,4 +35,25 @@ static inline unsigned int ip_transport_len(const struct sk_buff *skb)
{
	return ntohs(ip_hdr(skb)->tot_len) - skb_network_header_len(skb);
}

static inline unsigned int iph_totlen(const struct sk_buff *skb, const struct iphdr *iph)
{
	u32 len = ntohs(iph->tot_len);

	return (len || !skb_is_gso(skb) || !skb_is_gso_tcp(skb)) ?
	       len : skb->len - skb_network_offset(skb);
}

static inline unsigned int skb_ip_totlen(const struct sk_buff *skb)
{
	return iph_totlen(skb, ip_hdr(skb));
}

/* IPv4 datagram length is stored into 16bit field (tot_len) */
#define IP_MAX_MTU	0xFFFFU

static inline void iph_set_totlen(struct iphdr *iph, unsigned int len)
{
	iph->tot_len = len <= IP_MAX_MTU ? htons(len) : 0;
}
#endif	/* _LINUX_IP_H */
+6 −0
Original line number Diff line number Diff line
@@ -1964,6 +1964,8 @@ enum netdev_ml_priv_type {
 *	@gso_max_segs:	Maximum number of segments that can be passed to the
 *			NIC for GSO
 *	@tso_max_segs:	Device (as in HW) limit on the max TSO segment count
 * 	@gso_ipv4_max_size:	Maximum size of generic segmentation offload,
 * 				for IPv4.
 *
 *	@dcbnl_ops:	Data Center Bridging netlink ops
 *	@num_tc:	Number of traffic classes in the net device
@@ -2004,6 +2006,8 @@ enum netdev_ml_priv_type {
 *			keep a list of interfaces to be deleted.
 *	@gro_max_size:	Maximum size of aggregated packet in generic
 *			receive offload (GRO)
 * 	@gro_ipv4_max_size:	Maximum size of aggregated packet in generic
 * 				receive offload (GRO), for IPv4.
 *
 *	@dev_addr_shadow:	Copy of @dev_addr to catch direct writes.
 *	@linkwatch_dev_tracker:	refcount tracker used by linkwatch.
@@ -2207,6 +2211,7 @@ struct net_device {
 */
#define GRO_MAX_SIZE		(8 * 65535u)
	unsigned int		gro_max_size;
	unsigned int		gro_ipv4_max_size;
	rx_handler_func_t __rcu	*rx_handler;
	void __rcu		*rx_handler_data;

@@ -2330,6 +2335,7 @@ struct net_device {
	u16			gso_max_segs;
#define TSO_MAX_SEGS		U16_MAX
	u16			tso_max_segs;
	unsigned int		gso_ipv4_max_size;

#ifdef CONFIG_DCB
	const struct dcbnl_rtnl_ops *dcbnl_ops;
+2 −2
Original line number Diff line number Diff line
@@ -29,7 +29,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt)
	if (iph->ihl < 5 || iph->version != 4)
		return -1;

	len = ntohs(iph->tot_len);
	len = iph_totlen(pkt->skb, iph);
	thoff = iph->ihl * 4;
	if (pkt->skb->len < len)
		return -1;
@@ -64,7 +64,7 @@ static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt)
	if (iph->ihl < 5 || iph->version != 4)
		goto inhdr_error;

	len = ntohs(iph->tot_len);
	len = iph_totlen(pkt->skb, iph);
	thoff = iph->ihl * 4;
	if (pkt->skb->len < len) {
		__IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS);
+0 −3
Original line number Diff line number Diff line
@@ -35,9 +35,6 @@
#include <linux/cache.h>
#include <linux/security.h>

/* IPv4 datagram length is stored into 16bit field (tot_len) */
#define IP_MAX_MTU	0xFFFFU

#define RTO_ONLINK	0x01

#define RT_CONN_FLAGS(sk)   (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
Loading