Commit 7fa2e481 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'big-tcp'

Eric Dumazet says:

====================
tcp: BIG TCP implementation

This series implements BIG TCP as presented in netdev 0x15:

https://netdevconf.info/0x15/session.html?BIG-TCP

Jonathan Corbet made a nice summary: https://lwn.net/Articles/884104/



Standard TSO/GRO packet limit is 64KB

With BIG TCP, we allow bigger TSO/GRO packet sizes for IPv6 traffic.

Note that this feature is by default not enabled, because it might
break some eBPF programs assuming TCP header immediately follows IPv6 header.

While tcpdump recognizes the HBH/Jumbo header, standard pcap filters
are unable to skip over IPv6 extension headers.

Reducing number of packets traversing networking stack usually improves
performance, as shown on this experiment using a 100Gbit NIC, and 4K MTU.

'Standard' performance with current (74KB) limits.
for i in {1..10}; do ./netperf -t TCP_RR -H iroa23  -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done
77           138          183          8542.19
79           143          178          8215.28
70           117          164          9543.39
80           144          176          8183.71
78           126          155          9108.47
80           146          184          8115.19
71           113          165          9510.96
74           113          164          9518.74
79           137          178          8575.04
73           111          171          9561.73

Now enable BIG TCP on both hosts.

ip link set dev eth0 gro_max_size 185000 gso_max_size 185000
for i in {1..10}; do ./netperf -t TCP_RR -H iroa23  -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done
57           83           117          13871.38
64           118          155          11432.94
65           116          148          11507.62
60           105          136          12645.15
60           103          135          12760.34
60           102          134          12832.64
62           109          132          10877.68
58           82           115          14052.93
57           83           124          14212.58
57           82           119          14196.01

We see an increase of transactions per second, and lower latencies as well.

v7: adopt unsafe_memcpy() in mlx5 to avoid FORTIFY warnings.

v6: fix a compilation error for CONFIG_IPV6=n in
    "net: allow gso_max_size to exceed 65536", reported by kernel bots.

v5: Replaced two patches (that were adding new attributes) with patches
    from Alexander Duyck. Idea is to reuse existing gso_max_size/gro_max_size

v4: Rebased on top of Jakub series (Merge branch 'tso-gso-limit-split')
    max_tso_size is now family independent.

v3: Fixed a typo in RFC number (Alexander)
    Added Reviewed-by: tags from Tariq on mlx4/mlx5 parts.

v2: Removed the MAX_SKB_FRAGS change, this belongs to a different series.
    Addressed feedback, for Alexander and nvidia folks.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 5cf15ce3 de78960e
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -151,7 +151,8 @@
#define XGBE_TX_MAX_BUF_SIZE	(0x3fff & ~(64 - 1))

/* Descriptors required for maximum contiguous TSO/GSO packet */
#define XGBE_TX_MAX_SPLIT	((GSO_MAX_SIZE / XGBE_TX_MAX_BUF_SIZE) + 1)
#define XGBE_TX_MAX_SPLIT	\
	((GSO_LEGACY_MAX_SIZE / XGBE_TX_MAX_BUF_SIZE) + 1)

/* Maximum possible descriptors needed for an SKB:
 * - Maximum number of SKB frags
+3 −0
Original line number Diff line number Diff line
@@ -3417,6 +3417,9 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
	dev->min_mtu = ETH_MIN_MTU;
	dev->max_mtu = priv->max_mtu;

	/* supports LSOv2 packets. */
	netif_set_tso_max_size(dev, GSO_MAX_SIZE);

	mdev->pndev[port] = dev;
	mdev->upper[port] = NULL;

+38 −9
Original line number Diff line number Diff line
@@ -43,6 +43,7 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/indirect_call_wrapper.h>
#include <net/ipv6.h>

#include "mlx4_en.h"

@@ -634,19 +635,28 @@ static int get_real_size(const struct sk_buff *skb,
			 struct net_device *dev,
			 int *lso_header_size,
			 bool *inline_ok,
			 void **pfrag)
			 void **pfrag,
			 int *hopbyhop)
{
	struct mlx4_en_priv *priv = netdev_priv(dev);
	int real_size;

	if (shinfo->gso_size) {
		*inline_ok = false;
		if (skb->encapsulation)
		*hopbyhop = 0;
		if (skb->encapsulation) {
			*lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb);
		else
		} else {
			/* Detects large IPV6 TCP packets and prepares for removal of
			 * HBH header that has been pushed by ip6_xmit(),
			 * mainly so that tcpdump can dissect them.
			 */
			if (ipv6_has_hopopt_jumbo(skb))
				*hopbyhop = sizeof(struct hop_jumbo_hdr);
			*lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb);
		}
		real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE +
			ALIGN(*lso_header_size + 4, DS_SIZE);
			ALIGN(*lso_header_size - *hopbyhop + 4, DS_SIZE);
		if (unlikely(*lso_header_size != skb_headlen(skb))) {
			/* We add a segment for the skb linear buffer only if
			 * it contains data */
@@ -873,6 +883,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
	int desc_size;
	int real_size;
	u32 index, bf_index;
	struct ipv6hdr *h6;
	__be32 op_own;
	int lso_header_size;
	void *fragptr = NULL;
@@ -881,6 +892,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
	bool stop_queue;
	bool inline_ok;
	u8 data_offset;
	int hopbyhop;
	bool bf_ok;

	tx_ind = skb_get_queue_mapping(skb);
@@ -890,7 +902,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
		goto tx_drop;

	real_size = get_real_size(skb, shinfo, dev, &lso_header_size,
				  &inline_ok, &fragptr);
				  &inline_ok, &fragptr, &hopbyhop);
	if (unlikely(!real_size))
		goto tx_drop_count;

@@ -943,7 +955,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
		data = &tx_desc->data;
		data_offset = offsetof(struct mlx4_en_tx_desc, data);
	} else {
		int lso_align = ALIGN(lso_header_size + 4, DS_SIZE);
		int lso_align = ALIGN(lso_header_size - hopbyhop + 4, DS_SIZE);

		data = (void *)&tx_desc->lso + lso_align;
		data_offset = offsetof(struct mlx4_en_tx_desc, lso) + lso_align;
@@ -1008,14 +1020,31 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
			((ring->prod & ring->size) ?
				cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);

		lso_header_size -= hopbyhop;
		/* Fill in the LSO prefix */
		tx_desc->lso.mss_hdr_size = cpu_to_be32(
			shinfo->gso_size << 16 | lso_header_size);


		if (unlikely(hopbyhop)) {
			/* remove the HBH header.
			 * Layout: [Ethernet header][IPv6 header][HBH][TCP header]
			 */
			memcpy(tx_desc->lso.header, skb->data, ETH_HLEN + sizeof(*h6));
			h6 = (struct ipv6hdr *)((char *)tx_desc->lso.header + ETH_HLEN);
			h6->nexthdr = IPPROTO_TCP;
			/* Copy the TCP header after the IPv6 one */
			memcpy(h6 + 1,
			       skb->data + ETH_HLEN + sizeof(*h6) +
					sizeof(struct hop_jumbo_hdr),
			       tcp_hdrlen(skb));
			/* Leave ipv6 payload_len set to 0, as LSO v2 specs request. */
		} else {
			/* Copy headers;
		 * note that we already verified that it is linear */
			 * note that we already verified that it is linear
			 */
			memcpy(tx_desc->lso.header, skb->data, lso_header_size);

		}
		ring->tso_packets++;

		i = shinfo->gso_segs;
+1 −0
Original line number Diff line number Diff line
@@ -4920,6 +4920,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)

	netdev->priv_flags       |= IFF_UNICAST_FLT;

	netif_set_tso_max_size(netdev, GSO_MAX_SIZE);
	mlx5e_set_netdev_dev_addr(netdev);
	mlx5e_ipsec_build_netdev(priv);
	mlx5e_ktls_build_netdev(priv);
+1 −1
Original line number Diff line number Diff line
@@ -2038,7 +2038,7 @@ mlx5e_hw_gro_skb_has_enough_space(struct sk_buff *skb, u16 data_bcnt)
{
	int nr_frags = skb_shinfo(skb)->nr_frags;

	return PAGE_SIZE * nr_frags + data_bcnt <= GSO_MAX_SIZE;
	return PAGE_SIZE * nr_frags + data_bcnt <= GRO_LEGACY_MAX_SIZE;
}

static void
Loading