Commit b39212d5 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files
Tony Nguyen says:

====================
i40e: support XDP multi-buffer

Tirthendu Sarkar says:

This patchset adds multi-buffer support for XDP. Tx side already has
support for multi-buffer. This patchset focuses on Rx side. The last
patch contains actual multi-buffer changes while the previous ones are
preparatory patches.

On receiving the first buffer of a packet, xdp_buff is built and its
subsequent buffers are added to it as frags. While 'next_to_clean' keeps
pointing to the first descriptor, the newly introduced 'next_to_process'
keeps track of every descriptor for the packet.

On receiving EOP buffer the XDP program is called and appropriate action
is taken (building skb for XDP_PASS, reusing page for XDP_DROP, adjusting
page offsets for XDP_{REDIRECT,TX}).

The patchset also streamlines page offset adjustments for buffer reuse
to make it easier to post process the rx_buffers after running XDP prog.

With this patchset there does not seem to be any performance degradation
for XDP_PASS and some improvement (~1% for XDP_TX, ~5% for XDP_DROP) when
measured using xdp_rxq_info program from samples/bpf/ for 64B packets.

v1: https://lore.kernel.org/netdev/20230306210822.3381942-1-anthony.l.nguyen@intel.com/

* '40GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue:
  i40e: add support for XDP multi-buffer Rx
  i40e: add xdp_buff to i40e_ring struct
  i40e: introduce next_to_process to i40e_ring
  i40e: use frame_sz instead of recalculating truesize for building skb
  i40e: Change size to truesize when using i40e_rx_buffer_flip()
  i40e: add pre-xdp page_count in rx_buffer
  i40e: change Rx buffer size for legacy-rx to support XDP multi-buffer
  i40e: consolidate maximum frame size calculation for vsi
====================

Link: https://lore.kernel.org/r/20230309212819.1198218-1-anthony.l.nguyen@intel.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents c66b2111 e213ced1
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -5402,6 +5402,13 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags)
		return -EOPNOTSUPP;
	}

	if ((changed_flags & I40E_FLAG_LEGACY_RX) &&
	    I40E_2K_TOO_SMALL_WITH_PADDING) {
		dev_warn(&pf->pdev->dev,
			 "2k Rx buffer is too small to fit standard MTU and skb_shared_info\n");
		return -EOPNOTSUPP;
	}

	if ((changed_flags & new_flags &
	     I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED) &&
	    (new_flags & I40E_FLAG_MFP_ENABLED))
+51 −37
Original line number Diff line number Diff line
@@ -2896,15 +2896,35 @@ static void i40e_sync_filters_subtask(struct i40e_pf *pf)
}

/**
 * i40e_max_xdp_frame_size - returns the maximum allowed frame size for XDP
 * i40e_calculate_vsi_rx_buf_len - Calculates buffer length
 *
 * @vsi: VSI to calculate rx_buf_len from
 */
static u16 i40e_calculate_vsi_rx_buf_len(struct i40e_vsi *vsi)
{
	if (!vsi->netdev || (vsi->back->flags & I40E_FLAG_LEGACY_RX))
		return SKB_WITH_OVERHEAD(I40E_RXBUFFER_2048);

	return PAGE_SIZE < 8192 ? I40E_RXBUFFER_3072 : I40E_RXBUFFER_2048;
}

/**
 * i40e_max_vsi_frame_size - returns the maximum allowed frame size for VSI
 * @vsi: the vsi
 * @xdp_prog: XDP program
 **/
static int i40e_max_xdp_frame_size(struct i40e_vsi *vsi)
static int i40e_max_vsi_frame_size(struct i40e_vsi *vsi,
				   struct bpf_prog *xdp_prog)
{
	if (PAGE_SIZE >= 8192 || (vsi->back->flags & I40E_FLAG_LEGACY_RX))
		return I40E_RXBUFFER_2048;
	u16 rx_buf_len = i40e_calculate_vsi_rx_buf_len(vsi);
	u16 chain_len;

	if (xdp_prog && !xdp_prog->aux->xdp_has_frags)
		chain_len = 1;
	else
		return I40E_RXBUFFER_3072;
		chain_len = I40E_MAX_CHAINED_RX_BUFFERS;

	return min_t(u16, rx_buf_len * chain_len, I40E_MAX_RXBUFFER);
}

/**
@@ -2919,11 +2939,12 @@ static int i40e_change_mtu(struct net_device *netdev, int new_mtu)
	struct i40e_netdev_priv *np = netdev_priv(netdev);
	struct i40e_vsi *vsi = np->vsi;
	struct i40e_pf *pf = vsi->back;
	int frame_size;

	if (i40e_enabled_xdp_vsi(vsi)) {
		int frame_size = new_mtu + I40E_PACKET_HDR_PAD;

		if (frame_size > i40e_max_xdp_frame_size(vsi))
	frame_size = i40e_max_vsi_frame_size(vsi, vsi->xdp_prog);
	if (new_mtu > frame_size - I40E_PACKET_HDR_PAD) {
		netdev_err(netdev, "Error changing mtu to %d, Max is %d\n",
			   new_mtu, frame_size - I40E_PACKET_HDR_PAD);
		return -EINVAL;
	}

@@ -3595,6 +3616,8 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
		}
	}

	xdp_init_buff(&ring->xdp, i40e_rx_pg_size(ring) / 2, &ring->xdp_rxq);

	rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len,
				    BIT_ULL(I40E_RXQ_CTX_DBUFF_SHIFT));

@@ -3640,10 +3663,16 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
	}

	/* configure Rx buffer alignment */
	if (!vsi->netdev || (vsi->back->flags & I40E_FLAG_LEGACY_RX))
	if (!vsi->netdev || (vsi->back->flags & I40E_FLAG_LEGACY_RX)) {
		if (I40E_2K_TOO_SMALL_WITH_PADDING) {
			dev_info(&vsi->back->pdev->dev,
				 "2k Rx buffer is too small to fit standard MTU and skb_shared_info\n");
			return -EOPNOTSUPP;
		}
		clear_ring_build_skb_enabled(ring);
	else
	} else {
		set_ring_build_skb_enabled(ring);
	}

	ring->rx_offset = i40e_rx_offset(ring);

@@ -3693,24 +3722,6 @@ static int i40e_vsi_configure_tx(struct i40e_vsi *vsi)
	return err;
}

/**
 * i40e_calculate_vsi_rx_buf_len - Calculates buffer length
 *
 * @vsi: VSI to calculate rx_buf_len from
 */
static u16 i40e_calculate_vsi_rx_buf_len(struct i40e_vsi *vsi)
{
	if (!vsi->netdev || (vsi->back->flags & I40E_FLAG_LEGACY_RX))
		return I40E_RXBUFFER_2048;

#if (PAGE_SIZE < 8192)
	if (!I40E_2K_TOO_SMALL_WITH_PADDING && vsi->netdev->mtu <= ETH_DATA_LEN)
		return I40E_RXBUFFER_1536 - NET_IP_ALIGN;
#endif

	return PAGE_SIZE < 8192 ? I40E_RXBUFFER_3072 : I40E_RXBUFFER_2048;
}

/**
 * i40e_vsi_configure_rx - Configure the VSI for Rx
 * @vsi: the VSI being configured
@@ -3722,13 +3733,15 @@ static int i40e_vsi_configure_rx(struct i40e_vsi *vsi)
	int err = 0;
	u16 i;

	vsi->max_frame = I40E_MAX_RXBUFFER;
	vsi->max_frame = i40e_max_vsi_frame_size(vsi, vsi->xdp_prog);
	vsi->rx_buf_len = i40e_calculate_vsi_rx_buf_len(vsi);

#if (PAGE_SIZE < 8192)
	if (vsi->netdev && !I40E_2K_TOO_SMALL_WITH_PADDING &&
	    vsi->netdev->mtu <= ETH_DATA_LEN)
		vsi->max_frame = I40E_RXBUFFER_1536 - NET_IP_ALIGN;
	    vsi->netdev->mtu <= ETH_DATA_LEN) {
		vsi->rx_buf_len = I40E_RXBUFFER_1536 - NET_IP_ALIGN;
		vsi->max_frame = vsi->rx_buf_len;
	}
#endif

	/* set up individual rings */
@@ -13316,15 +13329,15 @@ static netdev_features_t i40e_features_check(struct sk_buff *skb,
static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog,
			  struct netlink_ext_ack *extack)
{
	int frame_size = vsi->netdev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
	int frame_size = i40e_max_vsi_frame_size(vsi, prog);
	struct i40e_pf *pf = vsi->back;
	struct bpf_prog *old_prog;
	bool need_reset;
	int i;

	/* Don't allow frames that span over multiple buffers */
	if (frame_size > i40e_calculate_vsi_rx_buf_len(vsi)) {
		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
	if (vsi->netdev->mtu > frame_size - I40E_PACKET_HDR_PAD) {
		NL_SET_ERR_MSG_MOD(extack, "MTU too large for linear frames and XDP prog does not support frags");
		return -EINVAL;
	}

@@ -13810,7 +13823,8 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)

		netdev->xdp_features = NETDEV_XDP_ACT_BASIC |
				       NETDEV_XDP_ACT_REDIRECT |
				       NETDEV_XDP_ACT_XSK_ZEROCOPY;
				       NETDEV_XDP_ACT_XSK_ZEROCOPY |
				       NETDEV_XDP_ACT_RX_SG;
	} else {
		/* Relate the VSI_VMDQ name to the VSI_MAIN name. Note that we
		 * are still limited by IFNAMSIZ, but we're adding 'v%d\0' to
+10 −10
Original line number Diff line number Diff line
@@ -162,45 +162,45 @@ DECLARE_EVENT_CLASS(

	TP_PROTO(struct i40e_ring *ring,
		 union i40e_16byte_rx_desc *desc,
		 struct sk_buff *skb),
		 struct xdp_buff *xdp),

	TP_ARGS(ring, desc, skb),
	TP_ARGS(ring, desc, xdp),

	TP_STRUCT__entry(
		__field(void*, ring)
		__field(void*, desc)
		__field(void*, skb)
		__field(void*, xdp)
		__string(devname, ring->netdev->name)
	),

	TP_fast_assign(
		__entry->ring = ring;
		__entry->desc = desc;
		__entry->skb = skb;
		__entry->xdp = xdp;
		__assign_str(devname, ring->netdev->name);
	),

	TP_printk(
		"netdev: %s ring: %p desc: %p skb %p",
		"netdev: %s ring: %p desc: %p xdp %p",
		__get_str(devname), __entry->ring,
		__entry->desc, __entry->skb)
		__entry->desc, __entry->xdp)
);

DEFINE_EVENT(
	i40e_rx_template, i40e_clean_rx_irq,
	TP_PROTO(struct i40e_ring *ring,
		 union i40e_16byte_rx_desc *desc,
		 struct sk_buff *skb),
		 struct xdp_buff *xdp),

	TP_ARGS(ring, desc, skb));
	TP_ARGS(ring, desc, xdp));

DEFINE_EVENT(
	i40e_rx_template, i40e_clean_rx_irq_rx,
	TP_PROTO(struct i40e_ring *ring,
		 union i40e_16byte_rx_desc *desc,
		 struct sk_buff *skb),
		 struct xdp_buff *xdp),

	TP_ARGS(ring, desc, skb));
	TP_ARGS(ring, desc, xdp));

DECLARE_EVENT_CLASS(
	i40e_xmit_template,
+241 −179

File changed.

Preview size limit exceeded, changes collapsed.

+12 −8
Original line number Diff line number Diff line
@@ -277,6 +277,7 @@ struct i40e_rx_buffer {
	struct page *page;
	__u32 page_offset;
	__u16 pagecnt_bias;
	__u32 page_count;
};

struct i40e_queue_stats {
@@ -336,6 +337,17 @@ struct i40e_ring {
	u8 dcb_tc;			/* Traffic class of ring */
	u8 __iomem *tail;

	/* Storing xdp_buff on ring helps in saving the state of partially built
	 * packet when i40e_clean_rx_ring_irq() must return before it sees EOP
	 * and to resume packet building for this ring in the next call to
	 * i40e_clean_rx_ring_irq().
	 */
	struct xdp_buff xdp;

	/* Next descriptor to be processed; next_to_clean is updated only on
	 * processing EOP descriptor
	 */
	u16 next_to_process;
	/* high bit set means dynamic, use accessor routines to read/write.
	 * hardware only supports 2us resolution for the ITR registers.
	 * these values always store the USER setting, and must be converted
@@ -380,14 +392,6 @@ struct i40e_ring {

	struct rcu_head rcu;		/* to avoid race on free */
	u16 next_to_alloc;
	struct sk_buff *skb;		/* When i40e_clean_rx_ring_irq() must
					 * return before it sees the EOP for
					 * the current packet, we save that skb
					 * here and resume receiving this
					 * packet the next time
					 * i40e_clean_rx_ring_irq() is called
					 * for this ring.
					 */

	struct i40e_channel *ch;
	u16 rx_offset;