Commit 39c536ac authored by Daniel Borkmann's avatar Daniel Borkmann
Browse files

Merge branch 'xdp-ice-mbuf'

Alexander Lobakin says:

====================
The set grew from the poor performance of %BPF_F_TEST_XDP_LIVE_FRAMES
when the ice-backed device is a sender. Initially there were around
3.3 Mpps / thread, while I have 5.5 on skb-based pktgen ...

After fixing 0005 (0004 is a prereq for it) first (strange thing nobody
noticed that earlier), I started catching random OOMs. This is how 0002
(and partially 0001) appeared.

0003 is a suggestion from Maciej to not waste time on refactoring dead
lines. 0006 is a "cherry on top" to get away with the final 6.7 Mpps.
4.5 of 6 are fixes, but only the first three are tagged, since it then
starts being tricky. I may backport them manually later on.

TL;DR for the series is that shortcuts are good, but only as long as
they don't make the driver miss important things. %XDP_TX is purely
driver-local, however .ndo_xdp_xmit() is not, and sometimes assumptions
can be unsafe there.

With that series and also one core code patch[0], "live frames" and
xdp-trafficgen are now safe'n'fast on ice (probably more to come).

  [0] https://lore.kernel.org/all/20230209172827.874728-1-alexandr.lobakin@intel.com


====================

Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 0b075724 ad07f29b
Loading
Loading
Loading
Loading
+43 −24
Original line number Diff line number Diff line
@@ -85,7 +85,7 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
	td_cmd = ICE_TXD_LAST_DESC_CMD | ICE_TX_DESC_CMD_DUMMY |
		 ICE_TX_DESC_CMD_RE;

	tx_buf->tx_flags = ICE_TX_FLAGS_DUMMY_PKT;
	tx_buf->type = ICE_TX_BUF_DUMMY;
	tx_buf->raw_buf = raw_packet;

	tx_desc->cmd_type_offset_bsz =
@@ -112,31 +112,29 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
static void
ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf)
{
	if (tx_buf->skb) {
		if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT) {
			devm_kfree(ring->dev, tx_buf->raw_buf);
		} else if (ice_ring_is_xdp(ring)) {
			if (ring->xsk_pool)
				xsk_buff_free(tx_buf->xdp);
			else
				page_frag_free(tx_buf->raw_buf);
		} else {
			dev_kfree_skb_any(tx_buf->skb);
		}
	if (dma_unmap_len(tx_buf, len))
			dma_unmap_single(ring->dev,
					 dma_unmap_addr(tx_buf, dma),
					 dma_unmap_len(tx_buf, len),
					 DMA_TO_DEVICE);
	} else if (dma_unmap_len(tx_buf, len)) {
		dma_unmap_page(ring->dev,
			       dma_unmap_addr(tx_buf, dma),
			       dma_unmap_len(tx_buf, len),
			       DMA_TO_DEVICE);

	switch (tx_buf->type) {
	case ICE_TX_BUF_DUMMY:
		devm_kfree(ring->dev, tx_buf->raw_buf);
		break;
	case ICE_TX_BUF_SKB:
		dev_kfree_skb_any(tx_buf->skb);
		break;
	case ICE_TX_BUF_XDP_TX:
		page_frag_free(tx_buf->raw_buf);
		break;
	case ICE_TX_BUF_XDP_XMIT:
		xdp_return_frame(tx_buf->xdpf);
		break;
	}

	tx_buf->next_to_watch = NULL;
	tx_buf->skb = NULL;
	tx_buf->type = ICE_TX_BUF_EMPTY;
	dma_unmap_len_set(tx_buf, len, 0);
	/* tx_buf must be completely set up in the transmit path */
}
@@ -269,7 +267,7 @@ static bool ice_clean_tx_irq(struct ice_tx_ring *tx_ring, int napi_budget)
				 DMA_TO_DEVICE);

		/* clear tx_buf data */
		tx_buf->skb = NULL;
		tx_buf->type = ICE_TX_BUF_EMPTY;
		dma_unmap_len_set(tx_buf, len, 0);

		/* unmap remaining buffers */
@@ -580,7 +578,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
	case XDP_TX:
		if (static_branch_unlikely(&ice_xdp_locking_key))
			spin_lock(&xdp_ring->tx_lock);
		ret = __ice_xmit_xdp_ring(xdp, xdp_ring);
		ret = __ice_xmit_xdp_ring(xdp, xdp_ring, false);
		if (static_branch_unlikely(&ice_xdp_locking_key))
			spin_unlock(&xdp_ring->tx_lock);
		if (ret == ICE_XDP_CONSUMED)
@@ -607,6 +605,25 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
		ice_set_rx_bufs_act(xdp, rx_ring, ret);
}

/**
 * ice_xmit_xdp_ring - submit frame to XDP ring for transmission
 * @xdpf: XDP frame that will be converted to XDP buff
 * @xdp_ring: XDP ring for transmission
 */
static int ice_xmit_xdp_ring(const struct xdp_frame *xdpf,
			     struct ice_tx_ring *xdp_ring)
{
	struct xdp_buff xdp;

	xdp.data_hard_start = (void *)xdpf;
	xdp.data = xdpf->data;
	xdp.data_end = xdp.data + xdpf->len;
	xdp.frame_sz = xdpf->frame_sz;
	xdp.flags = xdpf->flags;

	return __ice_xmit_xdp_ring(&xdp, xdp_ring, true);
}

/**
 * ice_xdp_xmit - submit packets to XDP ring for transmission
 * @dev: netdev
@@ -652,7 +669,7 @@ ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,

	tx_buf = &xdp_ring->tx_buf[xdp_ring->next_to_use];
	for (i = 0; i < n; i++) {
		struct xdp_frame *xdpf = frames[i];
		const struct xdp_frame *xdpf = frames[i];
		int err;

		err = ice_xmit_xdp_ring(xdpf, xdp_ring);
@@ -1712,6 +1729,7 @@ ice_tx_map(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first,
				       DMA_TO_DEVICE);

		tx_buf = &tx_ring->tx_buf[i];
		tx_buf->type = ICE_TX_BUF_FRAG;
	}

	/* record SW timestamp if HW timestamp is not available */
@@ -2355,6 +2373,7 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring)
	/* record the location of the first descriptor for this packet */
	first = &tx_ring->tx_buf[tx_ring->next_to_use];
	first->skb = skb;
	first->type = ICE_TX_BUF_SKB;
	first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN);
	first->gso_segs = 1;
	first->tx_flags = 0;
@@ -2527,11 +2546,11 @@ void ice_clean_ctrl_tx_irq(struct ice_tx_ring *tx_ring)
					 dma_unmap_addr(tx_buf, dma),
					 dma_unmap_len(tx_buf, len),
					 DMA_TO_DEVICE);
		if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT)
		if (tx_buf->type == ICE_TX_BUF_DUMMY)
			devm_kfree(tx_ring->dev, tx_buf->raw_buf);

		/* clear next_to_watch to prevent false hangs */
		tx_buf->raw_buf = NULL;
		tx_buf->type = ICE_TX_BUF_EMPTY;
		tx_buf->tx_flags = 0;
		tx_buf->next_to_watch = NULL;
		dma_unmap_len_set(tx_buf, len, 0);
+28 −9
Original line number Diff line number Diff line
@@ -121,10 +121,7 @@ static inline int ice_skb_pad(void)
#define ICE_TX_FLAGS_TSO	BIT(0)
#define ICE_TX_FLAGS_HW_VLAN	BIT(1)
#define ICE_TX_FLAGS_SW_VLAN	BIT(2)
/* ICE_TX_FLAGS_DUMMY_PKT is used to mark dummy packets that should be
 * freed instead of returned like skb packets.
 */
#define ICE_TX_FLAGS_DUMMY_PKT	BIT(3)
/* Free, was ICE_TX_FLAGS_DUMMY_PKT */
#define ICE_TX_FLAGS_TSYN	BIT(4)
#define ICE_TX_FLAGS_IPV4	BIT(5)
#define ICE_TX_FLAGS_IPV6	BIT(6)
@@ -149,14 +146,35 @@ static inline int ice_skb_pad(void)

#define ICE_TXD_LAST_DESC_CMD (ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS)

/**
 * enum ice_tx_buf_type - type of &ice_tx_buf to act on Tx completion
 * @ICE_TX_BUF_EMPTY: unused OR XSk frame, no action required
 * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
 * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
 * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
 * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
 * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
 * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
 */
enum ice_tx_buf_type {
	ICE_TX_BUF_EMPTY	= 0U,
	ICE_TX_BUF_DUMMY,
	ICE_TX_BUF_FRAG,
	ICE_TX_BUF_SKB,
	ICE_TX_BUF_XDP_TX,
	ICE_TX_BUF_XDP_XMIT,
	ICE_TX_BUF_XSK_TX,
};

struct ice_tx_buf {
	union {
		struct ice_tx_desc *next_to_watch;
		u32 rs_idx;
	};
	union {
		struct sk_buff *skb;
		void *raw_buf; /* used for XDP */
		void *raw_buf;		/* used for XDP_TX and FDir rules */
		struct sk_buff *skb;	/* used for .ndo_start_xmit() */
		struct xdp_frame *xdpf;	/* used for .ndo_xdp_xmit() */
		struct xdp_buff *xdp;	/* used for XDP_TX ZC */
	};
	unsigned int bytecount;
@@ -164,7 +182,8 @@ struct ice_tx_buf {
		unsigned int gso_segs;
		unsigned int nr_frags;	/* used for mbuf XDP */
	};
	u32 tx_flags;
	u32 type:16;			/* &ice_tx_buf_type */
	u32 tx_flags:16;
	DEFINE_DMA_UNMAP_LEN(len);
	DEFINE_DMA_UNMAP_ADDR(dma);
};
+56 −32
Original line number Diff line number Diff line
@@ -222,18 +222,28 @@ ice_receive_skb(struct ice_rx_ring *rx_ring, struct sk_buff *skb, u16 vlan_tag)

/**
 * ice_clean_xdp_tx_buf - Free and unmap XDP Tx buffer
 * @xdp_ring: XDP Tx ring
 * @dev: device for DMA mapping
 * @tx_buf: Tx buffer to clean
 * @bq: XDP bulk flush struct
 */
static void
ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf)
ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf *tx_buf,
		     struct xdp_frame_bulk *bq)
{
	dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma),
	dma_unmap_single(dev, dma_unmap_addr(tx_buf, dma),
			 dma_unmap_len(tx_buf, len), DMA_TO_DEVICE);
	dma_unmap_len_set(tx_buf, len, 0);
	xdp_ring->xdp_tx_active--;

	switch (tx_buf->type) {
	case ICE_TX_BUF_XDP_TX:
		page_frag_free(tx_buf->raw_buf);
	tx_buf->raw_buf = NULL;
		break;
	case ICE_TX_BUF_XDP_XMIT:
		xdp_return_frame_bulk(tx_buf->xdpf, bq);
		break;
	}

	tx_buf->type = ICE_TX_BUF_EMPTY;
}

/**
@@ -243,11 +253,13 @@ ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf)
static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
{
	int total_bytes = 0, total_pkts = 0;
	struct device *dev = xdp_ring->dev;
	u32 ntc = xdp_ring->next_to_clean;
	struct ice_tx_desc *tx_desc;
	u32 cnt = xdp_ring->count;
	struct xdp_frame_bulk bq;
	u32 frags, xdp_tx = 0;
	u32 ready_frames = 0;
	u32 frags;
	u32 idx;
	u32 ret;

@@ -261,12 +273,16 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
			ready_frames = idx + cnt - ntc + 1;
	}

	if (!ready_frames)
	if (unlikely(!ready_frames))
		return 0;
	ret = ready_frames;

	xdp_frame_bulk_init(&bq);
	rcu_read_lock(); /* xdp_return_frame_bulk() */

	while (ready_frames) {
		struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc];
		struct ice_tx_buf *head = tx_buf;

		/* bytecount holds size of head + frags */
		total_bytes += tx_buf->bytecount;
@@ -274,11 +290,8 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
		total_pkts++;
		/* count head + frags */
		ready_frames -= frags + 1;
		xdp_tx++;

		if (xdp_ring->xsk_pool)
			xsk_buff_free(tx_buf->xdp);
		else
			ice_clean_xdp_tx_buf(xdp_ring, tx_buf);
		ntc++;
		if (ntc == cnt)
			ntc = 0;
@@ -286,15 +299,21 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
		for (int i = 0; i < frags; i++) {
			tx_buf = &xdp_ring->tx_buf[ntc];

			ice_clean_xdp_tx_buf(xdp_ring, tx_buf);
			ice_clean_xdp_tx_buf(dev, tx_buf, &bq);
			ntc++;
			if (ntc == cnt)
				ntc = 0;
		}

		ice_clean_xdp_tx_buf(dev, head, &bq);
	}

	xdp_flush_frame_bulk(&bq);
	rcu_read_unlock();

	tx_desc->cmd_type_offset_bsz = 0;
	xdp_ring->next_to_clean = ntc;
	xdp_ring->xdp_tx_active -= xdp_tx;
	ice_update_tx_ring_stats(xdp_ring, total_pkts, total_bytes);

	return ret;
@@ -304,8 +323,10 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
 * __ice_xmit_xdp_ring - submit frame to XDP ring for transmission
 * @xdp: XDP buffer to be placed onto Tx descriptors
 * @xdp_ring: XDP ring for transmission
 * @frame: whether this comes from .ndo_xdp_xmit()
 */
int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring)
int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring,
			bool frame)
{
	struct skb_shared_info *sinfo = NULL;
	u32 size = xdp->data_end - xdp->data;
@@ -321,17 +342,17 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring)
	u32 frag = 0;

	free_space = ICE_DESC_UNUSED(xdp_ring);

	if (ICE_DESC_UNUSED(xdp_ring) < ICE_RING_QUARTER(xdp_ring))
	if (free_space < ICE_RING_QUARTER(xdp_ring))
		free_space += ice_clean_xdp_irq(xdp_ring);

	if (unlikely(!free_space))
		goto busy;

	if (unlikely(xdp_buff_has_frags(xdp))) {
		sinfo = xdp_get_shared_info_from_buff(xdp);
		nr_frags = sinfo->nr_frags;
		if (free_space < nr_frags + 1) {
			xdp_ring->ring_stats->tx_stats.tx_busy++;
			return ICE_XDP_CONSUMED;
		}
		if (free_space < nr_frags + 1)
			goto busy;
	}

	tx_desc = ICE_TX_DESC(xdp_ring, ntu);
@@ -349,9 +370,15 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring)
		dma_unmap_len_set(tx_buf, len, size);
		dma_unmap_addr_set(tx_buf, dma, dma);

		if (frame) {
			tx_buf->type = ICE_TX_BUF_FRAG;
		} else {
			tx_buf->type = ICE_TX_BUF_XDP_TX;
			tx_buf->raw_buf = data;
		}

		tx_desc->buf_addr = cpu_to_le64(dma);
		tx_desc->cmd_type_offset_bsz = ice_build_ctob(0, 0, size, 0);
		tx_buf->raw_buf = data;

		ntu++;
		if (ntu == cnt)
@@ -372,6 +399,11 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring)
	tx_head->bytecount = xdp_get_buff_len(xdp);
	tx_head->nr_frags = nr_frags;

	if (frame) {
		tx_head->type = ICE_TX_BUF_XDP_XMIT;
		tx_head->xdpf = xdp->data_hard_start;
	}

	/* update last descriptor from a frame with EOP */
	tx_desc->cmd_type_offset_bsz |=
		cpu_to_le64(ICE_TX_DESC_CMD_EOP << ICE_TXD_QW1_CMD_S);
@@ -395,19 +427,11 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring)
		ntu--;
	}
	return ICE_XDP_CONSUMED;
}

/**
 * ice_xmit_xdp_ring - submit frame to XDP ring for transmission
 * @xdpf: XDP frame that will be converted to XDP buff
 * @xdp_ring: XDP ring for transmission
 */
int ice_xmit_xdp_ring(struct xdp_frame *xdpf, struct ice_tx_ring *xdp_ring)
{
	struct xdp_buff xdp;
busy:
	xdp_ring->ring_stats->tx_stats.tx_busy++;

	xdp_convert_frame_to_buff(xdpf, &xdp);
	return __ice_xmit_xdp_ring(&xdp, xdp_ring);
	return ICE_XDP_CONSUMED;
}

/**
+2 −2
Original line number Diff line number Diff line
@@ -142,8 +142,8 @@ static inline u32 ice_set_rs_bit(const struct ice_tx_ring *xdp_ring)

void ice_finalize_xdp_rx(struct ice_tx_ring *xdp_ring, unsigned int xdp_res, u32 first_idx);
int ice_xmit_xdp_buff(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring);
int ice_xmit_xdp_ring(struct xdp_frame *xdpf, struct ice_tx_ring *xdp_ring);
int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring);
int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring,
			bool frame);
void ice_release_rx_desc(struct ice_rx_ring *rx_ring, u16 val);
void
ice_process_skb_fields(struct ice_rx_ring *rx_ring,
+7 −5
Original line number Diff line number Diff line
@@ -631,7 +631,8 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring)
	for (i = 0; i < xsk_frames; i++) {
		tx_buf = &xdp_ring->tx_buf[ntc];

		if (tx_buf->xdp) {
		if (tx_buf->type == ICE_TX_BUF_XSK_TX) {
			tx_buf->type = ICE_TX_BUF_EMPTY;
			xsk_buff_free(tx_buf->xdp);
			xdp_ring->xdp_tx_active--;
		} else {
@@ -685,6 +686,7 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp,

	tx_buf = &xdp_ring->tx_buf[ntu];
	tx_buf->xdp = xdp;
	tx_buf->type = ICE_TX_BUF_XSK_TX;
	tx_desc = ICE_TX_DESC(xdp_ring, ntu);
	tx_desc->buf_addr = cpu_to_le64(dma);
	tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP,
@@ -1083,12 +1085,12 @@ void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring)
	while (ntc != ntu) {
		struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc];

		if (tx_buf->xdp)
		if (tx_buf->type == ICE_TX_BUF_XSK_TX) {
			tx_buf->type = ICE_TX_BUF_EMPTY;
			xsk_buff_free(tx_buf->xdp);
		else
		} else {
			xsk_frames++;

		tx_buf->raw_buf = NULL;
		}

		ntc++;
		if (ntc >= xdp_ring->count)