Commit 7079d5e6 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge tag 'mlx5-updates-2023-03-28' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux

Saeed Mahameed says:

====================
mlx5-updates-2023-03-28

Dragos Tatulea says:
====================

net/mlx5e: RX, Drop page_cache and fully use page_pool

For page allocation on the rx path, the mlx5e driver has been using an
internal page cache in tandem with the page pool. The internal page
cache uses a queue for page recycling which has the issue of head of
queue blocking.

This patch series drops the internal page_cache altogether and uses the
page_pool to implement everything that was done by the page_cache
before:
* Let the page_pool handle dma mapping and unmapping.
* Use fragmented pages with fragment counter instead of tracking via
  page ref.
* Enable skb recycling.

The patch series has the following effects on the rx path:

* Improved performance for the cases when there was low page recycling
  due to head of queue blocking in the internal page_cache. The test
  for this was running a single iperf TCP stream to a rx queue
  which is bound on the same cpu as the application.

  |-------------+--------+--------+------+---------|
  | rq type     | before | after  | unit |   diff  |
  |-------------+--------+--------+------+---------|
  | striding rq |  30.1  |  31.4  | Gbps |  4.14 % |
  | legacy rq   |  30.2  |  33.0  | Gbps |  8.48 % |
  |-------------+--------+--------+------+---------|

* Small XDP performance degradation. The test was is XDP drop
  program running on a single rx queue with small packets incoming
  it looks like this:

  |-------------+----------+----------+------+---------|
  | rq type     | before   | after    | unit |   diff  |
  |-------------+----------+----------+------+---------|
  | striding rq | 19725449 | 18544617 | pps  | -6.37 % |
  | legacy rq   | 19879931 | 18631841 | pps  | -6.70 % |
  |-------------+----------+----------+------+---------|

  This will be handled in a different patch series by adding support for
  multi-packet per page.

* For other cases the performance is roughly the same.

The above numbers were obtained on the following system:
  24 core Intel(R) Xeon(R) Platinum 8380 CPU @ 2.30GHz
  32 GB RAM
  ConnectX-7 single port

The breakdown on the patch series is the following:
* Preparations for introducing the mlx5e_frag_page struct.
* Delete the mlx5e_page_cache struct.
* Enable dma mapping from page_pool.
* Enable skb recycling and fragment counting.
* Do deferred release of pages (just before alloc) to ensure better
  page_pool cache utilization.

====================

* tag 'mlx5-updates-2023-03-28' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux:
  net/mlx5e: RX, Remove unnecessary recycle parameter and page_cache stats
  net/mlx5e: RX, Break the wqe bulk refill in smaller chunks
  net/mlx5e: RX, Increase WQE bulk size for legacy rq
  net/mlx5e: RX, Split off release path for xsk buffers for legacy rq
  net/mlx5e: RX, Defer page release in legacy rq for better recycling
  net/mlx5e: RX, Change wqe last_in_page field from bool to bit flags
  net/mlx5e: RX, Defer page release in striding rq for better recycling
  net/mlx5e: RX, Rename xdp_xmit_bitmap to a more generic name
  net/mlx5e: RX, Enable skb page recycling through the page_pool
  net/mlx5e: RX, Enable dma map and sync from page_pool allocator
  net/mlx5e: RX, Remove internal page_cache
  net/mlx5e: RX, Store SHAMPO header pages in array
  net/mlx5e: RX, Remove alloc unit layout constraint for striding rq
  net/mlx5e: RX, Remove alloc unit layout constraint for legacy rq
  net/mlx5e: RX, Remove mlx5e_alloc_unit argument in page allocation
====================

Link: https://lore.kernel.org/r/20230328205623.142075-1-saeed@kernel.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents c5370374 3905f8d6
Loading
Loading
Loading
Loading
+0 −26
Original line number Diff line number Diff line
@@ -346,32 +346,6 @@ the software port.
     - The number of receive packets with CQE compression on ring i [#accel]_.
     - Acceleration

   * - `rx[i]_cache_reuse`
     - The number of events of successful reuse of a page from a driver's
       internal page cache.
     - Acceleration

   * - `rx[i]_cache_full`
     - The number of events of full internal page cache where driver can't put a
       page back to the cache for recycling (page will be freed).
     - Acceleration

   * - `rx[i]_cache_empty`
     - The number of events where cache was empty - no page to give. Driver
       shall allocate new page.
     - Acceleration

   * - `rx[i]_cache_busy`
     - The number of events where cache head was busy and cannot be recycled.
       Driver allocated new page.
     - Acceleration

   * - `rx[i]_cache_waive`
     - The number of cache evacuation. This can occur due to page move to
       another NUMA node or page was pfmemalloc-ed and should be freed as soon
       as possible.
     - Acceleration

   * - `rx[i]_arfs_err`
     - Number of flow rules that failed to be added to the flow table.
     - Error
+32 −19
Original line number Diff line number Diff line
@@ -475,11 +475,6 @@ struct mlx5e_txqsq {
	cqe_ts_to_ns               ptp_cyc2time;
} ____cacheline_aligned_in_smp;

union mlx5e_alloc_unit {
	struct page *page;
	struct xdp_buff *xsk;
};

/* XDP packets can be transmitted in different ways. On completion, we need to
 * distinguish between them to clean up things in a proper way.
 */
@@ -605,16 +600,35 @@ struct mlx5e_icosq {
	struct work_struct         recover_work;
} ____cacheline_aligned_in_smp;

struct mlx5e_frag_page {
	struct page *page;
	u16 frags;
};

enum mlx5e_wqe_frag_flag {
	MLX5E_WQE_FRAG_LAST_IN_PAGE,
	MLX5E_WQE_FRAG_SKIP_RELEASE,
};

struct mlx5e_wqe_frag_info {
	union mlx5e_alloc_unit *au;
	union {
		struct mlx5e_frag_page *frag_page;
		struct xdp_buff **xskp;
	};
	u32 offset;
	bool last_in_page;
	u8 flags;
};

union mlx5e_alloc_units {
	DECLARE_FLEX_ARRAY(struct mlx5e_frag_page, frag_pages);
	DECLARE_FLEX_ARRAY(struct page *, pages);
	DECLARE_FLEX_ARRAY(struct xdp_buff *, xsk_buffs);
};

struct mlx5e_mpw_info {
	u16 consumed_strides;
	DECLARE_BITMAP(xdp_xmit_bitmap, MLX5_MPWRQ_MAX_PAGES_PER_WQE);
	union mlx5e_alloc_unit alloc_units[];
	DECLARE_BITMAP(skip_release_bitmap, MLX5_MPWRQ_MAX_PAGES_PER_WQE);
	union mlx5e_alloc_units alloc_units;
};

#define MLX5E_MAX_RX_FRAGS 4
@@ -625,11 +639,6 @@ struct mlx5e_mpw_info {
#define MLX5E_CACHE_UNIT (MLX5_MPWRQ_MAX_PAGES_PER_WQE > NAPI_POLL_WEIGHT ? \
			  MLX5_MPWRQ_MAX_PAGES_PER_WQE : NAPI_POLL_WEIGHT)
#define MLX5E_CACHE_SIZE	(4 * roundup_pow_of_two(MLX5E_CACHE_UNIT))
struct mlx5e_page_cache {
	u32 head;
	u32 tail;
	struct page *page_cache[MLX5E_CACHE_SIZE];
};

struct mlx5e_rq;
typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*);
@@ -661,19 +670,24 @@ struct mlx5e_rq_frags_info {
	struct mlx5e_rq_frag_info arr[MLX5E_MAX_RX_FRAGS];
	u8 num_frags;
	u8 log_num_frags;
	u8 wqe_bulk;
	u16 wqe_bulk;
	u16 refill_unit;
	u8 wqe_index_mask;
};

struct mlx5e_dma_info {
	dma_addr_t addr;
	union {
		struct mlx5e_frag_page *frag_page;
		struct page *page;
	};
};

struct mlx5e_shampo_hd {
	u32 mkey;
	struct mlx5e_dma_info *info;
	struct page *last_page;
	struct mlx5e_frag_page *pages;
	u16 curr_page_index;
	u16 hd_per_wq;
	u16 hd_per_wqe;
	unsigned long *bitmap;
@@ -702,7 +716,7 @@ struct mlx5e_rq {
		struct {
			struct mlx5_wq_cyc          wq;
			struct mlx5e_wqe_frag_info *frags;
			union mlx5e_alloc_unit     *alloc_units;
			union mlx5e_alloc_units    *alloc_units;
			struct mlx5e_rq_frags_info  info;
			mlx5e_fp_skb_from_cqe       skb_from_cqe;
		} wqe;
@@ -738,7 +752,6 @@ struct mlx5e_rq {
	struct mlx5e_rq_stats *stats;
	struct mlx5e_cq        cq;
	struct mlx5e_cq_decomp cqd;
	struct mlx5e_page_cache page_cache;
	struct hwtstamp_config *tstamp;
	struct mlx5_clock      *clock;
	struct mlx5e_icosq    *icosq;
+49 −4
Original line number Diff line number Diff line
@@ -667,6 +667,48 @@ static int mlx5e_max_nonlinear_mtu(int first_frag_size, int frag_size, bool xdp)
	return first_frag_size + (MLX5E_MAX_RX_FRAGS - 2) * frag_size + PAGE_SIZE;
}

static void mlx5e_rx_compute_wqe_bulk_params(struct mlx5e_params *params,
					     struct mlx5e_rq_frags_info *info)
{
	u16 bulk_bound_rq_size = (1 << params->log_rq_mtu_frames) / 4;
	u32 bulk_bound_rq_size_in_bytes;
	u32 sum_frag_strides = 0;
	u32 wqe_bulk_in_bytes;
	u16 split_factor;
	u32 wqe_bulk;
	int i;

	for (i = 0; i < info->num_frags; i++)
		sum_frag_strides += info->arr[i].frag_stride;

	/* For MTUs larger than PAGE_SIZE, align to PAGE_SIZE to reflect
	 * amount of consumed pages per wqe in bytes.
	 */
	if (sum_frag_strides > PAGE_SIZE)
		sum_frag_strides = ALIGN(sum_frag_strides, PAGE_SIZE);

	bulk_bound_rq_size_in_bytes = bulk_bound_rq_size * sum_frag_strides;

#define MAX_WQE_BULK_BYTES(xdp) ((xdp ? 256 : 512) * 1024)

	/* A WQE bulk should not exceed min(512KB, 1/4 of rq size). For XDP
	 * keep bulk size smaller to avoid filling the page_pool cache on
	 * every bulk refill.
	 */
	wqe_bulk_in_bytes = min_t(u32, MAX_WQE_BULK_BYTES(params->xdp_prog),
				  bulk_bound_rq_size_in_bytes);
	wqe_bulk = DIV_ROUND_UP(wqe_bulk_in_bytes, sum_frag_strides);

	/* Make sure that allocations don't start when the page is still used
	 * by older WQEs.
	 */
	info->wqe_bulk = max_t(u16, info->wqe_index_mask + 1, wqe_bulk);

	split_factor = DIV_ROUND_UP(MAX_WQE_BULK_BYTES(params->xdp_prog),
				    PP_ALLOC_CACHE_REFILL * PAGE_SIZE);
	info->refill_unit = DIV_ROUND_UP(info->wqe_bulk, split_factor);
}

#define DEFAULT_FRAG_SIZE (2048)

static int mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
@@ -774,11 +816,14 @@ static int mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
	}

out:
	/* Bulking optimization to skip allocation until at least 8 WQEs can be
	 * allocated in a row. At the same time, never start allocation when
	 * the page is still used by older WQEs.
	/* Bulking optimization to skip allocation until a large enough number
	 * of WQEs can be allocated in a row. Bulking also influences how well
	 * deferred page release works.
	 */
	info->wqe_bulk = max_t(u8, info->wqe_index_mask + 1, 8);
	mlx5e_rx_compute_wqe_bulk_params(params, info);

	mlx5_core_dbg(mdev, "%s: wqe_bulk = %u, wqe_bulk_refill_unit = %u\n",
		      __func__, info->wqe_bulk, info->refill_unit);

	info->log_num_frags = order_base_2(info->num_frags);

+2 −2
Original line number Diff line number Diff line
@@ -121,9 +121,9 @@ static int mlx5e_rx_reporter_err_icosq_cqe_recover(void *ctx)

	mlx5e_reset_icosq_cc_pc(icosq);

	mlx5e_free_rx_in_progress_descs(rq);
	mlx5e_free_rx_missing_descs(rq);
	if (xskrq)
		mlx5e_free_rx_in_progress_descs(xskrq);
		mlx5e_free_rx_missing_descs(xskrq);

	clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state);
	mlx5e_activate_icosq(icosq);
+2 −4
Original line number Diff line number Diff line
@@ -65,13 +65,11 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget);
int mlx5e_poll_ico_cq(struct mlx5e_cq *cq);

/* RX */
void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct page *page);
void mlx5e_page_release_dynamic(struct mlx5e_rq *rq, struct page *page, bool recycle);
INDIRECT_CALLABLE_DECLARE(bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq));
INDIRECT_CALLABLE_DECLARE(bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq));
int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget);
void mlx5e_free_rx_descs(struct mlx5e_rq *rq);
void mlx5e_free_rx_in_progress_descs(struct mlx5e_rq *rq);
void mlx5e_free_rx_missing_descs(struct mlx5e_rq *rq);

static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
{
@@ -489,7 +487,7 @@ static inline bool mlx5e_icosq_can_post_wqe(struct mlx5e_icosq *sq, u16 wqe_size

static inline struct mlx5e_mpw_info *mlx5e_get_mpw_info(struct mlx5e_rq *rq, int i)
{
	size_t isz = struct_size(rq->mpwqe.info, alloc_units, rq->mpwqe.pages_per_wqe);
	size_t isz = struct_size(rq->mpwqe.info, alloc_units.frag_pages, rq->mpwqe.pages_per_wqe);

	return (struct mlx5e_mpw_info *)((char *)rq->mpwqe.info + array_size(i, isz));
}
Loading