Commit bc37b24e authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'mlx5-xsk-updates-part3-2022-09-30'

Saeed Mahameed says:

====================
mlx5 xsk updates part3 2022-09-30

The gist of this 4 part series is in this patchset's last patch

This series contains performance optimizations. XSK starts using the
batching allocator, and XSK data path gets separated from the regular
RX, allowing to drop some branches not relevant for non-XSK use cases.
Some minor optimizations for indirect calls and need_wakeup are also
included.

Other than that, this series adds a few features to the mlx5e
implementation of XSK:

1. XDP metadata support on XSK RQs.

2. RSS contexts support for XSK RQs.

3. Some other optimizations

4. Last but not least, change the queuing scheme, so that XSK RQs no longer
use higher indices, but replace the regular RQs.

Maxim Says:
==========

In the initial implementation of XSK in mlx5e, XSK RQs coexisted with
regular RQs in the same channel. The main idea was to allow RSS work the
same for regular traffic, without need to reconfigure RSS to exclude XSK
queues.

However, this scheme didn't prove to be beneficial, mainly because of
incompatibility with other vendors. Some tools don't properly support
using higher indices for XSK queues, some tools get confused with the
double amount of RQs exposed in sysfs. Some use cases are purely XSK,
and allocating the same amount of unused regular RQs is a waste of
resources.

This commit changes the queuing scheme to the standard one, where XSK
RQs replace regular RQs on the channels where XSK sockets are open. Two
RQs still exist in the channel to allow failsafe disable of XSK, but
only one is exposed at a time. The next commit will achieve the desired
memory save by flushing the buffers when the regular RQ is unused.

As the result of this transition:

1. It's possible to use RSS contexts over XSK RQs.

2. It's possible to dedicate all queues to XSK.

3. When XSK RQs coexist with regular RQs, the admin should make sure no
unwanted traffic goes into XSK RQs by either excluding them from RSS or
settings up the XDP program to return XDP_PASS for non-XSK traffic.

4. When using a mixed fleet of mlx5e devices and other netdevs, the same
configuration can be applied. If the application supports the fallback
to copy mode on unsupported drivers, it will work too.

==========

Part 4 will include some final xsk optimizations and minor improvements

part 1: https://lore.kernel.org/netdev/20220927203611.244301-1-saeed@kernel.org/
part 2: https://lore.kernel.org/netdev/20220929072156.93299-1-saeed@kernel.org/
====================

Link: https://lore.kernel.org/r/20220930162903.62262-1-saeed@kernel.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 5fcc2cfc 3db4c85c
Loading
Loading
Loading
Loading
+2 −8
Original line number Diff line number Diff line
@@ -181,12 +181,6 @@ do { \
#define mlx5e_state_dereference(priv, p) \
	rcu_dereference_protected((p), lockdep_is_held(&(priv)->state_lock))

enum mlx5e_rq_group {
	MLX5E_RQ_GROUP_REGULAR,
	MLX5E_RQ_GROUP_XSK,
#define MLX5E_NUM_RQ_GROUPS(g) (1 + MLX5E_RQ_GROUP_##g)
};

static inline u8 mlx5e_get_num_lag_ports(struct mlx5_core_dev *mdev)
{
	if (mlx5_lag_is_lacp_owner(mdev))
@@ -660,6 +654,7 @@ struct mlx5e_rq_frags_info {
	u8 num_frags;
	u8 log_num_frags;
	u8 wqe_bulk;
	u8 wqe_index_mask;
};

struct mlx5e_dma_info {
@@ -1004,7 +999,6 @@ struct mlx5e_profile {
	mlx5e_stats_grp_t *stats_grps;
	const struct mlx5e_rx_handlers *rx_handlers;
	int	max_tc;
	u8	rq_groups;
	u32     features;
};

@@ -1096,7 +1090,7 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv);
void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv);
int mlx5e_ptp_rx_manage_fs_ctx(struct mlx5e_priv *priv, void *ctx);

int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state);
int mlx5e_flush_rq(struct mlx5e_rq *rq, int curr_state);
void mlx5e_activate_rq(struct mlx5e_rq *rq);
void mlx5e_deactivate_rq(struct mlx5e_rq *rq);
void mlx5e_activate_icosq(struct mlx5e_icosq *icosq);
+17 −12
Original line number Diff line number Diff line
@@ -10,28 +10,33 @@ unsigned int mlx5e_channels_get_num(struct mlx5e_channels *chs)
	return chs->num;
}

void mlx5e_channels_get_regular_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn)
static struct mlx5e_channel *mlx5e_channels_get(struct mlx5e_channels *chs, unsigned int ix)
{
	struct mlx5e_channel *c;
	WARN_ON_ONCE(ix >= mlx5e_channels_get_num(chs));
	return chs->c[ix];
}

	WARN_ON(ix >= mlx5e_channels_get_num(chs));
	c = chs->c[ix];
bool mlx5e_channels_is_xsk(struct mlx5e_channels *chs, unsigned int ix)
{
	struct mlx5e_channel *c = mlx5e_channels_get(chs, ix);

	*rqn = c->rq.rqn;
	return test_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
}

bool mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn)
void mlx5e_channels_get_regular_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn)
{
	struct mlx5e_channel *c;
	struct mlx5e_channel *c = mlx5e_channels_get(chs, ix);

	WARN_ON(ix >= mlx5e_channels_get_num(chs));
	c = chs->c[ix];
	*rqn = c->rq.rqn;
}

	if (!test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
		return false;
void mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn)
{
	struct mlx5e_channel *c = mlx5e_channels_get(chs, ix);

	WARN_ON_ONCE(!test_bit(MLX5E_CHANNEL_STATE_XSK, c->state));

	*rqn = c->xskrq.rqn;
	return true;
}

bool mlx5e_channels_get_ptp_rqn(struct mlx5e_channels *chs, u32 *rqn)
+2 −1
Original line number Diff line number Diff line
@@ -9,8 +9,9 @@
struct mlx5e_channels;

unsigned int mlx5e_channels_get_num(struct mlx5e_channels *chs);
bool mlx5e_channels_is_xsk(struct mlx5e_channels *chs, unsigned int ix);
void mlx5e_channels_get_regular_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn);
bool mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn);
void mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn);
bool mlx5e_channels_get_ptp_rqn(struct mlx5e_channels *chs, u32 *rqn);

#endif /* __MLX5_EN_CHANNELS_H__ */
+40 −4
Original line number Diff line number Diff line
@@ -586,7 +586,14 @@ static int mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
		info->arr[0].frag_size = byte_count;
		info->arr[0].frag_stride = frag_stride;
		info->num_frags = 1;
		info->wqe_bulk = PAGE_SIZE / frag_stride;

		/* N WQEs share the same page, N = PAGE_SIZE / frag_stride. The
		 * first WQE in the page is responsible for allocation of this
		 * page, this WQE's index is k*N. If WQEs [k*N+1; k*N+N-1] are
		 * still not completed, the allocation must stop before k*N.
		 */
		info->wqe_index_mask = (PAGE_SIZE / frag_stride) - 1;

		goto out;
	}

@@ -635,11 +642,40 @@ static int mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
		i++;
	}
	info->num_frags = i;
	/* number of different wqes sharing a page */
	info->wqe_bulk = 1 + (info->num_frags % 2);

	/* The last fragment of WQE with index 2*N may share the page with the
	 * first fragment of WQE with index 2*N+1 in certain cases. If WQE 2*N+1
	 * is not completed yet, WQE 2*N must not be allocated, as it's
	 * responsible for allocating a new page.
	 */
	if (frag_size_max == PAGE_SIZE) {
		/* No WQE can start in the middle of a page. */
		info->wqe_index_mask = 0;
	} else {
		/* PAGE_SIZEs starting from 8192 don't use 2K-sized fragments,
		 * because there would be more than MLX5E_MAX_RX_FRAGS of them.
		 */
		WARN_ON(PAGE_SIZE != 2 * DEFAULT_FRAG_SIZE);

		/* Odd number of fragments allows to pack the last fragment of
		 * the previous WQE and the first fragment of the next WQE into
		 * the same page.
		 * As long as DEFAULT_FRAG_SIZE is 2048, and MLX5E_MAX_RX_FRAGS
		 * is 4, the last fragment can be bigger than the rest only if
		 * it's the fourth one, so WQEs consisting of 3 fragments will
		 * always share a page.
		 * When a page is shared, WQE bulk size is 2, otherwise just 1.
		 */
		info->wqe_index_mask = info->num_frags % 2;
	}

out:
	info->wqe_bulk = max_t(u8, info->wqe_bulk, 8);
	/* Bulking optimization to skip allocation until at least 8 WQEs can be
	 * allocated in a row. At the same time, never start allocation when
	 * the page is still used by older WQEs.
	 */
	info->wqe_bulk = max_t(u8, info->wqe_index_mask + 1, 8);

	info->log_num_frags = order_base_2(info->num_frags);

	return 0;
+0 −32
Original line number Diff line number Diff line
@@ -53,38 +53,6 @@ struct mlx5e_create_sq_param {
	u8                          min_inline_mode;
};

static inline bool mlx5e_qid_get_ch_if_in_group(struct mlx5e_params *params,
						u16 qid,
						enum mlx5e_rq_group group,
						u16 *ix)
{
	int nch = params->num_channels;
	int ch = qid - nch * group;

	if (ch < 0 || ch >= nch)
		return false;

	*ix = ch;
	return true;
}

static inline void mlx5e_qid_get_ch_and_group(struct mlx5e_params *params,
					      u16 qid,
					      u16 *ix,
					      enum mlx5e_rq_group *group)
{
	u16 nch = params->num_channels;

	*ix = qid % nch;
	*group = qid / nch;
}

static inline bool mlx5e_qid_validate(const struct mlx5e_profile *profile,
				      struct mlx5e_params *params, u64 qid)
{
	return qid < params->num_channels * profile->rq_groups;
}

/* Striding RQ dynamic parameters */

u8 mlx5e_mpwrq_page_shift(struct mlx5_core_dev *mdev, struct mlx5e_xsk_param *xsk);
Loading