Commit 9b3e446c authored by David S. Miller's avatar David S. Miller
Browse files

Merge tag 'mlx5-updates-2022-02-14' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux



Saeed Mahameed says:

====================
mlx5-updates-2022-02-14

mlx5 TX routines improvements

1) From Aya and Tariq, first 3 patches, Use the Max size of the TX descriptor
as advertised by the device and not the fixed value of 16 that the driver
always assumed, this is not a bug fix as all existing devices have Max value
larger than 16, but the series is necessary for future proofing the driver.

2) TX Synchronization improvements from Maxim, last 12 patches

Maxim Mikityanskiy Says:
=======================
mlx5e: Synchronize ndo_select_queue with configuration changes

The kernel can call ndo_select_queue at any time, and there is no direct
way to block it. The implementation of ndo_select_queue in mlx5e expects
the parameters to be consistent and may crash (invalid pointer, division
by zero) if they aren't.

There were attempts to partially fix some of the most frequent crashes,
see commit 846d6da1 ("net/mlx5e: Fix division by 0 in
mlx5e_select_queue") and commit 84c8a874 ("net/mlx5e: Fix division
by 0 in mlx5e_select_queue for representors"). However, they don't
address the issue completely.

This series introduces the proper synchronization mechanism between
mlx5e configuration and TX data path:

1. txq2sq updates are synchronized properly with ndo_start_xmit
   (mlx5e_xmit). The TX queue is stopped when it configuration is being
   updated, and memory barriers ensure the changes are visible before
   restarting.

2. The set of parameters needed for mlx5e_select_queue is reduced, and
   synchronization using RCU is implemented. This way, changes are
   atomic, and the state in mlx5e_select_queue is always consistent.

3. A few optimizations are applied to the new implementation of
   mlx5e_select_queue.

=======================

====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents d0b78ab1 71753b8e
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -28,7 +28,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en/rqt.o en/tir.o en/rss.o en/rx_res.o \
		en_selftest.o en/port.o en/monitor_stats.o en/health.o \
		en/reporter_tx.o en/reporter_rx.o en/params.o en/xsk/pool.o \
		en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o \
		en/qos.o en/trap.o en/fs_tt_redirect.o
		en/qos.o en/trap.o en/fs_tt_redirect.o en/selq.o

#
# Netdev extra
+37 −6
Original line number Diff line number Diff line
@@ -59,6 +59,7 @@
#include "lib/hv_vhca.h"
#include "lib/clock.h"
#include "en/rx_res.h"
#include "en/selq.h"

extern const struct net_device_ops mlx5e_netdev_ops;
struct page_pool;
@@ -172,8 +173,9 @@ struct page_pool;
#define MLX5E_KLM_ENTRIES_PER_WQE(wqe_size)\
	ALIGN_DOWN(MLX5E_KLM_MAX_ENTRIES_PER_WQE(wqe_size), MLX5_UMR_KLM_ALIGNMENT)

#define MLX5E_MAX_KLM_PER_WQE \
	MLX5E_KLM_ENTRIES_PER_WQE(MLX5E_TX_MPW_MAX_NUM_DS << MLX5_MKEY_BSF_OCTO_SIZE)
#define MLX5E_MAX_KLM_PER_WQE(mdev) \
	MLX5E_KLM_ENTRIES_PER_WQE(mlx5e_get_sw_max_sq_mpw_wqebbs(mlx5e_get_max_sq_wqebbs(mdev)) \
				   << MLX5_MKEY_BSF_OCTO_SIZE)

#define MLX5E_MSG_LEVEL			NETIF_MSG_LINK

@@ -221,6 +223,32 @@ static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
		min_t(int, mlx5_comp_vectors_count(mdev), MLX5E_MAX_NUM_CHANNELS);
}

/* The maximum WQE size can be retrieved by max_wqe_sz_sq in
 * bytes units. Driver hardens the limitation to 1KB (16
 * WQEBBs), unless firmware capability is stricter.
 */
static inline u16 mlx5e_get_max_sq_wqebbs(struct mlx5_core_dev *mdev)
{
	return min_t(u16, MLX5_SEND_WQE_MAX_WQEBBS,
		     MLX5_CAP_GEN(mdev, max_wqe_sz_sq) / MLX5_SEND_WQE_BB);
}

static inline u16 mlx5e_get_sw_max_sq_mpw_wqebbs(u16 max_sq_wqebbs)
{
/* The return value will be multiplied by MLX5_SEND_WQEBB_NUM_DS.
 * Since max_sq_wqebbs may be up to MLX5_SEND_WQE_MAX_WQEBBS == 16,
 * see mlx5e_get_max_sq_wqebbs(), the multiplication (16 * 4 == 64)
 * overflows the 6-bit DS field of Ctrl Segment. Use a bound lower
 * than MLX5_SEND_WQE_MAX_WQEBBS to let a full-session WQE be
 * cache-aligned.
 */
#if L1_CACHE_BYTES < 128
	return min_t(u16, max_sq_wqebbs, MLX5_SEND_WQE_MAX_WQEBBS - 1);
#else
	return min_t(u16, max_sq_wqebbs, MLX5_SEND_WQE_MAX_WQEBBS - 2);
#endif
}

struct mlx5e_tx_wqe {
	struct mlx5_wqe_ctrl_seg ctrl;
	struct mlx5_wqe_eth_seg  eth;
@@ -427,12 +455,12 @@ struct mlx5e_txqsq {
	struct netdev_queue       *txq;
	u32                        sqn;
	u16                        stop_room;
	u16                        max_sq_mpw_wqebbs;
	u8                         min_inline_mode;
	struct device             *pdev;
	__be32                     mkey_be;
	unsigned long              state;
	unsigned int               hw_mtu;
	struct hwtstamp_config    *tstamp;
	struct mlx5_clock         *clock;
	struct net_device         *netdev;
	struct mlx5_core_dev      *mdev;
@@ -446,6 +474,7 @@ struct mlx5e_txqsq {
	struct work_struct         recover_work;
	struct mlx5e_ptpsq        *ptpsq;
	cqe_ts_to_ns               ptp_cyc2time;
	u16                        max_sq_wqebbs;
} ____cacheline_aligned_in_smp;

struct mlx5e_dma_info {
@@ -540,6 +569,8 @@ struct mlx5e_xdpsq {
	u32                        sqn;
	struct device             *pdev;
	__be32                     mkey_be;
	u16                        stop_room;
	u16                        max_sq_mpw_wqebbs;
	u8                         min_inline_mode;
	unsigned long              state;
	unsigned int               hw_mtu;
@@ -547,6 +578,7 @@ struct mlx5e_xdpsq {
	/* control path */
	struct mlx5_wq_ctrl        wq_ctrl;
	struct mlx5e_channel      *channel;
	u16                        max_sq_wqebbs;
} ____cacheline_aligned_in_smp;

struct mlx5e_ktls_resync_resp;
@@ -575,6 +607,7 @@ struct mlx5e_icosq {
	/* control path */
	struct mlx5_wq_ctrl        wq_ctrl;
	struct mlx5e_channel      *channel;
	u16                        max_sq_wqebbs;

	struct work_struct         recover_work;
} ____cacheline_aligned_in_smp;
@@ -876,9 +909,8 @@ struct mlx5e_trap;

struct mlx5e_priv {
	/* priv data path fields - start */
	struct mlx5e_selq selq;
	struct mlx5e_txqsq **txq2sq;
	int **channel_tc2realtxq;
	int port_ptp_tc2realtxq[MLX5E_MAX_NUM_TC];
#ifdef CONFIG_MLX5_CORE_EN_DCB
	struct mlx5e_dcbx_dp       dcbx_dp;
#endif
@@ -921,7 +953,6 @@ struct mlx5e_priv {
	u16                        drop_rq_q_counter;
	struct notifier_block      events_nb;
	struct notifier_block      blocking_events_nb;
	int                        num_tc_x_num_ch;

	struct udp_tunnel_nic_info nic_info;
#ifdef CONFIG_MLX5_CORE_EN_DCB
+5 −5
Original line number Diff line number Diff line
@@ -196,13 +196,13 @@ u16 mlx5e_calc_sq_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *par
	u16 stop_room;

	stop_room  = mlx5e_tls_get_stop_room(mdev, params);
	stop_room += mlx5e_stop_room_for_wqe(MLX5_SEND_WQE_MAX_WQEBBS);
	stop_room += mlx5e_stop_room_for_max_wqe(mdev);
	if (is_mpwqe)
		/* A MPWQE can take up to the maximum-sized WQE + all the normal
		 * stop room can be taken if a new packet breaks the active
		 * MPWQE session and allocates its WQEs right away.
		 */
		stop_room += mlx5e_stop_room_for_wqe(MLX5_SEND_WQE_MAX_WQEBBS);
		stop_room += mlx5e_stop_room_for_max_wqe(mdev);

	return stop_room;
}
@@ -717,7 +717,7 @@ static u32 mlx5e_shampo_icosq_sz(struct mlx5_core_dev *mdev,
	int wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz));
	u32 wqebbs;

	max_klm_per_umr = MLX5E_MAX_KLM_PER_WQE;
	max_klm_per_umr = MLX5E_MAX_KLM_PER_WQE(mdev);
	max_hd_per_wqe = mlx5e_shampo_hd_per_wqe(mdev, params, rq_param);
	max_num_of_umr_per_wqe = max_hd_per_wqe / max_klm_per_umr;
	rest = max_hd_per_wqe % max_klm_per_umr;
@@ -774,10 +774,10 @@ static void mlx5e_build_async_icosq_param(struct mlx5_core_dev *mdev,
	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);

	mlx5e_build_sq_param_common(mdev, param);
	param->stop_room = mlx5e_stop_room_for_wqe(1); /* for XSK NOP */
	param->stop_room = mlx5e_stop_room_for_wqe(mdev, 1); /* for XSK NOP */
	param->is_tls = mlx5e_accel_is_ktls_rx(mdev);
	if (param->is_tls)
		param->stop_room += mlx5e_stop_room_for_wqe(1); /* for TLS RX resync NOP */
		param->stop_room += mlx5e_stop_room_for_wqe(mdev, 1); /* for TLS RX resync NOP */
	MLX5_SET(sqc, sqc, reg_umr, MLX5_CAP_ETH(mdev, reg_umr_sq));
	MLX5_SET(wq, wq, log_wq_sz, log_wq_size);
	mlx5e_build_ico_cq_param(mdev, log_wq_size, &param->cqp);
+1 −2
Original line number Diff line number Diff line
@@ -195,7 +195,6 @@ static int mlx5e_ptp_alloc_txqsq(struct mlx5e_ptp *c, int txq_ix,
	int node;

	sq->pdev      = c->pdev;
	sq->tstamp    = c->tstamp;
	sq->clock     = &mdev->clock;
	sq->mkey_be   = c->mkey_be;
	sq->netdev    = c->netdev;
@@ -449,7 +448,7 @@ static void mlx5e_ptp_build_sq_param(struct mlx5_core_dev *mdev,

	wq = MLX5_ADDR_OF(sqc, sqc, wq);
	MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size);
	param->stop_room = mlx5e_stop_room_for_wqe(MLX5_SEND_WQE_MAX_WQEBBS);
	param->stop_room = mlx5e_stop_room_for_max_wqe(mdev);
	mlx5e_build_tx_cq_param(mdev, params, &param->cqp);
}

+36 −6
Original line number Diff line number Diff line
@@ -50,7 +50,6 @@ static int mlx5e_find_unused_qos_qid(struct mlx5e_priv *priv)

struct mlx5e_qos_node {
	struct hlist_node hnode;
	struct rcu_head rcu;
	struct mlx5e_qos_node *parent;
	u64 rate;
	u32 bw_share;
@@ -132,7 +131,11 @@ static void mlx5e_sw_node_delete(struct mlx5e_priv *priv, struct mlx5e_qos_node
		__clear_bit(node->qid, priv->htb.qos_used_qids);
		mlx5e_update_tx_netdev_queues(priv);
	}
	kfree_rcu(node, rcu);
	/* Make sure this qid is no longer selected by mlx5e_select_queue, so
	 * that mlx5e_reactivate_qos_sq can safely restart the netdev TX queue.
	 */
	synchronize_net();
	kfree(node);
}

/* TX datapath API */
@@ -273,10 +276,18 @@ static int mlx5e_open_qos_sq(struct mlx5e_priv *priv, struct mlx5e_channels *chs
static void mlx5e_activate_qos_sq(struct mlx5e_priv *priv, struct mlx5e_qos_node *node)
{
	struct mlx5e_txqsq *sq;
	u16 qid;

	sq = mlx5e_get_qos_sq(priv, node->qid);

	WRITE_ONCE(priv->txq2sq[mlx5e_qid_from_qos(&priv->channels, node->qid)], sq);
	qid = mlx5e_qid_from_qos(&priv->channels, node->qid);

	/* If it's a new queue, it will be marked as started at this point.
	 * Stop it before updating txq2sq.
	 */
	mlx5e_tx_disable_queue(netdev_get_tx_queue(priv->netdev, qid));

	priv->txq2sq[qid] = sq;

	/* Make the change to txq2sq visible before the queue is started.
	 * As mlx5e_xmit runs under a spinlock, there is an implicit ACQUIRE,
@@ -299,8 +310,13 @@ static void mlx5e_deactivate_qos_sq(struct mlx5e_priv *priv, u16 qid)
	qos_dbg(priv->mdev, "Deactivate QoS SQ qid %u\n", qid);
	mlx5e_deactivate_txqsq(sq);

	/* The queue is disabled, no synchronization with datapath is needed. */
	priv->txq2sq[mlx5e_qid_from_qos(&priv->channels, qid)] = NULL;

	/* Make the change to txq2sq visible before the queue is started again.
	 * As mlx5e_xmit runs under a spinlock, there is an implicit ACQUIRE,
	 * which pairs with this barrier.
	 */
	smp_wmb();
}

static void mlx5e_close_qos_sq(struct mlx5e_priv *priv, u16 qid)
@@ -485,9 +501,11 @@ int mlx5e_htb_root_add(struct mlx5e_priv *priv, u16 htb_maj_id, u16 htb_defcls,

	opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
	if (opened) {
		mlx5e_selq_prepare(&priv->selq, &priv->channels.params, true);

		err = mlx5e_qos_alloc_queues(priv, &priv->channels);
		if (err)
			return err;
			goto err_cancel_selq;
	}

	root = mlx5e_sw_node_create_root(priv);
@@ -508,6 +526,9 @@ int mlx5e_htb_root_add(struct mlx5e_priv *priv, u16 htb_maj_id, u16 htb_defcls,
	 */
	smp_store_release(&priv->htb.maj_id, htb_maj_id);

	if (opened)
		mlx5e_selq_apply(&priv->selq);

	return 0;

err_sw_node_delete:
@@ -516,6 +537,8 @@ int mlx5e_htb_root_add(struct mlx5e_priv *priv, u16 htb_maj_id, u16 htb_defcls,
err_free_queues:
	if (opened)
		mlx5e_qos_close_all_queues(&priv->channels);
err_cancel_selq:
	mlx5e_selq_cancel(&priv->selq);
	return err;
}

@@ -526,8 +549,15 @@ int mlx5e_htb_root_del(struct mlx5e_priv *priv)

	qos_dbg(priv->mdev, "TC_HTB_DESTROY\n");

	/* Wait until real_num_tx_queues is updated for mlx5e_select_queue,
	 * so that we can safely switch to its non-HTB non-PTP fastpath.
	 */
	synchronize_net();

	mlx5e_selq_prepare(&priv->selq, &priv->channels.params, false);
	mlx5e_selq_apply(&priv->selq);

	WRITE_ONCE(priv->htb.maj_id, 0);
	synchronize_rcu(); /* Sync with mlx5e_select_htb_queue and TX data path. */

	root = mlx5e_sw_node_find(priv, MLX5E_HTB_CLASSID_ROOT);
	if (!root) {
Loading