Commit f84ad5cf authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge tag 'mlx5-updates-2023-06-06' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux

Saeed Mahameed says:

====================
mlx5-updates-2023-06-06

1) Support 4 ports VF LAG, part 2/2
2) Few extra trivial cleanup patches

Shay Drory Says:
================

Support 4 ports VF LAG, part 2/2

This series continues the series[1] "Support 4 ports VF LAG, part1/2".
This series adds support for 4 ports VF LAG (single FDB E-Switch).

This series of patches refactoring LAG code that make assumptions
about VF LAG supporting only two ports and then enable 4 ports VF LAG.

Patch 1:
- Fix for ib rep code
Patches 2-5:
- Refactors LAG layer.
Patches 6-7:
- Block LAG types which doesn't support 4 ports.
Patch 8:
- Enable 4 ports VF LAG.

This series specifically allows HCAs with 4 ports to create a VF LAG
with only 4 ports. It is not possible to create a VF LAG with 2 or 3
ports using HCAs that have 4 ports.

Currently, the Merged E-Switch feature only supports HCAs with 2 ports.
However, upcoming patches will introduce support for HCAs with 4 ports.

In order to activate VF LAG a user can execute:

devlink dev eswitch set pci/0000:08:00.0 mode switchdev
devlink dev eswitch set pci/0000:08:00.1 mode switchdev
devlink dev eswitch set pci/0000:08:00.2 mode switchdev
devlink dev eswitch set pci/0000:08:00.3 mode switchdev
ip link add name bond0 type bond
ip link set dev bond0 type bond mode 802.3ad
ip link set dev eth2 master bond0
ip link set dev eth3 master bond0
ip link set dev eth4 master bond0
ip link set dev eth5 master bond0

Where eth2, eth3, eth4 and eth5 are net-interfaces of pci/0000:08:00.0
pci/0000:08:00.1 pci/0000:08:00.2 pci/0000:08:00.3 respectively.

User can verify LAG state and type via debugfs:
/sys/kernel/debug/mlx5/0000\:08\:00.0/lag/state
/sys/kernel/debug/mlx5/0000\:08\:00.0/lag/type

[1]
https://lore.kernel.org/netdev/20230601060118.154015-1-saeed@kernel.org/T/#mf1d2083780970ba277bfe721554d4925f03f36d1

================

* tag 'mlx5-updates-2023-06-06' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux:
  net/mlx5e: simplify condition after napi budget handling change
  mlx5/core: E-Switch, Allocate ECPF vport if it's an eswitch manager
  net/mlx5: Skip inline mode check after mlx5_eswitch_enable_locked() failure
  net/mlx5e: TC, refactor access to hash key
  net/mlx5e: Remove RX page cache leftovers
  net/mlx5e: Expose catastrophic steering error counters
  net/mlx5: Enable 4 ports VF LAG
  net/mlx5: LAG, block multiport eswitch LAG in case ldev have more than 2 ports
  net/mlx5: LAG, block multipath LAG in case ldev have more than 2 ports
  net/mlx5: LAG, change mlx5_shared_fdb_supported() to static
  net/mlx5: LAG, generalize handling of shared FDB
  net/mlx5: LAG, check if all eswitches are paired for shared FDB
  {net/RDMA}/mlx5: introduce lag_for_each_peer
  RDMA/mlx5: Free second uplink ib port
====================

Link: https://lore.kernel.org/r/20230607210410.88209-1-saeed@kernel.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 55b24334 803ea346
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -290,6 +290,13 @@ Description of the vnic counters:
- nic_receive_steering_discard
        number of packets that completed RX flow
        steering but were discarded due to a mismatch in flow table.
- generated_pkt_steering_fail
	number of packets generated by the VNIC experiencing unexpected steering
	failure (at any point in steering flow).
- handled_pkt_steering_fail
	number of packets handled by the VNIC experiencing unexpected steering
	failure (at any point in steering flow owned by the VNIC, including the FDB
	for the eswitch owner).

User commands examples:

+66 −37
Original line number Diff line number Diff line
@@ -30,45 +30,65 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev,

static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev);

static void mlx5_ib_num_ports_update(struct mlx5_core_dev *dev, u32 *num_ports)
{
	struct mlx5_core_dev *peer_dev;
	int i;

	mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) {
		u32 peer_num_ports = mlx5_eswitch_get_total_vports(peer_dev);

		if (mlx5_lag_is_mpesw(peer_dev))
			*num_ports += peer_num_ports;
		else
			/* Only 1 ib port is the representor for all uplinks */
			*num_ports += peer_num_ports - 1;
	}
}

static int
mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
{
	u32 num_ports = mlx5_eswitch_get_total_vports(dev);
	struct mlx5_core_dev *lag_master = dev;
	const struct mlx5_ib_profile *profile;
	struct mlx5_core_dev *peer_dev;
	struct mlx5_ib_dev *ibdev;
	int second_uplink = false;
	u32 peer_num_ports;
	int new_uplink = false;
	int vport_index;
	int ret;
	int i;

	vport_index = rep->vport_index;

	if (mlx5_lag_is_shared_fdb(dev)) {
		peer_dev = mlx5_lag_get_peer_mdev(dev);
		peer_num_ports = mlx5_eswitch_get_total_vports(peer_dev);
		if (mlx5_lag_is_master(dev)) {
			if (mlx5_lag_is_mpesw(dev))
				num_ports += peer_num_ports;
			else
				num_ports += peer_num_ports - 1;

			mlx5_ib_num_ports_update(dev, &num_ports);
		} else {
			if (rep->vport == MLX5_VPORT_UPLINK) {
				if (!mlx5_lag_is_mpesw(dev))
					return 0;
				second_uplink = true;
				new_uplink = true;
			}
			mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) {
				u32 peer_n_ports = mlx5_eswitch_get_total_vports(peer_dev);

			vport_index += peer_num_ports;
			dev = peer_dev;
				if (mlx5_lag_is_master(peer_dev))
					lag_master = peer_dev;
				else if (!mlx5_lag_is_mpesw(dev))
				/* Only 1 ib port is the representor for all uplinks */
					peer_n_ports--;

				if (mlx5_get_dev_index(peer_dev) < mlx5_get_dev_index(dev))
					vport_index += peer_n_ports;
			}
		}
	}

	if (rep->vport == MLX5_VPORT_UPLINK && !second_uplink)
	if (rep->vport == MLX5_VPORT_UPLINK && !new_uplink)
		profile = &raw_eth_profile;
	else
		return mlx5_ib_set_vport_rep(dev, rep, vport_index);
		return mlx5_ib_set_vport_rep(lag_master, rep, vport_index);

	ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev);
	if (!ibdev)
@@ -85,8 +105,8 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
	vport_index = rep->vport_index;
	ibdev->port[vport_index].rep = rep;
	ibdev->port[vport_index].roce.netdev =
		mlx5_ib_get_rep_netdev(dev->priv.eswitch, rep->vport);
	ibdev->mdev = dev;
		mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, rep->vport);
	ibdev->mdev = lag_master;
	ibdev->num_ports = num_ports;

	ret = __mlx5_ib_add(ibdev, profile);
@@ -94,8 +114,8 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
		goto fail_add;

	rep->rep_data[REP_IB].priv = ibdev;
	if (mlx5_lag_is_shared_fdb(dev))
		mlx5_ib_register_peer_vport_reps(dev);
	if (mlx5_lag_is_shared_fdb(lag_master))
		mlx5_ib_register_peer_vport_reps(lag_master);

	return 0;

@@ -118,22 +138,26 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
	struct mlx5_ib_dev *dev = mlx5_ib_rep_to_dev(rep);
	int vport_index = rep->vport_index;
	struct mlx5_ib_port *port;
	int i;

	if (WARN_ON(!mdev))
		return;

	if (!dev)
		return;

	if (mlx5_lag_is_shared_fdb(mdev) &&
	    !mlx5_lag_is_master(mdev)) {
		struct mlx5_core_dev *peer_mdev;

		if (rep->vport == MLX5_VPORT_UPLINK)
		if (rep->vport == MLX5_VPORT_UPLINK && !mlx5_lag_is_mpesw(mdev))
			return;
		peer_mdev = mlx5_lag_get_peer_mdev(mdev);
		vport_index += mlx5_eswitch_get_total_vports(peer_mdev);
		for (i = 0; i < dev->num_ports; i++) {
			if (dev->port[i].rep == rep)
				break;
		}

	if (!dev)
		if (WARN_ON(i == dev->num_ports))
			return;
		vport_index = i;
	}

	port = &dev->port[vport_index];
	write_lock(&port->roce.netdev_lock);
@@ -143,14 +167,19 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
	port->rep = NULL;

	if (rep->vport == MLX5_VPORT_UPLINK) {

		if (mlx5_lag_is_shared_fdb(mdev) && !mlx5_lag_is_master(mdev))
			return;

		if (mlx5_lag_is_shared_fdb(mdev)) {
			struct mlx5_core_dev *peer_mdev;
			struct mlx5_eswitch *esw;

		if (mlx5_lag_is_shared_fdb(mdev)) {
			peer_mdev = mlx5_lag_get_peer_mdev(mdev);
			mlx5_lag_for_each_peer_mdev(mdev, peer_mdev, i) {
				esw = peer_mdev->priv.eswitch;
				mlx5_eswitch_unregister_vport_reps(esw, REP_IB);
			}
		}
		__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
	}
}
@@ -163,15 +192,15 @@ static const struct mlx5_eswitch_rep_ops rep_ops = {

static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev)
{
	struct mlx5_core_dev *peer_mdev = mlx5_lag_get_peer_mdev(mdev);
	struct mlx5_core_dev *peer_mdev;
	struct mlx5_eswitch *esw;
	int i;

	if (!peer_mdev)
		return;

	mlx5_lag_for_each_peer_mdev(mdev, peer_mdev, i) {
		esw = peer_mdev->priv.eswitch;
		mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB);
	}
}

struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
					  u16 vport_num)
+10 −0
Original line number Diff line number Diff line
@@ -76,6 +76,16 @@ int mlx5_reporter_vnic_diagnose_counters(struct mlx5_core_dev *dev,
	if (err)
		return err;

	err = devlink_fmsg_u64_pair_put(fmsg, "generated_pkt_steering_fail",
					VNIC_ENV_GET64(&vnic, generated_pkt_steering_fail));
	if (err)
		return err;

	err = devlink_fmsg_u64_pair_put(fmsg, "handled_pkt_steering_fail",
					VNIC_ENV_GET64(&vnic, handled_pkt_steering_fail));
	if (err)
		return err;

	err = devlink_fmsg_obj_nest_end(fmsg);
	if (err)
		return err;
+0 −7
Original line number Diff line number Diff line
@@ -594,13 +594,6 @@ struct mlx5e_mpw_info {

#define MLX5E_MAX_RX_FRAGS 4

/* a single cache unit is capable to serve one napi call (for non-striding rq)
 * or a MPWQE (for striding rq).
 */
#define MLX5E_CACHE_UNIT (MLX5_MPWRQ_MAX_PAGES_PER_WQE > NAPI_POLL_WEIGHT ? \
			  MLX5_MPWRQ_MAX_PAGES_PER_WQE : NAPI_POLL_WEIGHT)
#define MLX5E_CACHE_SIZE	(4 * roundup_pow_of_two(MLX5E_CACHE_UNIT))

struct mlx5e_rq;
typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*);
typedef struct sk_buff *
+3 −6
Original line number Diff line number Diff line
@@ -25,8 +25,8 @@ struct mlx5e_tc_act_stats {

static const struct rhashtable_params act_counters_ht_params = {
	.head_offset = offsetof(struct mlx5e_tc_act_stats, hash),
	.key_offset = 0,
	.key_len = offsetof(struct mlx5e_tc_act_stats, counter),
	.key_offset = offsetof(struct mlx5e_tc_act_stats, tc_act_cookie),
	.key_len = sizeof_field(struct mlx5e_tc_act_stats, tc_act_cookie),
	.automatic_shrinking = true,
};

@@ -169,14 +169,11 @@ mlx5e_tc_act_stats_fill_stats(struct mlx5e_tc_act_stats_handle *handle,
{
	struct rhashtable *ht = &handle->ht;
	struct mlx5e_tc_act_stats *item;
	struct mlx5e_tc_act_stats key;
	u64 pkts, bytes, lastused;
	int err = 0;

	key.tc_act_cookie = fl_act->cookie;

	rcu_read_lock();
	item = rhashtable_lookup(ht, &key, act_counters_ht_params);
	item = rhashtable_lookup(ht, &fl_act->cookie, act_counters_ht_params);
	if (!item) {
		rcu_read_unlock();
		err = -ENOENT;
Loading