Commit 0f19f514 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge tag 'mlx5-updates-2023-02-10' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux

Saeed Mahameed says:

====================
mlx5-updates-2023-02-10

1) From Roi and Mark: MultiPort eswitch support

MultiPort E-Switch builds on newer hardware's capabilities and introduces
a mode where a single E-Switch is used and all the vports and physical
ports on the NIC are connected to it.

The new mode will allow in the future a decrease in the memory used by the
driver and advanced features that aren't possible today.

This represents a big change in the current E-Switch implantation in mlx5.
Currently, by default, each E-Switch manager manages its E-Switch.
Steering rules in each E-Switch can only forward traffic to the native
physical port associated with that E-Switch. While there are ways to target
non-native physical ports, for example using a bond or via special TC
rules. None of the ways allows a user to configure the driver
to operate by default in such a mode nor can the driver decide
to move to this mode by default as it's user configuration-driven right now.

While MultiPort E-Switch single FDB mode is the preferred mode, older
generations of ConnectX hardware couldn't support this mode so it was never
implemented. Now that there is capable hardware present, start the
transition to having this mode by default.

Introduce a devlink parameter to control MultiPort Eswitch single FDB mode.
This will allow users to select this mode on their system right now
and in the future will allow the driver to move to this mode by default.

2) From Jiri: Improvements and fixes for mlx5 netdev's devlink logic
 2.1) Cleanups related to mlx5's devlink port logic
 2.2) Move devlink port registration to be done before netdev alloc
 2.3) Create auxdev devlink instance in the same ns as parent devlink
 2.4) Suspend auxiliary devices only in case of PCI device suspend

* tag 'mlx5-updates-2023-02-10' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux:
  net/mlx5: Suspend auxiliary devices only in case of PCI device suspend
  net/mlx5: Remove "recovery" arg from mlx5_load_one() function
  net/mlx5e: Create auxdev devlink instance in the same ns as parent devlink
  net/mlx5e: Move devlink port registration to be done before netdev alloc
  net/mlx5e: Move dl_port to struct mlx5e_dev
  net/mlx5e: Replace usage of mlx5e_devlink_get_dl_port() by netdev->devlink_port
  net/mlx5e: Pass mdev to mlx5e_devlink_port_register()
  net/mlx5: Remove outdated comment
  net/mlx5e: TC, Remove redundant parse_attr argument
  net/mlx5e: Use a simpler comparison for uplink rep
  net/mlx5: Lag, Add single RDMA device in multiport mode
  net/mlx5: Lag, set different uplink vport metadata in multiport eswitch mode
  net/mlx5: E-Switch, rename bond update function to be reused
  net/mlx5e: TC, Add peer flow in mpesw mode
  net/mlx5: Lag, Control MultiPort E-Switch single FDB mode
====================

Link: https://lore.kernel.org/r/20230214221239.159033-1-saeed@kernel.org


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 388a9c90 72ed5d56
Loading
Loading
Loading
Loading
+18 −0
Original line number Diff line number Diff line
@@ -54,6 +54,24 @@ parameters.
     - Control the number of large groups (size > 1) in the FDB table.

       * The default value is 15, and the range is between 1 and 1024.
   * - ``esw_multiport``
     - Boolean
     - runtime
     - Control MultiPort E-Switch shared fdb mode.

       An experimental mode where a single E-Switch is used and all the vports
       and physical ports on the NIC are connected to it.

       An example is to send traffic from a VF that is created on PF0 to an
       uplink that is natively associated with the uplink of PF1

       Note: Future devices, ConnectX-8 and onward, will eventually have this
       as the default to allow forwarding between all NIC ports in a single
       E-switch environment and the dual E-switch mode will likely get
       deprecated.

       Default: disabled


The ``mlx5`` driver supports reloading via ``DEVLINK_CMD_RELOAD``

+13 −5
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
	const struct mlx5_ib_profile *profile;
	struct mlx5_core_dev *peer_dev;
	struct mlx5_ib_dev *ibdev;
	int second_uplink = false;
	u32 peer_num_ports;
	int vport_index;
	int ret;
@@ -47,17 +48,24 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
		peer_dev = mlx5_lag_get_peer_mdev(dev);
		peer_num_ports = mlx5_eswitch_get_total_vports(peer_dev);
		if (mlx5_lag_is_master(dev)) {
			/* Only 1 ib port is the representor for both uplinks */
			if (mlx5_lag_is_mpesw(dev))
				num_ports += peer_num_ports;
			else
				num_ports += peer_num_ports - 1;

		} else {
			if (rep->vport == MLX5_VPORT_UPLINK)
			if (rep->vport == MLX5_VPORT_UPLINK) {
				if (!mlx5_lag_is_mpesw(dev))
					return 0;
				second_uplink = true;
			}

			vport_index += peer_num_ports;
			dev = peer_dev;
		}
	}

	if (rep->vport == MLX5_VPORT_UPLINK)
	if (rep->vport == MLX5_VPORT_UPLINK && !second_uplink)
		profile = &raw_eth_profile;
	else
		return mlx5_ib_set_vport_rep(dev, rep, vport_index);
+2 −6
Original line number Diff line number Diff line
@@ -377,10 +377,6 @@ int mlx5_attach_device(struct mlx5_core_dev *dev)

			/* Pay attention that this is not PCI driver that
			 * mlx5_core_dev is connected, but auxiliary driver.
			 *
			 * Here we can race of module unload with devlink
			 * reload, but we don't need to take extra lock because
			 * we are holding global mlx5_intf_mutex.
			 */
			if (!adev->dev.driver)
				continue;
@@ -400,7 +396,7 @@ int mlx5_attach_device(struct mlx5_core_dev *dev)
	return ret;
}

void mlx5_detach_device(struct mlx5_core_dev *dev)
void mlx5_detach_device(struct mlx5_core_dev *dev, bool suspend)
{
	struct mlx5_priv *priv = &dev->priv;
	struct auxiliary_device *adev;
@@ -429,7 +425,7 @@ void mlx5_detach_device(struct mlx5_core_dev *dev)

		adrv = to_auxiliary_drv(adev->dev.driver);

		if (adrv->suspend) {
		if (adrv->suspend && suspend) {
			adrv->suspend(adev, pm);
			continue;
		}
+56 −2
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@
#include "fw_reset.h"
#include "fs_core.h"
#include "eswitch.h"
#include "lag/lag.h"
#include "esw/qos.h"
#include "sf/dev/dev.h"
#include "sf/sf.h"
@@ -104,7 +105,7 @@ static int mlx5_devlink_reload_fw_activate(struct devlink *devlink, struct netli
	if (err)
		return err;

	mlx5_unload_one_devl_locked(dev);
	mlx5_unload_one_devl_locked(dev, true);
	err = mlx5_health_wait_pci_up(dev);
	if (err)
		NL_SET_ERR_MSG_MOD(extack, "FW activate aborted, PCI reads fail after reset");
@@ -167,7 +168,7 @@ static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change,

	switch (action) {
	case DEVLINK_RELOAD_ACTION_DRIVER_REINIT:
		mlx5_unload_one_devl_locked(dev);
		mlx5_unload_one_devl_locked(dev, false);
		break;
	case DEVLINK_RELOAD_ACTION_FW_ACTIVATE:
		if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET)
@@ -437,6 +438,53 @@ static int mlx5_devlink_large_group_num_validate(struct devlink *devlink, u32 id
	return 0;
}

static int mlx5_devlink_esw_multiport_set(struct devlink *devlink, u32 id,
					  struct devlink_param_gset_ctx *ctx)
{
	struct mlx5_core_dev *dev = devlink_priv(devlink);

	if (!MLX5_ESWITCH_MANAGER(dev))
		return -EOPNOTSUPP;

	if (ctx->val.vbool)
		return mlx5_lag_mpesw_enable(dev);

	mlx5_lag_mpesw_disable(dev);
	return 0;
}

static int mlx5_devlink_esw_multiport_get(struct devlink *devlink, u32 id,
					  struct devlink_param_gset_ctx *ctx)
{
	struct mlx5_core_dev *dev = devlink_priv(devlink);

	if (!MLX5_ESWITCH_MANAGER(dev))
		return -EOPNOTSUPP;

	ctx->val.vbool = mlx5_lag_is_mpesw(dev);
	return 0;
}

static int mlx5_devlink_esw_multiport_validate(struct devlink *devlink, u32 id,
					       union devlink_param_value val,
					       struct netlink_ext_ack *extack)
{
	struct mlx5_core_dev *dev = devlink_priv(devlink);

	if (!MLX5_ESWITCH_MANAGER(dev)) {
		NL_SET_ERR_MSG_MOD(extack, "E-Switch is unsupported");
		return -EOPNOTSUPP;
	}

	if (mlx5_eswitch_mode(dev) != MLX5_ESWITCH_OFFLOADS) {
		NL_SET_ERR_MSG_MOD(extack,
				   "E-Switch must be in switchdev mode");
		return -EBUSY;
	}

	return 0;
}

#endif

static int mlx5_devlink_eq_depth_validate(struct devlink *devlink, u32 id,
@@ -455,6 +503,12 @@ static const struct devlink_param mlx5_devlink_params[] = {
			     BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
			     NULL, NULL,
			     mlx5_devlink_large_group_num_validate),
	DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_ESW_MULTIPORT,
			     "esw_multiport", DEVLINK_PARAM_TYPE_BOOL,
			     BIT(DEVLINK_PARAM_CMODE_RUNTIME),
			     mlx5_devlink_esw_multiport_get,
			     mlx5_devlink_esw_multiport_set,
			     mlx5_devlink_esw_multiport_validate),
#endif
	DEVLINK_PARAM_GENERIC(IO_EQ_SIZE, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
			      NULL, NULL, mlx5_devlink_eq_depth_validate),
+1 −0
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@ enum mlx5_devlink_param_id {
	MLX5_DEVLINK_PARAM_ID_FLOW_STEERING_MODE,
	MLX5_DEVLINK_PARAM_ID_ESW_LARGE_GROUP_NUM,
	MLX5_DEVLINK_PARAM_ID_ESW_PORT_METADATA,
	MLX5_DEVLINK_PARAM_ID_ESW_MULTIPORT,
};

struct mlx5_trap_ctx {
Loading