Commit c446d9da authored by Mark Bloch's avatar Mark Bloch Committed by Saeed Mahameed
Browse files

RDMA/mlx5: Add shared FDB support



Shared FDB allows to create a single RDMA device that holds representors
from both eswitches. As shared FDB is only active when both uplink
representors are enslaved there is a single RDMA port that represents
both uplinks.

The number of ports is the number of vports on both eswitches minus one
as we only need 1 port for both uplinks.

Signed-off-by: default avatarMark Bloch <mbloch@nvidia.com>
Reviewed-by: default avatarMark Zhang <markzhang@nvidia.com>
Signed-off-by: default avatarSaeed Mahameed <saeedm@nvidia.com>
parent 979bf468
Loading
Loading
Loading
Loading
+69 −6
Original line number Diff line number Diff line
@@ -8,13 +8,15 @@
#include "srq.h"

static int
mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev,
		      struct mlx5_eswitch_rep *rep,
		      int vport_index)
{
	struct mlx5_ib_dev *ibdev;
	int vport_index;

	ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB);
	vport_index = rep->vport_index;
	if (!ibdev)
		return -EINVAL;

	ibdev->port[vport_index].rep = rep;
	rep->rep_data[REP_IB].priv = ibdev;
@@ -26,19 +28,39 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
	return 0;
}

static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev);

static int
mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
{
	u32 num_ports = mlx5_eswitch_get_total_vports(dev);
	const struct mlx5_ib_profile *profile;
	struct mlx5_core_dev *peer_dev;
	struct mlx5_ib_dev *ibdev;
	u32 peer_num_ports;
	int vport_index;
	int ret;

	vport_index = rep->vport_index;

	if (mlx5_lag_is_shared_fdb(dev)) {
		peer_dev = mlx5_lag_get_peer_mdev(dev);
		peer_num_ports = mlx5_eswitch_get_total_vports(peer_dev);
		if (mlx5_lag_is_master(dev)) {
			/* Only 1 ib port is the representor for both uplinks */
			num_ports += peer_num_ports - 1;
		} else {
			if (rep->vport == MLX5_VPORT_UPLINK)
				return 0;
			vport_index += peer_num_ports;
			dev = peer_dev;
		}
	}

	if (rep->vport == MLX5_VPORT_UPLINK)
		profile = &raw_eth_profile;
	else
		return mlx5_ib_set_vport_rep(dev, rep);
		return mlx5_ib_set_vport_rep(dev, rep, vport_index);

	ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev);
	if (!ibdev)
@@ -64,6 +86,8 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
		goto fail_add;

	rep->rep_data[REP_IB].priv = ibdev;
	if (mlx5_lag_is_shared_fdb(dev))
		mlx5_ib_register_peer_vport_reps(dev);

	return 0;

@@ -82,19 +106,46 @@ static void *mlx5_ib_rep_to_dev(struct mlx5_eswitch_rep *rep)
static void
mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
{
	struct mlx5_core_dev *mdev = mlx5_eswitch_get_core_dev(rep->esw);
	struct mlx5_ib_dev *dev = mlx5_ib_rep_to_dev(rep);
	int vport_index = rep->vport_index;
	struct mlx5_ib_port *port;

	port = &dev->port[rep->vport_index];
	if (WARN_ON(!mdev))
		return;

	if (mlx5_lag_is_shared_fdb(mdev) &&
	    !mlx5_lag_is_master(mdev)) {
		struct mlx5_core_dev *peer_mdev;

		if (rep->vport == MLX5_VPORT_UPLINK)
			return;
		peer_mdev = mlx5_lag_get_peer_mdev(mdev);
		vport_index += mlx5_eswitch_get_total_vports(peer_mdev);
	}

	if (!dev)
		return;

	port = &dev->port[vport_index];
	write_lock(&port->roce.netdev_lock);
	port->roce.netdev = NULL;
	write_unlock(&port->roce.netdev_lock);
	rep->rep_data[REP_IB].priv = NULL;
	port->rep = NULL;

	if (rep->vport == MLX5_VPORT_UPLINK)
	if (rep->vport == MLX5_VPORT_UPLINK) {
		struct mlx5_core_dev *peer_mdev;
		struct mlx5_eswitch *esw;

		if (mlx5_lag_is_shared_fdb(mdev)) {
			peer_mdev = mlx5_lag_get_peer_mdev(mdev);
			esw = peer_mdev->priv.eswitch;
			mlx5_eswitch_unregister_vport_reps(esw, REP_IB);
		}
		__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
	}
}

static const struct mlx5_eswitch_rep_ops rep_ops = {
	.load = mlx5_ib_vport_rep_load,
@@ -102,6 +153,18 @@ static const struct mlx5_eswitch_rep_ops rep_ops = {
	.get_proto_dev = mlx5_ib_rep_to_dev,
};

static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev)
{
	struct mlx5_core_dev *peer_mdev = mlx5_lag_get_peer_mdev(mdev);
	struct mlx5_eswitch *esw;

	if (!peer_mdev)
		return;

	esw = peer_mdev->priv.eswitch;
	mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB);
}

struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
					  u16 vport_num)
{
+26 −18
Original line number Diff line number Diff line
@@ -126,6 +126,7 @@ static int get_port_state(struct ib_device *ibdev,

static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
					   struct net_device *ndev,
					   struct net_device *upper,
					   u32 *port_num)
{
	struct net_device *rep_ndev;
@@ -137,6 +138,14 @@ static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
		if (!port->rep)
			continue;

		if (upper == ndev && port->rep->vport == MLX5_VPORT_UPLINK) {
			*port_num = i + 1;
			return &port->roce;
		}

		if (upper && port->rep->vport == MLX5_VPORT_UPLINK)
			continue;

		read_lock(&port->roce.netdev_lock);
		rep_ndev = mlx5_ib_get_rep_netdev(port->rep->esw,
						  port->rep->vport);
@@ -196,11 +205,12 @@ static int mlx5_netdev_event(struct notifier_block *this,
		}

		if (ibdev->is_rep)
			roce = mlx5_get_rep_roce(ibdev, ndev, &port_num);
			roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num);
		if (!roce)
			return NOTIFY_DONE;
		if ((upper == ndev || (!upper && ndev == roce->netdev))
		    && ibdev->ib_active) {
		if ((upper == ndev ||
		     ((!upper || ibdev->is_rep) && ndev == roce->netdev)) &&
		    ibdev->ib_active) {
			struct ib_event ibev = { };
			enum ib_port_state port_state;

@@ -3012,7 +3022,7 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
	struct mlx5_flow_table *ft;
	int err;

	if (!ns || !mlx5_lag_is_roce(mdev))
	if (!ns || !mlx5_lag_is_active(mdev))
		return 0;

	err = mlx5_cmd_create_vport_lag(mdev);
@@ -3074,9 +3084,11 @@ static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
{
	int err;

	if (!dev->is_rep && dev->profile != &raw_eth_profile) {
		err = mlx5_nic_vport_enable_roce(dev->mdev);
		if (err)
			return err;
	}

	err = mlx5_eth_lag_init(dev);
	if (err)
@@ -3085,6 +3097,7 @@ static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
	return 0;

err_disable_roce:
	if (!dev->is_rep && dev->profile != &raw_eth_profile)
		mlx5_nic_vport_disable_roce(dev->mdev);

	return err;
@@ -3093,6 +3106,7 @@ static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
{
	mlx5_eth_lag_cleanup(dev);
	if (!dev->is_rep && dev->profile != &raw_eth_profile)
		mlx5_nic_vport_disable_roce(dev->mdev);
}

@@ -3950,12 +3964,7 @@ static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev)

		/* Register only for native ports */
		err = mlx5_add_netdev_notifier(dev, port_num);
		if (err || dev->is_rep || !mlx5_is_roce_init_enabled(mdev))
			/*
			 * We don't enable ETH interface for
			 * 1. IB representors
			 * 2. User disabled ROCE through devlink interface
			 */
		if (err)
			return err;

		err = mlx5_enable_eth(dev);
@@ -3980,7 +3989,6 @@ static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev)
	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);

	if (ll == IB_LINK_LAYER_ETHERNET) {
		if (!dev->is_rep)
		mlx5_disable_eth(dev);

		port_num = mlx5_core_native_port_num(dev->mdev) - 1;
@@ -4037,7 +4045,7 @@ static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
{
	const char *name;

	if (!mlx5_lag_is_roce(dev->mdev))
	if (!mlx5_lag_is_active(dev->mdev))
		name = "mlx5_%d";
	else
		name = "mlx5_bond_%d";