Commit f0c227c7 authored by David S. Miller's avatar David S. Miller
Browse files

Merge tag 'mlx5-updates-2021-06-14' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux



Saeed Mahameed says:

====================
mlx5-updates-2021-06-14

1) Trivial Lag refactroing in preparation for upcomming Single FDB lag feature
 - First 3 patches

2) Scalable IRQ distriburion for Sub-functions

A subfunction (SF) is a lightweight function that has a parent PCI
function (PF) on which it is deployed.

Currently, mlx5 subfunction is sharing the IRQs (MSI-X) with their
parent PCI function.

Before this series the PF allocates enough IRQs to cover
all the cores in a system, Newly created SFs will re-use all the IRQs
that the PF has allocated for itself.
Hence, the more SFs are created, there are more EQs per IRQs. Therefore,
whenever we handle an interrupt, we need to pull all SFs EQs and PF EQs
instead of PF EQs without SFs on the system. This leads to a hard impact
on the performance of SFs and PF.

For example, on machine with:
Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz with 56 cores.
PCI Express 3 with BW of 126 Gb/s.
ConnectX-5 Ex; EDR IB (100Gb/s) and 100GbE; dual-port QSFP28; PCIe4.0 x16.

test case: iperf TX BW single CPU, affinity of app and IRQ are the same.
PF only: no SFs on the system, 56 IRQs.
SF (before), 250 SFs Sharing the same 56 IRQs .
SF (now),    250 SFs + 255 avaiable IRQs for the NIC. (please see IRQ spread scheme below).

	    application SF-IRQ  channel   BW(Gb/sec)         interrupts/sec
            iperf TX            affinity
PF only     cpu={0}     cpu={0} cpu={0}   79                 8200
SF (before) cpu={0}     cpu={0} cpu={0}   51.3 (-35%)        9500
SF (now)    cpu={0}     cpu={0} cpu={0}   78 (-2%)           8200

command:
$ taskset -c 0 iperf -c 11.1.1.1 -P 3 -i 6 -t 30 | grep SUM

The different between the SF examples is that before this series we
allocate num_cpus (56) IRQs, and all of them were shared among the PF
and the SFs. And after this series, we allocate 255 IRQs, and we spread
the SFs among the above IRQs. This have significantly decreased the load
on each IRQ and the number of EQs per IRQ is down by 95% (251->11).

In this patchset the solution proposed is to have a dedicated IRQ pool
for SFs to use. the pool will allocate a large number of IRQs
for SFs to grab from in order to minimize irq sharing between the
different SFs.
IRQs will not be requested from the OS until they are 1st requested by
an SF consumer, and will be eventually released when the last SF consumer
releases them.

For the detailed IRQ spread and allocation scheme  please see last patch:
("net/mlx5: Round-Robin EQs over IRQs")
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 08ab4d74 c36326d3
Loading
Loading
Loading
Loading
+6 −2
Original line number Diff line number Diff line
@@ -1560,11 +1560,15 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)

	eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
	param = (struct mlx5_eq_param) {
		.irq_index = 0,
		.nent = MLX5_IB_NUM_PF_EQE,
	};
	param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
	if (!zalloc_cpumask_var(&param.affinity, GFP_KERNEL)) {
		err = -ENOMEM;
		goto err_wq;
	}
	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
	free_cpumask_var(param.affinity);
	if (IS_ERR(eq->core)) {
		err = PTR_ERR(eq->core);
		goto err_wq;
+2 −2
Original line number Diff line number Diff line
@@ -5114,7 +5114,7 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv)
	mlx5e_set_netdev_mtu_boundaries(priv);
	mlx5e_set_dev_port_mtu(priv);

	mlx5_lag_add(mdev, netdev);
	mlx5_lag_add_netdev(mdev, netdev);

	mlx5e_enable_async_events(priv);
	mlx5e_enable_blocking_events(priv);
@@ -5162,7 +5162,7 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv)
		priv->en_trap = NULL;
	}
	mlx5e_disable_async_events(priv);
	mlx5_lag_remove(mdev);
	mlx5_lag_remove_netdev(mdev, priv->netdev);
	mlx5_vxlan_reset_to_default(mdev->vxlan);
}

+2 −2
Original line number Diff line number Diff line
@@ -976,7 +976,7 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv)
	if (MLX5_CAP_GEN(mdev, uplink_follow))
		mlx5_modify_vport_admin_state(mdev, MLX5_VPORT_STATE_OP_MOD_UPLINK,
					      0, 0, MLX5_VPORT_ADMIN_STATE_AUTO);
	mlx5_lag_add(mdev, netdev);
	mlx5_lag_add_netdev(mdev, netdev);
	priv->events_nb.notifier_call = uplink_rep_async_event;
	mlx5_notifier_register(mdev, &priv->events_nb);
	mlx5e_dcbnl_initialize(priv);
@@ -1009,7 +1009,7 @@ static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv)
	mlx5e_dcbnl_delete_app(priv);
	mlx5_notifier_unregister(mdev, &priv->events_nb);
	mlx5e_rep_tc_disable(priv);
	mlx5_lag_remove(mdev);
	mlx5_lag_remove_netdev(mdev, priv->netdev);
}

static MLX5E_DEFINE_STATS_GRP(sw_rep, 0);
+119 −60
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 * Copyright (c) 2013-2021, Mellanox Technologies inc.  All rights reserved.
 */

#include <linux/interrupt.h>
@@ -45,6 +18,7 @@
#include "eswitch.h"
#include "lib/clock.h"
#include "diag/fw_tracer.h"
#include "mlx5_irq.h"

enum {
	MLX5_EQE_OWNER_INIT_VAL	= 0x1,
@@ -84,6 +58,9 @@ struct mlx5_eq_table {
	struct mutex            lock; /* sync async eqs creations */
	int			num_comp_eqs;
	struct mlx5_irq_table	*irq_table;
#ifdef CONFIG_RFS_ACCEL
	struct cpu_rmap		*rmap;
#endif
};

#define MLX5_ASYNC_EVENT_MASK ((1ull << MLX5_EVENT_TYPE_PATH_MIG)	    | \
@@ -286,7 +263,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
	u8 log_eq_stride = ilog2(MLX5_EQE_SIZE);
	struct mlx5_priv *priv = &dev->priv;
	u8 vecidx = param->irq_index;
	u16 vecidx = param->irq_index;
	__be64 *pas;
	void *eqc;
	int inlen;
@@ -309,13 +286,20 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
	mlx5_init_fbc(eq->frag_buf.frags, log_eq_stride, log_eq_size, &eq->fbc);
	init_eq_buf(eq);

	eq->irq = mlx5_irq_request(dev, vecidx, param->affinity);
	if (IS_ERR(eq->irq)) {
		err = PTR_ERR(eq->irq);
		goto err_buf;
	}

	vecidx = mlx5_irq_get_index(eq->irq);
	inlen = MLX5_ST_SZ_BYTES(create_eq_in) +
		MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->frag_buf.npages;

	in = kvzalloc(inlen, GFP_KERNEL);
	if (!in) {
		err = -ENOMEM;
		goto err_buf;
		goto err_irq;
	}

	pas = (__be64 *)MLX5_ADDR_OF(create_eq_in, in, pas);
@@ -359,6 +343,8 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
err_in:
	kvfree(in);

err_irq:
	mlx5_irq_release(eq->irq);
err_buf:
	mlx5_frag_buf_free(dev, &eq->frag_buf);
	return err;
@@ -377,10 +363,9 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
int mlx5_eq_enable(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
		   struct notifier_block *nb)
{
	struct mlx5_eq_table *eq_table = dev->priv.eq_table;
	int err;

	err = mlx5_irq_attach_nb(eq_table->irq_table, eq->vecidx, nb);
	err = mlx5_irq_attach_nb(eq->irq, nb);
	if (!err)
		eq_update_ci(eq, 1);

@@ -399,9 +384,7 @@ EXPORT_SYMBOL(mlx5_eq_enable);
void mlx5_eq_disable(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
		     struct notifier_block *nb)
{
	struct mlx5_eq_table *eq_table = dev->priv.eq_table;

	mlx5_irq_detach_nb(eq_table->irq_table, eq->vecidx, nb);
	mlx5_irq_detach_nb(eq->irq, nb);
}
EXPORT_SYMBOL(mlx5_eq_disable);

@@ -415,10 +398,9 @@ static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
	if (err)
		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
			       eq->eqn);
	synchronize_irq(eq->irqn);
	mlx5_irq_release(eq->irq);

	mlx5_frag_buf_free(dev, &eq->frag_buf);

	return err;
}

@@ -490,14 +472,7 @@ static int create_async_eq(struct mlx5_core_dev *dev,
	int err;

	mutex_lock(&eq_table->lock);
	/* Async EQs must share irq index 0 */
	if (param->irq_index != 0) {
		err = -EINVAL;
		goto unlock;
	}

	err = create_map_eq(dev, eq, param);
unlock:
	mutex_unlock(&eq_table->lock);
	return err;
}
@@ -616,8 +591,11 @@ setup_async_eq(struct mlx5_core_dev *dev, struct mlx5_eq_async *eq,

	eq->irq_nb.notifier_call = mlx5_eq_async_int;
	spin_lock_init(&eq->lock);
	if (!zalloc_cpumask_var(&param->affinity, GFP_KERNEL))
		return -ENOMEM;

	err = create_async_eq(dev, &eq->core, param);
	free_cpumask_var(param->affinity);
	if (err) {
		mlx5_core_warn(dev, "failed to create %s EQ %d\n", name, err);
		return err;
@@ -652,7 +630,6 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
	mlx5_eq_notifier_register(dev, &table->cq_err_nb);

	param = (struct mlx5_eq_param) {
		.irq_index = 0,
		.nent = MLX5_NUM_CMD_EQE,
		.mask[0] = 1ull << MLX5_EVENT_TYPE_CMD,
	};
@@ -665,7 +642,6 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
	mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL);

	param = (struct mlx5_eq_param) {
		.irq_index = 0,
		.nent = MLX5_NUM_ASYNC_EQE,
	};

@@ -675,7 +651,6 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
		goto err2;

	param = (struct mlx5_eq_param) {
		.irq_index = 0,
		.nent = /* TODO: sriov max_vf + */ 1,
		.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_REQUEST,
	};
@@ -735,6 +710,9 @@ mlx5_eq_create_generic(struct mlx5_core_dev *dev,
	struct mlx5_eq *eq = kvzalloc(sizeof(*eq), GFP_KERNEL);
	int err;

	if (!param->affinity)
		return ERR_PTR(-EINVAL);

	if (!eq)
		return ERR_PTR(-ENOMEM);

@@ -845,16 +823,21 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
			.irq_index = vecidx,
			.nent = nent,
		};
		err = create_map_eq(dev, &eq->core, &param);
		if (err) {
			kfree(eq);
			goto clean;

		if (!zalloc_cpumask_var(&param.affinity, GFP_KERNEL)) {
			err = -ENOMEM;
			goto clean_eq;
		}
		cpumask_set_cpu(cpumask_local_spread(i, dev->priv.numa_node),
				param.affinity);
		err = create_map_eq(dev, &eq->core, &param);
		free_cpumask_var(param.affinity);
		if (err)
			goto clean_eq;
		err = mlx5_eq_enable(dev, &eq->core, &eq->irq_nb);
		if (err) {
			destroy_unmap_eq(dev, &eq->core);
			kfree(eq);
			goto clean;
			goto clean_eq;
		}

		mlx5_core_dbg(dev, "allocated completion EQN %d\n", eq->core.eqn);
@@ -863,7 +846,8 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
	}

	return 0;

clean_eq:
	kfree(eq);
clean:
	destroy_comp_eqs(dev);
	return err;
@@ -899,17 +883,23 @@ EXPORT_SYMBOL(mlx5_comp_vectors_count);
struct cpumask *
mlx5_comp_irq_get_affinity_mask(struct mlx5_core_dev *dev, int vector)
{
	int vecidx = vector + MLX5_IRQ_VEC_COMP_BASE;
	struct mlx5_eq_table *table = dev->priv.eq_table;
	struct mlx5_eq_comp *eq, *n;
	int i = 0;

	list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) {
		if (i++ == vector)
			break;
	}

	return mlx5_irq_get_affinity_mask(dev->priv.eq_table->irq_table,
					  vecidx);
	return mlx5_irq_get_affinity_mask(eq->core.irq);
}
EXPORT_SYMBOL(mlx5_comp_irq_get_affinity_mask);

#ifdef CONFIG_RFS_ACCEL
struct cpu_rmap *mlx5_eq_table_get_rmap(struct mlx5_core_dev *dev)
{
	return mlx5_irq_get_rmap(dev->priv.eq_table->irq_table);
	return dev->priv.eq_table->rmap;
}
#endif

@@ -926,12 +916,57 @@ struct mlx5_eq_comp *mlx5_eqn2comp_eq(struct mlx5_core_dev *dev, int eqn)
	return ERR_PTR(-ENOENT);
}

static void clear_rmap(struct mlx5_core_dev *dev)
{
#ifdef CONFIG_RFS_ACCEL
	struct mlx5_eq_table *eq_table = dev->priv.eq_table;

	free_irq_cpu_rmap(eq_table->rmap);
#endif
}

static int set_rmap(struct mlx5_core_dev *mdev)
{
	int err = 0;
#ifdef CONFIG_RFS_ACCEL
	struct mlx5_eq_table *eq_table = mdev->priv.eq_table;
	int vecidx;

	eq_table->rmap = alloc_irq_cpu_rmap(eq_table->num_comp_eqs);
	if (!eq_table->rmap) {
		err = -ENOMEM;
		mlx5_core_err(mdev, "Failed to allocate cpu_rmap. err %d", err);
		goto err_out;
	}

	vecidx = MLX5_IRQ_VEC_COMP_BASE;
	for (; vecidx < eq_table->num_comp_eqs + MLX5_IRQ_VEC_COMP_BASE;
	     vecidx++) {
		err = irq_cpu_rmap_add(eq_table->rmap,
				       pci_irq_vector(mdev->pdev, vecidx));
		if (err) {
			mlx5_core_err(mdev, "irq_cpu_rmap_add failed. err %d",
				      err);
			goto err_irq_cpu_rmap_add;
		}
	}
	return 0;

err_irq_cpu_rmap_add:
	clear_rmap(mdev);
err_out:
#endif
	return err;
}

/* This function should only be called after mlx5_cmd_force_teardown_hca */
void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev)
{
	struct mlx5_eq_table *table = dev->priv.eq_table;

	mutex_lock(&table->lock); /* sync with create/destroy_async_eq */
	if (!mlx5_core_is_sf(dev))
		clear_rmap(dev);
	mlx5_irq_table_destroy(dev);
	mutex_unlock(&table->lock);
}
@@ -948,12 +983,19 @@ int mlx5_eq_table_create(struct mlx5_core_dev *dev)
	int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
		      MLX5_CAP_GEN(dev, max_num_eqs) :
		      1 << MLX5_CAP_GEN(dev, log_max_eq);
	int max_eqs_sf;
	int err;

	eq_table->num_comp_eqs =
		min_t(int,
		      mlx5_irq_get_num_comp(eq_table->irq_table),
		      mlx5_irq_table_get_num_comp(eq_table->irq_table),
		      num_eqs - MLX5_MAX_ASYNC_EQS);
	if (mlx5_core_is_sf(dev)) {
		max_eqs_sf = min_t(int, MLX5_COMP_EQS_PER_SF,
				   mlx5_irq_table_get_sfs_vec(eq_table->irq_table));
		eq_table->num_comp_eqs = min_t(int, eq_table->num_comp_eqs,
					       max_eqs_sf);
	}

	err = create_async_eqs(dev);
	if (err) {
@@ -961,6 +1003,18 @@ int mlx5_eq_table_create(struct mlx5_core_dev *dev)
		goto err_async_eqs;
	}

	if (!mlx5_core_is_sf(dev)) {
		/* rmap is a mapping between irq number and queue number.
		 * each irq can be assign only to a single rmap.
		 * since SFs share IRQs, rmap mapping cannot function correctly
		 * for irqs that are shared for different core/netdev RX rings.
		 * Hence we don't allow netdev rmap for SFs
		 */
		err = set_rmap(dev);
		if (err)
			goto err_rmap;
	}

	err = create_comp_eqs(dev);
	if (err) {
		mlx5_core_err(dev, "Failed to create completion EQs\n");
@@ -969,6 +1023,9 @@ int mlx5_eq_table_create(struct mlx5_core_dev *dev)

	return 0;
err_comp_eqs:
	if (!mlx5_core_is_sf(dev))
		clear_rmap(dev);
err_rmap:
	destroy_async_eqs(dev);
err_async_eqs:
	return err;
@@ -976,6 +1033,8 @@ int mlx5_eq_table_create(struct mlx5_core_dev *dev)

void mlx5_eq_table_destroy(struct mlx5_core_dev *dev)
{
	if (!mlx5_core_is_sf(dev))
		clear_rmap(dev);
	destroy_comp_eqs(dev);
	destroy_async_eqs(dev);
}
+172 −95
Original line number Diff line number Diff line
@@ -93,6 +93,64 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
}
EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);

static int mlx5_lag_netdev_event(struct notifier_block *this,
				 unsigned long event, void *ptr);
static void mlx5_do_bond_work(struct work_struct *work);

static void mlx5_ldev_free(struct kref *ref)
{
	struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref);

	if (ldev->nb.notifier_call)
		unregister_netdevice_notifier_net(&init_net, &ldev->nb);
	mlx5_lag_mp_cleanup(ldev);
	cancel_delayed_work_sync(&ldev->bond_work);
	destroy_workqueue(ldev->wq);
	kfree(ldev);
}

static void mlx5_ldev_put(struct mlx5_lag *ldev)
{
	kref_put(&ldev->ref, mlx5_ldev_free);
}

static void mlx5_ldev_get(struct mlx5_lag *ldev)
{
	kref_get(&ldev->ref);
}

static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev)
{
	struct mlx5_lag *ldev;
	int err;

	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
	if (!ldev)
		return NULL;

	ldev->wq = create_singlethread_workqueue("mlx5_lag");
	if (!ldev->wq) {
		kfree(ldev);
		return NULL;
	}

	kref_init(&ldev->ref);
	INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);

	ldev->nb.notifier_call = mlx5_lag_netdev_event;
	if (register_netdevice_notifier_net(&init_net, &ldev->nb)) {
		ldev->nb.notifier_call = NULL;
		mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
	}

	err = mlx5_lag_mp_init(ldev);
	if (err)
		mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
			      err);

	return ldev;
}

int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
				struct net_device *ndev)
{
@@ -258,6 +316,10 @@ static void mlx5_lag_add_devices(struct mlx5_lag *ldev)
		if (!ldev->pf[i].dev)
			continue;

		if (ldev->pf[i].dev->priv.flags &
		    MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
			continue;

		ldev->pf[i].dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
		mlx5_rescan_drivers_locked(ldev->pf[i].dev);
	}
@@ -276,6 +338,31 @@ static void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
	}
}

static void mlx5_disable_lag(struct mlx5_lag *ldev)
{
	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
	bool roce_lag;
	int err;

	roce_lag = __mlx5_lag_is_roce(ldev);

	if (roce_lag) {
		if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) {
			dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
			mlx5_rescan_drivers_locked(dev0);
		}
		mlx5_nic_vport_disable_roce(dev1);
	}

	err = mlx5_deactivate_lag(ldev);
	if (err)
		return;

	if (roce_lag)
		mlx5_lag_add_devices(ldev);
}

static void mlx5_do_bond(struct mlx5_lag *ldev)
{
	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
@@ -322,20 +409,7 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
	} else if (do_bond && __mlx5_lag_is_active(ldev)) {
		mlx5_modify_lag(ldev, &tracker);
	} else if (!do_bond && __mlx5_lag_is_active(ldev)) {
		roce_lag = __mlx5_lag_is_roce(ldev);

		if (roce_lag) {
			dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
			mlx5_rescan_drivers_locked(dev0);
			mlx5_nic_vport_disable_roce(dev1);
		}

		err = mlx5_deactivate_lag(ldev);
		if (err)
			return;

		if (roce_lag)
			mlx5_lag_add_devices(ldev);
		mlx5_disable_lag(ldev);
	}
}

@@ -495,54 +569,51 @@ static int mlx5_lag_netdev_event(struct notifier_block *this,
	return NOTIFY_DONE;
}

static struct mlx5_lag *mlx5_lag_dev_alloc(void)
static void mlx5_ldev_add_netdev(struct mlx5_lag *ldev,
				 struct mlx5_core_dev *dev,
				 struct net_device *netdev)
{
	struct mlx5_lag *ldev;
	unsigned int fn = PCI_FUNC(dev->pdev->devfn);

	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
	if (!ldev)
		return NULL;
	if (fn >= MLX5_MAX_PORTS)
		return;

	ldev->wq = create_singlethread_workqueue("mlx5_lag");
	if (!ldev->wq) {
		kfree(ldev);
		return NULL;
	spin_lock(&lag_lock);
	ldev->pf[fn].netdev = netdev;
	ldev->tracker.netdev_state[fn].link_up = 0;
	ldev->tracker.netdev_state[fn].tx_enabled = 0;
	spin_unlock(&lag_lock);
}

	INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);
static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev,
				    struct net_device *netdev)
{
	int i;

	return ldev;
	spin_lock(&lag_lock);
	for (i = 0; i < MLX5_MAX_PORTS; i++) {
		if (ldev->pf[i].netdev == netdev) {
			ldev->pf[i].netdev = NULL;
			break;
		}

static void mlx5_lag_dev_free(struct mlx5_lag *ldev)
{
	destroy_workqueue(ldev->wq);
	kfree(ldev);
	}
	spin_unlock(&lag_lock);
}

static int mlx5_lag_dev_add_pf(struct mlx5_lag *ldev,
			       struct mlx5_core_dev *dev,
			       struct net_device *netdev)
static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
			       struct mlx5_core_dev *dev)
{
	unsigned int fn = PCI_FUNC(dev->pdev->devfn);

	if (fn >= MLX5_MAX_PORTS)
		return -EPERM;
		return;

	spin_lock(&lag_lock);
	ldev->pf[fn].dev = dev;
	ldev->pf[fn].netdev = netdev;
	ldev->tracker.netdev_state[fn].link_up = 0;
	ldev->tracker.netdev_state[fn].tx_enabled = 0;

	dev->priv.lag = ldev;

	spin_unlock(&lag_lock);

	return fn;
}

static void mlx5_lag_dev_remove_pf(struct mlx5_lag *ldev,
/* Must be called with intf_mutex held */
static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
				  struct mlx5_core_dev *dev)
{
	int i;
@@ -554,19 +625,15 @@ static void mlx5_lag_dev_remove_pf(struct mlx5_lag *ldev,
	if (i == MLX5_MAX_PORTS)
		return;

	spin_lock(&lag_lock);
	memset(&ldev->pf[i], 0, sizeof(*ldev->pf));

	ldev->pf[i].dev = NULL;
	dev->priv.lag = NULL;
	spin_unlock(&lag_lock);
}

/* Must be called with intf_mutex held */
void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
static void __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
{
	struct mlx5_lag *ldev = NULL;
	struct mlx5_core_dev *tmp_dev;
	int i, err;

	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
	    !MLX5_CAP_GEN(dev, lag_master) ||
@@ -578,67 +645,77 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
		ldev = tmp_dev->priv.lag;

	if (!ldev) {
		ldev = mlx5_lag_dev_alloc();
		ldev = mlx5_lag_dev_alloc(dev);
		if (!ldev) {
			mlx5_core_err(dev, "Failed to alloc lag dev\n");
			return;
		}
	} else {
		mlx5_ldev_get(ldev);
	}

	if (mlx5_lag_dev_add_pf(ldev, dev, netdev) < 0)
	mlx5_ldev_add_mdev(ldev, dev);

	return;
}

	for (i = 0; i < MLX5_MAX_PORTS; i++)
		if (!ldev->pf[i].dev)
			break;
void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev)
{
	struct mlx5_lag *ldev;

	if (i >= MLX5_MAX_PORTS)
		ldev->flags |= MLX5_LAG_FLAG_READY;
	ldev = mlx5_lag_dev(dev);
	if (!ldev)
		return;

	if (!ldev->nb.notifier_call) {
		ldev->nb.notifier_call = mlx5_lag_netdev_event;
		if (register_netdevice_notifier_net(&init_net, &ldev->nb)) {
			ldev->nb.notifier_call = NULL;
			mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
		}
	mlx5_dev_list_lock();
	mlx5_ldev_remove_mdev(ldev, dev);
	mlx5_dev_list_unlock();
	mlx5_ldev_put(ldev);
}

	err = mlx5_lag_mp_init(ldev);
	if (err)
		mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
			      err);
void mlx5_lag_add_mdev(struct mlx5_core_dev *dev)
{
	mlx5_dev_list_lock();
	__mlx5_lag_dev_add_mdev(dev);
	mlx5_dev_list_unlock();
}

/* Must be called with intf_mutex held */
void mlx5_lag_remove(struct mlx5_core_dev *dev)
void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev,
			    struct net_device *netdev)
{
	struct mlx5_lag *ldev;
	int i;

	ldev = mlx5_lag_dev_get(dev);
	ldev = mlx5_lag_dev(dev);
	if (!ldev)
		return;

	if (__mlx5_lag_is_active(ldev))
		mlx5_deactivate_lag(ldev);

	mlx5_lag_dev_remove_pf(ldev, dev);
		mlx5_disable_lag(ldev);

	mlx5_ldev_remove_netdev(ldev, netdev);
	ldev->flags &= ~MLX5_LAG_FLAG_READY;
}

/* Must be called with intf_mutex held */
void mlx5_lag_add_netdev(struct mlx5_core_dev *dev,
			 struct net_device *netdev)
{
	struct mlx5_lag *ldev;
	int i;

	ldev = mlx5_lag_dev(dev);
	if (!ldev)
		return;

	mlx5_ldev_add_netdev(ldev, dev, netdev);

	for (i = 0; i < MLX5_MAX_PORTS; i++)
		if (ldev->pf[i].dev)
		if (!ldev->pf[i].dev)
			break;

	if (i == MLX5_MAX_PORTS) {
		if (ldev->nb.notifier_call) {
			unregister_netdevice_notifier_net(&init_net, &ldev->nb);
			ldev->nb.notifier_call = NULL;
		}
		mlx5_lag_mp_cleanup(ldev);
		cancel_delayed_work_sync(&ldev->bond_work);
		mlx5_lag_dev_free(ldev);
	}
	if (i >= MLX5_MAX_PORTS)
		ldev->flags |= MLX5_LAG_FLAG_READY;
}

bool mlx5_lag_is_roce(struct mlx5_core_dev *dev)
@@ -647,7 +724,7 @@ bool mlx5_lag_is_roce(struct mlx5_core_dev *dev)
	bool res;

	spin_lock(&lag_lock);
	ldev = mlx5_lag_dev_get(dev);
	ldev = mlx5_lag_dev(dev);
	res  = ldev && __mlx5_lag_is_roce(ldev);
	spin_unlock(&lag_lock);

@@ -661,7 +738,7 @@ bool mlx5_lag_is_active(struct mlx5_core_dev *dev)
	bool res;

	spin_lock(&lag_lock);
	ldev = mlx5_lag_dev_get(dev);
	ldev = mlx5_lag_dev(dev);
	res  = ldev && __mlx5_lag_is_active(ldev);
	spin_unlock(&lag_lock);

@@ -675,7 +752,7 @@ bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
	bool res;

	spin_lock(&lag_lock);
	ldev = mlx5_lag_dev_get(dev);
	ldev = mlx5_lag_dev(dev);
	res  = ldev && __mlx5_lag_is_sriov(ldev);
	spin_unlock(&lag_lock);

@@ -688,7 +765,7 @@ void mlx5_lag_update(struct mlx5_core_dev *dev)
	struct mlx5_lag *ldev;

	mlx5_dev_list_lock();
	ldev = mlx5_lag_dev_get(dev);
	ldev = mlx5_lag_dev(dev);
	if (!ldev)
		goto unlock;

@@ -704,7 +781,7 @@ struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev)
	struct mlx5_lag *ldev;

	spin_lock(&lag_lock);
	ldev = mlx5_lag_dev_get(dev);
	ldev = mlx5_lag_dev(dev);

	if (!(ldev && __mlx5_lag_is_roce(ldev)))
		goto unlock;
@@ -733,7 +810,7 @@ u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
	u8 port = 0;

	spin_lock(&lag_lock);
	ldev = mlx5_lag_dev_get(dev);
	ldev = mlx5_lag_dev(dev);
	if (!(ldev && __mlx5_lag_is_roce(ldev)))
		goto unlock;

@@ -769,7 +846,7 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
	memset(values, 0, sizeof(*values) * num_counters);

	spin_lock(&lag_lock);
	ldev = mlx5_lag_dev_get(dev);
	ldev = mlx5_lag_dev(dev);
	if (ldev && __mlx5_lag_is_active(ldev)) {
		num_ports = MLX5_MAX_PORTS;
		mdev[MLX5_LAG_P1] = ldev->pf[MLX5_LAG_P1].dev;
Loading