Commit 37ca95e6 authored by Gavin Li's avatar Gavin Li Committed by Saeed Mahameed
Browse files

net/mlx5: Increase FW pre-init timeout for health recovery



Currently, health recovery will reload driver to recover it from fatal
errors. During the driver's load process, it would wait for FW to set the
pre-init bit for up to 120 seconds, beyond this threshold it would abort
the load process. In some cases, such as a FW upgrade on the DPU, this
timeout period is insufficient, and the user has no way to recover the
host device.

To solve this issue, introduce a new FW pre-init timeout for health
recovery, which is set to 2 hours.

The timeout for devlink reload and probe will use the original one because
they are user triggered flows, and therefore should not have a
significantly long timeout, during which the user command would hang.

Signed-off-by: default avatarGavin Li <gavinl@nvidia.com>
Reviewed-by: default avatarMoshe Shemesh <moshe@nvidia.com>
Reviewed-by: default avatarShay Drory <shayd@nvidia.com>
Signed-off-by: default avatarSaeed Mahameed <saeedm@nvidia.com>
parent 8324a02c
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -178,13 +178,13 @@ static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_a
	*actions_performed = BIT(action);
	switch (action) {
	case DEVLINK_RELOAD_ACTION_DRIVER_REINIT:
		return mlx5_load_one(dev);
		return mlx5_load_one(dev, false);
	case DEVLINK_RELOAD_ACTION_FW_ACTIVATE:
		if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET)
			break;
		/* On fw_activate action, also driver is reloaded and reinit performed */
		*actions_performed |= BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
		return mlx5_load_one(dev);
		return mlx5_load_one(dev, false);
	default:
		/* Unsupported action should not get to this function */
		WARN_ON(1);
+1 −1
Original line number Diff line number Diff line
@@ -148,7 +148,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
	if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
		complete(&fw_reset->done);
	} else {
		mlx5_load_one(dev);
		mlx5_load_one(dev, false);
		devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0,
							BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
							BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
+1 −0
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@ struct mlx5_timeouts {

static const u32 tout_def_sw_val[MAX_TIMEOUT_TYPES] = {
	[MLX5_TO_FW_PRE_INIT_TIMEOUT_MS] = 120000,
	[MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS] = 7200000,
	[MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS] = 20000,
	[MLX5_TO_FW_PRE_INIT_WAIT_MS] = 2,
	[MLX5_TO_FW_INIT_MS] = 2000,
+1 −0
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@
enum mlx5_timeouts_types {
	/* pre init timeouts (not read from FW) */
	MLX5_TO_FW_PRE_INIT_TIMEOUT_MS,
	MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS,
	MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS,
	MLX5_TO_FW_PRE_INIT_WAIT_MS,

+14 −9
Original line number Diff line number Diff line
@@ -1003,7 +1003,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
	mlx5_devcom_unregister_device(dev->priv.devcom);
}

static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
{
	int err;

@@ -1018,11 +1018,11 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)

	/* wait for firmware to accept initialization segments configurations
	 */
	err = wait_fw_init(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT),
	err = wait_fw_init(dev, timeout,
			   mlx5_tout_ms(dev, FW_PRE_INIT_WARN_MESSAGE_INTERVAL));
	if (err) {
		mlx5_core_err(dev, "Firmware over %llu MS in pre-initializing state, aborting\n",
			      mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
			      timeout);
		return err;
	}

@@ -1272,7 +1272,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
	mutex_lock(&dev->intf_state_mutex);
	dev->state = MLX5_DEVICE_STATE_UP;

	err = mlx5_function_setup(dev, true);
	err = mlx5_function_setup(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
	if (err)
		goto err_function;

@@ -1336,9 +1336,10 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev)
	mutex_unlock(&dev->intf_state_mutex);
}

int mlx5_load_one(struct mlx5_core_dev *dev)
int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery)
{
	int err = 0;
	u64 timeout;

	mutex_lock(&dev->intf_state_mutex);
	if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
@@ -1348,7 +1349,11 @@ int mlx5_load_one(struct mlx5_core_dev *dev)
	/* remove any previous indication of internal error */
	dev->state = MLX5_DEVICE_STATE_UP;

	err = mlx5_function_setup(dev, false);
	if (recovery)
		timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT);
	else
		timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT);
	err = mlx5_function_setup(dev, timeout);
	if (err)
		goto err_function;

@@ -1719,7 +1724,7 @@ static void mlx5_pci_resume(struct pci_dev *pdev)

	mlx5_pci_trace(dev, "Enter, loading driver..\n");

	err = mlx5_load_one(dev);
	err = mlx5_load_one(dev, false);

	mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err,
		       !err ? "recovered" : "Failed");
@@ -1807,7 +1812,7 @@ static int mlx5_resume(struct pci_dev *pdev)
{
	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);

	return mlx5_load_one(dev);
	return mlx5_load_one(dev, false);
}

static const struct pci_device_id mlx5_core_pci_table[] = {
@@ -1852,7 +1857,7 @@ int mlx5_recover_device(struct mlx5_core_dev *dev)
			return -EIO;
	}

	return mlx5_load_one(dev);
	return mlx5_load_one(dev, true);
}

static struct pci_driver mlx5_core_driver = {
Loading