Commit fad1783a authored by Saeed Mahameed's avatar Saeed Mahameed
Browse files

net/mlx5: Print more info on pci error handlers



In case mlx5_pci_err_detected was called with state equals to
pci_channel_io_perm_failure, the driver will never come back up.

It is nice to know why the driver went to zombie land, so print some
useful information on pci err handlers.

Signed-off-by: default avatarSaeed Mahameed <saeedm@nvidia.com>
Reviewed-by: default avatarMoshe Shemesh <moshe@nvidia.com>
parent c64d01b3
Loading
Loading
Loading
Loading
+37 −14
Original line number Diff line number Diff line
@@ -1604,12 +1604,28 @@ static void remove_one(struct pci_dev *pdev)
	mlx5_devlink_free(devlink);
}

#define mlx5_pci_trace(dev, fmt, ...) ({ \
	struct mlx5_core_dev *__dev = (dev); \
	mlx5_core_info(__dev, "%s Device state = %d health sensors: %d pci_status: %d. " fmt, \
		       __func__, __dev->state, mlx5_health_check_fatal_sensors(__dev), \
		       __dev->pci_status, ##__VA_ARGS__); \
})

static const char *result2str(enum pci_ers_result result)
{
	return  result == PCI_ERS_RESULT_NEED_RESET ? "need reset" :
		result == PCI_ERS_RESULT_DISCONNECT ? "disconnect" :
		result == PCI_ERS_RESULT_RECOVERED  ? "recovered" :
		"unknown";
}

static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
					      pci_channel_state_t state)
{
	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
	enum pci_ers_result res;

	mlx5_core_info(dev, "%s was called\n", __func__);
	mlx5_pci_trace(dev, "Enter, pci channel state = %d\n", state);

	mlx5_enter_error_state(dev, false);
	mlx5_error_sw_reset(dev);
@@ -1617,8 +1633,11 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
	mlx5_drain_health_wq(dev);
	mlx5_pci_disable_device(dev);

	return state == pci_channel_io_perm_failure ?
	res = state == pci_channel_io_perm_failure ?
		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;

	mlx5_pci_trace(dev, "Exit, result = %d, %s\n",  res, result2str(res));
	return res;
}

/* wait for the device to show vital signs by waiting
@@ -1652,28 +1671,34 @@ static int wait_vital(struct pci_dev *pdev)

static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev)
{
	enum pci_ers_result res = PCI_ERS_RESULT_DISCONNECT;
	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
	int err;

	mlx5_core_info(dev, "%s was called\n", __func__);
	mlx5_pci_trace(dev, "Enter\n");

	err = mlx5_pci_enable_device(dev);
	if (err) {
		mlx5_core_err(dev, "%s: mlx5_pci_enable_device failed with error code: %d\n",
			      __func__, err);
		return PCI_ERS_RESULT_DISCONNECT;
		goto out;
	}

	pci_set_master(pdev);
	pci_restore_state(pdev);
	pci_save_state(pdev);

	if (wait_vital(pdev)) {
		mlx5_core_err(dev, "%s: wait_vital timed out\n", __func__);
		return PCI_ERS_RESULT_DISCONNECT;
	err = wait_vital(pdev);
	if (err) {
		mlx5_core_err(dev, "%s: wait vital failed with error code: %d\n",
			      __func__, err);
		goto out;
	}

	return PCI_ERS_RESULT_RECOVERED;
	res = PCI_ERS_RESULT_RECOVERED;
out:
	mlx5_pci_trace(dev, "Exit, err = %d, result = %d, %s\n", err, res, result2str(res));
	return res;
}

static void mlx5_pci_resume(struct pci_dev *pdev)
@@ -1681,14 +1706,12 @@ static void mlx5_pci_resume(struct pci_dev *pdev)
	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
	int err;

	mlx5_core_info(dev, "%s was called\n", __func__);
	mlx5_pci_trace(dev, "Enter, loading driver..\n");

	err = mlx5_load_one(dev);
	if (err)
		mlx5_core_err(dev, "%s: mlx5_load_one failed with error code: %d\n",
			      __func__, err);
	else
		mlx5_core_info(dev, "%s: device recovered\n", __func__);

	mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err,
		       !err ? "recovered" : "Failed");
}

static const struct pci_error_handlers mlx5_err_handler = {