Commit b042b278 authored by Alex Williamson's avatar Alex Williamson
Browse files

Merge tag 'mlx5-vfio-v10' of...

Merge tag 'mlx5-vfio-v10' of https://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux into v5.18/vfio/next/mlx5-migration-v10

Add mlx5 live migration driver and v2 migration protocol

This series adds mlx5 live migration driver for VFs that are migration
capable and includes the v2 migration protocol definition and mlx5
implementation.

The mlx5 driver uses the vfio_pci_core split to create a specific VFIO
PCI driver that matches the mlx5 virtual functions. The driver provides
the same experience as normal vfio-pci with the addition of migration
support.

In HW the migration is controlled by the PF function, using its
mlx5_core driver, and the VFIO PCI VF driver co-ordinates with the PF to
execute the migration actions.

The bulk of the v2 migration protocol is semantically the same v1,
however it has been recast into a FSM for the device_state and the
actual syscall interface uses normal ioctl(), read() and write() instead
of building a syscall interface using the region.

Several bits of infrastructure work are included here:
 - pci_iov_vf_id() to help drivers like mlx5 figure out the VF index from
   a BDF
 - pci_iov_get_pf_drvdata() to clarify the tricky locking protocol when a
   VF reaches into its PF's driver
 - mlx5_core uses the normal SRIOV lifecycle and disables SRIOV before
   driver remove, to be compatible with pci_iov_get_pf_drvdata()
 - Lifting VFIO_DEVICE_FEATURE into core VFIO code

This series comes after alot of discussion. Some major points:
- v1 ABI compatible migration defined using the same FSM approach:
   https://lore.kernel.org/all/0-v1-a4f7cab64938+3f-vfio_mig_states_jgg@nvidia.com/
- Attempts to clarify how the v1 API works:
   Alex's:
     https://lore.kernel.org/kvm/163909282574.728533.7460416142511440919.stgit@omen/
   Jason's:
     https://lore.kernel.org/all/0-v3-184b374ad0a8+24c-vfio_mig_doc_jgg@nvidia.com/
- Etherpad exploring the scope and questions of general VFIO migration:
     https://lore.kernel.org/kvm/87mtm2loml.fsf@redhat.com/

NOTE: As this series touched mlx5_core parts we need to send this in a
pull request format to VFIO to avoid conflicts.

Matching qemu changes can be previewed here:
 https://github.com/jgunthorpe/qemu/commits/vfio_migration_v2

Link: https://lore.kernel.org/all/20220224142024.147653-1-yishaih@nvidia.com


Signed-of-by: default avatarLeon Romanovsky <leonro@nvidia.com>
parents cfb92440 88faa5e8
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -20320,6 +20320,12 @@ L: kvm@vger.kernel.org
S:	Maintained
F:	drivers/vfio/platform/
VFIO MLX5 PCI DRIVER
M:	Yishai Hadas <yishaih@nvidia.com>
L:	kvm@vger.kernel.org
S:	Maintained
F:	drivers/vfio/pci/mlx5/
VGA_SWITCHEROO
R:	Lukas Wunner <lukas@wunner.de>
S:	Maintained
+10 −0
Original line number Diff line number Diff line
@@ -477,6 +477,11 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
	case MLX5_CMD_OP_QUERY_VHCA_STATE:
	case MLX5_CMD_OP_MODIFY_VHCA_STATE:
	case MLX5_CMD_OP_ALLOC_SF:
	case MLX5_CMD_OP_SUSPEND_VHCA:
	case MLX5_CMD_OP_RESUME_VHCA:
	case MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE:
	case MLX5_CMD_OP_SAVE_VHCA_STATE:
	case MLX5_CMD_OP_LOAD_VHCA_STATE:
		*status = MLX5_DRIVER_STATUS_ABORTED;
		*synd = MLX5_DRIVER_SYND;
		return -EIO;
@@ -674,6 +679,11 @@ const char *mlx5_command_str(int command)
	MLX5_COMMAND_STR_CASE(MODIFY_VHCA_STATE);
	MLX5_COMMAND_STR_CASE(ALLOC_SF);
	MLX5_COMMAND_STR_CASE(DEALLOC_SF);
	MLX5_COMMAND_STR_CASE(SUSPEND_VHCA);
	MLX5_COMMAND_STR_CASE(RESUME_VHCA);
	MLX5_COMMAND_STR_CASE(QUERY_VHCA_MIGRATION_STATE);
	MLX5_COMMAND_STR_CASE(SAVE_VHCA_STATE);
	MLX5_COMMAND_STR_CASE(LOAD_VHCA_STATE);
	default: return "unknown command opcode";
	}
}
+45 −0
Original line number Diff line number Diff line
@@ -1620,6 +1620,7 @@ static void remove_one(struct pci_dev *pdev)
	struct devlink *devlink = priv_to_devlink(dev);

	devlink_unregister(devlink);
	mlx5_sriov_disable(pdev);
	mlx5_crdump_disable(dev);
	mlx5_drain_health_wq(dev);
	mlx5_uninit_one(dev);
@@ -1880,6 +1881,50 @@ static struct pci_driver mlx5_core_driver = {
	.sriov_set_msix_vec_count = mlx5_core_sriov_set_msix_vec_count,
};

/**
 * mlx5_vf_get_core_dev - Get the mlx5 core device from a given VF PCI device if
 *                     mlx5_core is its driver.
 * @pdev: The associated PCI device.
 *
 * Upon return the interface state lock stay held to let caller uses it safely.
 * Caller must ensure to use the returned mlx5 device for a narrow window
 * and put it back with mlx5_vf_put_core_dev() immediately once usage was over.
 *
 * Return: Pointer to the associated mlx5_core_dev or NULL.
 */
struct mlx5_core_dev *mlx5_vf_get_core_dev(struct pci_dev *pdev)
			__acquires(&mdev->intf_state_mutex)
{
	struct mlx5_core_dev *mdev;

	mdev = pci_iov_get_pf_drvdata(pdev, &mlx5_core_driver);
	if (IS_ERR(mdev))
		return NULL;

	mutex_lock(&mdev->intf_state_mutex);
	if (!test_bit(MLX5_INTERFACE_STATE_UP, &mdev->intf_state)) {
		mutex_unlock(&mdev->intf_state_mutex);
		return NULL;
	}

	return mdev;
}
EXPORT_SYMBOL(mlx5_vf_get_core_dev);

/**
 * mlx5_vf_put_core_dev - Put the mlx5 core device back.
 * @mdev: The mlx5 core device.
 *
 * Upon return the interface state lock is unlocked and caller should not
 * access the mdev any more.
 */
void mlx5_vf_put_core_dev(struct mlx5_core_dev *mdev)
			__releases(&mdev->intf_state_mutex)
{
	mutex_unlock(&mdev->intf_state_mutex);
}
EXPORT_SYMBOL(mlx5_vf_put_core_dev);

static void mlx5_core_verify_params(void)
{
	if (prof_sel >= ARRAY_SIZE(profile)) {
+1 −0
Original line number Diff line number Diff line
@@ -164,6 +164,7 @@ void mlx5_sriov_cleanup(struct mlx5_core_dev *dev);
int mlx5_sriov_attach(struct mlx5_core_dev *dev);
void mlx5_sriov_detach(struct mlx5_core_dev *dev);
int mlx5_core_sriov_configure(struct pci_dev *dev, int num_vfs);
void mlx5_sriov_disable(struct pci_dev *pdev);
int mlx5_core_sriov_set_msix_vec_count(struct pci_dev *vf, int msix_vec_count);
int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id);
int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id);
+3 −14
Original line number Diff line number Diff line
@@ -161,7 +161,7 @@ static int mlx5_sriov_enable(struct pci_dev *pdev, int num_vfs)
	return err;
}

static void mlx5_sriov_disable(struct pci_dev *pdev)
void mlx5_sriov_disable(struct pci_dev *pdev)
{
	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
	int num_vfs = pci_num_vf(dev->pdev);
@@ -205,19 +205,8 @@ int mlx5_core_sriov_set_msix_vec_count(struct pci_dev *vf, int msix_vec_count)
			mlx5_get_default_msix_vec_count(dev, pci_num_vf(pf));

	sriov = &dev->priv.sriov;

	/* Reversed translation of PCI VF function number to the internal
	 * function_id, which exists in the name of virtfn symlink.
	 */
	for (id = 0; id < pci_num_vf(pf); id++) {
		if (!sriov->vfs_ctx[id].enabled)
			continue;

		if (vf->devfn == pci_iov_virtfn_devfn(pf, id))
			break;
	}

	if (id == pci_num_vf(pf) || !sriov->vfs_ctx[id].enabled)
	id = pci_iov_vf_id(vf);
	if (id < 0 || !sriov->vfs_ctx[id].enabled)
		return -EINVAL;

	return mlx5_set_msix_vec_count(dev, id + 1, msix_vec_count);
Loading