Commit 4debf771 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull iommufd updates from Jason Gunthorpe:
 "On top of the vfio updates is built some new iommufd functionality:

   - IOMMU_HWPT_ALLOC allows userspace to directly create the low level
     IO Page table objects and affiliate them with IOAS objects that
     hold the translation mapping. This is the basic functionality for
     the normal IOMMU_DOMAIN_PAGING domains.

   - VFIO_DEVICE_ATTACH_IOMMUFD_PT can be used to replace the current
     translation. This is wired up to through all the layers down to the
     driver so the driver has the ability to implement a hitless
     replacement. This is necessary to fully support guest behaviors
     when emulating HW (eg guest atomic change of translation)

   - IOMMU_GET_HW_INFO returns information about the IOMMU driver HW
     that owns a VFIO device. This includes support for the Intel iommu,
     and patches have been posted for all the other server IOMMU.

  Along the way are a number of internal items:

   - New iommufd kernel APIs: iommufd_ctx_has_group(),
        iommufd_device_to_ictx(), iommufd_device_to_id(),
        iommufd_access_detach(), iommufd_ctx_from_fd(),
        iommufd_device_replace()

   - iommufd now internally tracks iommu_groups as it needs some
     per-group data

   - Reorganize how the internal hwpt allocation flows to have more
     robust locking

   - Improve the access interfaces to support detach and replace of an
     IOAS from an access

   - New selftests and a rework of how the selftests creates a mock
     iommu driver to be more like a real iommu driver"

Link: https://lore.kernel.org/lkml/ZO%2FTe6LU1ENf58ZW@nvidia.com/

* tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (34 commits)
  iommufd/selftest: Don't leak the platform device memory when unloading the module
  iommu/vt-d: Implement hw_info for iommu capability query
  iommufd/selftest: Add coverage for IOMMU_GET_HW_INFO ioctl
  iommufd: Add IOMMU_GET_HW_INFO
  iommu: Add new iommu op to get iommu hardware information
  iommu: Move dev_iommu_ops() to private header
  iommufd: Remove iommufd_ref_to_users()
  iommufd/selftest: Make the mock iommu driver into a real driver
  vfio: Support IO page table replacement
  iommufd/selftest: Add IOMMU_TEST_OP_ACCESS_REPLACE_IOAS coverage
  iommufd: Add iommufd_access_replace() API
  iommufd: Use iommufd_access_change_ioas in iommufd_access_destroy_object
  iommufd: Add iommufd_access_change_ioas(_id) helpers
  iommufd: Allow passing in iopt_access_list_id to iopt_remove_access()
  vfio: Do not allow !ops->dma_unmap in vfio_pin/unpin_pages()
  iommufd/selftest: Add a selftest for IOMMU_HWPT_ALLOC
  iommufd/selftest: Return the real idev id from selftest mock_domain
  iommufd: Add IOMMU_HWPT_ALLOC
  iommufd/selftest: Test iommufd_device_replace()
  iommufd: Make destroy_rwsem use a lock class per object type
  ...
parents ec0e2dc8 eb501c2d
Loading
Loading
Loading
Loading
+19 −0
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
#include <linux/tboot.h>
#include <uapi/linux/iommufd.h>

#include "iommu.h"
#include "../dma-iommu.h"
@@ -4732,8 +4733,26 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
}

static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
{
	struct device_domain_info *info = dev_iommu_priv_get(dev);
	struct intel_iommu *iommu = info->iommu;
	struct iommu_hw_info_vtd *vtd;

	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
	if (!vtd)
		return ERR_PTR(-ENOMEM);

	vtd->cap_reg = iommu->cap;
	vtd->ecap_reg = iommu->ecap;
	*length = sizeof(*vtd);
	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
	return vtd;
}

const struct iommu_ops intel_iommu_ops = {
	.capable		= intel_iommu_capable,
	.hw_info		= intel_iommu_hw_info,
	.domain_alloc		= intel_iommu_domain_alloc,
	.probe_device		= intel_iommu_probe_device,
	.probe_finalize		= intel_iommu_probe_finalize,
+30 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.
 */
#ifndef __LINUX_IOMMU_PRIV_H
#define __LINUX_IOMMU_PRIV_H

#include <linux/iommu.h>

static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
{
	/*
	 * Assume that valid ops must be installed if iommu_probe_device()
	 * has succeeded. The device ops are essentially for internal use
	 * within the IOMMU subsystem itself, so we should be able to trust
	 * ourselves not to misuse the helper.
	 */
	return dev->iommu->iommu_dev->ops;
}

int iommu_group_replace_domain(struct iommu_group *group,
			       struct iommu_domain *new_domain);

int iommu_device_register_bus(struct iommu_device *iommu,
			      const struct iommu_ops *ops, struct bus_type *bus,
			      struct notifier_block *nb);
void iommu_device_unregister_bus(struct iommu_device *iommu,
				 struct bus_type *bus,
				 struct notifier_block *nb);

#endif /* __LINUX_IOMMU_PRIV_H */
+80 −1
Original line number Diff line number Diff line
@@ -34,8 +34,10 @@
#include <linux/msi.h>

#include "dma-iommu.h"
#include "iommu-priv.h"

#include "iommu-sva.h"
#include "iommu-priv.h"

static struct kset *iommu_group_kset;
static DEFINE_IDA(iommu_group_ida);
@@ -287,6 +289,48 @@ void iommu_device_unregister(struct iommu_device *iommu)
}
EXPORT_SYMBOL_GPL(iommu_device_unregister);

#if IS_ENABLED(CONFIG_IOMMUFD_TEST)
void iommu_device_unregister_bus(struct iommu_device *iommu,
				 struct bus_type *bus,
				 struct notifier_block *nb)
{
	bus_unregister_notifier(bus, nb);
	iommu_device_unregister(iommu);
}
EXPORT_SYMBOL_GPL(iommu_device_unregister_bus);

/*
 * Register an iommu driver against a single bus. This is only used by iommufd
 * selftest to create a mock iommu driver. The caller must provide
 * some memory to hold a notifier_block.
 */
int iommu_device_register_bus(struct iommu_device *iommu,
			      const struct iommu_ops *ops, struct bus_type *bus,
			      struct notifier_block *nb)
{
	int err;

	iommu->ops = ops;
	nb->notifier_call = iommu_bus_notifier;
	err = bus_register_notifier(bus, nb);
	if (err)
		return err;

	spin_lock(&iommu_device_lock);
	list_add_tail(&iommu->list, &iommu_device_list);
	spin_unlock(&iommu_device_lock);

	bus->iommu_ops = ops;
	err = bus_iommu_probe(bus);
	if (err) {
		iommu_device_unregister_bus(iommu, bus, nb);
		return err;
	}
	return 0;
}
EXPORT_SYMBOL_GPL(iommu_device_register_bus);
#endif

static struct dev_iommu *dev_iommu_get(struct device *dev)
{
	struct dev_iommu *param = dev->iommu;
@@ -2114,6 +2158,32 @@ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group)
}
EXPORT_SYMBOL_GPL(iommu_attach_group);

/**
 * iommu_group_replace_domain - replace the domain that a group is attached to
 * @new_domain: new IOMMU domain to replace with
 * @group: IOMMU group that will be attached to the new domain
 *
 * This API allows the group to switch domains without being forced to go to
 * the blocking domain in-between.
 *
 * If the currently attached domain is a core domain (e.g. a default_domain),
 * it will act just like the iommu_attach_group().
 */
int iommu_group_replace_domain(struct iommu_group *group,
			       struct iommu_domain *new_domain)
{
	int ret;

	if (!new_domain)
		return -EINVAL;

	mutex_lock(&group->mutex);
	ret = __iommu_group_set_domain(group, new_domain);
	mutex_unlock(&group->mutex);
	return ret;
}
EXPORT_SYMBOL_NS_GPL(iommu_group_replace_domain, IOMMUFD_INTERNAL);

static int __iommu_device_set_domain(struct iommu_group *group,
				     struct device *dev,
				     struct iommu_domain *new_domain,
@@ -2642,6 +2712,14 @@ int iommu_set_pgtable_quirks(struct iommu_domain *domain,
}
EXPORT_SYMBOL_GPL(iommu_set_pgtable_quirks);

/**
 * iommu_get_resv_regions - get reserved regions
 * @dev: device for which to get reserved regions
 * @list: reserved region list for device
 *
 * This returns a list of reserved IOVA regions specific to this device.
 * A domain user should not map IOVA in these ranges.
 */
void iommu_get_resv_regions(struct device *dev, struct list_head *list)
{
	const struct iommu_ops *ops = dev_iommu_ops(dev);
@@ -2649,9 +2727,10 @@ void iommu_get_resv_regions(struct device *dev, struct list_head *list)
	if (ops->get_resv_regions)
		ops->get_resv_regions(dev, list);
}
EXPORT_SYMBOL_GPL(iommu_get_resv_regions);

/**
 * iommu_put_resv_regions - release resered regions
 * iommu_put_resv_regions - release reserved regions
 * @dev: device for which to free reserved regions
 * @list: reserved region list for device
 *
+552 −179

File changed.

Preview size limit exceeded, changes collapsed.

+93 −19
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@
 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
 */
#include <linux/iommu.h>
#include <uapi/linux/iommufd.h>

#include "iommufd_private.h"

@@ -11,8 +12,6 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
	struct iommufd_hw_pagetable *hwpt =
		container_of(obj, struct iommufd_hw_pagetable, obj);

	WARN_ON(!list_empty(&hwpt->devices));

	if (!list_empty(&hwpt->hwpt_item)) {
		mutex_lock(&hwpt->ioas->mutex);
		list_del(&hwpt->hwpt_item);
@@ -25,7 +24,35 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
		iommu_domain_free(hwpt->domain);

	refcount_dec(&hwpt->ioas->obj.users);
	mutex_destroy(&hwpt->devices_lock);
}

void iommufd_hw_pagetable_abort(struct iommufd_object *obj)
{
	struct iommufd_hw_pagetable *hwpt =
		container_of(obj, struct iommufd_hw_pagetable, obj);

	/* The ioas->mutex must be held until finalize is called. */
	lockdep_assert_held(&hwpt->ioas->mutex);

	if (!list_empty(&hwpt->hwpt_item)) {
		list_del_init(&hwpt->hwpt_item);
		iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain);
	}
	iommufd_hw_pagetable_destroy(obj);
}

int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt)
{
	if (hwpt->enforce_cache_coherency)
		return 0;

	if (hwpt->domain->ops->enforce_cache_coherency)
		hwpt->enforce_cache_coherency =
			hwpt->domain->ops->enforce_cache_coherency(
				hwpt->domain);
	if (!hwpt->enforce_cache_coherency)
		return -EINVAL;
	return 0;
}

/**
@@ -38,6 +65,10 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
 * Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT
 * will be linked to the given ioas and upon return the underlying iommu_domain
 * is fully popoulated.
 *
 * The caller must hold the ioas->mutex until after
 * iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on
 * the returned hwpt.
 */
struct iommufd_hw_pagetable *
iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
@@ -52,9 +83,7 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
	if (IS_ERR(hwpt))
		return hwpt;

	INIT_LIST_HEAD(&hwpt->devices);
	INIT_LIST_HEAD(&hwpt->hwpt_item);
	mutex_init(&hwpt->devices_lock);
	/* Pairs with iommufd_hw_pagetable_destroy() */
	refcount_inc(&ioas->obj.users);
	hwpt->ioas = ioas;
@@ -65,7 +94,18 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
		goto out_abort;
	}

	mutex_lock(&hwpt->devices_lock);
	/*
	 * Set the coherency mode before we do iopt_table_add_domain() as some
	 * iommus have a per-PTE bit that controls it and need to decide before
	 * doing any maps. It is an iommu driver bug to report
	 * IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on
	 * a new domain.
	 */
	if (idev->enforce_cache_coherency) {
		rc = iommufd_hw_pagetable_enforce_cc(hwpt);
		if (WARN_ON(rc))
			goto out_abort;
	}

	/*
	 * immediate_attach exists only to accommodate iommu drivers that cannot
@@ -76,30 +116,64 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
	if (immediate_attach) {
		rc = iommufd_hw_pagetable_attach(hwpt, idev);
		if (rc)
			goto out_unlock;
			goto out_abort;
	}

	rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain);
	if (rc)
		goto out_detach;
	list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list);

	if (immediate_attach) {
		/* See iommufd_device_do_attach() */
		refcount_inc(&hwpt->obj.users);
		idev->hwpt = hwpt;
		list_add(&idev->devices_item, &hwpt->devices);
	}

	mutex_unlock(&hwpt->devices_lock);
	return hwpt;

out_detach:
	if (immediate_attach)
		iommufd_hw_pagetable_detach(hwpt, idev);
out_unlock:
	mutex_unlock(&hwpt->devices_lock);
		iommufd_hw_pagetable_detach(idev);
out_abort:
	iommufd_object_abort_and_destroy(ictx, &hwpt->obj);
	return ERR_PTR(rc);
}

int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
{
	struct iommu_hwpt_alloc *cmd = ucmd->cmd;
	struct iommufd_hw_pagetable *hwpt;
	struct iommufd_device *idev;
	struct iommufd_ioas *ioas;
	int rc;

	if (cmd->flags || cmd->__reserved)
		return -EOPNOTSUPP;

	idev = iommufd_get_device(ucmd, cmd->dev_id);
	if (IS_ERR(idev))
		return PTR_ERR(idev);

	ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id);
	if (IS_ERR(ioas)) {
		rc = PTR_ERR(ioas);
		goto out_put_idev;
	}

	mutex_lock(&ioas->mutex);
	hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, idev, false);
	if (IS_ERR(hwpt)) {
		rc = PTR_ERR(hwpt);
		goto out_unlock;
	}

	cmd->out_hwpt_id = hwpt->obj.id;
	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
	if (rc)
		goto out_hwpt;
	iommufd_object_finalize(ucmd->ictx, &hwpt->obj);
	goto out_unlock;

out_hwpt:
	iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj);
out_unlock:
	mutex_unlock(&ioas->mutex);
	iommufd_put_object(&ioas->obj);
out_put_idev:
	iommufd_put_object(&idev->obj);
	return rc;
}
Loading