Commit 76adb20f authored by Alex Williamson's avatar Alex Williamson
Browse files

Merge branch 'v5.12/vfio/next-vaddr' into v5.12/vfio/next

parents 7e31d6dc 898b9eae
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -1220,6 +1220,11 @@ static int vfio_fops_open(struct inode *inode, struct file *filep)
static int vfio_fops_release(struct inode *inode, struct file *filep)
{
	struct vfio_container *container = filep->private_data;
	struct vfio_iommu_driver *driver = container->iommu_driver;

	if (driver && driver->ops->notify)
		driver->ops->notify(container->iommu_data,
				    VFIO_IOMMU_CONTAINER_CLOSE);

	filep->private_data = NULL;

+217 −32
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@
#include <linux/rbtree.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/kthread.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
@@ -69,11 +70,15 @@ struct vfio_iommu {
	struct rb_root		dma_list;
	struct blocking_notifier_head notifier;
	unsigned int		dma_avail;
	unsigned int		vaddr_invalid_count;
	uint64_t		pgsize_bitmap;
	uint64_t		num_non_pinned_groups;
	wait_queue_head_t	vaddr_wait;
	bool			v2;
	bool			nesting;
	bool			dirty_page_tracking;
	bool			pinned_page_dirty_scope;
	bool			container_open;
};

struct vfio_domain {
@@ -92,6 +97,7 @@ struct vfio_dma {
	int			prot;		/* IOMMU_READ/WRITE */
	bool			iommu_mapped;
	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
	bool			vaddr_invalid;
	struct task_struct	*task;
	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
	unsigned long		*bitmap;
@@ -143,6 +149,8 @@ struct vfio_regions {
#define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
#define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)

#define WAITED 1

static int put_pfn(unsigned long pfn, int prot);

static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
@@ -172,6 +180,31 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
	return NULL;
}

static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
						dma_addr_t start, size_t size)
{
	struct rb_node *res = NULL;
	struct rb_node *node = iommu->dma_list.rb_node;
	struct vfio_dma *dma_res = NULL;

	while (node) {
		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);

		if (start < dma->iova + dma->size) {
			res = node;
			dma_res = dma;
			if (start >= dma->iova)
				break;
			node = node->rb_left;
		} else {
			node = node->rb_right;
		}
	}
	if (res && size && dma_res->iova >= start + size)
		res = NULL;
	return res;
}

static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
{
	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
@@ -490,6 +523,61 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
	return ret;
}

static int vfio_wait(struct vfio_iommu *iommu)
{
	DEFINE_WAIT(wait);

	prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
	mutex_unlock(&iommu->lock);
	schedule();
	mutex_lock(&iommu->lock);
	finish_wait(&iommu->vaddr_wait, &wait);
	if (kthread_should_stop() || !iommu->container_open ||
	    fatal_signal_pending(current)) {
		return -EFAULT;
	}
	return WAITED;
}

/*
 * Find dma struct and wait for its vaddr to be valid.  iommu lock is dropped
 * if the task waits, but is re-locked on return.  Return result in *dma_p.
 * Return 0 on success with no waiting, WAITED on success if waited, and -errno
 * on error.
 */
static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start,
			       size_t size, struct vfio_dma **dma_p)
{
	int ret;

	do {
		*dma_p = vfio_find_dma(iommu, start, size);
		if (!*dma_p)
			ret = -EINVAL;
		else if (!(*dma_p)->vaddr_invalid)
			ret = 0;
		else
			ret = vfio_wait(iommu);
	} while (ret > 0);

	return ret;
}

/*
 * Wait for all vaddr in the dma_list to become valid.  iommu lock is dropped
 * if the task waits, but is re-locked on return.  Return 0 on success with no
 * waiting, WAITED on success if waited, and -errno on error.
 */
static int vfio_wait_all_valid(struct vfio_iommu *iommu)
{
	int ret = 0;

	while (iommu->vaddr_invalid_count && ret >= 0)
		ret = vfio_wait(iommu);

	return ret;
}

/*
 * Attempt to pin pages.  We really don't want to track all the pfns and
 * the iommu can only map chunks of consecutive pfns anyway, so get the
@@ -651,6 +739,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
	unsigned long remote_vaddr;
	struct vfio_dma *dma;
	bool do_accounting;
	dma_addr_t iova;

	if (!iommu || !user_pfn || !phys_pfn)
		return -EINVAL;
@@ -661,6 +750,22 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,

	mutex_lock(&iommu->lock);

	/*
	 * Wait for all necessary vaddr's to be valid so they can be used in
	 * the main loop without dropping the lock, to avoid racing vs unmap.
	 */
again:
	if (iommu->vaddr_invalid_count) {
		for (i = 0; i < npage; i++) {
			iova = user_pfn[i] << PAGE_SHIFT;
			ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
			if (ret < 0)
				goto pin_done;
			if (ret == WAITED)
				goto again;
		}
	}

	/* Fail if notifier list is empty */
	if (!iommu->notifier.head) {
		ret = -EINVAL;
@@ -675,7 +780,6 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);

	for (i = 0; i < npage; i++) {
		dma_addr_t iova;
		struct vfio_pfn *vpfn;

		iova = user_pfn[i] << PAGE_SHIFT;
@@ -961,6 +1065,10 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
	vfio_unlink_dma(iommu, dma);
	put_task_struct(dma->task);
	vfio_dma_bitmap_free(dma);
	if (dma->vaddr_invalid) {
		iommu->vaddr_invalid_count--;
		wake_up_all(&iommu->vaddr_wait);
	}
	kfree(dma);
	iommu->dma_avail++;
}
@@ -1086,34 +1194,36 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
{
	struct vfio_dma *dma, *dma_last = NULL;
	size_t unmapped = 0, pgsize;
	int ret = 0, retries = 0;
	int ret = -EINVAL, retries = 0;
	unsigned long pgshift;
	dma_addr_t iova = unmap->iova;
	unsigned long size = unmap->size;
	bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
	bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
	struct rb_node *n, *first_n;

	mutex_lock(&iommu->lock);

	pgshift = __ffs(iommu->pgsize_bitmap);
	pgsize = (size_t)1 << pgshift;

	if (unmap->iova & (pgsize - 1)) {
		ret = -EINVAL;
	if (iova & (pgsize - 1))
		goto unlock;
	}

	if (!unmap->size || unmap->size & (pgsize - 1)) {
		ret = -EINVAL;
	if (unmap_all) {
		if (iova || size)
			goto unlock;
		size = SIZE_MAX;
	} else if (!size || size & (pgsize - 1)) {
		goto unlock;
	}

	if (unmap->iova + unmap->size - 1 < unmap->iova ||
	    unmap->size > SIZE_MAX) {
		ret = -EINVAL;
	if (iova + size - 1 < iova || size > SIZE_MAX)
		goto unlock;
	}

	/* When dirty tracking is enabled, allow only min supported pgsize */
	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
	    (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
		ret = -EINVAL;
		goto unlock;
	}

@@ -1150,21 +1260,25 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
	 * will only return success and a size of zero if there were no
	 * mappings within the range.
	 */
	if (iommu->v2) {
		dma = vfio_find_dma(iommu, unmap->iova, 1);
		if (dma && dma->iova != unmap->iova) {
			ret = -EINVAL;
	if (iommu->v2 && !unmap_all) {
		dma = vfio_find_dma(iommu, iova, 1);
		if (dma && dma->iova != iova)
			goto unlock;
		}
		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
			ret = -EINVAL;

		dma = vfio_find_dma(iommu, iova + size - 1, 0);
		if (dma && dma->iova + dma->size != iova + size)
			goto unlock;
	}
	}

	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
		if (!iommu->v2 && unmap->iova > dma->iova)
	ret = 0;
	n = first_n = vfio_find_dma_first_node(iommu, iova, size);

	while (n) {
		dma = rb_entry(n, struct vfio_dma, node);
		if (dma->iova >= iova + size)
			break;

		if (!iommu->v2 && iova > dma->iova)
			break;
		/*
		 * Task with same address space who mapped this iova range is
@@ -1173,6 +1287,27 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
		if (dma->task->mm != current->mm)
			break;

		if (invalidate_vaddr) {
			if (dma->vaddr_invalid) {
				struct rb_node *last_n = n;

				for (n = first_n; n != last_n; n = rb_next(n)) {
					dma = rb_entry(n,
						       struct vfio_dma, node);
					dma->vaddr_invalid = false;
					iommu->vaddr_invalid_count--;
				}
				ret = -EINVAL;
				unmapped = 0;
				break;
			}
			dma->vaddr_invalid = true;
			iommu->vaddr_invalid_count++;
			unmapped += dma->size;
			n = rb_next(n);
			continue;
		}

		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
			struct vfio_iommu_type1_dma_unmap nb_unmap;

@@ -1202,12 +1337,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

		if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
			ret = update_user_bitmap(bitmap->data, iommu, dma,
						 unmap->iova, pgsize);
						 iova, pgsize);
			if (ret)
				break;
		}

		unmapped += dma->size;
		n = rb_next(n);
		vfio_remove_dma(iommu, dma);
	}

@@ -1311,6 +1447,7 @@ static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
static int vfio_dma_do_map(struct vfio_iommu *iommu,
			   struct vfio_iommu_type1_dma_map *map)
{
	bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
	dma_addr_t iova = map->iova;
	unsigned long vaddr = map->vaddr;
	size_t size = map->size;
@@ -1328,13 +1465,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
		prot |= IOMMU_READ;

	if ((prot && set_vaddr) || (!prot && !set_vaddr))
		return -EINVAL;

	mutex_lock(&iommu->lock);

	pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);

	WARN_ON((pgsize - 1) & PAGE_MASK);

	if (!prot || !size || (size | iova | vaddr) & (pgsize - 1)) {
	if (!size || (size | iova | vaddr) & (pgsize - 1)) {
		ret = -EINVAL;
		goto out_unlock;
	}
@@ -1345,7 +1485,21 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
		goto out_unlock;
	}

	if (vfio_find_dma(iommu, iova, size)) {
	dma = vfio_find_dma(iommu, iova, size);
	if (set_vaddr) {
		if (!dma) {
			ret = -ENOENT;
		} else if (!dma->vaddr_invalid || dma->iova != iova ||
			   dma->size != size) {
			ret = -EINVAL;
		} else {
			dma->vaddr = vaddr;
			dma->vaddr_invalid = false;
			iommu->vaddr_invalid_count--;
			wake_up_all(&iommu->vaddr_wait);
		}
		goto out_unlock;
	} else if (dma) {
		ret = -EEXIST;
		goto out_unlock;
	}
@@ -1442,6 +1596,10 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
	int ret;

	ret = vfio_wait_all_valid(iommu);
	if (ret < 0)
		return ret;

	/* Arbitrarily pick the first domain in the list for lookups */
	if (!list_empty(&iommu->domain_list))
		d = list_first_entry(&iommu->domain_list,
@@ -2417,8 +2575,10 @@ static void *vfio_iommu_type1_open(unsigned long arg)
	INIT_LIST_HEAD(&iommu->iova_list);
	iommu->dma_list = RB_ROOT;
	iommu->dma_avail = dma_entry_limit;
	iommu->container_open = true;
	mutex_init(&iommu->lock);
	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
	init_waitqueue_head(&iommu->vaddr_wait);

	return iommu;
}
@@ -2487,6 +2647,8 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
	case VFIO_TYPE1_IOMMU:
	case VFIO_TYPE1v2_IOMMU:
	case VFIO_TYPE1_NESTING_IOMMU:
	case VFIO_UNMAP_ALL:
	case VFIO_UPDATE_VADDR:
		return 1;
	case VFIO_DMA_CC_IOMMU:
		if (!iommu)
@@ -2658,7 +2820,8 @@ static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
{
	struct vfio_iommu_type1_dma_map map;
	unsigned long minsz;
	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
			VFIO_DMA_MAP_FLAG_VADDR;

	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);

@@ -2676,6 +2839,9 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
{
	struct vfio_iommu_type1_dma_unmap unmap;
	struct vfio_bitmap bitmap = { 0 };
	uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
			VFIO_DMA_UNMAP_FLAG_VADDR |
			VFIO_DMA_UNMAP_FLAG_ALL;
	unsigned long minsz;
	int ret;

@@ -2684,8 +2850,12 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
	if (copy_from_user(&unmap, (void __user *)arg, minsz))
		return -EFAULT;

	if (unmap.argsz < minsz ||
	    unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
	if (unmap.argsz < minsz || unmap.flags & ~mask)
		return -EINVAL;

	if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
	    (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
			    VFIO_DMA_UNMAP_FLAG_VADDR)))
		return -EINVAL;

	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
@@ -2876,12 +3046,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
	struct vfio_dma *dma;
	bool kthread = current->mm == NULL;
	size_t offset;
	int ret;

	*copied = 0;

	dma = vfio_find_dma(iommu, user_iova, 1);
	if (!dma)
		return -EINVAL;
	ret = vfio_find_dma_valid(iommu, user_iova, 1, &dma);
	if (ret < 0)
		return ret;

	if ((write && !(dma->prot & IOMMU_WRITE)) ||
			!(dma->prot & IOMMU_READ))
@@ -2973,6 +3144,19 @@ vfio_iommu_type1_group_iommu_domain(void *iommu_data,
	return domain;
}

static void vfio_iommu_type1_notify(void *iommu_data,
				    enum vfio_iommu_notify_type event)
{
	struct vfio_iommu *iommu = iommu_data;

	if (event != VFIO_IOMMU_CONTAINER_CLOSE)
		return;
	mutex_lock(&iommu->lock);
	iommu->container_open = false;
	mutex_unlock(&iommu->lock);
	wake_up_all(&iommu->vaddr_wait);
}

static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
	.name			= "vfio-iommu-type1",
	.owner			= THIS_MODULE,
@@ -2987,6 +3171,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
	.dma_rw			= vfio_iommu_type1_dma_rw,
	.group_iommu_domain	= vfio_iommu_type1_group_iommu_domain,
	.notify			= vfio_iommu_type1_notify,
};

static int __init vfio_iommu_type1_init(void)
+7 −0
Original line number Diff line number Diff line
@@ -57,6 +57,11 @@ extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
extern void vfio_device_put(struct vfio_device *device);
extern void *vfio_device_data(struct vfio_device *device);

/* events for the backend driver notify callback */
enum vfio_iommu_notify_type {
	VFIO_IOMMU_CONTAINER_CLOSE = 0,
};

/**
 * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
 */
@@ -92,6 +97,8 @@ struct vfio_iommu_driver_ops {
				  void *data, size_t count, bool write);
	struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
						   struct iommu_group *group);
	void		(*notify)(void *iommu_data,
				  enum vfio_iommu_notify_type event);
};

extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
+27 −0
Original line number Diff line number Diff line
@@ -46,6 +46,12 @@
 */
#define VFIO_NOIOMMU_IOMMU		8

/* Supports VFIO_DMA_UNMAP_FLAG_ALL */
#define VFIO_UNMAP_ALL			9

/* Supports the vaddr flag for DMA map and unmap */
#define VFIO_UPDATE_VADDR		10

/*
 * The IOCTL interface is designed for extensibility by embedding the
 * structure length (argsz) and flags into structures passed between
@@ -1074,12 +1080,22 @@ struct vfio_iommu_type1_info_dma_avail {
 *
 * Map process virtual addresses to IO virtual addresses using the
 * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
 *
 * If flags & VFIO_DMA_MAP_FLAG_VADDR, update the base vaddr for iova, and
 * unblock translation of host virtual addresses in the iova range.  The vaddr
 * must have previously been invalidated with VFIO_DMA_UNMAP_FLAG_VADDR.  To
 * maintain memory consistency within the user application, the updated vaddr
 * must address the same memory object as originally mapped.  Failure to do so
 * will result in user memory corruption and/or device misbehavior.  iova and
 * size must match those in the original MAP_DMA call.  Protection is not
 * changed, and the READ & WRITE flags must be 0.
 */
struct vfio_iommu_type1_dma_map {
	__u32	argsz;
	__u32	flags;
#define VFIO_DMA_MAP_FLAG_READ (1 << 0)		/* readable from device */
#define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)	/* writable from device */
#define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
	__u64	vaddr;				/* Process virtual address */
	__u64	iova;				/* IO virtual address */
	__u64	size;				/* Size of mapping (bytes) */
@@ -1102,6 +1118,7 @@ struct vfio_bitmap {
 * field.  No guarantee is made to the user that arbitrary unmaps of iova
 * or size different from those used in the original mapping call will
 * succeed.
 *
 * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap
 * before unmapping IO virtual addresses. When this flag is set, the user must
 * provide a struct vfio_bitmap in data[]. User must provide zero-allocated
@@ -1111,11 +1128,21 @@ struct vfio_bitmap {
 * indicates that the page at that offset from iova is dirty. A Bitmap of the
 * pages in the range of unmapped size is returned in the user-provided
 * vfio_bitmap.data.
 *
 * If flags & VFIO_DMA_UNMAP_FLAG_ALL, unmap all addresses.  iova and size
 * must be 0.  This cannot be combined with the get-dirty-bitmap flag.
 *
 * If flags & VFIO_DMA_UNMAP_FLAG_VADDR, do not unmap, but invalidate host
 * virtual addresses in the iova range.  Tasks that attempt to translate an
 * iova's vaddr will block.  DMA to already-mapped pages continues.  This
 * cannot be combined with the get-dirty-bitmap flag.
 */
struct vfio_iommu_type1_dma_unmap {
	__u32	argsz;
	__u32	flags;
#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
#define VFIO_DMA_UNMAP_FLAG_ALL		     (1 << 1)
#define VFIO_DMA_UNMAP_FLAG_VADDR	     (1 << 2)
	__u64	iova;				/* IO virtual address */
	__u64	size;				/* Size of mapping (bytes) */
	__u8    data[];