Commit 79a463be authored by Xie Yongji's avatar Xie Yongji Committed by Michael S. Tsirkin
Browse files

vduse: Support registering userspace memory for IOVA regions



Introduce two ioctls: VDUSE_IOTLB_REG_UMEM and
VDUSE_IOTLB_DEREG_UMEM to support registering
and de-registering userspace memory for IOVA
regions.

Now it only supports registering userspace memory
for bounce buffer region in virtio-vdpa case.

Signed-off-by: default avatarXie Yongji <xieyongji@bytedance.com>
Acked-by: default avatarJason Wang <jasowang@redhat.com>
Message-Id: <20220803045523.23851-5-xieyongji@bytedance.com>
Signed-off-by: default avatarMichael S. Tsirkin <mst@redhat.com>
parent 6c77ed22
Loading
Loading
Loading
Loading
+141 −0
Original line number Diff line number Diff line
@@ -21,6 +21,8 @@
#include <linux/uio.h>
#include <linux/vdpa.h>
#include <linux/nospec.h>
#include <linux/vmalloc.h>
#include <linux/sched/mm.h>
#include <uapi/linux/vduse.h>
#include <uapi/linux/vdpa.h>
#include <uapi/linux/virtio_config.h>
@@ -64,6 +66,13 @@ struct vduse_vdpa {
	struct vduse_dev *dev;
};

struct vduse_umem {
	unsigned long iova;
	unsigned long npages;
	struct page **pages;
	struct mm_struct *mm;
};

struct vduse_dev {
	struct vduse_vdpa *vdev;
	struct device *dev;
@@ -95,6 +104,8 @@ struct vduse_dev {
	u8 status;
	u32 vq_num;
	u32 vq_align;
	struct vduse_umem *umem;
	struct mutex mem_lock;
};

struct vduse_dev_msg {
@@ -917,6 +928,102 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
	return ret;
}

static int vduse_dev_dereg_umem(struct vduse_dev *dev,
				u64 iova, u64 size)
{
	int ret;

	mutex_lock(&dev->mem_lock);
	ret = -ENOENT;
	if (!dev->umem)
		goto unlock;

	ret = -EINVAL;
	if (dev->umem->iova != iova || size != dev->domain->bounce_size)
		goto unlock;

	vduse_domain_remove_user_bounce_pages(dev->domain);
	unpin_user_pages_dirty_lock(dev->umem->pages,
				    dev->umem->npages, true);
	atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm);
	mmdrop(dev->umem->mm);
	vfree(dev->umem->pages);
	kfree(dev->umem);
	dev->umem = NULL;
	ret = 0;
unlock:
	mutex_unlock(&dev->mem_lock);
	return ret;
}

static int vduse_dev_reg_umem(struct vduse_dev *dev,
			      u64 iova, u64 uaddr, u64 size)
{
	struct page **page_list = NULL;
	struct vduse_umem *umem = NULL;
	long pinned = 0;
	unsigned long npages, lock_limit;
	int ret;

	if (!dev->domain->bounce_map ||
	    size != dev->domain->bounce_size ||
	    iova != 0 || uaddr & ~PAGE_MASK)
		return -EINVAL;

	mutex_lock(&dev->mem_lock);
	ret = -EEXIST;
	if (dev->umem)
		goto unlock;

	ret = -ENOMEM;
	npages = size >> PAGE_SHIFT;
	page_list = __vmalloc(array_size(npages, sizeof(struct page *)),
			      GFP_KERNEL_ACCOUNT);
	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
	if (!page_list || !umem)
		goto unlock;

	mmap_read_lock(current->mm);

	lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
	if (npages + atomic64_read(&current->mm->pinned_vm) > lock_limit)
		goto out;

	pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
				page_list, NULL);
	if (pinned != npages) {
		ret = pinned < 0 ? pinned : -ENOMEM;
		goto out;
	}

	ret = vduse_domain_add_user_bounce_pages(dev->domain,
						 page_list, pinned);
	if (ret)
		goto out;

	atomic64_add(npages, &current->mm->pinned_vm);

	umem->pages = page_list;
	umem->npages = pinned;
	umem->iova = iova;
	umem->mm = current->mm;
	mmgrab(current->mm);

	dev->umem = umem;
out:
	if (ret && pinned > 0)
		unpin_user_pages(page_list, pinned);

	mmap_read_unlock(current->mm);
unlock:
	if (ret) {
		vfree(page_list);
		kfree(umem);
	}
	mutex_unlock(&dev->mem_lock);
	return ret;
}

static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
			    unsigned long arg)
{
@@ -1089,6 +1196,38 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
		ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject);
		break;
	}
	case VDUSE_IOTLB_REG_UMEM: {
		struct vduse_iova_umem umem;

		ret = -EFAULT;
		if (copy_from_user(&umem, argp, sizeof(umem)))
			break;

		ret = -EINVAL;
		if (!is_mem_zero((const char *)umem.reserved,
				 sizeof(umem.reserved)))
			break;

		ret = vduse_dev_reg_umem(dev, umem.iova,
					 umem.uaddr, umem.size);
		break;
	}
	case VDUSE_IOTLB_DEREG_UMEM: {
		struct vduse_iova_umem umem;

		ret = -EFAULT;
		if (copy_from_user(&umem, argp, sizeof(umem)))
			break;

		ret = -EINVAL;
		if (!is_mem_zero((const char *)umem.reserved,
				 sizeof(umem.reserved)))
			break;

		ret = vduse_dev_dereg_umem(dev, umem.iova,
					   umem.size);
		break;
	}
	default:
		ret = -ENOIOCTLCMD;
		break;
@@ -1101,6 +1240,7 @@ static int vduse_dev_release(struct inode *inode, struct file *file)
{
	struct vduse_dev *dev = file->private_data;

	vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
	spin_lock(&dev->msg_lock);
	/* Make sure the inflight messages can processed after reconncection */
	list_splice_init(&dev->recv_list, &dev->send_list);
@@ -1163,6 +1303,7 @@ static struct vduse_dev *vduse_dev_create(void)
		return NULL;

	mutex_init(&dev->lock);
	mutex_init(&dev->mem_lock);
	spin_lock_init(&dev->msg_lock);
	INIT_LIST_HEAD(&dev->send_list);
	INIT_LIST_HEAD(&dev->recv_list);
+23 −0
Original line number Diff line number Diff line
@@ -210,6 +210,29 @@ struct vduse_vq_eventfd {
 */
#define VDUSE_VQ_INJECT_IRQ	_IOW(VDUSE_BASE, 0x17, __u32)

/**
 * struct vduse_iova_umem - userspace memory configuration for one IOVA region
 * @uaddr: start address of userspace memory, it must be aligned to page size
 * @iova: start of the IOVA region
 * @size: size of the IOVA region
 * @reserved: for future use, needs to be initialized to zero
 *
 * Structure used by VDUSE_IOTLB_REG_UMEM and VDUSE_IOTLB_DEREG_UMEM
 * ioctls to register/de-register userspace memory for IOVA regions
 */
struct vduse_iova_umem {
	__u64 uaddr;
	__u64 iova;
	__u64 size;
	__u64 reserved[3];
};

/* Register userspace memory for IOVA regions */
#define VDUSE_IOTLB_REG_UMEM	_IOW(VDUSE_BASE, 0x18, struct vduse_iova_umem)

/* De-register the userspace memory. Caller should set iova and size field. */
#define VDUSE_IOTLB_DEREG_UMEM	_IOW(VDUSE_BASE, 0x19, struct vduse_iova_umem)

/* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */

/**