Commit c471ad0e authored by Jason Wang's avatar Jason Wang Committed by Michael S. Tsirkin
Browse files

vhost_net: device IOTLB support



This patches implements Device IOTLB support for vhost kernel. This is
done through:

1) switch to use dma helpers when map/unmap vrings from vhost codes
2) introduce a set of VhostOps to:
   - setting up device IOTLB request callback
   - processing device IOTLB request
   - processing device IOTLB invalidation
2) kernel support for Device IOTLB API:

- allow vhost-net to query the IOMMU IOTLB entry through eventfd
- enable the ability for qemu to update a specified mapping of vhost
- through ioctl.
- enable the ability to invalidate a specified range of iova for the
  device IOTLB of vhost through ioctl. In x86/intel_iommu case this is
  triggered through iommu memory region notifier from device IOTLB
  invalidation descriptor processing routine.

With all the above, kernel vhost_net can co-operate with userspace
IOMMU. For vhost-user, the support could be easily done on top by
implementing the VhostOps.

Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: default avatarJason Wang <jasowang@redhat.com>
Reviewed-by: default avatarMichael S. Tsirkin <mst@redhat.com>
Signed-off-by: default avatarMichael S. Tsirkin <mst@redhat.com>
parent 1448c133
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -52,6 +52,7 @@ static const int kernel_feature_bits[] = {
    VIRTIO_NET_F_MRG_RXBUF,
    VIRTIO_F_VERSION_1,
    VIRTIO_NET_F_MTU,
    VIRTIO_F_IOMMU_PLATFORM,
    VHOST_INVALID_FEATURE_BIT
};

+99 −0
Original line number Diff line number Diff line
@@ -185,6 +185,102 @@ static int vhost_kernel_vsock_set_running(struct vhost_dev *dev, int start)
}
#endif /* CONFIG_VHOST_VSOCK */

static void vhost_kernel_iotlb_read(void *opaque)
{
    struct vhost_dev *dev = opaque;
    struct vhost_msg msg;
    ssize_t len;

    while ((len = read((uintptr_t)dev->opaque, &msg, sizeof msg)) > 0) {
        struct vhost_iotlb_msg *imsg = &msg.iotlb;
        if (len < sizeof msg) {
            error_report("Wrong vhost message len: %d", (int)len);
            break;
        }
        if (msg.type != VHOST_IOTLB_MSG) {
            error_report("Unknown vhost iotlb message type");
            break;
        }
        switch (imsg->type) {
        case VHOST_IOTLB_MISS:
            vhost_device_iotlb_miss(dev, imsg->iova,
                                    imsg->perm != VHOST_ACCESS_RO);
            break;
        case VHOST_IOTLB_UPDATE:
        case VHOST_IOTLB_INVALIDATE:
            error_report("Unexpected IOTLB message type");
            break;
        case VHOST_IOTLB_ACCESS_FAIL:
            /* FIXME: report device iotlb error */
            break;
        default:
            break;
        }
    }
}

static int vhost_kernel_update_device_iotlb(struct vhost_dev *dev,
                                            uint64_t iova, uint64_t uaddr,
                                            uint64_t len,
                                            IOMMUAccessFlags perm)
{
    struct vhost_msg msg;
    msg.type = VHOST_IOTLB_MSG;
    msg.iotlb.iova =  iova;
    msg.iotlb.uaddr = uaddr;
    msg.iotlb.size = len;
    msg.iotlb.type = VHOST_IOTLB_UPDATE;

    switch (perm) {
    case IOMMU_RO:
        msg.iotlb.perm = VHOST_ACCESS_RO;
        break;
    case IOMMU_WO:
        msg.iotlb.perm = VHOST_ACCESS_WO;
        break;
    case IOMMU_RW:
        msg.iotlb.perm = VHOST_ACCESS_RW;
        break;
    default:
        g_assert_not_reached();
    }

    if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) {
        error_report("Fail to update device iotlb");
        return -EFAULT;
    }

    return 0;
}

static int vhost_kernel_invalidate_device_iotlb(struct vhost_dev *dev,
                                                uint64_t iova, uint64_t len)
{
    struct vhost_msg msg;

    msg.type = VHOST_IOTLB_MSG;
    msg.iotlb.iova = iova;
    msg.iotlb.size = len;
    msg.iotlb.type = VHOST_IOTLB_INVALIDATE;

    if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) {
        error_report("Fail to invalidate device iotlb");
        return -EFAULT;
    }

    return 0;
}

static void vhost_kernel_set_iotlb_callback(struct vhost_dev *dev,
                                           int enabled)
{
    if (enabled)
        qemu_set_fd_handler((uintptr_t)dev->opaque,
                            vhost_kernel_iotlb_read, NULL, dev);
    else
        qemu_set_fd_handler((uintptr_t)dev->opaque, NULL, NULL, NULL);
}

static const VhostOps kernel_ops = {
        .backend_type = VHOST_BACKEND_TYPE_KERNEL,
        .vhost_backend_init = vhost_kernel_init,
@@ -214,6 +310,9 @@ static const VhostOps kernel_ops = {
        .vhost_vsock_set_guest_cid = vhost_kernel_vsock_set_guest_cid,
        .vhost_vsock_set_running = vhost_kernel_vsock_set_running,
#endif /* CONFIG_VHOST_VSOCK */
        .vhost_set_iotlb_callback = vhost_kernel_set_iotlb_callback,
        .vhost_update_device_iotlb = vhost_kernel_update_device_iotlb,
        .vhost_invalidate_device_iotlb = vhost_kernel_invalidate_device_iotlb,
};

int vhost_set_backend_type(struct vhost_dev *dev, VhostBackendType backend_type)
+144 −22
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@
#include "hw/virtio/virtio-bus.h"
#include "hw/virtio/virtio-access.h"
#include "migration/migration.h"
#include "sysemu/dma.h"

/* enabled until disconnected backend stabilizes */
#define _VHOST_DEBUG 1
@@ -421,8 +422,36 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
    dev->log_size = size;
}

static int vhost_dev_has_iommu(struct vhost_dev *dev)
{
    VirtIODevice *vdev = dev->vdev;
    AddressSpace *dma_as = vdev->dma_as;

    return memory_region_is_iommu(dma_as->root) &&
           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
}

static int vhost_verify_ring_part_mapping(void *part,
static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
                              hwaddr *plen, int is_write)
{
    if (!vhost_dev_has_iommu(dev)) {
        return cpu_physical_memory_map(addr, plen, is_write);
    } else {
        return (void *)(uintptr_t)addr;
    }
}

static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
                               hwaddr len, int is_write,
                               hwaddr access_len)
{
    if (!vhost_dev_has_iommu(dev)) {
        cpu_physical_memory_unmap(buffer, len, is_write, access_len);
    }
}

static int vhost_verify_ring_part_mapping(struct vhost_dev *dev,
                                          void *part,
                                          uint64_t part_addr,
                                          uint64_t part_size,
                                          uint64_t start_addr,
@@ -436,14 +465,14 @@ static int vhost_verify_ring_part_mapping(void *part,
        return 0;
    }
    l = part_size;
    p = cpu_physical_memory_map(part_addr, &l, 1);
    p = vhost_memory_map(dev, part_addr, &l, 1);
    if (!p || l != part_size) {
        r = -ENOMEM;
    }
    if (p != part) {
        r = -EBUSY;
    }
    cpu_physical_memory_unmap(p, l, 0, 0);
    vhost_memory_unmap(dev, p, l, 0, 0);
    return r;
}

@@ -463,21 +492,21 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,
        struct vhost_virtqueue *vq = dev->vqs + i;

        j = 0;
        r = vhost_verify_ring_part_mapping(vq->desc, vq->desc_phys,
        r = vhost_verify_ring_part_mapping(dev, vq->desc, vq->desc_phys,
                                           vq->desc_size, start_addr, size);
        if (!r) {
            break;
        }

        j++;
        r = vhost_verify_ring_part_mapping(vq->avail, vq->avail_phys,
        r = vhost_verify_ring_part_mapping(dev, vq->avail, vq->avail_phys,
                                           vq->avail_size, start_addr, size);
        if (!r) {
            break;
        }

        j++;
        r = vhost_verify_ring_part_mapping(vq->used, vq->used_phys,
        r = vhost_verify_ring_part_mapping(dev, vq->used, vq->used_phys,
                                           vq->used_size, start_addr, size);
        if (!r) {
            break;
@@ -715,7 +744,8 @@ static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
    return 0;
}

static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
static int vhost_dev_set_features(struct vhost_dev *dev,
                                  bool enable_log)
{
    uint64_t features = dev->acked_features;
    int r;
@@ -858,6 +888,56 @@ static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
    return -errno;
}

static int vhost_memory_region_lookup(struct vhost_dev *hdev,
                                      uint64_t gpa, uint64_t *uaddr,
                                      uint64_t *len)
{
    int i;

    for (i = 0; i < hdev->mem->nregions; i++) {
        struct vhost_memory_region *reg = hdev->mem->regions + i;

        if (gpa >= reg->guest_phys_addr &&
            reg->guest_phys_addr + reg->memory_size > gpa) {
            *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
            *len = reg->guest_phys_addr + reg->memory_size - gpa;
            return 0;
        }
    }

    return -EFAULT;
}

void vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
{
    IOMMUTLBEntry iotlb;
    uint64_t uaddr, len;

    rcu_read_lock();

    iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
                                          iova, write);
    if (iotlb.target_as != NULL) {
        if (vhost_memory_region_lookup(dev, iotlb.translated_addr,
                                       &uaddr, &len)) {
            error_report("Fail to lookup the translated address "
                         "%"PRIx64, iotlb.translated_addr);
            goto out;
        }

        len = MIN(iotlb.addr_mask + 1, len);
        iova = iova & ~iotlb.addr_mask;

        if (dev->vhost_ops->vhost_update_device_iotlb(dev, iova, uaddr,
                                                      len, iotlb.perm)) {
            error_report("Fail to update device iotlb");
            goto out;
        }
    }
out:
    rcu_read_unlock();
}

static int vhost_virtqueue_start(struct vhost_dev *dev,
                                struct VirtIODevice *vdev,
                                struct vhost_virtqueue *vq,
@@ -903,21 +983,21 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,

    vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
    vq->desc_phys = a = virtio_queue_get_desc_addr(vdev, idx);
    vq->desc = cpu_physical_memory_map(a, &l, 0);
    vq->desc = vhost_memory_map(dev, a, &l, 0);
    if (!vq->desc || l != s) {
        r = -ENOMEM;
        goto fail_alloc_desc;
    }
    vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
    vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
    vq->avail = cpu_physical_memory_map(a, &l, 0);
    vq->avail = vhost_memory_map(dev, a, &l, 0);
    if (!vq->avail || l != s) {
        r = -ENOMEM;
        goto fail_alloc_avail;
    }
    vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
    vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
    vq->used = cpu_physical_memory_map(a, &l, 1);
    vq->used = vhost_memory_map(dev, a, &l, 1);
    if (!vq->used || l != s) {
        r = -ENOMEM;
        goto fail_alloc_used;
@@ -963,13 +1043,13 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
fail_vector:
fail_kick:
fail_alloc:
    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
    vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
                       0, 0);
fail_alloc_used:
    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
    vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
                       0, 0);
fail_alloc_avail:
    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
    vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
                       0, 0);
fail_alloc_desc:
    return r;
@@ -1004,11 +1084,11 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev,
                                                vhost_vq_index);
    }

    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
    vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
                       1, virtio_queue_get_used_size(vdev, idx));
    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
    vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
                       0, virtio_queue_get_avail_size(vdev, idx));
    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
    vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
                       0, virtio_queue_get_desc_size(vdev, idx));
}

@@ -1066,6 +1146,9 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
        r = -errno;
        goto fail_call;
    }

    vq->dev = dev;

    return 0;
fail_call:
    event_notifier_cleanup(&vq->masked_notifier);
@@ -1077,12 +1160,24 @@ static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
    event_notifier_cleanup(&vq->masked_notifier);
}

static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
{
    struct vhost_dev *hdev = container_of(n, struct vhost_dev, n);

    if (hdev->vhost_ops->vhost_invalidate_device_iotlb(hdev,
                                                       iotlb->iova,
                                                       iotlb->addr_mask + 1)) {
        error_report("Fail to invalidate device iotlb");
    }
}

int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
                   VhostBackendType backend_type, uint32_t busyloop_timeout)
{
    uint64_t features;
    int i, r, n_initialized_vqs = 0;

    hdev->vdev = NULL;
    hdev->migration_blocker = NULL;

    r = vhost_set_backend_type(hdev, backend_type);
@@ -1147,6 +1242,9 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
        .priority = 10
    };

    hdev->n.notify = vhost_iommu_unmap_notify;
    hdev->n.notifier_flags = IOMMU_NOTIFIER_UNMAP;

    if (hdev->migration_blocker == NULL) {
        if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
            error_setg(&hdev->migration_blocker,
@@ -1342,11 +1440,18 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
    assert(hdev->vhost_ops);

    hdev->started = true;
    hdev->vdev = vdev;

    r = vhost_dev_set_features(hdev, hdev->log_enabled);
    if (r < 0) {
        goto fail_features;
    }

    if (vhost_dev_has_iommu(hdev)) {
        memory_region_register_iommu_notifier(vdev->dma_as->root,
                                              &hdev->n);
    }

    r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
    if (r < 0) {
        VHOST_OPS_DEBUG("vhost_set_mem_table failed");
@@ -1380,6 +1485,16 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
        }
    }

    if (vhost_dev_has_iommu(hdev)) {
        hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);

        /* Update used ring information for IOTLB to work correctly,
         * vhost-kernel code requires for this.*/
        for (i = 0; i < hdev->nvqs; ++i) {
            struct vhost_virtqueue *vq = hdev->vqs + i;
            vhost_device_iotlb_miss(hdev, vq->used_phys, true);
        }
    }
    return 0;
fail_log:
    vhost_log_put(hdev, false);
@@ -1391,6 +1506,7 @@ fail_vq:
                             hdev->vq_index + i);
    }
    i = hdev->nvqs;

fail_mem:
fail_features:

@@ -1413,8 +1529,14 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
                             hdev->vq_index + i);
    }

    if (vhost_dev_has_iommu(hdev)) {
        hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
        memory_region_unregister_iommu_notifier(vdev->dma_as->root,
                                                &hdev->n);
    }
    vhost_log_put(hdev, true);
    hdev->started = false;
    hdev->vdev = NULL;
}

int vhost_net_set_backend(struct vhost_dev *hdev,
+13 −0
Original line number Diff line number Diff line
@@ -11,6 +11,8 @@
#ifndef VHOST_BACKEND_H
#define VHOST_BACKEND_H

#include "exec/memory.h"

typedef enum VhostBackendType {
    VHOST_BACKEND_TYPE_NONE = 0,
    VHOST_BACKEND_TYPE_KERNEL = 1,
@@ -77,6 +79,14 @@ typedef bool (*vhost_backend_can_merge_op)(struct vhost_dev *dev,
typedef int (*vhost_vsock_set_guest_cid_op)(struct vhost_dev *dev,
                                            uint64_t guest_cid);
typedef int (*vhost_vsock_set_running_op)(struct vhost_dev *dev, int start);
typedef void (*vhost_set_iotlb_callback_op)(struct vhost_dev *dev,
                                           int enabled);
typedef int (*vhost_update_device_iotlb_op)(struct vhost_dev *dev,
                                            uint64_t iova, uint64_t uaddr,
                                            uint64_t len,
                                            IOMMUAccessFlags perm);
typedef int (*vhost_invalidate_device_iotlb_op)(struct vhost_dev *dev,
                                                uint64_t iova, uint64_t len);

typedef struct VhostOps {
    VhostBackendType backend_type;
@@ -109,6 +119,9 @@ typedef struct VhostOps {
    vhost_backend_can_merge_op vhost_backend_can_merge;
    vhost_vsock_set_guest_cid_op vhost_vsock_set_guest_cid;
    vhost_vsock_set_running_op vhost_vsock_set_running;
    vhost_set_iotlb_callback_op vhost_set_iotlb_callback;
    vhost_update_device_iotlb_op vhost_update_device_iotlb;
    vhost_invalidate_device_iotlb_op vhost_invalidate_device_iotlb;
} VhostOps;

extern const VhostOps user_ops;
+4 −0
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@ struct vhost_virtqueue {
    unsigned long long used_phys;
    unsigned used_size;
    EventNotifier masked_notifier;
    struct vhost_dev *dev;
};

typedef unsigned long vhost_log_chunk_t;
@@ -38,6 +39,7 @@ struct vhost_log {

struct vhost_memory;
struct vhost_dev {
    VirtIODevice *vdev;
    MemoryListener memory_listener;
    struct vhost_memory *mem;
    int n_mem_sections;
@@ -62,6 +64,7 @@ struct vhost_dev {
    void *opaque;
    struct vhost_log *log;
    QLIST_ENTRY(vhost_dev) entry;
    IOMMUNotifier n;
};

int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
@@ -91,4 +94,5 @@ bool vhost_has_free_slot(void);
int vhost_net_set_backend(struct vhost_dev *hdev,
                          struct vhost_vring_file *file);

void vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write);
#endif
Loading