Commit 1d27b917 authored by Peter Maydell's avatar Peter Maydell
Browse files

Merge remote-tracking branch 'remotes/awilliam/tags/vfio-update-20151007.0' into staging



VFIO updates 2015-10-07

 - Change platform device IRQ setup sequence for compatibility
   with upcoming IRQ forwarding (Eric Auger)
 - Extensions to support vfio-pci devices on spapr-pci-host-bridge
   (David Gibson) [clang problem patch dropped]

# gpg: Signature made Wed 07 Oct 2015 16:30:52 BST using RSA key ID 3BB08B22
# gpg: Good signature from "Alex Williamson <alex.williamson@redhat.com>"
# gpg:                 aka "Alex Williamson <alex@shazbot.org>"
# gpg:                 aka "Alex Williamson <alwillia@redhat.com>"
# gpg:                 aka "Alex Williamson <alex.l.williamson@gmail.com>"

* remotes/awilliam/tags/vfio-update-20151007.0:
  vfio: Allow hotplug of containers onto existing guest IOMMU mappings
  memory: Allow replay of IOMMU mapping notifications
  vfio: Record host IOMMU's available IO page sizes
  vfio: Check guest IOVA ranges against host IOMMU capabilities
  vfio: Generalize vfio_listener_region_add failure path
  vfio: Remove unneeded union from VFIOContainer
  hw/vfio/platform: do not set resamplefd for edge-sensitive IRQS
  hw/vfio/platform: change interrupt/unmask fields into pointer
  hw/vfio/platform: irqfd setup sequence update

Signed-off-by: default avatarPeter Maydell <peter.maydell@linaro.org>
parents 31c9bd16 508ce5eb
Loading
Loading
Loading
Loading
+86 −54
Original line number Diff line number Diff line
@@ -312,11 +312,15 @@ out:
    rcu_read_unlock();
}

static hwaddr vfio_container_granularity(VFIOContainer *container)
{
    return (hwaddr)1 << ctz64(container->iova_pgsizes);
}

static void vfio_listener_region_add(MemoryListener *listener,
                                     MemoryRegionSection *section)
{
    VFIOContainer *container = container_of(listener, VFIOContainer,
                                            iommu_data.type1.listener);
    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
    hwaddr iova, end;
    Int128 llend;
    void *vaddr;
@@ -344,14 +348,22 @@ static void vfio_listener_region_add(MemoryListener *listener,
    if (int128_ge(int128_make64(iova), llend)) {
        return;
    }
    end = int128_get64(llend);

    if ((iova < container->min_iova) || ((end - 1) > container->max_iova)) {
        error_report("vfio: IOMMU container %p can't map guest IOVA region"
                     " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
                     container, iova, end - 1);
        ret = -EFAULT;
        goto fail;
    }

    memory_region_ref(section->mr);

    if (memory_region_is_iommu(section->mr)) {
        VFIOGuestIOMMU *giommu;

        trace_vfio_listener_region_add_iommu(iova,
                    int128_get64(int128_sub(llend, int128_one())));
        trace_vfio_listener_region_add_iommu(iova, end - 1);
        /*
         * FIXME: We should do some checking to see if the
         * capabilities of the host VFIO IOMMU are adequate to model
@@ -362,33 +374,22 @@ static void vfio_listener_region_add(MemoryListener *listener,
         * would be the right place to wire that up (tell the KVM
         * device emulation the VFIO iommu handles to use).
         */
        /*
         * This assumes that the guest IOMMU is empty of
         * mappings at this point.
         *
         * One way of doing this is:
         * 1. Avoid sharing IOMMUs between emulated devices or different
         * IOMMU groups.
         * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
         * there are some mappings in IOMMU.
         *
         * VFIO on SPAPR does that. Other IOMMU models may do that different,
         * they must make sure there are no existing mappings or
         * loop through existing mappings to map them into VFIO.
         */
        giommu = g_malloc0(sizeof(*giommu));
        giommu->iommu = section->mr;
        giommu->container = container;
        giommu->n.notify = vfio_iommu_map_notify;
        QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);

        memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
        memory_region_iommu_replay(giommu->iommu, &giommu->n,
                                   vfio_container_granularity(container),
                                   false);

        return;
    }

    /* Here we assume that memory_region_is_ram(section->mr)==true */

    end = int128_get64(llend);
    vaddr = memory_region_get_ram_ptr(section->mr) +
            section->offset_within_region +
            (iova - section->offset_within_address_space);
@@ -400,27 +401,30 @@ static void vfio_listener_region_add(MemoryListener *listener,
        error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
                     "0x%"HWADDR_PRIx", %p) = %d (%m)",
                     container, iova, end - iova, vaddr, ret);
        goto fail;
    }

    return;

fail:
    /*
     * On the initfn path, store the first error in the container so we
     * can gracefully fail.  Runtime, there's not much we can do other
     * than throw a hardware error.
     */
        if (!container->iommu_data.type1.initialized) {
            if (!container->iommu_data.type1.error) {
                container->iommu_data.type1.error = ret;
    if (!container->initialized) {
        if (!container->error) {
            container->error = ret;
        }
    } else {
        hw_error("vfio: DMA mapping failed, unable to continue");
    }
}
}

static void vfio_listener_region_del(MemoryListener *listener,
                                     MemoryRegionSection *section)
{
    VFIOContainer *container = container_of(listener, VFIOContainer,
                                            iommu_data.type1.listener);
    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
    hwaddr iova, end;
    int ret;

@@ -485,7 +489,7 @@ static const MemoryListener vfio_memory_listener = {

static void vfio_listener_release(VFIOContainer *container)
{
    memory_listener_unregister(&container->iommu_data.type1.listener);
    memory_listener_unregister(&container->listener);
}

int vfio_mmap_region(Object *obj, VFIORegion *region,
@@ -668,6 +672,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) ||
        ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) {
        bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU);
        struct vfio_iommu_type1_info info;

        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
        if (ret) {
@@ -684,21 +689,27 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
            goto free_container_exit;
        }

        container->iommu_data.type1.listener = vfio_memory_listener;
        container->iommu_data.release = vfio_listener_release;

        memory_listener_register(&container->iommu_data.type1.listener,
                                 container->space->as);
        /*
         * FIXME: This assumes that a Type1 IOMMU can map any 64-bit
         * IOVA whatsoever.  That's not actually true, but the current
         * kernel interface doesn't tell us what it can map, and the
         * existing Type1 IOMMUs generally support any IOVA we're
         * going to actually try in practice.
         */
        container->min_iova = 0;
        container->max_iova = (hwaddr)-1;

        if (container->iommu_data.type1.error) {
            ret = container->iommu_data.type1.error;
            error_report("vfio: memory listener initialization failed for container");
            goto listener_release_exit;
        /* Assume just 4K IOVA page size */
        container->iova_pgsizes = 0x1000;
        info.argsz = sizeof(info);
        ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info);
        /* Ignore errors */
        if ((ret == 0) && (info.flags & VFIO_IOMMU_INFO_PGSIZES)) {
            container->iova_pgsizes = info.iova_pgsizes;
        }

        container->iommu_data.type1.initialized = true;

    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
        struct vfio_iommu_spapr_tce_info info;

        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
        if (ret) {
            error_report("vfio: failed to set group container: %m");
@@ -724,18 +735,41 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
            goto free_container_exit;
        }

        container->iommu_data.type1.listener = vfio_memory_listener;
        container->iommu_data.release = vfio_listener_release;

        memory_listener_register(&container->iommu_data.type1.listener,
                                 container->space->as);
        /*
         * This only considers the host IOMMU's 32-bit window.  At
         * some point we need to add support for the optional 64-bit
         * window and dynamic windows
         */
        info.argsz = sizeof(info);
        ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
        if (ret) {
            error_report("vfio: VFIO_IOMMU_SPAPR_TCE_GET_INFO failed: %m");
            ret = -errno;
            goto free_container_exit;
        }
        container->min_iova = info.dma32_window_start;
        container->max_iova = container->min_iova + info.dma32_window_size - 1;

        /* Assume just 4K IOVA pages for now */
        container->iova_pgsizes = 0x1000;
    } else {
        error_report("vfio: No available IOMMU models");
        ret = -EINVAL;
        goto free_container_exit;
    }

    container->listener = vfio_memory_listener;

    memory_listener_register(&container->listener, container->space->as);

    if (container->error) {
        ret = container->error;
        error_report("vfio: memory listener initialization failed for container");
        goto listener_release_exit;
    }

    container->initialized = true;

    QLIST_INIT(&container->group_list);
    QLIST_INSERT_HEAD(&space->containers, container, next);

@@ -774,9 +808,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
        VFIOAddressSpace *space = container->space;
        VFIOGuestIOMMU *giommu, *tmp;

        if (container->iommu_data.release) {
            container->iommu_data.release(container);
        }
        vfio_listener_release(container);
        QLIST_REMOVE(container, next);

        QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
+70 −46
Original line number Diff line number Diff line
@@ -32,6 +32,11 @@
 * Functions used whatever the injection method
 */

static inline bool vfio_irq_is_automasked(VFIOINTp *intp)
{
    return intp->flags & VFIO_IRQ_INFO_AUTOMASKED;
}

/**
 * vfio_init_intp - allocate, initialize the IRQ struct pointer
 * and add it into the list of IRQs
@@ -57,19 +62,26 @@ static VFIOINTp *vfio_init_intp(VFIODevice *vbasedev,
    sysbus_init_irq(sbdev, &intp->qemuirq);

    /* Get an eventfd for trigger */
    ret = event_notifier_init(&intp->interrupt, 0);
    intp->interrupt = g_malloc0(sizeof(EventNotifier));
    ret = event_notifier_init(intp->interrupt, 0);
    if (ret) {
        g_free(intp->interrupt);
        g_free(intp);
        error_report("vfio: Error: trigger event_notifier_init failed ");
        return NULL;
    }
    if (vfio_irq_is_automasked(intp)) {
        /* Get an eventfd for resample/unmask */
    ret = event_notifier_init(&intp->unmask, 0);
        intp->unmask = g_malloc0(sizeof(EventNotifier));
        ret = event_notifier_init(intp->unmask, 0);
        if (ret) {
            g_free(intp->interrupt);
            g_free(intp->unmask);
            g_free(intp);
            error_report("vfio: Error: resamplefd event_notifier_init failed");
            return NULL;
        }
    }

    QLIST_INSERT_HEAD(&vdev->intp_list, intp, next);
    return intp;
@@ -100,7 +112,7 @@ static int vfio_set_trigger_eventfd(VFIOINTp *intp,
    irq_set->start = 0;
    irq_set->count = 1;
    pfd = (int32_t *)&irq_set->data;
    *pfd = event_notifier_get_fd(&intp->interrupt);
    *pfd = event_notifier_get_fd(intp->interrupt);
    qemu_set_fd_handler(*pfd, (IOHandler *)handler, NULL, intp);
    ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
    g_free(irq_set);
@@ -182,7 +194,7 @@ static void vfio_intp_mmap_enable(void *opaque)
static void vfio_intp_inject_pending_lockheld(VFIOINTp *intp)
{
    trace_vfio_platform_intp_inject_pending_lockheld(intp->pin,
                              event_notifier_get_fd(&intp->interrupt));
                              event_notifier_get_fd(intp->interrupt));

    intp->state = VFIO_IRQ_ACTIVE;

@@ -224,18 +236,18 @@ static void vfio_intp_interrupt(VFIOINTp *intp)
        trace_vfio_intp_interrupt_set_pending(intp->pin);
        QSIMPLEQ_INSERT_TAIL(&vdev->pending_intp_queue,
                             intp, pqnext);
        ret = event_notifier_test_and_clear(&intp->interrupt);
        ret = event_notifier_test_and_clear(intp->interrupt);
        qemu_mutex_unlock(&vdev->intp_mutex);
        return;
    }

    trace_vfio_platform_intp_interrupt(intp->pin,
                              event_notifier_get_fd(&intp->interrupt));
                              event_notifier_get_fd(intp->interrupt));

    ret = event_notifier_test_and_clear(&intp->interrupt);
    ret = event_notifier_test_and_clear(intp->interrupt);
    if (!ret) {
        error_report("Error when clearing fd=%d (ret = %d)",
                     event_notifier_get_fd(&intp->interrupt), ret);
                     event_notifier_get_fd(intp->interrupt), ret);
    }

    intp->state = VFIO_IRQ_ACTIVE;
@@ -283,13 +295,13 @@ static void vfio_platform_eoi(VFIODevice *vbasedev)
    QLIST_FOREACH(intp, &vdev->intp_list, next) {
        if (intp->state == VFIO_IRQ_ACTIVE) {
            trace_vfio_platform_eoi(intp->pin,
                                event_notifier_get_fd(&intp->interrupt));
                                event_notifier_get_fd(intp->interrupt));
            intp->state = VFIO_IRQ_INACTIVE;

            /* deassert the virtual IRQ */
            qemu_set_irq(intp->qemuirq, 0);

            if (intp->flags & VFIO_IRQ_INFO_AUTOMASKED) {
            if (vfio_irq_is_automasked(intp)) {
                /* unmasks the physical level-sensitive IRQ */
                vfio_unmask_single_irqindex(vbasedev, intp->pin);
            }
@@ -310,18 +322,29 @@ static void vfio_platform_eoi(VFIODevice *vbasedev)
/**
 * vfio_start_eventfd_injection - starts the virtual IRQ injection using
 * user-side handled eventfds
 * @intp: the IRQ struct pointer
 * @sbdev: the sysbus device handle
 * @irq: the qemu irq handle
 */

static int vfio_start_eventfd_injection(VFIOINTp *intp)
static void vfio_start_eventfd_injection(SysBusDevice *sbdev, qemu_irq irq)
{
    int ret;
    VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(sbdev);
    VFIOINTp *intp;

    QLIST_FOREACH(intp, &vdev->intp_list, next) {
        if (intp->qemuirq == irq) {
            break;
        }
    }
    assert(intp);

    ret = vfio_set_trigger_eventfd(intp, vfio_intp_interrupt);
    if (ret) {
        error_report("vfio: Error: Failed to pass IRQ fd to the driver: %m");
        error_report("vfio: failed to start eventfd signaling for IRQ %d: %m",
                     intp->pin);
        abort();
    }
    return ret;
}

/*
@@ -349,7 +372,7 @@ static int vfio_set_resample_eventfd(VFIOINTp *intp)
    irq_set->start = 0;
    irq_set->count = 1;
    pfd = (int32_t *)&irq_set->data;
    *pfd = event_notifier_get_fd(&intp->unmask);
    *pfd = event_notifier_get_fd(intp->unmask);
    qemu_set_fd_handler(*pfd, NULL, NULL, NULL);
    ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
    g_free(irq_set);
@@ -359,6 +382,15 @@ static int vfio_set_resample_eventfd(VFIOINTp *intp)
    return ret;
}

/**
 * vfio_start_irqfd_injection - starts the virtual IRQ injection using
 * irqfd
 *
 * @sbdev: the sysbus device handle
 * @irq: the qemu irq handle
 *
 * In case the irqfd setup fails, we fallback to userspace handled eventfd
 */
static void vfio_start_irqfd_injection(SysBusDevice *sbdev, qemu_irq irq)
{
    VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(sbdev);
@@ -366,7 +398,7 @@ static void vfio_start_irqfd_injection(SysBusDevice *sbdev, qemu_irq irq)

    if (!kvm_irqfds_enabled() || !kvm_resamplefds_enabled() ||
        !vdev->irqfd_allowed) {
        return;
        goto fail_irqfd;
    }

    QLIST_FOREACH(intp, &vdev->intp_list, next) {
@@ -376,39 +408,36 @@ static void vfio_start_irqfd_injection(SysBusDevice *sbdev, qemu_irq irq)
    }
    assert(intp);

    /* Get to a known interrupt state */
    qemu_set_fd_handler(event_notifier_get_fd(&intp->interrupt),
                        NULL, NULL, vdev);

    vfio_mask_single_irqindex(&vdev->vbasedev, intp->pin);
    qemu_set_irq(intp->qemuirq, 0);

    if (kvm_irqchip_add_irqfd_notifier(kvm_state, &intp->interrupt,
                                   &intp->unmask, irq) < 0) {
    if (kvm_irqchip_add_irqfd_notifier(kvm_state, intp->interrupt,
                                   intp->unmask, irq) < 0) {
        goto fail_irqfd;
    }

    if (vfio_set_trigger_eventfd(intp, NULL) < 0) {
        goto fail_vfio;
    }
    if (vfio_irq_is_automasked(intp)) {
        if (vfio_set_resample_eventfd(intp) < 0) {
            goto fail_vfio;
        }

    /* Let's resume injection with irqfd setup */
    vfio_unmask_single_irqindex(&vdev->vbasedev, intp->pin);
        trace_vfio_platform_start_level_irqfd_injection(intp->pin,
                                    event_notifier_get_fd(intp->interrupt),
                                    event_notifier_get_fd(intp->unmask));
    } else {
        trace_vfio_platform_start_edge_irqfd_injection(intp->pin,
                                    event_notifier_get_fd(intp->interrupt));
    }

    intp->kvm_accel = true;

    trace_vfio_platform_start_irqfd_injection(intp->pin,
                                     event_notifier_get_fd(&intp->interrupt),
                                     event_notifier_get_fd(&intp->unmask));
    return;
fail_vfio:
    kvm_irqchip_remove_irqfd_notifier(kvm_state, &intp->interrupt, irq);
    kvm_irqchip_remove_irqfd_notifier(kvm_state, intp->interrupt, irq);
    error_report("vfio: failed to start eventfd signaling for IRQ %d: %m",
                 intp->pin);
    abort();
fail_irqfd:
    vfio_start_eventfd_injection(intp);
    vfio_unmask_single_irqindex(&vdev->vbasedev, intp->pin);
    vfio_start_eventfd_injection(sbdev, irq);
    return;
}

@@ -646,7 +675,6 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp)
    VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev);
    SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
    VFIODevice *vbasedev = &vdev->vbasedev;
    VFIOINTp *intp;
    int i, ret;

    vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
@@ -665,10 +693,6 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp)
        vfio_map_region(vdev, i);
        sysbus_init_mmio(sbdev, &vdev->regions[i]->mem);
    }

    QLIST_FOREACH(intp, &vdev->intp_list, next) {
        vfio_start_eventfd_injection(intp);
    }
}

static const VMStateDescription vfio_platform_vmstate = {
+13 −0
Original line number Diff line number Diff line
@@ -582,6 +582,19 @@ void memory_region_notify_iommu(MemoryRegion *mr,
 */
void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n);

/**
 * memory_region_iommu_replay: replay existing IOMMU translations to
 * a notifier
 *
 * @mr: the memory region to observe
 * @n: the notifier to which to replay iommu mappings
 * @granularity: Minimum page granularity to replay notifications for
 * @is_write: Whether to treat the replay as a translate "write"
 *     through the iommu
 */
void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n,
                                hwaddr granularity, bool is_write);

/**
 * memory_region_unregister_iommu_notifier: unregister a notifier for
 * changes to IOMMU translation entries.
+10 −13
Original line number Diff line number Diff line
@@ -59,22 +59,19 @@ typedef struct VFIOAddressSpace {

struct VFIOGroup;

typedef struct VFIOType1 {
    MemoryListener listener;
    int error;
    bool initialized;
} VFIOType1;

typedef struct VFIOContainer {
    VFIOAddressSpace *space;
    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
    struct {
        /* enable abstraction to support various iommu backends */
        union {
            VFIOType1 type1;
        };
        void (*release)(struct VFIOContainer *);
    } iommu_data;
    MemoryListener listener;
    int error;
    bool initialized;
    /*
     * This assumes the host IOMMU can support only a single
     * contiguous IOVA window.  We may need to generalize that in
     * future
     */
    hwaddr min_iova, max_iova;
    uint64_t iova_pgsizes;
    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
    QLIST_HEAD(, VFIOGroup) group_list;
    QLIST_ENTRY(VFIOContainer) next;
+2 −2
Original line number Diff line number Diff line
@@ -34,8 +34,8 @@ enum {
typedef struct VFIOINTp {
    QLIST_ENTRY(VFIOINTp) next; /* entry for IRQ list */
    QSIMPLEQ_ENTRY(VFIOINTp) pqnext; /* entry for pending IRQ queue */
    EventNotifier interrupt; /* eventfd triggered on interrupt */
    EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
    EventNotifier *interrupt; /* eventfd triggered on interrupt */
    EventNotifier *unmask; /* eventfd for unmask on QEMU bypass */
    qemu_irq qemuirq;
    struct VFIOPlatformDevice *vdev; /* back pointer to device */
    int state; /* inactive, pending, active */
Loading