Commit adcd6e93 authored by Peter Maydell's avatar Peter Maydell
Browse files

Merge remote-tracking branch 'remotes/stefanha/tags/block-pull-request' into staging



Pull request

# gpg: Signature made Thu 30 Jan 2020 21:31:02 GMT
# gpg:                using RSA key 8695A8BFD3F97CDAAC35775A9CA4ABB381AB73C8
# gpg: Good signature from "Stefan Hajnoczi <stefanha@redhat.com>" [full]
# gpg:                 aka "Stefan Hajnoczi <stefanha@gmail.com>" [full]
# Primary key fingerprint: 8695 A8BF D3F9 7CDA AC35  775A 9CA4 ABB3 81AB 73C8

* remotes/stefanha/tags/block-pull-request:
  tests/qemu-iotests: use AIOMODE with various tests
  tests/qemu-iotests: enable testing with aio options
  qemu-nbd: adds option for aio engines
  qemu-img: adds option to use aio engine for benchmarking
  qemu-io: adds option to use aio engine
  block/io_uring: adds userspace completion polling
  block: add trace events for io_uring
  block/file-posix.c: extend to use io_uring
  blockdev: adds bdrv_parse_aio to use io_uring
  util/async: add aio interfaces for io_uring
  stubs: add stubs for io_uring interface
  block/io_uring: implements interfaces for io_uring
  block/block: add BDRV flag for io_uring
  qapi/block-core: add option for io_uring
  configure: permit use of io_uring
  block/io: take bs->reqs_lock in bdrv_mark_request_serialising
  block/io: wait for serialising requests when a request becomes serialising
  block: eliminate BDRV_REQ_NO_SERIALISING

Signed-off-by: default avatarPeter Maydell <peter.maydell@linaro.org>
parents 92817365 8dff69b9
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -2634,6 +2634,15 @@ F: block/file-posix.c
F: block/file-win32.c
F: block/win32-aio.c

Linux io_uring
M: Aarushi Mehta <mehta.aaru20@gmail.com>
M: Julia Suvorova <jusual@redhat.com>
M: Stefan Hajnoczi <stefanha@redhat.com>
L: qemu-block@nongnu.org
S: Maintained
F: block/io_uring.c
F: stubs/io_uring.c

qcow2
M: Kevin Wolf <kwolf@redhat.com>
M: Max Reitz <mreitz@redhat.com>
+22 −0
Original line number Diff line number Diff line
@@ -845,6 +845,28 @@ static BlockdevDetectZeroesOptions bdrv_parse_detect_zeroes(QemuOpts *opts,
    return detect_zeroes;
}

/**
 * Set open flags for aio engine
 *
 * Return 0 on success, -1 if the engine specified is invalid
 */
int bdrv_parse_aio(const char *mode, int *flags)
{
    if (!strcmp(mode, "threads")) {
        /* do nothing, default */
    } else if (!strcmp(mode, "native")) {
        *flags |= BDRV_O_NATIVE_AIO;
#ifdef CONFIG_LINUX_IO_URING
    } else if (!strcmp(mode, "io_uring")) {
        *flags |= BDRV_O_IO_URING;
#endif
    } else {
        return -1;
    }

    return 0;
}

/**
 * Set open flags for a given discard mode
 *
+3 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o
block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o
block-obj-$(CONFIG_POSIX) += file-posix.o
block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
block-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o
block-obj-y += null.o mirror.o commit.o io.o create.o
block-obj-y += throttle-groups.o
block-obj-$(CONFIG_LINUX) += nvme.o
@@ -66,5 +67,7 @@ block-obj-$(if $(CONFIG_LZFSE),m,n) += dmg-lzfse.o
dmg-lzfse.o-libs   := $(LZFSE_LIBS)
qcow.o-libs        := -lz
linux-aio.o-libs   := -laio
io_uring.o-cflags  := $(LINUX_IO_URING_CFLAGS)
io_uring.o-libs    := $(LINUX_IO_URING_LIBS)
parallels.o-cflags := $(LIBXML2_CFLAGS)
parallels.o-libs   := $(LIBXML2_LIBS)
+79 −20
Original line number Diff line number Diff line
@@ -156,6 +156,7 @@ typedef struct BDRVRawState {
    bool has_write_zeroes:1;
    bool discard_zeroes:1;
    bool use_linux_aio:1;
    bool use_linux_io_uring:1;
    bool page_cache_inconsistent:1;
    bool has_fallocate;
    bool needs_alignment;
@@ -444,7 +445,7 @@ static QemuOptsList raw_runtime_opts = {
        {
            .name = "aio",
            .type = QEMU_OPT_STRING,
            .help = "host AIO implementation (threads, native)",
            .help = "host AIO implementation (threads, native, io_uring)",
        },
        {
            .name = "locking",
@@ -503,9 +504,16 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
        goto fail;
    }

    aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO)
                  ? BLOCKDEV_AIO_OPTIONS_NATIVE
                  : BLOCKDEV_AIO_OPTIONS_THREADS;
    if (bdrv_flags & BDRV_O_NATIVE_AIO) {
        aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE;
#ifdef CONFIG_LINUX_IO_URING
    } else if (bdrv_flags & BDRV_O_IO_URING) {
        aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING;
#endif
    } else {
        aio_default = BLOCKDEV_AIO_OPTIONS_THREADS;
    }

    aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
                          qemu_opt_get(opts, "aio"),
                          aio_default, &local_err);
@@ -514,7 +522,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
        ret = -EINVAL;
        goto fail;
    }

    s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
#ifdef CONFIG_LINUX_IO_URING
    s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
#endif

    locking = qapi_enum_parse(&OnOffAuto_lookup,
                              qemu_opt_get(opts, "locking"),
@@ -600,6 +612,22 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    }
#endif /* !defined(CONFIG_LINUX_AIO) */

#ifdef CONFIG_LINUX_IO_URING
    if (s->use_linux_io_uring) {
        if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) {
            error_prepend(errp, "Unable to use io_uring: ");
            goto fail;
        }
    }
#else
    if (s->use_linux_io_uring) {
        error_setg(errp, "aio=io_uring was specified, but is not supported "
                         "in this build.");
        ret = -EINVAL;
        goto fail;
    }
#endif /* !defined(CONFIG_LINUX_IO_URING) */

    s->has_discard = true;
    s->has_write_zeroes = true;
    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
@@ -1877,14 +1905,19 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
        return -EIO;

    /*
     * Check if the underlying device requires requests to be aligned,
     * and if the request we are trying to submit is aligned or not.
     * If this is the case tell the low-level driver that it needs
     * to copy the buffer.
     * When using O_DIRECT, the request must be aligned to be able to use
     * either libaio or io_uring interface. If not fail back to regular thread
     * pool read/write code which emulates this for us if we
     * set QEMU_AIO_MISALIGNED.
     */
    if (s->needs_alignment) {
        if (!bdrv_qiov_is_aligned(bs, qiov)) {
    if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
        type |= QEMU_AIO_MISALIGNED;
#ifdef CONFIG_LINUX_IO_URING
    } else if (s->use_linux_io_uring) {
        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
        assert(qiov->size == bytes);
        return luring_co_submit(bs, aio, s->fd, offset, qiov, type);
#endif
#ifdef CONFIG_LINUX_AIO
    } else if (s->use_linux_aio) {
        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
@@ -1892,7 +1925,6 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
        return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
#endif
    }
    }

    acb = (RawPosixAIOData) {
        .bs             = bs,
@@ -1927,24 +1959,36 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,

static void raw_aio_plug(BlockDriverState *bs)
{
    BDRVRawState __attribute__((unused)) *s = bs->opaque;
#ifdef CONFIG_LINUX_AIO
    BDRVRawState *s = bs->opaque;
    if (s->use_linux_aio) {
        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
        laio_io_plug(bs, aio);
    }
#endif
#ifdef CONFIG_LINUX_IO_URING
    if (s->use_linux_io_uring) {
        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
        luring_io_plug(bs, aio);
    }
#endif
}

static void raw_aio_unplug(BlockDriverState *bs)
{
    BDRVRawState __attribute__((unused)) *s = bs->opaque;
#ifdef CONFIG_LINUX_AIO
    BDRVRawState *s = bs->opaque;
    if (s->use_linux_aio) {
        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
        laio_io_unplug(bs, aio);
    }
#endif
#ifdef CONFIG_LINUX_IO_URING
    if (s->use_linux_io_uring) {
        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
        luring_io_unplug(bs, aio);
    }
#endif
}

static int raw_co_flush_to_disk(BlockDriverState *bs)
@@ -1964,14 +2008,20 @@ static int raw_co_flush_to_disk(BlockDriverState *bs)
        .aio_type       = QEMU_AIO_FLUSH,
    };

#ifdef CONFIG_LINUX_IO_URING
    if (s->use_linux_io_uring) {
        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
        return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH);
    }
#endif
    return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
}

static void raw_aio_attach_aio_context(BlockDriverState *bs,
                                       AioContext *new_context)
{
    BDRVRawState __attribute__((unused)) *s = bs->opaque;
#ifdef CONFIG_LINUX_AIO
    BDRVRawState *s = bs->opaque;
    if (s->use_linux_aio) {
        Error *local_err = NULL;
        if (!aio_setup_linux_aio(new_context, &local_err)) {
@@ -1981,6 +2031,16 @@ static void raw_aio_attach_aio_context(BlockDriverState *bs,
        }
    }
#endif
#ifdef CONFIG_LINUX_IO_URING
    if (s->use_linux_io_uring) {
        Error *local_err;
        if (!aio_setup_linux_io_uring(new_context, &local_err)) {
            error_reportf_err(local_err, "Unable to use linux io_uring, "
                                         "falling back to thread pool: ");
            s->use_linux_io_uring = false;
        }
    }
#endif
}

static void raw_close(BlockDriverState *bs)
@@ -2753,7 +2813,6 @@ raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes,
        req->overlap_bytes = req->bytes;

        bdrv_mark_request_serialising(req, bs->bl.request_alignment);
        bdrv_wait_serialising_requests(req);
    }
#endif

+80 −82
Original line number Diff line number Diff line
@@ -715,12 +715,69 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
    qemu_co_mutex_unlock(&bs->reqs_lock);
}

void bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
static bool tracked_request_overlaps(BdrvTrackedRequest *req,
                                     int64_t offset, uint64_t bytes)
{
    /*        aaaa   bbbb */
    if (offset >= req->overlap_offset + req->overlap_bytes) {
        return false;
    }
    /* bbbb   aaaa        */
    if (req->overlap_offset >= offset + bytes) {
        return false;
    }
    return true;
}

static bool coroutine_fn
bdrv_wait_serialising_requests_locked(BlockDriverState *bs,
                                      BdrvTrackedRequest *self)
{
    BdrvTrackedRequest *req;
    bool retry;
    bool waited = false;

    do {
        retry = false;
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
            if (req == self || (!req->serialising && !self->serialising)) {
                continue;
            }
            if (tracked_request_overlaps(req, self->overlap_offset,
                                         self->overlap_bytes))
            {
                /* Hitting this means there was a reentrant request, for
                 * example, a block driver issuing nested requests.  This must
                 * never happen since it means deadlock.
                 */
                assert(qemu_coroutine_self() != req->co);

                /* If the request is already (indirectly) waiting for us, or
                 * will wait for us as soon as it wakes up, then just go on
                 * (instead of producing a deadlock in the former case). */
                if (!req->waiting_for) {
                    self->waiting_for = req;
                    qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
                    self->waiting_for = NULL;
                    retry = true;
                    waited = true;
                    break;
                }
            }
        }
    } while (retry);
    return waited;
}

bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
{
    BlockDriverState *bs = req->bs;
    int64_t overlap_offset = req->offset & ~(align - 1);
    uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
                               - overlap_offset;
    bool waited;

    qemu_co_mutex_lock(&bs->reqs_lock);
    if (!req->serialising) {
        atomic_inc(&req->bs->serialising_in_flight);
        req->serialising = true;
@@ -728,18 +785,9 @@ void bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)

    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
}

static bool is_request_serialising_and_aligned(BdrvTrackedRequest *req)
{
    /*
     * If the request is serialising, overlap_offset and overlap_bytes are set,
     * so we can check if the request is aligned. Otherwise, don't care and
     * return false.
     */

    return req->serialising && (req->offset == req->overlap_offset) &&
           (req->bytes == req->overlap_bytes);
    waited = bdrv_wait_serialising_requests_locked(bs, req);
    qemu_co_mutex_unlock(&bs->reqs_lock);
    return waited;
}

/**
@@ -793,20 +841,6 @@ static int bdrv_get_cluster_size(BlockDriverState *bs)
    }
}

static bool tracked_request_overlaps(BdrvTrackedRequest *req,
                                     int64_t offset, uint64_t bytes)
{
    /*        aaaa   bbbb */
    if (offset >= req->overlap_offset + req->overlap_bytes) {
        return false;
    }
    /* bbbb   aaaa        */
    if (req->overlap_offset >= offset + bytes) {
        return false;
    }
    return true;
}

void bdrv_inc_in_flight(BlockDriverState *bs)
{
    atomic_inc(&bs->in_flight);
@@ -823,48 +857,18 @@ void bdrv_dec_in_flight(BlockDriverState *bs)
    bdrv_wakeup(bs);
}

bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self)
static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self)
{
    BlockDriverState *bs = self->bs;
    BdrvTrackedRequest *req;
    bool retry;
    bool waited = false;

    if (!atomic_read(&bs->serialising_in_flight)) {
        return false;
    }

    do {
        retry = false;
    qemu_co_mutex_lock(&bs->reqs_lock);
        QLIST_FOREACH(req, &bs->tracked_requests, list) {
            if (req == self || (!req->serialising && !self->serialising)) {
                continue;
            }
            if (tracked_request_overlaps(req, self->overlap_offset,
                                         self->overlap_bytes))
            {
                /* Hitting this means there was a reentrant request, for
                 * example, a block driver issuing nested requests.  This must
                 * never happen since it means deadlock.
                 */
                assert(qemu_coroutine_self() != req->co);

                /* If the request is already (indirectly) waiting for us, or
                 * will wait for us as soon as it wakes up, then just go on
                 * (instead of producing a deadlock in the former case). */
                if (!req->waiting_for) {
                    self->waiting_for = req;
                    qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
                    self->waiting_for = NULL;
                    retry = true;
                    waited = true;
                    break;
                }
            }
        }
    waited = bdrv_wait_serialising_requests_locked(bs, self);
    qemu_co_mutex_unlock(&bs->reqs_lock);
    } while (retry);

    return waited;
}
@@ -1445,8 +1449,7 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
     * potential fallback support, if we ever implement any read flags
     * to pass through to drivers.  For now, there aren't any
     * passthrough flags.  */
    assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ |
                       BDRV_REQ_PREFETCH)));
    assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH)));

    /* Handle Copy on Read and associated serialisation */
    if (flags & BDRV_REQ_COPY_ON_READ) {
@@ -1456,12 +1459,7 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
         * it ensures that the CoR read and write operations are atomic and
         * guest writes cannot interleave between them. */
        bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
    }

    /* BDRV_REQ_SERIALISING is only for write operation */
    assert(!(flags & BDRV_REQ_SERIALISING));

    if (!(flags & BDRV_REQ_NO_SERIALISING)) {
    } else {
        bdrv_wait_serialising_requests(req);
    }

@@ -1711,7 +1709,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
    bdrv_inc_in_flight(bs);

    /* Don't do copy-on-read if we read data before write operation */
    if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
    if (atomic_read(&bs->copy_on_read)) {
        flags |= BDRV_REQ_COPY_ON_READ;
    }

@@ -1852,20 +1850,24 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes,
        return -EPERM;
    }

    /* BDRV_REQ_NO_SERIALISING is only for read operation */
    assert(!(flags & BDRV_REQ_NO_SERIALISING));
    assert(!(bs->open_flags & BDRV_O_INACTIVE));
    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
    assert(!(flags & ~BDRV_REQ_MASK));

    if (flags & BDRV_REQ_SERIALISING) {
        bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
        waited = bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
        /*
         * For a misaligned request we should have already waited earlier,
         * because we come after bdrv_padding_rmw_read which must be called
         * with the request already marked as serialising.
         */
        assert(!waited ||
               (req->offset == req->overlap_offset &&
                req->bytes == req->overlap_bytes));
    } else {
        bdrv_wait_serialising_requests(req);
    }

    waited = bdrv_wait_serialising_requests(req);

    assert(!waited || !req->serialising ||
           is_request_serialising_and_aligned(req));
    assert(req->overlap_offset <= offset);
    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
    assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
@@ -2027,7 +2029,6 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
    padding = bdrv_init_padding(bs, offset, bytes, &pad);
    if (padding) {
        bdrv_mark_request_serialising(req, align);
        bdrv_wait_serialising_requests(req);

        bdrv_padding_rmw_read(child, req, &pad, true);

@@ -2130,7 +2131,6 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,

    if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) {
        bdrv_mark_request_serialising(&req, align);
        bdrv_wait_serialising_requests(&req);
        bdrv_padding_rmw_read(child, &req, &pad, false);
    }

@@ -3222,9 +3222,7 @@ static int coroutine_fn bdrv_co_copy_range_internal(

        /* BDRV_REQ_SERIALISING is only for write operation */
        assert(!(read_flags & BDRV_REQ_SERIALISING));
        if (!(read_flags & BDRV_REQ_NO_SERIALISING)) {
        bdrv_wait_serialising_requests(&req);
        }

        ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
                                                    src, src_offset,
Loading