Commit 25493dc0 authored by Kevin Wolf's avatar Kevin Wolf
Browse files

Merge remote-tracking branch 'mreitz/tags/pull-block-2016-10-24' into queue-block



Block patches for master

# gpg: Signature made Mon Oct 24 17:56:44 2016 CEST
# gpg:                using RSA key 0xF407DB0061D5CF40
# gpg: Good signature from "Max Reitz <mreitz@redhat.com>"
# Primary key fingerprint: 91BE B60A 30DB 3E88 57D1  1829 F407 DB00 61D5 CF40

* mreitz/tags/pull-block-2016-10-24:
  block/replication: Clarify 'top-id' parameter usage
  block: More operations for meta dirty bitmap
  tests: Add test code for hbitmap serialization
  block: BdrvDirtyBitmap serialization interface
  hbitmap: serialization
  block: Assert that bdrv_release_dirty_bitmap succeeded
  block: Add two dirty bitmap getters
  block: Support meta dirty bitmap
  tests: Add test code for meta bitmap
  HBitmap: Introduce "meta" bitmap to track bit changes
  block: Hide HBitmap in block dirty bitmap interface
  quorum: do not allocate multiple iovecs for FIFO strategy
  quorum: change child_iter to children_read

Signed-off-by: default avatarKevin Wolf <kwolf@redhat.com>
parents 12ac9d9e f4f2539b
Loading
Loading
Loading
Loading
+8 −6
Original line number Diff line number Diff line
@@ -372,14 +372,14 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
    int64_t end;
    int64_t last_cluster = -1;
    int64_t sectors_per_cluster = cluster_size_sectors(job);
    HBitmapIter hbi;
    BdrvDirtyBitmapIter *dbi;

    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
    clusters_per_iter = MAX((granularity / job->cluster_size), 1);
    bdrv_dirty_iter_init(job->sync_bitmap, &hbi);
    dbi = bdrv_dirty_iter_new(job->sync_bitmap, 0);

    /* Find the next dirty sector(s) */
    while ((sector = hbitmap_iter_next(&hbi)) != -1) {
    while ((sector = bdrv_dirty_iter_next(dbi)) != -1) {
        cluster = sector / sectors_per_cluster;

        /* Fake progress updates for any clusters we skipped */
@@ -391,7 +391,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
        for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
            do {
                if (yield_and_check(job)) {
                    return ret;
                    goto out;
                }
                ret = backup_do_cow(job, cluster * sectors_per_cluster,
                                    sectors_per_cluster, &error_is_read,
@@ -399,7 +399,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
                if ((ret < 0) &&
                    backup_error_action(job, error_is_read, -ret) ==
                    BLOCK_ERROR_ACTION_REPORT) {
                    return ret;
                    goto out;
                }
            } while (ret < 0);
        }
@@ -407,7 +407,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
        /* If the bitmap granularity is smaller than the backup granularity,
         * we need to advance the iterator pointer to the next cluster. */
        if (granularity < job->cluster_size) {
            bdrv_set_dirty_iter(&hbi, cluster * sectors_per_cluster);
            bdrv_set_dirty_iter(dbi, cluster * sectors_per_cluster);
        }

        last_cluster = cluster - 1;
@@ -419,6 +419,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
        job->common.offset += ((end - last_cluster - 1) * job->cluster_size);
    }

out:
    bdrv_dirty_iter_free(dbi);
    return ret;
}

+154 −6
Original line number Diff line number Diff line
@@ -38,13 +38,20 @@
 */
struct BdrvDirtyBitmap {
    HBitmap *bitmap;            /* Dirty sector bitmap implementation */
    HBitmap *meta;              /* Meta dirty bitmap */
    BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
    char *name;                 /* Optional non-empty unique ID */
    int64_t size;               /* Size of the bitmap (Number of sectors) */
    bool disabled;              /* Bitmap is read-only */
    int active_iterators;       /* How many iterators are active */
    QLIST_ENTRY(BdrvDirtyBitmap) list;
};

struct BdrvDirtyBitmapIter {
    HBitmapIter hbi;
    BdrvDirtyBitmap *bitmap;
};

BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
{
    BdrvDirtyBitmap *bm;
@@ -97,6 +104,66 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
    return bitmap;
}

/* bdrv_create_meta_dirty_bitmap
 *
 * Create a meta dirty bitmap that tracks the changes of bits in @bitmap. I.e.
 * when a dirty status bit in @bitmap is changed (either from reset to set or
 * the other way around), its respective meta dirty bitmap bit will be marked
 * dirty as well.
 *
 * @bitmap: the block dirty bitmap for which to create a meta dirty bitmap.
 * @chunk_size: how many bytes of bitmap data does each bit in the meta bitmap
 * track.
 */
void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                                   int chunk_size)
{
    assert(!bitmap->meta);
    bitmap->meta = hbitmap_create_meta(bitmap->bitmap,
                                       chunk_size * BITS_PER_BYTE);
}

void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap)
{
    assert(bitmap->meta);
    hbitmap_free_meta(bitmap->bitmap);
    bitmap->meta = NULL;
}

int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
                               BdrvDirtyBitmap *bitmap, int64_t sector,
                               int nb_sectors)
{
    uint64_t i;
    int sectors_per_bit = 1 << hbitmap_granularity(bitmap->meta);

    /* To optimize: we can make hbitmap to internally check the range in a
     * coarse level, or at least do it word by word. */
    for (i = sector; i < sector + nb_sectors; i += sectors_per_bit) {
        if (hbitmap_get(bitmap->meta, i)) {
            return true;
        }
    }
    return false;
}

void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
                                  BdrvDirtyBitmap *bitmap, int64_t sector,
                                  int nb_sectors)
{
    hbitmap_reset(bitmap->meta, sector, nb_sectors);
}

int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap)
{
    return bitmap->size;
}

const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap)
{
    return bitmap->name;
}

bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
{
    return bitmap->successor;
@@ -212,6 +279,7 @@ void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)

    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
        assert(!bdrv_dirty_bitmap_frozen(bitmap));
        assert(!bitmap->active_iterators);
        hbitmap_truncate(bitmap->bitmap, size);
        bitmap->size = size;
    }
@@ -224,7 +292,9 @@ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
    BdrvDirtyBitmap *bm, *next;
    QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
        if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) {
            assert(!bm->active_iterators);
            assert(!bdrv_dirty_bitmap_frozen(bm));
            assert(!bm->meta);
            QLIST_REMOVE(bm, list);
            hbitmap_free(bm->bitmap);
            g_free(bm->name);
@@ -235,6 +305,9 @@ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
            }
        }
    }
    if (bitmap) {
        abort();
    }
}

void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
@@ -320,9 +393,43 @@ uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
    return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
}

void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
uint32_t bdrv_dirty_bitmap_meta_granularity(BdrvDirtyBitmap *bitmap)
{
    return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->meta);
}

BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap,
                                         uint64_t first_sector)
{
    hbitmap_iter_init(hbi, bitmap->bitmap, 0);
    BdrvDirtyBitmapIter *iter = g_new(BdrvDirtyBitmapIter, 1);
    hbitmap_iter_init(&iter->hbi, bitmap->bitmap, first_sector);
    iter->bitmap = bitmap;
    bitmap->active_iterators++;
    return iter;
}

BdrvDirtyBitmapIter *bdrv_dirty_meta_iter_new(BdrvDirtyBitmap *bitmap)
{
    BdrvDirtyBitmapIter *iter = g_new(BdrvDirtyBitmapIter, 1);
    hbitmap_iter_init(&iter->hbi, bitmap->meta, 0);
    iter->bitmap = bitmap;
    bitmap->active_iterators++;
    return iter;
}

void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter)
{
    if (!iter) {
        return;
    }
    assert(iter->bitmap->active_iterators > 0);
    iter->bitmap->active_iterators--;
    g_free(iter);
}

int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter)
{
    return hbitmap_iter_next(&iter->hbi);
}

void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
@@ -360,6 +467,43 @@ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
    hbitmap_free(tmp);
}

uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap,
                                              uint64_t start, uint64_t count)
{
    return hbitmap_serialization_size(bitmap->bitmap, start, count);
}

uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap)
{
    return hbitmap_serialization_granularity(bitmap->bitmap);
}

void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap,
                                      uint8_t *buf, uint64_t start,
                                      uint64_t count)
{
    hbitmap_serialize_part(bitmap->bitmap, buf, start, count);
}

void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap,
                                        uint8_t *buf, uint64_t start,
                                        uint64_t count, bool finish)
{
    hbitmap_deserialize_part(bitmap->bitmap, buf, start, count, finish);
}

void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap,
                                          uint64_t start, uint64_t count,
                                          bool finish)
{
    hbitmap_deserialize_zeroes(bitmap->bitmap, start, count, finish);
}

void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap)
{
    hbitmap_deserialize_finish(bitmap->bitmap);
}

void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
                    int64_t nr_sectors)
{
@@ -373,15 +517,19 @@ void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
}

/**
 * Advance an HBitmapIter to an arbitrary offset.
 * Advance a BdrvDirtyBitmapIter to an arbitrary offset.
 */
void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset)
void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *iter, int64_t sector_num)
{
    assert(hbi->hb);
    hbitmap_iter_init(hbi, hbi->hb, offset);
    hbitmap_iter_init(&iter->hbi, iter->hbi.hb, sector_num);
}

int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
{
    return hbitmap_count(bitmap->bitmap);
}

int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap)
{
    return hbitmap_count(bitmap->meta);
}
+13 −11
Original line number Diff line number Diff line
@@ -55,7 +55,7 @@ typedef struct MirrorBlockJob {
    int64_t bdev_length;
    unsigned long *cow_bitmap;
    BdrvDirtyBitmap *dirty_bitmap;
    HBitmapIter hbi;
    BdrvDirtyBitmapIter *dbi;
    uint8_t *buf;
    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
    int buf_free_count;
@@ -330,10 +330,10 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT,
                             MAX_IO_SECTORS);

    sector_num = hbitmap_iter_next(&s->hbi);
    sector_num = bdrv_dirty_iter_next(s->dbi);
    if (sector_num < 0) {
        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
        sector_num = hbitmap_iter_next(&s->hbi);
        bdrv_set_dirty_iter(s->dbi, 0);
        sector_num = bdrv_dirty_iter_next(s->dbi);
        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
        assert(sector_num >= 0);
    }
@@ -349,7 +349,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
    /* Find the number of consective dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
        int64_t hbitmap_next;
        int64_t next_dirty;
        int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
        int64_t next_chunk = next_sector / sectors_per_chunk;
        if (next_sector >= end ||
@@ -360,13 +360,13 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
            break;
        }

        hbitmap_next = hbitmap_iter_next(&s->hbi);
        if (hbitmap_next > next_sector || hbitmap_next < 0) {
        next_dirty = bdrv_dirty_iter_next(s->dbi);
        if (next_dirty > next_sector || next_dirty < 0) {
            /* The bitmap iterator's cache is stale, refresh it */
            bdrv_set_dirty_iter(&s->hbi, next_sector);
            hbitmap_next = hbitmap_iter_next(&s->hbi);
            bdrv_set_dirty_iter(s->dbi, next_sector);
            next_dirty = bdrv_dirty_iter_next(s->dbi);
        }
        assert(hbitmap_next == next_sector);
        assert(next_dirty == next_sector);
        nb_chunks++;
    }

@@ -679,7 +679,8 @@ static void coroutine_fn mirror_run(void *opaque)
        }
    }

    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
    assert(!s->dbi);
    s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap, 0);
    for (;;) {
        uint64_t delay_ns = 0;
        int64_t cnt, delta;
@@ -793,6 +794,7 @@ immediate_exit:
    qemu_vfree(s->buf);
    g_free(s->cow_bitmap);
    g_free(s->in_flight_bitmap);
    bdrv_dirty_iter_free(s->dbi);
    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);

    data = g_malloc(sizeof(*data));
+45 −48
Original line number Diff line number Diff line
@@ -130,7 +130,7 @@ struct QuorumAIOCB {

    bool is_read;
    int vote_ret;
    int child_iter;             /* which child to read in fifo pattern */
    int children_read;          /* how many children have been read from */
};

static bool quorum_vote(QuorumAIOCB *acb);
@@ -156,22 +156,7 @@ static AIOCBInfo quorum_aiocb_info = {

static void quorum_aio_finalize(QuorumAIOCB *acb)
{
    int i, ret = 0;

    if (acb->vote_ret) {
        ret = acb->vote_ret;
    }

    acb->common.cb(acb->common.opaque, ret);

    if (acb->is_read) {
        /* on the quorum case acb->child_iter == s->num_children - 1 */
        for (i = 0; i <= acb->child_iter; i++) {
            qemu_vfree(acb->qcrs[i].buf);
            qemu_iovec_destroy(&acb->qcrs[i].qiov);
        }
    }

    acb->common.cb(acb->common.opaque, acb->vote_ret);
    g_free(acb->qcrs);
    qemu_aio_unref(acb);
}
@@ -283,39 +268,52 @@ static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
    }
}

static void quorum_aio_cb(void *opaque, int ret)
static void quorum_report_bad_acb(QuorumChildRequest *sacb, int ret)
{
    QuorumChildRequest *sacb = opaque;
    QuorumAIOCB *acb = sacb->parent;
    BDRVQuorumState *s = acb->common.bs->opaque;
    bool rewrite = false;

    if (ret == 0) {
        acb->success_count++;
    } else {
        QuorumOpType type;
        type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE;
    QuorumOpType type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE;
    quorum_report_bad(type, acb->sector_num, acb->nb_sectors,
                      sacb->aiocb->bs->node_name, ret);
}

    if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) {
static void quorum_fifo_aio_cb(void *opaque, int ret)
{
    QuorumChildRequest *sacb = opaque;
    QuorumAIOCB *acb = sacb->parent;
    BDRVQuorumState *s = acb->common.bs->opaque;

    assert(acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO);

    if (ret < 0) {
        quorum_report_bad_acb(sacb, ret);

        /* We try to read next child in FIFO order if we fail to read */
        if (ret < 0 && (acb->child_iter + 1) < s->num_children) {
            acb->child_iter++;
        if (acb->children_read < s->num_children) {
            read_fifo_child(acb);
            return;
        }

        if (ret == 0) {
            quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qiov);
    }

    acb->vote_ret = ret;

    /* FIXME: rewrite failed children if acb->children_read > 1? */
    quorum_aio_finalize(acb);
        return;
}

static void quorum_aio_cb(void *opaque, int ret)
{
    QuorumChildRequest *sacb = opaque;
    QuorumAIOCB *acb = sacb->parent;
    BDRVQuorumState *s = acb->common.bs->opaque;
    bool rewrite = false;
    int i;

    sacb->ret = ret;
    if (ret == 0) {
        acb->success_count++;
    } else {
        quorum_report_bad_acb(sacb, ret);
    }
    acb->count++;
    assert(acb->count <= s->num_children);
    assert(acb->success_count <= s->num_children);
@@ -326,6 +324,10 @@ static void quorum_aio_cb(void *opaque, int ret)
    /* Do the vote on read */
    if (acb->is_read) {
        rewrite = quorum_vote(acb);
        for (i = 0; i < s->num_children; i++) {
            qemu_vfree(acb->qcrs[i].buf);
            qemu_iovec_destroy(&acb->qcrs[i].qiov);
        }
    } else {
        quorum_has_too_much_io_failed(acb);
    }
@@ -653,6 +655,7 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb)
    BDRVQuorumState *s = acb->common.bs->opaque;
    int i;

    acb->children_read = s->num_children;
    for (i = 0; i < s->num_children; i++) {
        acb->qcrs[i].buf = qemu_blockalign(s->children[i]->bs, acb->qiov->size);
        qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov);
@@ -671,16 +674,11 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb)
static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb)
{
    BDRVQuorumState *s = acb->common.bs->opaque;
    int n = acb->children_read++;

    acb->qcrs[acb->child_iter].buf =
        qemu_blockalign(s->children[acb->child_iter]->bs, acb->qiov->size);
    qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov);
    qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov,
                     acb->qcrs[acb->child_iter].buf);
    acb->qcrs[acb->child_iter].aiocb =
        bdrv_aio_readv(s->children[acb->child_iter], acb->sector_num,
                       &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors,
                       quorum_aio_cb, &acb->qcrs[acb->child_iter]);
    acb->qcrs[n].aiocb = bdrv_aio_readv(s->children[n], acb->sector_num,
                                        acb->qiov, acb->nb_sectors,
                                        quorum_fifo_aio_cb, &acb->qcrs[n]);

    return &acb->common;
}
@@ -696,13 +694,12 @@ static BlockAIOCB *quorum_aio_readv(BlockDriverState *bs,
    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
                                      nb_sectors, cb, opaque);
    acb->is_read = true;
    acb->children_read = 0;

    if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
        acb->child_iter = s->num_children - 1;
        return read_quorum_children(acb);
    }

    acb->child_iter = 0;
    return read_fifo_child(acb);
}

+5 −0
Original line number Diff line number Diff line
@@ -101,6 +101,11 @@ static int replication_open(BlockDriverState *bs, QDict *options,

    if (!strcmp(mode, "primary")) {
        s->mode = REPLICATION_MODE_PRIMARY;
        top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
        if (top_id) {
            error_setg(&local_err, "The primary side does not support option top-id");
            goto fail;
        }
    } else if (!strcmp(mode, "secondary")) {
        s->mode = REPLICATION_MODE_SECONDARY;
        top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
Loading