Commit c6ac36e1 authored by Fam Zheng's avatar Fam Zheng Committed by Kevin Wolf
Browse files

vmdk: Optimize cluster allocation



This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.

Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.

This is not efficient, so it's now rewritten as:

  - Save the extent file length when opening.

  - When allocating cluster, use the saved length as cluster offset.

  - Don't truncate image, because we'll anyway write data there: just
    write any data at the EOF position, in descending priority:

    * New user data (cluster allocation happens in a write request).

    * Filling data in the beginning and/or ending of the new cluster, if
      not covered by user data: either backing file content (COW), or
      zero for standalone images.

One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:

    $ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk

    Before:
        real    0m21.796s
        user    0m0.130s
        sys     0m0.483s

    After:
        real    0m2.017s
        user    0m0.047s
        sys     0m0.190s

We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.

Tested that this passes qemu-iotests for all VMDK subformats.

Signed-off-by: default avatarFam Zheng <famz@redhat.com>
Signed-off-by: default avatarStefan Hajnoczi <stefanha@redhat.com>
parent a8d8a1a0
Loading
Loading
Loading
Loading
+140 −82
Original line number Diff line number Diff line
@@ -106,6 +106,7 @@ typedef struct VmdkExtent {
    uint32_t l2_cache_counts[L2_CACHE_SIZE];

    int64_t cluster_sectors;
    int64_t next_cluster_sector;
    char *type;
} VmdkExtent;

@@ -124,7 +125,6 @@ typedef struct BDRVVmdkState {
} BDRVVmdkState;

typedef struct VmdkMetaData {
    uint32_t offset;
    unsigned int l1_index;
    unsigned int l2_index;
    unsigned int l2_offset;
@@ -397,6 +397,7 @@ static int vmdk_add_extent(BlockDriverState *bs,
{
    VmdkExtent *extent;
    BDRVVmdkState *s = bs->opaque;
    int64_t length;

    if (cluster_sectors > 0x200000) {
        /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
@@ -412,6 +413,11 @@ static int vmdk_add_extent(BlockDriverState *bs,
        return -EFBIG;
    }

    length = bdrv_getlength(file);
    if (length < 0) {
        return length;
    }

    s->extents = g_realloc(s->extents,
                              (s->num_extents + 1) * sizeof(VmdkExtent));
    extent = &s->extents[s->num_extents];
@@ -427,6 +433,8 @@ static int vmdk_add_extent(BlockDriverState *bs,
    extent->l1_entry_sectors = l2_size * cluster_sectors;
    extent->l2_size = l2_size;
    extent->cluster_sectors = flat ? sectors : cluster_sectors;
    extent->next_cluster_sector =
        ROUND_UP(DIV_ROUND_UP(length, BDRV_SECTOR_SIZE), cluster_sectors);

    if (s->num_extents > 1) {
        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
@@ -951,57 +959,97 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
    }
}

/**
 * get_whole_cluster
 *
 * Copy backing file's cluster that covers @sector_num, otherwise write zero,
 * to the cluster at @cluster_sector_num.
 *
 * If @skip_start_sector < @skip_end_sector, the relative range
 * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
 * it for call to write user data in the request.
 */
static int get_whole_cluster(BlockDriverState *bs,
                             VmdkExtent *extent,
                uint64_t cluster_offset,
                uint64_t offset,
                bool allocate)
                             uint64_t cluster_sector_num,
                             uint64_t sector_num,
                             uint64_t skip_start_sector,
                             uint64_t skip_end_sector)
{
    int ret = VMDK_OK;
    uint8_t *whole_grain = NULL;
    int64_t cluster_bytes;
    uint8_t *whole_grain;

    /* For COW, align request sector_num to cluster start */
    sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors);
    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
    whole_grain = qemu_blockalign(bs, cluster_bytes);

    if (!bs->backing_hd) {
        memset(whole_grain, 0,  skip_start_sector << BDRV_SECTOR_BITS);
        memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0,
               cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS));
    }

    assert(skip_end_sector <= extent->cluster_sectors);
    /* we will be here if it's first write on non-exist grain(cluster).
     * try to read from parent image, if exist */
    if (bs->backing_hd) {
        whole_grain =
            qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS);
        if (!vmdk_is_cid_valid(bs)) {
    if (bs->backing_hd && !vmdk_is_cid_valid(bs)) {
        ret = VMDK_ERROR;
        goto exit;
    }

        /* floor offset to cluster */
        offset -= offset % (extent->cluster_sectors * 512);
        ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
                extent->cluster_sectors);
    /* Read backing data before skip range */
    if (skip_start_sector > 0) {
        if (bs->backing_hd) {
            ret = bdrv_read(bs->backing_hd, sector_num,
                            whole_grain, skip_start_sector);
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }

        /* Write grain only into the active image */
        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
                extent->cluster_sectors);
        }
        ret = bdrv_write(extent->file, cluster_sector_num, whole_grain,
                         skip_start_sector);
        if (ret < 0) {
            ret = VMDK_ERROR;
            goto exit;
        }
    }
    /* Read backing data after skip range */
    if (skip_end_sector < extent->cluster_sectors) {
        if (bs->backing_hd) {
            ret = bdrv_read(bs->backing_hd, sector_num + skip_end_sector,
                            whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
                            extent->cluster_sectors - skip_end_sector);
            if (ret < 0) {
                ret = VMDK_ERROR;
                goto exit;
            }
        }
        ret = bdrv_write(extent->file, cluster_sector_num + skip_end_sector,
                         whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
                         extent->cluster_sectors - skip_end_sector);
        if (ret < 0) {
            ret = VMDK_ERROR;
            goto exit;
        }
    }

exit:
    qemu_vfree(whole_grain);
    return ret;
}

static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
                         uint32_t offset)
{
    uint32_t offset;
    QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
    offset = cpu_to_le32(m_data->offset);
    offset = cpu_to_le32(offset);
    /* update L2 table */
    if (bdrv_pwrite_sync(
                extent->file,
                ((int64_t)m_data->l2_offset * 512)
                    + (m_data->l2_index * sizeof(m_data->offset)),
                    + (m_data->l2_index * sizeof(offset)),
                &offset, sizeof(offset)) < 0) {
        return VMDK_ERROR;
    }
@@ -1011,7 +1059,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
        if (bdrv_pwrite_sync(
                    extent->file,
                    ((int64_t)m_data->l2_offset * 512)
                        + (m_data->l2_index * sizeof(m_data->offset)),
                        + (m_data->l2_index * sizeof(offset)),
                    &offset, sizeof(offset)) < 0) {
            return VMDK_ERROR;
        }
@@ -1023,17 +1071,41 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
    return VMDK_OK;
}

/**
 * get_cluster_offset
 *
 * Look up cluster offset in extent file by sector number, and store in
 * @cluster_offset.
 *
 * For flat extents, the start offset as parsed from the description file is
 * returned.
 *
 * For sparse extents, look up in L1, L2 table. If allocate is true, return an
 * offset for a new cluster and update L2 cache. If there is a backing file,
 * COW is done before returning; otherwise, zeroes are written to the allocated
 * cluster. Both COW and zero writing skips the sector range
 * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
 * has new data to write there.
 *
 * Returns: VMDK_OK if cluster exists and mapped in the image.
 *          VMDK_UNALLOC if cluster is not mapped and @allocate is false.
 *          VMDK_ERROR if failed.
 */
static int get_cluster_offset(BlockDriverState *bs,
                              VmdkExtent *extent,
                              VmdkMetaData *m_data,
                              uint64_t offset,
                                    int allocate,
                                    uint64_t *cluster_offset)
                              bool allocate,
                              uint64_t *cluster_offset,
                              uint64_t skip_start_sector,
                              uint64_t skip_end_sector)
{
    unsigned int l1_index, l2_offset, l2_index;
    int min_index, i, j;
    uint32_t min_count, *l2_table;
    bool zeroed = false;
    int64_t ret;
    int32_t cluster_sector;

    if (m_data) {
        m_data->valid = 0;
@@ -1087,52 +1159,41 @@ static int get_cluster_offset(BlockDriverState *bs,
    extent->l2_cache_counts[min_index] = 1;
 found:
    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
    *cluster_offset = le32_to_cpu(l2_table[l2_index]);
    cluster_sector = le32_to_cpu(l2_table[l2_index]);

    if (m_data) {
        m_data->valid = 1;
        m_data->l1_index = l1_index;
        m_data->l2_index = l2_index;
        m_data->offset = *cluster_offset;
        m_data->l2_offset = l2_offset;
        m_data->l2_cache_entry = &l2_table[l2_index];
    }
    if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
    if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
        zeroed = true;
    }

    if (!*cluster_offset || zeroed) {
    if (!cluster_sector || zeroed) {
        if (!allocate) {
            return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
        }

        /* Avoid the L2 tables update for the images that have snapshots. */
        *cluster_offset = bdrv_getlength(extent->file);
        if (!extent->compressed) {
            bdrv_truncate(
                extent->file,
                *cluster_offset + (extent->cluster_sectors << 9)
            );
        }

        *cluster_offset >>= 9;
        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
        cluster_sector = extent->next_cluster_sector;
        extent->next_cluster_sector += extent->cluster_sectors;

        /* First of all we write grain itself, to avoid race condition
         * that may to corrupt the image.
         * This problem may occur because of insufficient space on host disk
         * or inappropriate VM shutdown.
         */
        if (get_whole_cluster(
                bs, extent, *cluster_offset, offset, allocate) == -1) {
            return VMDK_ERROR;
        }

        if (m_data) {
            m_data->offset = *cluster_offset;
        ret = get_whole_cluster(bs, extent,
                                cluster_sector,
                                offset >> BDRV_SECTOR_BITS,
                                skip_start_sector, skip_end_sector);
        if (ret) {
            return ret;
        }
    }
    *cluster_offset <<= 9;
    *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
    return VMDK_OK;
}

@@ -1167,7 +1228,8 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
    }
    qemu_co_mutex_lock(&s->lock);
    ret = get_cluster_offset(bs, extent, NULL,
                            sector_num * 512, 0, &offset);
                             sector_num * 512, false, &offset,
                             0, 0);
    qemu_co_mutex_unlock(&s->lock);

    switch (ret) {
@@ -1320,9 +1382,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
        if (!extent) {
            return -EIO;
        }
        ret = get_cluster_offset(
                            bs, extent, NULL,
                            sector_num << 9, 0, &cluster_offset);
        ret = get_cluster_offset(bs, extent, NULL,
                                 sector_num << 9, false, &cluster_offset,
                                 0, 0);
        extent_begin_sector = extent->end_sector - extent->sectors;
        extent_relative_sector_num = sector_num - extent_begin_sector;
        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
@@ -1403,12 +1465,17 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
        if (!extent) {
            return -EIO;
        }
        ret = get_cluster_offset(
                                bs,
                                extent,
                                &m_data,
                                sector_num << 9, !extent->compressed,
                                &cluster_offset);
        extent_begin_sector = extent->end_sector - extent->sectors;
        extent_relative_sector_num = sector_num - extent_begin_sector;
        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
        n = extent->cluster_sectors - index_in_cluster;
        if (n > nb_sectors) {
            n = nb_sectors;
        }
        ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
                                 !(extent->compressed || zeroed),
                                 &cluster_offset,
                                 index_in_cluster, index_in_cluster + n);
        if (extent->compressed) {
            if (ret == VMDK_OK) {
                /* Refuse write to allocated cluster for streamOptimized */
@@ -1417,24 +1484,13 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                return -EIO;
            } else {
                /* allocate */
                ret = get_cluster_offset(
                                        bs,
                                        extent,
                                        &m_data,
                                        sector_num << 9, 1,
                                        &cluster_offset);
                ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
                                         true, &cluster_offset, 0, 0);
            }
        }
        if (ret == VMDK_ERROR) {
            return -EINVAL;
        }
        extent_begin_sector = extent->end_sector - extent->sectors;
        extent_relative_sector_num = sector_num - extent_begin_sector;
        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
        n = extent->cluster_sectors - index_in_cluster;
        if (n > nb_sectors) {
            n = nb_sectors;
        }
        if (zeroed) {
            /* Do zeroed write, buf is ignored */
            if (extent->has_zero_grain &&
@@ -1442,9 +1498,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                    n >= extent->cluster_sectors) {
                n = extent->cluster_sectors;
                if (!zero_dry_run) {
                    m_data.offset = VMDK_GTE_ZEROED;
                    /* update L2 tables */
                    if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
                            != VMDK_OK) {
                        return -EIO;
                    }
                }
@@ -1460,7 +1516,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
            }
            if (m_data.valid) {
                /* update L2 tables */
                if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
                if (vmdk_L2update(extent, &m_data,
                                  cluster_offset >> BDRV_SECTOR_BITS)
                        != VMDK_OK) {
                    return -EIO;
                }
            }
@@ -2019,7 +2077,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
        }
        ret = get_cluster_offset(bs, extent, NULL,
                                 sector_num << BDRV_SECTOR_BITS,
                                 0, &cluster_offset);
                                 false, &cluster_offset, 0, 0);
        if (ret == VMDK_ERROR) {
            fprintf(stderr,
                    "ERROR: could not get cluster_offset for sector %"