Commit 32b6444d authored by Max Reitz's avatar Max Reitz Committed by Kevin Wolf
Browse files

qcow2-cluster: Expand zero clusters



Add functionality for expanding zero clusters. This is necessary for
downgrading the image version to one without zero cluster support.

For non-backed images, this function may also just discard zero clusters
instead of truly expanding them.

Signed-off-by: default avatarMax Reitz <mreitz@redhat.com>
Signed-off-by: default avatarKevin Wolf <kwolf@redhat.com>
parent e7108fea
Loading
Loading
Loading
Loading
+233 −0
Original line number Diff line number Diff line
@@ -1497,3 +1497,236 @@ fail:

    return ret;
}

/*
 * Expands all zero clusters in a specific L1 table (or deallocates them, for
 * non-backed non-pre-allocated zero clusters).
 *
 * expanded_clusters is a bitmap where every bit corresponds to one cluster in
 * the image file; a bit gets set if the corresponding cluster has been used for
 * zero expansion (i.e., has been filled with zeroes and is referenced from an
 * L2 table). nb_clusters contains the total cluster count of the image file,
 * i.e., the number of bits in expanded_clusters.
 */
static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                                      int l1_size, uint8_t *expanded_clusters,
                                      uint64_t nb_clusters)
{
    BDRVQcowState *s = bs->opaque;
    bool is_active_l1 = (l1_table == s->l1_table);
    uint64_t *l2_table = NULL;
    int ret;
    int i, j;

    if (!is_active_l1) {
        /* inactive L2 tables require a buffer to be stored in when loading
         * them from disk */
        l2_table = qemu_blockalign(bs, s->cluster_size);
    }

    for (i = 0; i < l1_size; i++) {
        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
        bool l2_dirty = false;

        if (!l2_offset) {
            /* unallocated */
            continue;
        }

        if (is_active_l1) {
            /* get active L2 tables from cache */
            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
                    (void **)&l2_table);
        } else {
            /* load inactive L2 tables from disk */
            ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
                    (void *)l2_table, s->cluster_sectors);
        }
        if (ret < 0) {
            goto fail;
        }

        for (j = 0; j < s->l2_size; j++) {
            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
            int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index;
            int cluster_type = qcow2_get_cluster_type(l2_entry);

            if (cluster_type == QCOW2_CLUSTER_NORMAL) {
                cluster_index = offset >> s->cluster_bits;
                assert((cluster_index >= 0) && (cluster_index < nb_clusters));
                if (expanded_clusters[cluster_index / 8] &
                    (1 << (cluster_index % 8))) {
                    /* Probably a shared L2 table; this cluster was a zero
                     * cluster which has been expanded, its refcount
                     * therefore most likely requires an update. */
                    ret = qcow2_update_cluster_refcount(bs, cluster_index, 1,
                                                        QCOW2_DISCARD_NEVER);
                    if (ret < 0) {
                        goto fail;
                    }
                    /* Since we just increased the refcount, the COPIED flag may
                     * no longer be set. */
                    l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED);
                    l2_dirty = true;
                }
                continue;
            }
            else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) {
                continue;
            }

            if (!offset) {
                /* not preallocated */
                if (!bs->backing_hd) {
                    /* not backed; therefore we can simply deallocate the
                     * cluster */
                    l2_table[j] = 0;
                    l2_dirty = true;
                    continue;
                }

                offset = qcow2_alloc_clusters(bs, s->cluster_size);
                if (offset < 0) {
                    ret = offset;
                    goto fail;
                }
            }

            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
                                                offset, s->cluster_size);
            if (ret < 0) {
                qcow2_free_clusters(bs, offset, s->cluster_size,
                        QCOW2_DISCARD_ALWAYS);
                goto fail;
            }

            ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE,
                                    s->cluster_sectors);
            if (ret < 0) {
                qcow2_free_clusters(bs, offset, s->cluster_size,
                        QCOW2_DISCARD_ALWAYS);
                goto fail;
            }

            l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
            l2_dirty = true;

            cluster_index = offset >> s->cluster_bits;
            assert((cluster_index >= 0) && (cluster_index < nb_clusters));
            expanded_clusters[cluster_index / 8] |= 1 << (cluster_index % 8);
        }

        if (is_active_l1) {
            if (l2_dirty) {
                qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
                qcow2_cache_depends_on_flush(s->l2_table_cache);
            }
            ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
            if (ret < 0) {
                l2_table = NULL;
                goto fail;
            }
        } else {
            if (l2_dirty) {
                ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT &
                        ~(QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2), l2_offset,
                        s->cluster_size);
                if (ret < 0) {
                    goto fail;
                }

                ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
                        (void *)l2_table, s->cluster_sectors);
                if (ret < 0) {
                    goto fail;
                }
            }
        }
    }

    ret = 0;

fail:
    if (l2_table) {
        if (!is_active_l1) {
            qemu_vfree(l2_table);
        } else {
            if (ret < 0) {
                qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
            } else {
                ret = qcow2_cache_put(bs, s->l2_table_cache,
                        (void **)&l2_table);
            }
        }
    }
    return ret;
}

/*
 * For backed images, expands all zero clusters on the image. For non-backed
 * images, deallocates all non-pre-allocated zero clusters (and claims the
 * allocation for pre-allocated ones). This is important for downgrading to a
 * qcow2 version which doesn't yet support metadata zero clusters.
 */
int qcow2_expand_zero_clusters(BlockDriverState *bs)
{
    BDRVQcowState *s = bs->opaque;
    uint64_t *l1_table = NULL;
    int cluster_to_sector_bits = s->cluster_bits - BDRV_SECTOR_BITS;
    uint64_t nb_clusters;
    uint8_t *expanded_clusters;
    int ret;
    int i, j;

    nb_clusters = (bs->total_sectors + (1 << cluster_to_sector_bits) - 1)
            >> cluster_to_sector_bits;
    expanded_clusters = g_malloc0((nb_clusters + 7) / 8);

    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
                                     expanded_clusters, nb_clusters);
    if (ret < 0) {
        goto fail;
    }

    /* Inactive L1 tables may point to active L2 tables - therefore it is
     * necessary to flush the L2 table cache before trying to access the L2
     * tables pointed to by inactive L1 entries (else we might try to expand
     * zero clusters that have already been expanded); furthermore, it is also
     * necessary to empty the L2 table cache, since it may contain tables which
     * are now going to be modified directly on disk, bypassing the cache.
     * qcow2_cache_empty() does both for us. */
    ret = qcow2_cache_empty(bs, s->l2_table_cache);
    if (ret < 0) {
        goto fail;
    }

    for (i = 0; i < s->nb_snapshots; i++) {
        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;

        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);

        ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset /
                BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors);
        if (ret < 0) {
            goto fail;
        }

        for (j = 0; j < s->snapshots[i].l1_size; j++) {
            be64_to_cpus(&l1_table[j]);
        }

        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
                                         expanded_clusters, nb_clusters);
        if (ret < 0) {
            goto fail;
        }
    }

    ret = 0;

fail:
    g_free(expanded_clusters);
    g_free(l1_table);
    return ret;
}
+15 −14
Original line number Diff line number Diff line
@@ -601,7 +601,7 @@ fail:
 * If the return value is non-negative, it is the new refcount of the cluster.
 * If it is negative, it is -errno and indicates an error.
 */
static int update_cluster_refcount(BlockDriverState *bs,
int qcow2_update_cluster_refcount(BlockDriverState *bs,
                                  int64_t cluster_index,
                                  int addend,
                                  enum qcow2_discard_type type)
@@ -733,7 +733,7 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
        if (free_in_cluster == 0)
            s->free_byte_offset = 0;
        if ((offset & (s->cluster_size - 1)) != 0)
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
                                          QCOW2_DISCARD_NEVER);
    } else {
        offset = qcow2_alloc_clusters(bs, s->cluster_size);
@@ -744,7 +744,7 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
        if ((cluster_offset + s->cluster_size) == offset) {
            /* we are lucky: contiguous data */
            offset = s->free_byte_offset;
            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
                                          QCOW2_DISCARD_NEVER);
            s->free_byte_offset += size;
        } else {
@@ -754,8 +754,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
    }

    /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
     * or explicitly by update_cluster_refcount().  Refcount blocks must be
     * flushed before the caller's L2 table updates.
     * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
     * be flushed before the caller's L2 table updates.
     */
    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
    return offset;
@@ -896,7 +896,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                            break;
                        }
                        if (addend != 0) {
                            refcount = update_cluster_refcount(bs, cluster_index, addend,
                            refcount = qcow2_update_cluster_refcount(bs,
                                    cluster_index, addend,
                                    QCOW2_DISCARD_SNAPSHOT);
                        } else {
                            refcount = get_refcount(bs, cluster_index);
@@ -936,8 +937,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,


            if (addend != 0) {
                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend,
                                                   QCOW2_DISCARD_SNAPSHOT);
                refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
                        s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
            } else {
                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
            }
+5 −0
Original line number Diff line number Diff line
@@ -411,6 +411,9 @@ int qcow2_update_header(BlockDriverState *bs);
int qcow2_refcount_init(BlockDriverState *bs);
void qcow2_refcount_close(BlockDriverState *bs);

int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
                                  int addend, enum qcow2_discard_type type);

int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size);
int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
    int nb_clusters);
@@ -458,6 +461,8 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
    int nb_sectors, enum qcow2_discard_type type);
int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);

int qcow2_expand_zero_clusters(BlockDriverState *bs);

/* qcow2-snapshot.c functions */
int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info);
int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id);