Commit ac839ccd authored by Anthony Liguori's avatar Anthony Liguori
Browse files

Merge remote-tracking branch 'quintela/migration-next-20120808' into staging

* quintela/migration-next-20120808:
  Restart optimization on stage3 update version
  Add XBZRLE statistics
  Add migration accounting for normal and duplicate pages
  Change total_time to total-time in MigrationStats
  Add migrate_set_cache_size command
  Add XBZRLE to ram_save_block and ram_save_live
  Add xbzrle_encode_buffer and xbzrle_decode_buffer functions
  Add uleb encoding/decoding functions
  Add cache handling functions
  Add XBZRLE documentation
  Add migrate-set-capabilities
  Add migration capabilities
parents 6a1f9d0c dd051c72
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -77,6 +77,7 @@ common-obj-y += qemu-char.o #aio.o
common-obj-y += block-migration.o iohandler.o
common-obj-y += pflib.o
common-obj-y += bitmap.o bitops.o
common-obj-y += page_cache.o

common-obj-$(CONFIG_POSIX) += migration-exec.o migration-unix.o migration-fd.o
common-obj-$(CONFIG_WIN32) += version.o
+241 −5
Original line number Diff line number Diff line
@@ -43,6 +43,7 @@
#include "hw/smbios.h"
#include "exec-memory.h"
#include "hw/pcspk.h"
#include "qemu/page_cache.h"

#ifdef DEBUG_ARCH_INIT
#define DPRINTF(fmt, ...) \
@@ -106,6 +107,7 @@ const uint32_t arch_type = QEMU_ARCH;
#define RAM_SAVE_FLAG_PAGE     0x08
#define RAM_SAVE_FLAG_EOS      0x10
#define RAM_SAVE_FLAG_CONTINUE 0x20
#define RAM_SAVE_FLAG_XBZRLE   0x40

#ifdef __ALTIVEC__
#include <altivec.h>
@@ -173,6 +175,92 @@ static int is_dup_page(uint8_t *page)
    return 1;
}

/* struct contains XBZRLE cache and a static page
   used by the compression */
static struct {
    /* buffer used for XBZRLE encoding */
    uint8_t *encoded_buf;
    /* buffer for storing page content */
    uint8_t *current_buf;
    /* buffer used for XBZRLE decoding */
    uint8_t *decoded_buf;
    /* Cache for XBZRLE */
    PageCache *cache;
} XBZRLE = {
    .encoded_buf = NULL,
    .current_buf = NULL,
    .decoded_buf = NULL,
    .cache = NULL,
};


int64_t xbzrle_cache_resize(int64_t new_size)
{
    if (XBZRLE.cache != NULL) {
        return cache_resize(XBZRLE.cache, new_size / TARGET_PAGE_SIZE) *
            TARGET_PAGE_SIZE;
    }
    return pow2floor(new_size);
}

/* accounting for migration statistics */
typedef struct AccountingInfo {
    uint64_t dup_pages;
    uint64_t norm_pages;
    uint64_t iterations;
    uint64_t xbzrle_bytes;
    uint64_t xbzrle_pages;
    uint64_t xbzrle_cache_miss;
    uint64_t xbzrle_overflows;
} AccountingInfo;

static AccountingInfo acct_info;

static void acct_clear(void)
{
    memset(&acct_info, 0, sizeof(acct_info));
}

uint64_t dup_mig_bytes_transferred(void)
{
    return acct_info.dup_pages * TARGET_PAGE_SIZE;
}

uint64_t dup_mig_pages_transferred(void)
{
    return acct_info.dup_pages;
}

uint64_t norm_mig_bytes_transferred(void)
{
    return acct_info.norm_pages * TARGET_PAGE_SIZE;
}

uint64_t norm_mig_pages_transferred(void)
{
    return acct_info.norm_pages;
}

uint64_t xbzrle_mig_bytes_transferred(void)
{
    return acct_info.xbzrle_bytes;
}

uint64_t xbzrle_mig_pages_transferred(void)
{
    return acct_info.xbzrle_pages;
}

uint64_t xbzrle_mig_pages_cache_miss(void)
{
    return acct_info.xbzrle_cache_miss;
}

uint64_t xbzrle_mig_pages_overflow(void)
{
    return acct_info.xbzrle_overflows;
}

static void save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
        int cont, int flag)
{
@@ -185,6 +273,61 @@ static void save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,

}

#define ENCODING_FLAG_XBZRLE 0x1

static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
                            ram_addr_t current_addr, RAMBlock *block,
                            ram_addr_t offset, int cont, bool last_stage)
{
    int encoded_len = 0, bytes_sent = -1;
    uint8_t *prev_cached_page;

    if (!cache_is_cached(XBZRLE.cache, current_addr)) {
        if (!last_stage) {
            cache_insert(XBZRLE.cache, current_addr,
                         g_memdup(current_data, TARGET_PAGE_SIZE));
        }
        acct_info.xbzrle_cache_miss++;
        return -1;
    }

    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);

    /* save current buffer into memory */
    memcpy(XBZRLE.current_buf, current_data, TARGET_PAGE_SIZE);

    /* XBZRLE encoding (if there is no overflow) */
    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
                                       TARGET_PAGE_SIZE);
    if (encoded_len == 0) {
        DPRINTF("Skipping unmodified page\n");
        return 0;
    } else if (encoded_len == -1) {
        DPRINTF("Overflow\n");
        acct_info.xbzrle_overflows++;
        /* update data in the cache */
        memcpy(prev_cached_page, current_data, TARGET_PAGE_SIZE);
        return -1;
    }

    /* we need to update the data in the cache, in order to get the same data */
    if (!last_stage) {
        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
    }

    /* Send XBZRLE based compressed page */
    save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_XBZRLE);
    qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
    qemu_put_be16(f, encoded_len);
    qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
    bytes_sent = encoded_len + 1 + 2;
    acct_info.xbzrle_pages++;
    acct_info.xbzrle_bytes += bytes_sent;

    return bytes_sent;
}

static RAMBlock *last_block;
static ram_addr_t last_offset;

@@ -196,12 +339,13 @@ static ram_addr_t last_offset;
 *           n: the amount of bytes written in other case
 */

static int ram_save_block(QEMUFile *f)
static int ram_save_block(QEMUFile *f, bool last_stage)
{
    RAMBlock *block = last_block;
    ram_addr_t offset = last_offset;
    int bytes_sent = -1;
    MemoryRegion *mr;
    ram_addr_t current_addr;

    if (!block)
        block = QLIST_FIRST(&ram_list.blocks);
@@ -219,17 +363,32 @@ static int ram_save_block(QEMUFile *f)
            p = memory_region_get_ram_ptr(mr) + offset;

            if (is_dup_page(p)) {
                acct_info.dup_pages++;
                save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_COMPRESS);
                qemu_put_byte(f, *p);
                bytes_sent = 1;
            } else {
            } else if (migrate_use_xbzrle()) {
                current_addr = block->offset + offset;
                bytes_sent = save_xbzrle_page(f, p, current_addr, block,
                                              offset, cont, last_stage);
                if (!last_stage) {
                    p = get_cached_data(XBZRLE.cache, current_addr);
                }
            }

            /* either we didn't send yet (we may have had XBZRLE overflow) */
            if (bytes_sent == -1) {
                save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE);
                qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
                bytes_sent = TARGET_PAGE_SIZE;
                acct_info.norm_pages++;
            }

            /* if page is unmodified, continue to the next */
            if (bytes_sent != 0) {
                break;
            }
        }

        offset += TARGET_PAGE_SIZE;
        if (offset >= block->length) {
@@ -306,6 +465,15 @@ static void sort_ram_list(void)
static void migration_end(void)
{
    memory_global_dirty_log_stop();

    if (migrate_use_xbzrle()) {
        cache_fini(XBZRLE.cache);
        g_free(XBZRLE.cache);
        g_free(XBZRLE.encoded_buf);
        g_free(XBZRLE.current_buf);
        g_free(XBZRLE.decoded_buf);
        XBZRLE.cache = NULL;
    }
}

static void ram_migration_cancel(void *opaque)
@@ -325,6 +493,19 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
    last_offset = 0;
    sort_ram_list();

    if (migrate_use_xbzrle()) {
        XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
                                  TARGET_PAGE_SIZE,
                                  TARGET_PAGE_SIZE);
        if (!XBZRLE.cache) {
            DPRINTF("Error creating cache\n");
            return -1;
        }
        XBZRLE.encoded_buf = g_malloc0(TARGET_PAGE_SIZE);
        XBZRLE.current_buf = g_malloc(TARGET_PAGE_SIZE);
        acct_clear();
    }

    /* Make sure all dirty bits are set */
    QLIST_FOREACH(block, &ram_list.blocks, next) {
        for (addr = 0; addr < block->length; addr += TARGET_PAGE_SIZE) {
@@ -365,12 +546,13 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
    while ((ret = qemu_file_rate_limit(f)) == 0) {
        int bytes_sent;

        bytes_sent = ram_save_block(f);
        bytes_sent = ram_save_block(f, false);
        /* no more blocks to sent */
        if (bytes_sent < 0) {
            break;
        }
        bytes_transferred += bytes_sent;
        acct_info.iterations++;
        /* we want to check in the 1st loop, just in case it was the 1st time
           and we had to sync the dirty bitmap.
           qemu_get_clock_ns() is a bit expensive, so we only check each some
@@ -426,7 +608,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
    while (true) {
        int bytes_sent;

        bytes_sent = ram_save_block(f);
        bytes_sent = ram_save_block(f, true);
        /* no more blocks to sent */
        if (bytes_sent < 0) {
            break;
@@ -440,6 +622,47 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
    return 0;
}

static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
{
    int ret, rc = 0;
    unsigned int xh_len;
    int xh_flags;

    if (!XBZRLE.decoded_buf) {
        XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
    }

    /* extract RLE header */
    xh_flags = qemu_get_byte(f);
    xh_len = qemu_get_be16(f);

    if (xh_flags != ENCODING_FLAG_XBZRLE) {
        fprintf(stderr, "Failed to load XBZRLE page - wrong compression!\n");
        return -1;
    }

    if (xh_len > TARGET_PAGE_SIZE) {
        fprintf(stderr, "Failed to load XBZRLE page - len overflow!\n");
        return -1;
    }
    /* load data and decode */
    qemu_get_buffer(f, XBZRLE.decoded_buf, xh_len);

    /* decode RLE */
    ret = xbzrle_decode_buffer(XBZRLE.decoded_buf, xh_len, host,
                               TARGET_PAGE_SIZE);
    if (ret == -1) {
        fprintf(stderr, "Failed to load XBZRLE page - decode error!\n");
        rc = -1;
    } else  if (ret > TARGET_PAGE_SIZE) {
        fprintf(stderr, "Failed to load XBZRLE page - size %d exceeds %d!\n",
                ret, TARGET_PAGE_SIZE);
        abort();
    }

    return rc;
}

static inline void *host_from_stream_offset(QEMUFile *f,
                                            ram_addr_t offset,
                                            int flags)
@@ -553,6 +776,19 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
            }

            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
        } else if (flags & RAM_SAVE_FLAG_XBZRLE) {
            if (!migrate_use_xbzrle()) {
                return -EINVAL;
            }
            void *host = host_from_stream_offset(f, addr, flags);
            if (!host) {
                return -EINVAL;
            }

            if (load_xbzrle(f, addr, host) < 0) {
                ret = -EINVAL;
                goto done;
            }
        }
        error = qemu_file_get_error(f);
        if (error) {
+42 −0
Original line number Diff line number Diff line
@@ -382,3 +382,45 @@ int qemu_parse_fd(const char *param)
    }
    return fd;
}

/* round down to the nearest power of 2*/
int64_t pow2floor(int64_t value)
{
    if (!is_power_of_2(value)) {
        value = 0x8000000000000000ULL >> clz64(value);
    }
    return value;
}

/*
 * Implementation of  ULEB128 (http://en.wikipedia.org/wiki/LEB128)
 * Input is limited to 14-bit numbers
 */
int uleb128_encode_small(uint8_t *out, uint32_t n)
{
    g_assert(n <= 0x3fff);
    if (n < 0x80) {
        *out++ = n;
        return 1;
    } else {
        *out++ = (n & 0x7f) | 0x80;
        *out++ = n >> 7;
        return 2;
    }
}

int uleb128_decode_small(const uint8_t *in, uint32_t *n)
{
    if (!(*in & 0x80)) {
        *n = *in++;
        return 1;
    } else {
        *n = *in++ & 0x7f;
        /* we exceed 14 bit number */
        if (*in & 0x80) {
            return -1;
        }
        *n |= *in++ << 7;
        return 2;
    }
}

docs/xbzrle.txt

0 → 100644
+128 −0
Original line number Diff line number Diff line
XBZRLE (Xor Based Zero Run Length Encoding)
===========================================

Using XBZRLE (Xor Based Zero Run Length Encoding) allows for the reduction
of VM downtime and the total live-migration time of Virtual machines.
It is particularly useful for virtual machines running memory write intensive
workloads that are typical of large enterprise applications such as SAP ERP
Systems, and generally speaking for any application that uses a sparse memory
update pattern.

Instead of sending the changed guest memory page this solution will send a
compressed version of the updates, thus reducing the amount of data sent during
live migration.
In order to be able to calculate the update, the previous memory pages need to
be stored on the source. Those pages are stored in a dedicated cache
(hash table) and are accessed by their address.
The larger the cache size the better the chances are that the page has already
been stored in the cache.
A small cache size will result in high cache miss rate.
Cache size can be changed before and during migration.

Format
=======

The compression format performs a XOR between the previous and current content
of the page, where zero represents an unchanged value.
The page data delta is represented by zero and non zero runs.
A zero run is represented by its length (in bytes).
A non zero run is represented by its length (in bytes) and the new data.
The run length is encoded using ULEB128 (http://en.wikipedia.org/wiki/LEB128)

There can be more than one valid encoding, the sender may send a longer encoding
for the benefit of reducing computation cost.

page = zrun nzrun
       | zrun nzrun page

zrun = length

nzrun = length byte...

length = uleb128 encoded integer

On the sender side XBZRLE is used as a compact delta encoding of page updates,
retrieving the old page content from the cache (default size of 512 MB). The
receiving side uses the existing page's content and XBZRLE to decode the new
page's content.

This work was originally based on research results published
VEE 2011: Evaluation of Delta Compression Techniques for Efficient Live
Migration of Large Virtual Machines by Benoit, Svard, Tordsson and Elmroth.
Additionally the delta encoder XBRLE was improved further using the XBZRLE
instead.

XBZRLE has a sustained bandwidth of 2-2.5 GB/s for typical workloads making it
ideal for in-line, real-time encoding such as is needed for live-migration.

Example
old buffer:
1001 zeros
05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 68 00 00 6b 00 6d
3074 zeros

new buffer:
1001 zeros
01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 68 00 00 67 00 69
3074 zeros

encoded buffer:

encoded length 24
e9 07 0f 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 03 01 67 01 01 69

Usage
======================
1. Verify the destination QEMU version is able to decode the new format.
    {qemu} info migrate_capabilities
    {qemu} xbzrle: off , ...

2. Activate xbzrle on both source and destination:
   {qemu} migrate_set_capability xbzrle on

3. Set the XBZRLE cache size - the cache size is in MBytes and should be a
power of 2. The cache default value is 64MBytes. (on source only)
    {qemu} migrate_set_cache_size 256m

4. Start outgoing migration
    {qemu} migrate -d tcp:destination.host:4444
    {qemu} info migrate
    capabilities: xbzrle: on
    Migration status: active
    transferred ram: A kbytes
    remaining ram: B kbytes
    total ram: C kbytes
    total time: D milliseconds
    duplicate: E pages
    normal: F pages
    normal bytes: G kbytes
    cache size: H bytes
    xbzrle transferred: I kbytes
    xbzrle pages: J pages
    xbzrle cache miss: K
    xbzrle overflow : L

xbzrle cache-miss: the number of cache misses to date - high cache-miss rate
indicates that the cache size is set too low.
xbzrle overflow: the number of overflows in the decoding which where the delta
could not be compressed. This can happen if the changes in the pages are too
large or there are many short changes; for example, changing every second byte
(half a page).

Testing: Testing indicated that live migration with XBZRLE was completed in 110
seconds, whereas without it would not be able to complete.

A simple synthetic memory r/w load generator:
..    include <stdlib.h>
..    include <stdio.h>
..    int main()
..    {
..        char *buf = (char *) calloc(4096, 4096);
..        while (1) {
..            int i;
..            for (i = 0; i < 4096 * 4; i++) {
..                buf[i * 4096 / 4]++;
..            }
..            printf(".");
..        }
..    }
+38 −0
Original line number Diff line number Diff line
@@ -829,6 +829,26 @@ STEXI
@item migrate_cancel
@findex migrate_cancel
Cancel the current VM migration.

ETEXI

    {
        .name       = "migrate_set_cache_size",
        .args_type  = "value:o",
        .params     = "value",
        .help       = "set cache size (in bytes) for XBZRLE migrations,"
                      "the cache size will be rounded down to the nearest "
                      "power of 2.\n"
                      "The cache size affects the number of cache misses."
                      "In case of a high cache miss ratio you need to increase"
                      " the cache size",
        .mhandler.cmd = hmp_migrate_set_cache_size,
    },

STEXI
@item migrate_set_cache_size @var{value}
@findex migrate_set_cache_size
Set cache size to @var{value} (in bytes) for xbzrle migrations.
ETEXI

    {
@@ -858,6 +878,20 @@ STEXI
@item migrate_set_downtime @var{second}
@findex migrate_set_downtime
Set maximum tolerated downtime (in seconds) for migration.
ETEXI

    {
        .name       = "migrate_set_capability",
        .args_type  = "capability:s,state:b",
        .params     = "capability state",
        .help       = "Enable/Disable the usage of a capability for migration",
        .mhandler.cmd = hmp_migrate_set_capability,
    },

STEXI
@item migrate_set_capability @var{capability} @var{state}
@findex migrate_set_capability
Enable/Disable the usage of a capability @var{capability} for migration.
ETEXI

    {
@@ -1417,6 +1451,10 @@ show CPU statistics
show user network stack connection states
@item info migrate
show migration status
@item info migrate_capabilities
show current migration capabilities
@item info migrate_cache_size
show current migration XBZRLE cache size
@item info balloon
show balloon information
@item info qtree
Loading