Merge remote-tracking branch 'quintela/migration-next-20120808' into staging (ac839ccd) · Commits · SUMMER2020 / students / proj-2021291

Makefile.objs

+1 −0

Original line number	Diff line number	Diff line
		@@ -77,6 +77,7 @@ common-obj-y += qemu-char.o #aio.o
		common-obj-y += block-migration.o iohandler.o
		common-obj-y += pflib.o
		common-obj-y += bitmap.o bitops.o
		common-obj-y += page_cache.o

		common-obj-$(CONFIG_POSIX) += migration-exec.o migration-unix.o migration-fd.o
		common-obj-$(CONFIG_WIN32) += version.o

arch_init.c

+241 −5

Original line number	Diff line number	Diff line
		@@ -43,6 +43,7 @@
		#include "hw/smbios.h"
		#include "exec-memory.h"
		#include "hw/pcspk.h"
		#include "qemu/page_cache.h"

		#ifdef DEBUG_ARCH_INIT
		#define DPRINTF(fmt, ...) \
		@@ -106,6 +107,7 @@ const uint32_t arch_type = QEMU_ARCH;
		#define RAM_SAVE_FLAG_PAGE 0x08
		#define RAM_SAVE_FLAG_EOS 0x10
		#define RAM_SAVE_FLAG_CONTINUE 0x20
		#define RAM_SAVE_FLAG_XBZRLE 0x40

		#ifdef __ALTIVEC__
		#include <altivec.h>
		@@ -173,6 +175,92 @@ static int is_dup_page(uint8_t *page)
		return 1;
		}

		/* struct contains XBZRLE cache and a static page
		used by the compression */
		static struct {
		/* buffer used for XBZRLE encoding */
		uint8_t *encoded_buf;
		/* buffer for storing page content */
		uint8_t *current_buf;
		/* buffer used for XBZRLE decoding */
		uint8_t *decoded_buf;
		/* Cache for XBZRLE */
		PageCache *cache;
		} XBZRLE = {
		.encoded_buf = NULL,
		.current_buf = NULL,
		.decoded_buf = NULL,
		.cache = NULL,
		};


		int64_t xbzrle_cache_resize(int64_t new_size)
		{
		if (XBZRLE.cache != NULL) {
		return cache_resize(XBZRLE.cache, new_size / TARGET_PAGE_SIZE) *
		TARGET_PAGE_SIZE;
		}
		return pow2floor(new_size);
		}

		/* accounting for migration statistics */
		typedef struct AccountingInfo {
		uint64_t dup_pages;
		uint64_t norm_pages;
		uint64_t iterations;
		uint64_t xbzrle_bytes;
		uint64_t xbzrle_pages;
		uint64_t xbzrle_cache_miss;
		uint64_t xbzrle_overflows;
		} AccountingInfo;

		static AccountingInfo acct_info;

		static void acct_clear(void)
		{
		memset(&acct_info, 0, sizeof(acct_info));
		}

		uint64_t dup_mig_bytes_transferred(void)
		{
		return acct_info.dup_pages * TARGET_PAGE_SIZE;
		}

		uint64_t dup_mig_pages_transferred(void)
		{
		return acct_info.dup_pages;
		}

		uint64_t norm_mig_bytes_transferred(void)
		{
		return acct_info.norm_pages * TARGET_PAGE_SIZE;
		}

		uint64_t norm_mig_pages_transferred(void)
		{
		return acct_info.norm_pages;
		}

		uint64_t xbzrle_mig_bytes_transferred(void)
		{
		return acct_info.xbzrle_bytes;
		}

		uint64_t xbzrle_mig_pages_transferred(void)
		{
		return acct_info.xbzrle_pages;
		}

		uint64_t xbzrle_mig_pages_cache_miss(void)
		{
		return acct_info.xbzrle_cache_miss;
		}

		uint64_t xbzrle_mig_pages_overflow(void)
		{
		return acct_info.xbzrle_overflows;
		}

		static void save_block_hdr(QEMUFile f, RAMBlock block, ram_addr_t offset,
		int cont, int flag)
		{
		@@ -185,6 +273,61 @@ static void save_block_hdr(QEMUFile f, RAMBlock block, ram_addr_t offset,

		}

		#define ENCODING_FLAG_XBZRLE 0x1

		static int save_xbzrle_page(QEMUFile f, uint8_t current_data,
		ram_addr_t current_addr, RAMBlock *block,
		ram_addr_t offset, int cont, bool last_stage)
		{
		int encoded_len = 0, bytes_sent = -1;
		uint8_t *prev_cached_page;

		if (!cache_is_cached(XBZRLE.cache, current_addr)) {
		if (!last_stage) {
		cache_insert(XBZRLE.cache, current_addr,
		g_memdup(current_data, TARGET_PAGE_SIZE));
		}
		acct_info.xbzrle_cache_miss++;
		return -1;
		}

		prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);

		/* save current buffer into memory */
		memcpy(XBZRLE.current_buf, current_data, TARGET_PAGE_SIZE);

		/* XBZRLE encoding (if there is no overflow) */
		encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
		TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
		TARGET_PAGE_SIZE);
		if (encoded_len == 0) {
		DPRINTF("Skipping unmodified page\n");
		return 0;
		} else if (encoded_len == -1) {
		DPRINTF("Overflow\n");
		acct_info.xbzrle_overflows++;
		/* update data in the cache */
		memcpy(prev_cached_page, current_data, TARGET_PAGE_SIZE);
		return -1;
		}

		/* we need to update the data in the cache, in order to get the same data */
		if (!last_stage) {
		memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
		}

		/* Send XBZRLE based compressed page */
		save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_XBZRLE);
		qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
		qemu_put_be16(f, encoded_len);
		qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
		bytes_sent = encoded_len + 1 + 2;
		acct_info.xbzrle_pages++;
		acct_info.xbzrle_bytes += bytes_sent;

		return bytes_sent;
		}

		static RAMBlock *last_block;
		static ram_addr_t last_offset;

		@@ -196,12 +339,13 @@ static ram_addr_t last_offset;
		* n: the amount of bytes written in other case
		*/

		static int ram_save_block(QEMUFile *f)
		static int ram_save_block(QEMUFile *f, bool last_stage)
		{
		RAMBlock *block = last_block;
		ram_addr_t offset = last_offset;
		int bytes_sent = -1;
		MemoryRegion *mr;
		ram_addr_t current_addr;

		if (!block)
		block = QLIST_FIRST(&ram_list.blocks);
		@@ -219,17 +363,32 @@ static int ram_save_block(QEMUFile *f)
		p = memory_region_get_ram_ptr(mr) + offset;

		if (is_dup_page(p)) {
		acct_info.dup_pages++;
		save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_COMPRESS);
		qemu_put_byte(f, *p);
		bytes_sent = 1;
		} else {
		} else if (migrate_use_xbzrle()) {
		current_addr = block->offset + offset;
		bytes_sent = save_xbzrle_page(f, p, current_addr, block,
		offset, cont, last_stage);
		if (!last_stage) {
		p = get_cached_data(XBZRLE.cache, current_addr);
		}
		}

		/* either we didn't send yet (we may have had XBZRLE overflow) */
		if (bytes_sent == -1) {
		save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE);
		qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
		bytes_sent = TARGET_PAGE_SIZE;
		acct_info.norm_pages++;
		}

		/* if page is unmodified, continue to the next */
		if (bytes_sent != 0) {
		break;
		}
		}

		offset += TARGET_PAGE_SIZE;
		if (offset >= block->length) {
		@@ -306,6 +465,15 @@ static void sort_ram_list(void)
		static void migration_end(void)
		{
		memory_global_dirty_log_stop();

		if (migrate_use_xbzrle()) {
		cache_fini(XBZRLE.cache);
		g_free(XBZRLE.cache);
		g_free(XBZRLE.encoded_buf);
		g_free(XBZRLE.current_buf);
		g_free(XBZRLE.decoded_buf);
		XBZRLE.cache = NULL;
		}
		}

		static void ram_migration_cancel(void *opaque)
		@@ -325,6 +493,19 @@ static int ram_save_setup(QEMUFile f, void opaque)
		last_offset = 0;
		sort_ram_list();

		if (migrate_use_xbzrle()) {
		XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
		TARGET_PAGE_SIZE,
		TARGET_PAGE_SIZE);
		if (!XBZRLE.cache) {
		DPRINTF("Error creating cache\n");
		return -1;
		}
		XBZRLE.encoded_buf = g_malloc0(TARGET_PAGE_SIZE);
		XBZRLE.current_buf = g_malloc(TARGET_PAGE_SIZE);
		acct_clear();
		}

		/* Make sure all dirty bits are set */
		QLIST_FOREACH(block, &ram_list.blocks, next) {
		for (addr = 0; addr < block->length; addr += TARGET_PAGE_SIZE) {
		@@ -365,12 +546,13 @@ static int ram_save_iterate(QEMUFile f, void opaque)
		while ((ret = qemu_file_rate_limit(f)) == 0) {
		int bytes_sent;

		bytes_sent = ram_save_block(f);
		bytes_sent = ram_save_block(f, false);
		/* no more blocks to sent */
		if (bytes_sent < 0) {
		break;
		}
		bytes_transferred += bytes_sent;
		acct_info.iterations++;
		/* we want to check in the 1st loop, just in case it was the 1st time
		and we had to sync the dirty bitmap.
		qemu_get_clock_ns() is a bit expensive, so we only check each some
		@@ -426,7 +608,7 @@ static int ram_save_complete(QEMUFile f, void opaque)
		while (true) {
		int bytes_sent;

		bytes_sent = ram_save_block(f);
		bytes_sent = ram_save_block(f, true);
		/* no more blocks to sent */
		if (bytes_sent < 0) {
		break;
		@@ -440,6 +622,47 @@ static int ram_save_complete(QEMUFile f, void opaque)
		return 0;
		}

		static int load_xbzrle(QEMUFile f, ram_addr_t addr, void host)
		{
		int ret, rc = 0;
		unsigned int xh_len;
		int xh_flags;

		if (!XBZRLE.decoded_buf) {
		XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
		}

		/* extract RLE header */
		xh_flags = qemu_get_byte(f);
		xh_len = qemu_get_be16(f);

		if (xh_flags != ENCODING_FLAG_XBZRLE) {
		fprintf(stderr, "Failed to load XBZRLE page - wrong compression!\n");
		return -1;
		}

		if (xh_len > TARGET_PAGE_SIZE) {
		fprintf(stderr, "Failed to load XBZRLE page - len overflow!\n");
		return -1;
		}
		/* load data and decode */
		qemu_get_buffer(f, XBZRLE.decoded_buf, xh_len);

		/* decode RLE */
		ret = xbzrle_decode_buffer(XBZRLE.decoded_buf, xh_len, host,
		TARGET_PAGE_SIZE);
		if (ret == -1) {
		fprintf(stderr, "Failed to load XBZRLE page - decode error!\n");
		rc = -1;
		} else if (ret > TARGET_PAGE_SIZE) {
		fprintf(stderr, "Failed to load XBZRLE page - size %d exceeds %d!\n",
		ret, TARGET_PAGE_SIZE);
		abort();
		}

		return rc;
		}

		static inline void host_from_stream_offset(QEMUFile f,
		ram_addr_t offset,
		int flags)
		@@ -553,6 +776,19 @@ static int ram_load(QEMUFile f, void opaque, int version_id)
		}

		qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
		} else if (flags & RAM_SAVE_FLAG_XBZRLE) {
		if (!migrate_use_xbzrle()) {
		return -EINVAL;
		}
		void *host = host_from_stream_offset(f, addr, flags);
		if (!host) {
		return -EINVAL;
		}

		if (load_xbzrle(f, addr, host) < 0) {
		ret = -EINVAL;
		goto done;
		}
		}
		error = qemu_file_get_error(f);
		if (error) {

cutils.c

+42 −0

Original line number	Diff line number	Diff line
		@@ -382,3 +382,45 @@ int qemu_parse_fd(const char *param)
		}
		return fd;
		}

		/* round down to the nearest power of 2*/
		int64_t pow2floor(int64_t value)
		{
		if (!is_power_of_2(value)) {
		value = 0x8000000000000000ULL >> clz64(value);
		}
		return value;
		}

		/*
		* Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128)
		* Input is limited to 14-bit numbers
		*/
		int uleb128_encode_small(uint8_t *out, uint32_t n)
		{
		g_assert(n <= 0x3fff);
		if (n < 0x80) {
		*out++ = n;
		return 1;
		} else {
		*out++ = (n & 0x7f) \| 0x80;
		*out++ = n >> 7;
		return 2;
		}
		}

		int uleb128_decode_small(const uint8_t in, uint32_t n)
		{
		if (!(*in & 0x80)) {
		n = in++;
		return 1;
		} else {
		n = in++ & 0x7f;
		/* we exceed 14 bit number */
		if (*in & 0x80) {
		return -1;
		}
		n \|= in++ << 7;
		return 2;
		}
		}

docs/xbzrle.txt

0 → 100644

+128 −0

Original line number	Diff line number	Diff line
		XBZRLE (Xor Based Zero Run Length Encoding)
		===========================================

		Using XBZRLE (Xor Based Zero Run Length Encoding) allows for the reduction
		of VM downtime and the total live-migration time of Virtual machines.
		It is particularly useful for virtual machines running memory write intensive
		workloads that are typical of large enterprise applications such as SAP ERP
		Systems, and generally speaking for any application that uses a sparse memory
		update pattern.

		Instead of sending the changed guest memory page this solution will send a
		compressed version of the updates, thus reducing the amount of data sent during
		live migration.
		In order to be able to calculate the update, the previous memory pages need to
		be stored on the source. Those pages are stored in a dedicated cache
		(hash table) and are accessed by their address.
		The larger the cache size the better the chances are that the page has already
		been stored in the cache.
		A small cache size will result in high cache miss rate.
		Cache size can be changed before and during migration.

		Format
		=======

		The compression format performs a XOR between the previous and current content
		of the page, where zero represents an unchanged value.
		The page data delta is represented by zero and non zero runs.
		A zero run is represented by its length (in bytes).
		A non zero run is represented by its length (in bytes) and the new data.
		The run length is encoded using ULEB128 (http://en.wikipedia.org/wiki/LEB128)

		There can be more than one valid encoding, the sender may send a longer encoding
		for the benefit of reducing computation cost.

		page = zrun nzrun
		\| zrun nzrun page

		zrun = length

		nzrun = length byte...

		length = uleb128 encoded integer

		On the sender side XBZRLE is used as a compact delta encoding of page updates,
		retrieving the old page content from the cache (default size of 512 MB). The
		receiving side uses the existing page's content and XBZRLE to decode the new
		page's content.

		This work was originally based on research results published
		VEE 2011: Evaluation of Delta Compression Techniques for Efficient Live
		Migration of Large Virtual Machines by Benoit, Svard, Tordsson and Elmroth.
		Additionally the delta encoder XBRLE was improved further using the XBZRLE
		instead.

		XBZRLE has a sustained bandwidth of 2-2.5 GB/s for typical workloads making it
		ideal for in-line, real-time encoding such as is needed for live-migration.

		Example
		old buffer:
		1001 zeros
		05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 68 00 00 6b 00 6d
		3074 zeros

		new buffer:
		1001 zeros
		01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 68 00 00 67 00 69
		3074 zeros

		encoded buffer:

		encoded length 24
		e9 07 0f 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 03 01 67 01 01 69

		Usage
		======================
		1. Verify the destination QEMU version is able to decode the new format.
		{qemu} info migrate_capabilities
		{qemu} xbzrle: off , ...

		2. Activate xbzrle on both source and destination:
		{qemu} migrate_set_capability xbzrle on

		3. Set the XBZRLE cache size - the cache size is in MBytes and should be a
		power of 2. The cache default value is 64MBytes. (on source only)
		{qemu} migrate_set_cache_size 256m

		4. Start outgoing migration
		{qemu} migrate -d tcp:destination.host:4444
		{qemu} info migrate
		capabilities: xbzrle: on
		Migration status: active
		transferred ram: A kbytes
		remaining ram: B kbytes
		total ram: C kbytes
		total time: D milliseconds
		duplicate: E pages
		normal: F pages
		normal bytes: G kbytes
		cache size: H bytes
		xbzrle transferred: I kbytes
		xbzrle pages: J pages
		xbzrle cache miss: K
		xbzrle overflow : L

		xbzrle cache-miss: the number of cache misses to date - high cache-miss rate
		indicates that the cache size is set too low.
		xbzrle overflow: the number of overflows in the decoding which where the delta
		could not be compressed. This can happen if the changes in the pages are too
		large or there are many short changes; for example, changing every second byte
		(half a page).

		Testing: Testing indicated that live migration with XBZRLE was completed in 110
		seconds, whereas without it would not be able to complete.

		A simple synthetic memory r/w load generator:
		.. include <stdlib.h>
		.. include <stdio.h>
		.. int main()
		.. {
		.. char buf = (char ) calloc(4096, 4096);
		.. while (1) {
		.. int i;
		.. for (i = 0; i < 4096 * 4; i++) {
		.. buf[i * 4096 / 4]++;
		.. }
		.. printf(".");
		.. }
		.. }

hmp-commands.hx

+38 −0

Original line number	Diff line number	Diff line
		@@ -829,6 +829,26 @@ STEXI
		@item migrate_cancel
		@findex migrate_cancel
		Cancel the current VM migration.

		ETEXI

		{
		.name = "migrate_set_cache_size",
		.args_type = "value:o",
		.params = "value",
		.help = "set cache size (in bytes) for XBZRLE migrations,"
		"the cache size will be rounded down to the nearest "
		"power of 2.\n"
		"The cache size affects the number of cache misses."
		"In case of a high cache miss ratio you need to increase"
		" the cache size",
		.mhandler.cmd = hmp_migrate_set_cache_size,
		},

		STEXI
		@item migrate_set_cache_size @var{value}
		@findex migrate_set_cache_size
		Set cache size to @var{value} (in bytes) for xbzrle migrations.
		ETEXI

		{
		@@ -858,6 +878,20 @@ STEXI
		@item migrate_set_downtime @var{second}
		@findex migrate_set_downtime
		Set maximum tolerated downtime (in seconds) for migration.
		ETEXI

		{
		.name = "migrate_set_capability",
		.args_type = "capability:s,state:b",
		.params = "capability state",
		.help = "Enable/Disable the usage of a capability for migration",
		.mhandler.cmd = hmp_migrate_set_capability,
		},

		STEXI
		@item migrate_set_capability @var{capability} @var{state}
		@findex migrate_set_capability
		Enable/Disable the usage of a capability @var{capability} for migration.
		ETEXI

		{
		@@ -1417,6 +1451,10 @@ show CPU statistics
		show user network stack connection states
		@item info migrate
		show migration status
		@item info migrate_capabilities
		show current migration capabilities
		@item info migrate_cache_size
		show current migration XBZRLE cache size
		@item info balloon
		show balloon information
		@item info qtree