Commit f03d07d4 authored by Anthony Liguori's avatar Anthony Liguori
Browse files

Merge remote-tracking branch 'quintela/migration.next' into staging



# By Michael R. Hines (8) and others
# Via Juan Quintela
* quintela/migration.next:
  migration: add autoconvergence documentation
  Fix real mode guest segments dpl value in savevm
  Fix real mode guest migration
  rdma: account for the time spent in MIG_STATE_SETUP through QMP
  rdma: introduce MIG_STATE_NONE and change MIG_STATE_SETUP state transition
  rdma: allow state transitions between other states besides ACTIVE
  rdma: send pc.ram
  rdma: core logic
  rdma: introduce ram_handle_compressed()
  rdma: bugfix: ram_control_save_page()
  rdma: update documentation to reflect new unpin support

Message-id: 1374590725-14144-1-git-send-email-quintela@redhat.com
Signed-off-by: default avatarAnthony Liguori <aliguori@us.ibm.com>
parents 3988982c 9781c371
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -51,6 +51,7 @@ common-obj-$(CONFIG_POSIX) += os-posix.o
common-obj-$(CONFIG_LINUX) += fsdev/

common-obj-y += migration.o migration-tcp.o
common-obj-$(CONFIG_RDMA) += migration-rdma.o
common-obj-y += qemu-char.o #aio.o
common-obj-y += block-migration.o
common-obj-y += page_cache.o xbzrle.o
+51 −11
Original line number Diff line number Diff line
@@ -118,6 +118,7 @@ static void check_guest_throttling(void);
#define RAM_SAVE_FLAG_EOS      0x10
#define RAM_SAVE_FLAG_CONTINUE 0x20
#define RAM_SAVE_FLAG_XBZRLE   0x40
/* 0x80 is reserved in migration.h start with 0x100 next */


static struct defconfig_file {
@@ -475,6 +476,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
                ram_bulk_stage = false;
            }
        } else {
            int ret;
            uint8_t *p;
            int cont = (block == last_sent_block) ?
                RAM_SAVE_FLAG_CONTINUE : 0;
@@ -483,7 +485,18 @@ static int ram_save_block(QEMUFile *f, bool last_stage)

            /* In doubt sent page as normal */
            bytes_sent = -1;
            if (is_zero_page(p)) {
            ret = ram_control_save_page(f, block->offset,
                               offset, TARGET_PAGE_SIZE, &bytes_sent);

            if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
                if (ret != RAM_SAVE_CONTROL_DELAYED) {
                    if (bytes_sent > 0) {
                        acct_info.norm_pages++;
                    } else if (bytes_sent == 0) {
                        acct_info.dup_pages++;
                    }
                }
            } else if (is_zero_page(p)) {
                acct_info.dup_pages++;
                bytes_sent = save_block_hdr(f, block, offset, cont,
                                            RAM_SAVE_FLAG_COMPRESS);
@@ -635,6 +648,10 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
    }

    qemu_mutex_unlock_ramlist();

    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
    ram_control_after_iterate(f, RAM_CONTROL_SETUP);

    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

    return 0;
@@ -653,6 +670,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
        reset_ram_globals();
    }

    ram_control_before_iterate(f, RAM_CONTROL_ROUND);

    t0 = qemu_get_clock_ns(rt_clock);
    i = 0;
    while ((ret = qemu_file_rate_limit(f)) == 0) {
@@ -684,6 +703,12 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)

    qemu_mutex_unlock_ramlist();

    /*
     * Must occur before EOS (or any QEMUFile operation)
     * because of RDMA protocol.
     */
    ram_control_after_iterate(f, RAM_CONTROL_ROUND);

    if (ret < 0) {
        bytes_transferred += total_sent;
        return ret;
@@ -701,6 +726,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
    qemu_mutex_lock_ramlist();
    migration_bitmap_sync();

    ram_control_before_iterate(f, RAM_CONTROL_FINISH);

    /* try transferring iterative blocks of memory */

    /* flush all remaining blocks regardless of rate limiting */
@@ -714,6 +741,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
        }
        bytes_transferred += bytes_sent;
    }

    ram_control_after_iterate(f, RAM_CONTROL_FINISH);
    migration_end();

    qemu_mutex_unlock_ramlist();
@@ -808,6 +837,24 @@ static inline void *host_from_stream_offset(QEMUFile *f,
    return NULL;
}

/*
 * If a page (or a whole RDMA chunk) has been
 * determined to be zero, then zap it.
 */
void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
{
    if (ch != 0 || !is_zero_page(host)) {
        memset(host, ch, size);
#ifndef _WIN32
        if (ch == 0 &&
            (!kvm_enabled() || kvm_has_sync_mmu()) &&
            getpagesize() <= TARGET_PAGE_SIZE) {
            qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
        }
#endif
    }
}

static int ram_load(QEMUFile *f, void *opaque, int version_id)
{
    ram_addr_t addr;
@@ -879,16 +926,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
            }

            ch = qemu_get_byte(f);
            if (ch != 0 || !is_zero_page(host)) {
                memset(host, ch, TARGET_PAGE_SIZE);
#ifndef _WIN32
                if (ch == 0 &&
                    (!kvm_enabled() || kvm_has_sync_mmu()) &&
                    getpagesize() <= TARGET_PAGE_SIZE) {
                    qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
                }
#endif
            }
            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
        } else if (flags & RAM_SAVE_FLAG_PAGE) {
            void *host;

@@ -908,6 +946,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                ret = -EINVAL;
                goto done;
            }
        } else if (flags & RAM_SAVE_FLAG_HOOK) {
            ram_control_load_hook(f, flags);
        }
        error = qemu_file_get_error(f);
        if (error) {
+40 −0
Original line number Diff line number Diff line
@@ -180,6 +180,7 @@ xfs=""
vhost_net="no"
vhost_scsi="no"
kvm="no"
rdma=""
gprof="no"
debug_tcg="no"
debug="no"
@@ -937,6 +938,10 @@ for opt do
  ;;
  --enable-gtk) gtk="yes"
  ;;
  --enable-rdma) rdma="yes"
  ;;
  --disable-rdma) rdma="no"
  ;;
  --with-gtkabi=*) gtkabi="$optarg"
  ;;
  --enable-tpm) tpm="yes"
@@ -1095,6 +1100,8 @@ echo " --enable-bluez enable bluez stack connectivity"
echo "  --disable-slirp          disable SLIRP userspace network connectivity"
echo "  --disable-kvm            disable KVM acceleration support"
echo "  --enable-kvm             enable KVM acceleration support"
echo "  --disable-rdma           disable RDMA-based migration support"
echo "  --enable-rdma            enable RDMA-based migration support"
echo "  --enable-tcg-interpreter enable TCG with bytecode interpreter (TCI)"
echo "  --disable-nptl           disable usermode NPTL support"
echo "  --enable-nptl            enable usermode NPTL support"
@@ -1801,6 +1808,30 @@ EOF
  libs_softmmu="$sdl_libs $libs_softmmu"
fi

##########################################
# RDMA needs OpenFabrics libraries
if test "$rdma" != "no" ; then
  cat > $TMPC <<EOF
#include <rdma/rdma_cma.h>
int main(void) { return 0; }
EOF
  rdma_libs="-lrdmacm -libverbs"
  if compile_prog "" "$rdma_libs" ; then
    rdma="yes"
    libs_softmmu="$libs_softmmu $rdma_libs"
  else
    if test "$rdma" = "yes" ; then
        error_exit \
            " OpenFabrics librdmacm/libibverbs not present." \
            " Your options:" \
            "  (1) Fast: Install infiniband packages from your distro." \
            "  (2) Cleanest: Install libraries from www.openfabrics.org" \
            "  (3) Also: Install softiwarp if you don't have RDMA hardware"
    fi
    rdma="no"
  fi
fi

##########################################
# VNC TLS/WS detection
if test "$vnc" = "yes" -a \( "$vnc_tls" != "no" -o "$vnc_ws" != "no" \) ; then
@@ -3558,6 +3589,7 @@ echo "Linux AIO support $linux_aio"
echo "ATTR/XATTR support $attr"
echo "Install blobs     $blobs"
echo "KVM support       $kvm"
echo "RDMA support      $rdma"
echo "TCG interpreter   $tcg_interpreter"
echo "fdt support       $fdt"
echo "preadv support    $preadv"
@@ -4046,6 +4078,10 @@ if test "$trace_default" = "yes"; then
  echo "CONFIG_TRACE_DEFAULT=y" >> $config_host_mak
fi

if test "$rdma" = "yes" ; then
  echo "CONFIG_RDMA=y" >> $config_host_mak
fi

if test "$tcg_interpreter" = "yes"; then
  QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES"
elif test "$ARCH" = "sparc64" ; then
@@ -4485,6 +4521,10 @@ if [ "$pixman" = "internal" ]; then
  echo "config-host.h: subdir-pixman" >> $config_host_mak
fi

if test "$rdma" = "yes" ; then
echo "CONFIG_RDMA=y" >> $config_host_mak
fi

if [ "$dtc_internal" = "yes" ]; then
  echo "config-host.h: subdir-dtc" >> $config_host_mak
fi
+30 −21
Original line number Diff line number Diff line
@@ -35,7 +35,7 @@ memory tracked during each live migration iteration round cannot keep pace
with the rate of dirty memory produced by the workload.

RDMA currently comes in two flavors: both Ethernet based (RoCE, or RDMA
over Convered Ethernet) as well as Infiniband-based. This implementation of
over Converged Ethernet) as well as Infiniband-based. This implementation of
migration using RDMA is capable of using both technologies because of
the use of the OpenFabrics OFED software stack that abstracts out the
programming model irrespective of the underlying hardware.
@@ -202,7 +202,7 @@ The maximum number of repeats is hard-coded to 4096. This is a conservative
limit based on the maximum size of a SEND message along with emperical
observations on the maximum future benefit of simultaneous page registrations.

The 'type' field has 10 different command values:
The 'type' field has 12 different command values:
     1. Unused
     2. Error                      (sent to the source during bad things)
     3. Ready                      (control-channel is available)
@@ -213,6 +213,8 @@ The 'type' field has 10 different command values:
     8. Register request           (dynamic chunk registration)
     9. Register result            ('rkey' to be used by sender)
    10. Register finished          (registration for current iteration finished)
    11. Unregister request         (unpin previously registered memory)
    12. Unregister finished        (confirmation that unpin completed)

A single control message, as hinted above, can contain within the data
portion an array of many commands of the same type. If there is more than
@@ -243,7 +245,7 @@ qemu_rdma_exchange_send(header, data, optional response header & data):
   from the receiver to tell us that the receiver
   is *ready* for us to transmit some new bytes.
2. Optionally: if we are expecting a response from the command
   (that we have no yet transmitted), let's post an RQ
   (that we have not yet transmitted), let's post an RQ
   work request to receive that data a few moments later.
3. When the READY arrives, librdmacm will
   unblock us and we immediately post a RQ work request
@@ -293,8 +295,10 @@ librdmacm provides the user with a 'private data' area to be exchanged
at connection-setup time before any infiniband traffic is generated.

Header:
    * Version (protocol version validated before send/recv occurs), uint32, network byte order
    * Flags   (bitwise OR of each capability), uint32, network byte order
    * Version (protocol version validated before send/recv occurs),
                                               uint32, network byte order
    * Flags   (bitwise OR of each capability),
                                               uint32, network byte order

There is no data portion of this header right now, so there is
no length field. The maximum size of the 'private data' section
@@ -313,7 +317,7 @@ If the version is invalid, we throw an error.
If the version is new, we only negotiate the capabilities that the
requested version is able to perform and ignore the rest.

Currently there is only *one* capability in Version #1: dynamic page registration
Currently there is only one capability in Version #1: dynamic page registration

Finally: Negotiation happens with the Flags field: If the primary-VM
sets a flag, but the destination does not support this capability, it
@@ -413,3 +417,8 @@ TODO:
   the use of KSM and ballooning while using RDMA.
4. Also, some form of balloon-device usage tracking would also
   help alleviate some issues.
5. Move UNREGISTER requests to a separate thread.
6. Use LRU to provide more fine-grained direction of UNREGISTER
   requests for unpinning memory in an overcommitted environment.
7. Expose UNREGISTER support to the user by way of workload-specific
   hints about application behavior.
+4 −0
Original line number Diff line number Diff line
@@ -164,6 +164,10 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
            monitor_printf(mon, "downtime: %" PRIu64 " milliseconds\n",
                           info->downtime);
        }
        if (info->has_setup_time) {
            monitor_printf(mon, "setup: %" PRIu64 " milliseconds\n",
                           info->setup_time);
        }
    }

    if (info->has_ram) {
Loading