Commit 93197380 authored by Peter Maydell's avatar Peter Maydell
Browse files

Merge remote-tracking branch 'remotes/bonzini/tags/for-upstream-replay' into staging



So here it is, let's see what happens.

# gpg: Signature made Fri 06 Nov 2015 09:30:34 GMT using RSA key ID 78C7AE83
# gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>"
# gpg:                 aka "Paolo Bonzini <pbonzini@redhat.com>"

* remotes/bonzini/tags/for-upstream-replay:
  replay: recording of the user input
  replay: command line options
  replay: replay blockers for devices
  replay: initialization and deinitialization
  replay: ptimer
  bottom halves: introduce bh call function
  replay: checkpoints
  icount: improve counting for record/replay
  replay: shutdown event
  replay: recording and replaying clock ticks
  replay: asynchronous events infrastructure
  replay: interrupts and exceptions
  cpu: replay instructions sequence
  cpu-exec: allow temporary disabling icount
  replay: introduce icount event
  replay: introduce mutex to protect the replay log
  replay: internal functions for replay log
  replay: global variables and function stubs

Signed-off-by: default avatarPeter Maydell <peter.maydell@linaro.org>
parents 3aa88b31 ee312992
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -54,6 +54,8 @@ common-obj-y += audio/
common-obj-y += hw/
common-obj-y += accel.o

common-obj-y += replay/

common-obj-y += ui/
common-obj-y += bt-host.o bt-vhci.o
bt-host.o-cflags := $(BLUEZ_CFLAGS)
+6 −1
Original line number Diff line number Diff line
@@ -59,6 +59,11 @@ QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
    return bh;
}

void aio_bh_call(QEMUBH *bh)
{
    bh->cb(bh->opaque);
}

/* Multiple occurrences of aio_bh_poll cannot be called concurrently */
int aio_bh_poll(AioContext *ctx)
{
@@ -84,7 +89,7 @@ int aio_bh_poll(AioContext *ctx)
                ret = 1;
            }
            bh->idle = 0;
            bh->cb(bh->opaque);
            aio_bh_call(bh);
        }
    }

+42 −13
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@
#if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
#include "hw/i386/apic.h"
#endif
#include "sysemu/replay.h"

/* -icount align implementation. */

@@ -184,7 +185,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, uint8_t *tb_ptr)
/* Execute the code without caching the generated code. An interpreter
   could be used if available. */
static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
                             TranslationBlock *orig_tb)
                             TranslationBlock *orig_tb, bool ignore_icount)
{
    TranslationBlock *tb;

@@ -194,7 +195,8 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
        max_cycles = CF_COUNT_MASK;

    tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
                     max_cycles | CF_NOCACHE);
                     max_cycles | CF_NOCACHE
                         | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
    tb->orig_tb = tcg_ctx.tb_ctx.tb_invalidated_flag ? NULL : orig_tb;
    cpu->current_tb = tb;
    /* execute the generated code */
@@ -345,21 +347,25 @@ int cpu_exec(CPUState *cpu)
    uintptr_t next_tb;
    SyncClocks sc;

    /* replay_interrupt may need current_cpu */
    current_cpu = cpu;

    if (cpu->halted) {
#if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
        if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
        if ((cpu->interrupt_request & CPU_INTERRUPT_POLL)
            && replay_interrupt()) {
            apic_poll_irq(x86_cpu->apic_state);
            cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL);
        }
#endif
        if (!cpu_has_work(cpu)) {
            current_cpu = NULL;
            return EXCP_HALTED;
        }

        cpu->halted = 0;
    }

    current_cpu = cpu;
    atomic_mb_set(&tcg_current_cpu, cpu);
    rcu_read_lock();

@@ -401,10 +407,22 @@ int cpu_exec(CPUState *cpu)
                    cpu->exception_index = -1;
                    break;
#else
                    if (replay_exception()) {
                        cc->do_interrupt(cpu);
                        cpu->exception_index = -1;
                    } else if (!replay_has_interrupt()) {
                        /* give a chance to iothread in replay mode */
                        ret = EXCP_INTERRUPT;
                        break;
                    }
#endif
                }
            } else if (replay_has_exception()
                       && cpu->icount_decr.u16.low + cpu->icount_extra == 0) {
                /* try to cause an exception pending in the log */
                cpu_exec_nocache(cpu, 1, tb_find_fast(cpu), true);
                ret = -1;
                break;
            }

            next_tb = 0; /* force lookup of first TB */
@@ -420,31 +438,41 @@ int cpu_exec(CPUState *cpu)
                        cpu->exception_index = EXCP_DEBUG;
                        cpu_loop_exit(cpu);
                    }
                    if (interrupt_request & CPU_INTERRUPT_HALT) {
                    if (replay_mode == REPLAY_MODE_PLAY
                        && !replay_has_interrupt()) {
                        /* Do nothing */
                    } else if (interrupt_request & CPU_INTERRUPT_HALT) {
                        replay_interrupt();
                        cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
                        cpu->halted = 1;
                        cpu->exception_index = EXCP_HLT;
                        cpu_loop_exit(cpu);
                    }
#if defined(TARGET_I386)
                    if (interrupt_request & CPU_INTERRUPT_INIT) {
                    else if (interrupt_request & CPU_INTERRUPT_INIT) {
                        replay_interrupt();
                        cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0);
                        do_cpu_init(x86_cpu);
                        cpu->exception_index = EXCP_HALTED;
                        cpu_loop_exit(cpu);
                    }
#else
                    if (interrupt_request & CPU_INTERRUPT_RESET) {
                    else if (interrupt_request & CPU_INTERRUPT_RESET) {
                        replay_interrupt();
                        cpu_reset(cpu);
                        cpu_loop_exit(cpu);
                    }
#endif
                    /* The target hook has 3 exit conditions:
                       False when the interrupt isn't processed,
                       True when it is, and we should restart on a new TB,
                       and via longjmp via cpu_loop_exit.  */
                    else {
                        replay_interrupt();
                        if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
                            next_tb = 0;
                        }
                    }
                    /* Don't use the cached interrupt_request value,
                       do_interrupt may have updated the EXITTB flag. */
                    if (cpu->interrupt_request & CPU_INTERRUPT_EXITTB) {
@@ -454,7 +482,8 @@ int cpu_exec(CPUState *cpu)
                        next_tb = 0;
                    }
                }
                if (unlikely(cpu->exit_request)) {
                if (unlikely(cpu->exit_request
                             || replay_has_interrupt())) {
                    cpu->exit_request = 0;
                    cpu->exception_index = EXCP_INTERRUPT;
                    cpu_loop_exit(cpu);
@@ -519,7 +548,7 @@ int cpu_exec(CPUState *cpu)
                            if (insns_left > 0) {
                                /* Execute remaining instructions.  */
                                tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
                                cpu_exec_nocache(cpu, insns_left, tb);
                                cpu_exec_nocache(cpu, insns_left, tb, false);
                                align_clocks(&sc, cpu);
                            }
                            cpu->exception_index = EXCP_INTERRUPT;
+47 −17
Original line number Diff line number Diff line
@@ -42,6 +42,7 @@
#include "qemu/seqlock.h"
#include "qapi-event.h"
#include "hw/nmi.h"
#include "sysemu/replay.h"

#ifndef _WIN32
#include "qemu/compatfd.h"
@@ -334,7 +335,7 @@ static int64_t qemu_icount_round(int64_t count)
    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
}

static void icount_warp_rt(void *opaque)
static void icount_warp_rt(void)
{
    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
     * changes from -1 to another value, so the race here is okay.
@@ -345,7 +346,8 @@ static void icount_warp_rt(void *opaque)

    seqlock_write_lock(&timers_state.vm_clock_seqlock);
    if (runstate_is_running()) {
        int64_t clock = cpu_get_clock_locked();
        int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
                                     cpu_get_clock_locked());
        int64_t warp_delta;

        warp_delta = clock - vm_clock_warp_start;
@@ -368,6 +370,11 @@ static void icount_warp_rt(void *opaque)
    }
}

static void icount_dummy_timer(void *opaque)
{
    (void)opaque;
}

void qtest_clock_warp(int64_t dest)
{
    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
@@ -403,6 +410,18 @@ void qemu_clock_warp(QEMUClockType type)
        return;
    }

    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
     * do not fire, so computing the deadline does not make sense.
     */
    if (!runstate_is_running()) {
        return;
    }

    /* warp clock deterministically in record/replay mode */
    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP)) {
        return;
    }

    if (icount_sleep) {
        /*
         * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now.
@@ -412,7 +431,7 @@ void qemu_clock_warp(QEMUClockType type)
         * the CPU starts running, in case the CPU is woken by an event other
         * than the earliest QEMU_CLOCK_VIRTUAL timer.
         */
        icount_warp_rt(NULL);
        icount_warp_rt();
        timer_del(icount_warp_timer);
    }
    if (!all_cpu_threads_idle()) {
@@ -605,7 +624,7 @@ void configure_icount(QemuOpts *opts, Error **errp)
    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
    if (icount_sleep) {
        icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
                                         icount_warp_rt, NULL);
                                         icount_dummy_timer, NULL);
    }

    icount_align_option = qemu_opt_get_bool(opts, "align", false);
@@ -1402,6 +1421,28 @@ int vm_stop_force_state(RunState state)
    }
}

static int64_t tcg_get_icount_limit(void)
{
    int64_t deadline;

    if (replay_mode != REPLAY_MODE_PLAY) {
        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);

        /* Maintain prior (possibly buggy) behaviour where if no deadline
         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
         * nanoseconds.
         */
        if ((deadline < 0) || (deadline > INT32_MAX)) {
            deadline = INT32_MAX;
        }

        return qemu_icount_round(deadline);
    } else {
        return replay_get_instructions();
    }
}

static int tcg_cpu_exec(CPUState *cpu)
{
    int ret;
@@ -1414,24 +1455,12 @@ static int tcg_cpu_exec(CPUState *cpu)
#endif
    if (use_icount) {
        int64_t count;
        int64_t deadline;
        int decr;
        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
                                    + cpu->icount_extra);
        cpu->icount_decr.u16.low = 0;
        cpu->icount_extra = 0;
        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);

        /* Maintain prior (possibly buggy) behaviour where if no deadline
         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
         * nanoseconds.
         */
        if ((deadline < 0) || (deadline > INT32_MAX)) {
            deadline = INT32_MAX;
        }

        count = qemu_icount_round(deadline);
        count = tcg_get_icount_limit();
        timers_state.qemu_icount += count;
        decr = (count > 0xffff) ? 0xffff : count;
        count -= decr;
@@ -1449,6 +1478,7 @@ static int tcg_cpu_exec(CPUState *cpu)
                        + cpu->icount_extra);
        cpu->icount_decr.u32 = 0;
        cpu->icount_extra = 0;
        replay_account_executed_instructions();
    }
    return ret;
}

docs/replay.txt

0 → 100644
+168 −0
Original line number Diff line number Diff line
Copyright (c) 2010-2015 Institute for System Programming
                        of the Russian Academy of Sciences.

This work is licensed under the terms of the GNU GPL, version 2 or later.
See the COPYING file in the top-level directory.

Record/replay
-------------

Record/replay functions are used for the reverse execution and deterministic
replay of qemu execution. This implementation of deterministic replay can
be used for deterministic debugging of guest code through a gdb remote
interface.

Execution recording writes a non-deterministic events log, which can be later
used for replaying the execution anywhere and for unlimited number of times.
It also supports checkpointing for faster rewinding during reverse debugging.
Execution replaying reads the log and replays all non-deterministic events
including external input, hardware clocks, and interrupts.

Deterministic replay has the following features:
 * Deterministically replays whole system execution and all contents of
   the memory, state of the hardware devices, clocks, and screen of the VM.
 * Writes execution log into the file for later replaying for multiple times
   on different machines.
 * Supports i386, x86_64, and ARM hardware platforms.
 * Performs deterministic replay of all operations with keyboard and mouse
   input devices.

Usage of the record/replay:
 * First, record the execution, by adding the following arguments to the command line:
   '-icount shift=7,rr=record,rrfile=replay.bin -net none'.
   Block devices' images are not actually changed in the recording mode,
   because all of the changes are written to the temporary overlay file.
 * Then you can replay it by using another command
   line option: '-icount shift=7,rr=replay,rrfile=replay.bin -net none'
 * '-net none' option should also be specified if network replay patches
   are not applied.

Papers with description of deterministic replay implementation:
http://www.computer.org/csdl/proceedings/csmr/2012/4666/00/4666a553-abs.html
http://dl.acm.org/citation.cfm?id=2786805.2803179

Modifications of qemu include:
 * wrappers for clock and time functions to save their return values in the log
 * saving different asynchronous events (e.g. system shutdown) into the log
 * synchronization of the bottom halves execution
 * synchronization of the threads from thread pool
 * recording/replaying user input (mouse and keyboard)
 * adding internal checkpoints for cpu and io synchronization

Non-deterministic events
------------------------

Our record/replay system is based on saving and replaying non-deterministic
events (e.g. keyboard input) and simulating deterministic ones (e.g. reading
from HDD or memory of the VM). Saving only non-deterministic events makes
log file smaller, simulation faster, and allows using reverse debugging even
for realtime applications.

The following non-deterministic data from peripheral devices is saved into
the log: mouse and keyboard input, network packets, audio controller input,
USB packets, serial port input, and hardware clocks (they are non-deterministic
too, because their values are taken from the host machine). Inputs from
simulated hardware, memory of VM, software interrupts, and execution of
instructions are not saved into the log, because they are deterministic and
can be replayed by simulating the behavior of virtual machine starting from
initial state.

We had to solve three tasks to implement deterministic replay: recording
non-deterministic events, replaying non-deterministic events, and checking
that there is no divergence between record and replay modes.

We changed several parts of QEMU to make event log recording and replaying.
Devices' models that have non-deterministic input from external devices were
changed to write every external event into the execution log immediately.
E.g. network packets are written into the log when they arrive into the virtual
network adapter.

All non-deterministic events are coming from these devices. But to
replay them we need to know at which moments they occur. We specify
these moments by counting the number of instructions executed between
every pair of consecutive events.

Instruction counting
--------------------

QEMU should work in icount mode to use record/replay feature. icount was
designed to allow deterministic execution in absence of external inputs
of the virtual machine. We also use icount to control the occurrence of the
non-deterministic events. The number of instructions elapsed from the last event
is written to the log while recording the execution. In replay mode we
can predict when to inject that event using the instruction counter.

Timers
------

Timers are used to execute callbacks from different subsystems of QEMU
at the specified moments of time. There are several kinds of timers:
 * Real time clock. Based on host time and used only for callbacks that
   do not change the virtual machine state. For this reason real time
   clock and timers does not affect deterministic replay at all.
 * Virtual clock. These timers run only during the emulation. In icount
   mode virtual clock value is calculated using executed instructions counter.
   That is why it is completely deterministic and does not have to be recorded.
 * Host clock. This clock is used by device models that simulate real time
   sources (e.g. real time clock chip). Host clock is the one of the sources
   of non-determinism. Host clock read operations should be logged to
   make the execution deterministic.
 * Real time clock for icount. This clock is similar to real time clock but
   it is used only for increasing virtual clock while virtual machine is
   sleeping. Due to its nature it is also non-deterministic as the host clock
   and has to be logged too.

Checkpoints
-----------

Replaying of the execution of virtual machine is bound by sources of
non-determinism. These are inputs from clock and peripheral devices,
and QEMU thread scheduling. Thread scheduling affect on processing events
from timers, asynchronous input-output, and bottom halves.

Invocations of timers are coupled with clock reads and changing the state
of the virtual machine. Reads produce non-deterministic data taken from
host clock. And VM state changes should preserve their order. Their relative
order in replay mode must replicate the order of callbacks in record mode.
To preserve this order we use checkpoints. When a specific clock is processed
in record mode we save to the log special "checkpoint" event.
Checkpoints here do not refer to virtual machine snapshots. They are just
record/replay events used for synchronization.

QEMU in replay mode will try to invoke timers processing in random moment
of time. That's why we do not process a group of timers until the checkpoint
event will be read from the log. Such an event allows synchronizing CPU
execution and timer events.

Another checkpoints application in record/replay is instruction counting
while the virtual machine is idle. This function (qemu_clock_warp) is called
from the wait loop. It changes virtual machine state and must be deterministic
then. That is why we added checkpoint to this function to prevent its
operation in replay mode when it does not correspond to record mode.

Bottom halves
-------------

Disk I/O events are completely deterministic in our model, because
in both record and replay modes we start virtual machine from the same
disk state. But callbacks that virtual disk controller uses for reading and
writing the disk may occur at different moments of time in record and replay
modes.

Reading and writing requests are created by CPU thread of QEMU. Later these
requests proceed to block layer which creates "bottom halves". Bottom
halves consist of callback and its parameters. They are processed when
main loop locks the global mutex. These locks are not synchronized with
replaying process because main loop also processes the events that do not
affect the virtual machine state (like user interaction with monitor).

That is why we had to implement saving and replaying bottom halves callbacks
synchronously to the CPU execution. When the callback is about to execute
it is added to the queue in the replay module. This queue is written to the
log when its callbacks are executed. In replay mode callbacks are not processed
until the corresponding event is read from the events log file.

Sometimes the block layer uses asynchronous callbacks for its internal purposes
(like reading or writing VM snapshots or disk image cluster tables). In this
case bottom halves are not marked as "replayable" and do not saved
into the log.
Loading