Commit 33836a73 authored by Peter Maydell's avatar Peter Maydell
Browse files

Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20180615' into staging



TCG patch queue:

Workaround macos assembler lossage.
Eliminate tb_lock.
Fix TB code generation overflow.

# gpg: Signature made Fri 15 Jun 2018 20:40:56 BST
# gpg:                using RSA key 64DF38E8AF7E215F
# gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>"
# Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A  05C0 64DF 38E8 AF7E 215F

* remotes/rth/tags/pull-tcg-20180615:
  tcg: Reduce max TB opcode count
  tcg: remove tb_lock
  translate-all: remove tb_lock mention from cpu_restore_state_from_tb
  cputlb: remove tb_lock from tlb_flush functions
  translate-all: protect TB jumps with a per-destination-TB lock
  translate-all: discard TB when tb_link_page returns an existing matching TB
  translate-all: introduce assert_no_pages_locked
  translate-all: add page_locked assertions
  translate-all: use per-page locking in !user-mode
  translate-all: move tb_invalidate_phys_page_range up in the file
  translate-all: work page-by-page in tb_invalidate_phys_range_1
  translate-all: remove hole in PageDesc
  translate-all: make l1_map lockless
  translate-all: iterate over TBs in a page with PAGE_FOR_EACH_TB
  tcg: move tb_ctx.tb_phys_invalidate_count to tcg_ctx
  tcg: track TBs with per-region BST's
  qht: return existing entry when qht_insert fails
  qht: require a default comparison function
  tcg/i386: Use byte form of xgetbv instruction

Signed-off-by: default avatarPeter Maydell <peter.maydell@linaro.org>
parents 46012db6 9f754620
Loading
Loading
Loading
Loading
+41 −55
Original line number Diff line number Diff line
@@ -212,20 +212,20 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
       We only end up here when an existing TB is too long.  */
    cflags |= MIN(max_cycles, CF_COUNT_MASK);

    tb_lock();
    mmap_lock();
    tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base,
                     orig_tb->flags, cflags);
    tb->orig_tb = orig_tb;
    tb_unlock();
    mmap_unlock();

    /* execute the generated code */
    trace_exec_tb_nocache(tb, tb->pc);
    cpu_tb_exec(cpu, tb);

    tb_lock();
    mmap_lock();
    tb_phys_invalidate(tb, -1);
    tb_remove(tb);
    tb_unlock();
    mmap_unlock();
    tcg_tb_remove(tb);
}
#endif

@@ -244,12 +244,7 @@ void cpu_exec_step_atomic(CPUState *cpu)
        tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
        if (tb == NULL) {
            mmap_lock();
            tb_lock();
            tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask);
            if (likely(tb == NULL)) {
            tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
            }
            tb_unlock();
            mmap_unlock();
        }

@@ -264,15 +259,14 @@ void cpu_exec_step_atomic(CPUState *cpu)
        cpu_tb_exec(cpu, tb);
        cc->cpu_exec_exit(cpu);
    } else {
        /* We may have exited due to another problem here, so we need
         * to reset any tb_locks we may have taken but didn't release.
        /*
         * The mmap_lock is dropped by tb_gen_code if it runs out of
         * memory.
         */
#ifndef CONFIG_SOFTMMU
        tcg_debug_assert(!have_mmap_lock());
#endif
        tb_lock_reset();
        assert_no_pages_locked();
    }

    if (in_exclusive_region) {
@@ -295,7 +289,7 @@ struct tb_desc {
    uint32_t trace_vcpu_dstate;
};

static bool tb_cmp(const void *p, const void *d)
static bool tb_lookup_cmp(const void *p, const void *d)
{
    const TranslationBlock *tb = p;
    const struct tb_desc *desc = d;
@@ -340,7 +334,7 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
    phys_pc = get_page_addr_code(desc.env, pc);
    desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
    h = tb_hash_func(phys_pc, pc, flags, cf_mask, *cpu->trace_dstate);
    return qht_lookup(&tb_ctx.htable, tb_cmp, &desc, h);
    return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);
}

void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
@@ -354,28 +348,43 @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
    }
}

/* Called with tb_lock held.  */
static inline void tb_add_jump(TranslationBlock *tb, int n,
                               TranslationBlock *tb_next)
{
    uintptr_t old;

    assert(n < ARRAY_SIZE(tb->jmp_list_next));
    if (tb->jmp_list_next[n]) {
        /* Another thread has already done this while we were
         * outside of the lock; nothing to do in this case */
        return;
    qemu_spin_lock(&tb_next->jmp_lock);

    /* make sure the destination TB is valid */
    if (tb_next->cflags & CF_INVALID) {
        goto out_unlock_next;
    }
    /* Atomically claim the jump destination slot only if it was NULL */
    old = atomic_cmpxchg(&tb->jmp_dest[n], (uintptr_t)NULL, (uintptr_t)tb_next);
    if (old) {
        goto out_unlock_next;
    }

    /* patch the native jump address */
    tb_set_jmp_target(tb, n, (uintptr_t)tb_next->tc.ptr);

    /* add in TB jmp list */
    tb->jmp_list_next[n] = tb_next->jmp_list_head;
    tb_next->jmp_list_head = (uintptr_t)tb | n;

    qemu_spin_unlock(&tb_next->jmp_lock);

    qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,
                           "Linking TBs %p [" TARGET_FMT_lx
                           "] index %d -> %p [" TARGET_FMT_lx "]\n",
                           tb->tc.ptr, tb->pc, n,
                           tb_next->tc.ptr, tb_next->pc);
    return;

    /* patch the native jump address */
    tb_set_jmp_target(tb, n, (uintptr_t)tb_next->tc.ptr);

    /* add in TB jmp circular list */
    tb->jmp_list_next[n] = tb_next->jmp_list_first;
    tb_next->jmp_list_first = (uintptr_t)tb | n;
 out_unlock_next:
    qemu_spin_unlock(&tb_next->jmp_lock);
    return;
}

static inline TranslationBlock *tb_find(CPUState *cpu,
@@ -385,27 +394,11 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
    TranslationBlock *tb;
    target_ulong cs_base, pc;
    uint32_t flags;
    bool acquired_tb_lock = false;

    tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
    if (tb == NULL) {
        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
         * taken outside tb_lock. As system emulation is currently
         * single threaded the locks are NOPs.
         */
        mmap_lock();
        tb_lock();
        acquired_tb_lock = true;

        /* There's a chance that our desired tb has been translated while
         * taking the locks so we check again inside the lock.
         */
        tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask);
        if (likely(tb == NULL)) {
            /* if no translated code available, then translate it now */
        tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
        }

        mmap_unlock();
        /* We add the TB in the virtual pc hash table for the fast lookup */
        atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
@@ -421,17 +414,8 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
#endif
    /* See if we can patch the calling TB. */
    if (last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
        if (!acquired_tb_lock) {
            tb_lock();
            acquired_tb_lock = true;
        }
        if (!(tb->cflags & CF_INVALID)) {
        tb_add_jump(last_tb, tb_exit, tb);
    }
    }
    if (acquired_tb_lock) {
        tb_unlock();
    }
    return tb;
}

@@ -706,7 +690,9 @@ int cpu_exec(CPUState *cpu)
        g_assert(cpu == current_cpu);
        g_assert(cc == CPU_GET_CLASS(cpu));
#endif /* buggy compiler */
        tb_lock_reset();
#ifndef CONFIG_SOFTMMU
        tcg_debug_assert(!have_mmap_lock());
#endif
        if (qemu_mutex_iothread_locked()) {
            qemu_mutex_unlock_iothread();
        }
+0 −8
Original line number Diff line number Diff line
@@ -125,8 +125,6 @@ static void tlb_flush_nocheck(CPUState *cpu)
    atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
    tlb_debug("(count: %zu)\n", tlb_flush_count());

    tb_lock();

    memset(env->tlb_table, -1, sizeof(env->tlb_table));
    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
    cpu_tb_jmp_cache_clear(cpu);
@@ -135,8 +133,6 @@ static void tlb_flush_nocheck(CPUState *cpu)
    env->tlb_flush_addr = -1;
    env->tlb_flush_mask = 0;

    tb_unlock();

    atomic_mb_set(&cpu->pending_tlb_flush, 0);
}

@@ -180,8 +176,6 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)

    assert_cpu_is_self(cpu);

    tb_lock();

    tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask);

    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
@@ -197,8 +191,6 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
    cpu_tb_jmp_cache_clear(cpu);

    tlb_debug("done\n");

    tb_unlock();
}

void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
+720 −321

File changed.

Preview size limit exceeded, changes collapsed.

+5 −1
Original line number Diff line number Diff line
@@ -23,7 +23,11 @@


/* translate-all.c */
void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len);
struct page_collection *page_collection_lock(tb_page_addr_t start,
                                             tb_page_addr_t end);
void page_collection_unlock(struct page_collection *set);
void tb_invalidate_phys_page_fast(struct page_collection *pages,
                                  tb_page_addr_t start, int len);
void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
                                   int is_cpu_write_access);
void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end);
+16 −8
Original line number Diff line number Diff line
@@ -61,6 +61,7 @@ have their block-to-block jumps patched.
Global TCG State
----------------

### User-mode emulation
We need to protect the entire code generation cycle including any post
generation patching of the translated code. This also implies a shared
translation buffer which contains code running on all cores. Any
@@ -75,9 +76,11 @@ patching.

(Current solution)

Mainly as part of the linux-user work all code generation is
serialised with a tb_lock(). For the SoftMMU tb_lock() also takes the
place of mmap_lock() in linux-user.
Code generation is serialised with mmap_lock().

### !User-mode emulation
Each vCPU has its own TCG context and associated TCG region, thereby
requiring no locking.

Translation Blocks
------------------
@@ -131,15 +134,20 @@ DESIGN REQUIREMENT: Safely handle invalidation of TBs

The direct jump themselves are updated atomically by the TCG
tb_set_jmp_target() code. Modification to the linked lists that allow
searching for linked pages are done under the protect of the
tb_lock().
searching for linked pages are done under the protection of tb->jmp_lock,
where tb is the destination block of a jump. Each origin block keeps a
pointer to its destinations so that the appropriate lock can be acquired before
iterating over a jump list.

The global page table is protected by the tb_lock() in system-mode and
mmap_lock() in linux-user mode.
The global page table is a lockless radix tree; cmpxchg is used
to atomically insert new elements.

The lookup caches are updated atomically and the lookup hash uses QHT
which is designed for concurrent safe lookup.

Parallel code generation is supported. QHT is used at insertion time
as the synchronization point across threads, thereby ensuring that we only
keep track of a single TranslationBlock for each guest code block.

Memory maps and TLBs
--------------------
@@ -190,7 +198,7 @@ work as "safe work" and exiting the cpu run loop. This ensure by the
time execution restarts all flush operations have completed.

TLB flag updates are all done atomically and are also protected by the
tb_lock() which is used by the functions that update the TLB in bulk.
corresponding page lock.

(Known limitation)

Loading