Commit 3a183e33 authored by Peter Maydell's avatar Peter Maydell
Browse files

Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20190128' into staging



Backend vector enhancements
Dynamic tlb resizing

# gpg: Signature made Mon 28 Jan 2019 15:57:19 GMT
# gpg:                using RSA key 64DF38E8AF7E215F
# gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full]
# Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A  05C0 64DF 38E8 AF7E 215F

* remotes/rth/tags/pull-tcg-20190128: (23 commits)
  cputlb: Remove static tlb sizing
  tcg/tci: enable dynamic TLB sizing
  tcg/mips: enable dynamic TLB sizing
  tcg/mips: Fix tcg_out_qemu_ld_slow_path
  tcg/arm: enable dynamic TLB sizing
  tcg/riscv: enable dynamic TLB sizing
  tcg/s390: enable dynamic TLB sizing
  tcg/sparc: enable dynamic TLB sizing
  tcg/ppc: enable dynamic TLB sizing
  tcg/aarch64: enable dynamic TLB sizing
  tcg/i386: enable dynamic TLB sizing
  tcg: introduce dynamic TLB sizing
  cputlb: do not evict empty entries to the vtlb
  tcg/aarch64: Implement vector minmax arithmetic
  tcg/aarch64: Implement vector saturating arithmetic
  tcg/i386: Implement vector minmax arithmetic
  tcg/i386: Implement vector saturating arithmetic
  tcg/i386: Split subroutines out of tcg_expand_vec_op
  tcg: Add opcodes for vector minmax arithmetic
  tcg: Add opcodes for vector saturated arithmetic
  ...

Signed-off-by: default avatarPeter Maydell <peter.maydell@linaro.org>
parents 5f39a91d e77c89fb
Loading
Loading
Loading
Loading
+186 −6
Original line number Diff line number Diff line
@@ -74,6 +74,166 @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
#define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)

static inline size_t sizeof_tlb(CPUArchState *env, uintptr_t mmu_idx)
{
    return env->tlb_mask[mmu_idx] + (1 << CPU_TLB_ENTRY_BITS);
}

static void tlb_window_reset(CPUTLBWindow *window, int64_t ns,
                             size_t max_entries)
{
    window->begin_ns = ns;
    window->max_entries = max_entries;
}

static void tlb_dyn_init(CPUArchState *env)
{
    int i;

    for (i = 0; i < NB_MMU_MODES; i++) {
        CPUTLBDesc *desc = &env->tlb_d[i];
        size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS;

        tlb_window_reset(&desc->window, get_clock_realtime(), 0);
        desc->n_used_entries = 0;
        env->tlb_mask[i] = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
        env->tlb_table[i] = g_new(CPUTLBEntry, n_entries);
        env->iotlb[i] = g_new(CPUIOTLBEntry, n_entries);
    }
}

/**
 * tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
 * @env: CPU that owns the TLB
 * @mmu_idx: MMU index of the TLB
 *
 * Called with tlb_lock_held.
 *
 * We have two main constraints when resizing a TLB: (1) we only resize it
 * on a TLB flush (otherwise we'd have to take a perf hit by either rehashing
 * the array or unnecessarily flushing it), which means we do not control how
 * frequently the resizing can occur; (2) we don't have access to the guest's
 * future scheduling decisions, and therefore have to decide the magnitude of
 * the resize based on past observations.
 *
 * In general, a memory-hungry process can benefit greatly from an appropriately
 * sized TLB, since a guest TLB miss is very expensive. This doesn't mean that
 * we just have to make the TLB as large as possible; while an oversized TLB
 * results in minimal TLB miss rates, it also takes longer to be flushed
 * (flushes can be _very_ frequent), and the reduced locality can also hurt
 * performance.
 *
 * To achieve near-optimal performance for all kinds of workloads, we:
 *
 * 1. Aggressively increase the size of the TLB when the use rate of the
 * TLB being flushed is high, since it is likely that in the near future this
 * memory-hungry process will execute again, and its memory hungriness will
 * probably be similar.
 *
 * 2. Slowly reduce the size of the TLB as the use rate declines over a
 * reasonably large time window. The rationale is that if in such a time window
 * we have not observed a high TLB use rate, it is likely that we won't observe
 * it in the near future. In that case, once a time window expires we downsize
 * the TLB to match the maximum use rate observed in the window.
 *
 * 3. Try to keep the maximum use rate in a time window in the 30-70% range,
 * since in that range performance is likely near-optimal. Recall that the TLB
 * is direct mapped, so we want the use rate to be low (or at least not too
 * high), since otherwise we are likely to have a significant amount of
 * conflict misses.
 */
static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
{
    CPUTLBDesc *desc = &env->tlb_d[mmu_idx];
    size_t old_size = tlb_n_entries(env, mmu_idx);
    size_t rate;
    size_t new_size = old_size;
    int64_t now = get_clock_realtime();
    int64_t window_len_ms = 100;
    int64_t window_len_ns = window_len_ms * 1000 * 1000;
    bool window_expired = now > desc->window.begin_ns + window_len_ns;

    if (desc->n_used_entries > desc->window.max_entries) {
        desc->window.max_entries = desc->n_used_entries;
    }
    rate = desc->window.max_entries * 100 / old_size;

    if (rate > 70) {
        new_size = MIN(old_size << 1, 1 << CPU_TLB_DYN_MAX_BITS);
    } else if (rate < 30 && window_expired) {
        size_t ceil = pow2ceil(desc->window.max_entries);
        size_t expected_rate = desc->window.max_entries * 100 / ceil;

        /*
         * Avoid undersizing when the max number of entries seen is just below
         * a pow2. For instance, if max_entries == 1025, the expected use rate
         * would be 1025/2048==50%. However, if max_entries == 1023, we'd get
         * 1023/1024==99.9% use rate, so we'd likely end up doubling the size
         * later. Thus, make sure that the expected use rate remains below 70%.
         * (and since we double the size, that means the lowest rate we'd
         * expect to get is 35%, which is still in the 30-70% range where
         * we consider that the size is appropriate.)
         */
        if (expected_rate > 70) {
            ceil *= 2;
        }
        new_size = MAX(ceil, 1 << CPU_TLB_DYN_MIN_BITS);
    }

    if (new_size == old_size) {
        if (window_expired) {
            tlb_window_reset(&desc->window, now, desc->n_used_entries);
        }
        return;
    }

    g_free(env->tlb_table[mmu_idx]);
    g_free(env->iotlb[mmu_idx]);

    tlb_window_reset(&desc->window, now, 0);
    /* desc->n_used_entries is cleared by the caller */
    env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS;
    env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size);
    env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size);
    /*
     * If the allocations fail, try smaller sizes. We just freed some
     * memory, so going back to half of new_size has a good chance of working.
     * Increased memory pressure elsewhere in the system might cause the
     * allocations to fail though, so we progressively reduce the allocation
     * size, aborting if we cannot even allocate the smallest TLB we support.
     */
    while (env->tlb_table[mmu_idx] == NULL || env->iotlb[mmu_idx] == NULL) {
        if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) {
            error_report("%s: %s", __func__, strerror(errno));
            abort();
        }
        new_size = MAX(new_size >> 1, 1 << CPU_TLB_DYN_MIN_BITS);
        env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS;

        g_free(env->tlb_table[mmu_idx]);
        g_free(env->iotlb[mmu_idx]);
        env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size);
        env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size);
    }
}

static inline void tlb_table_flush_by_mmuidx(CPUArchState *env, int mmu_idx)
{
    tlb_mmu_resize_locked(env, mmu_idx);
    memset(env->tlb_table[mmu_idx], -1, sizeof_tlb(env, mmu_idx));
    env->tlb_d[mmu_idx].n_used_entries = 0;
}

static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
{
    env->tlb_d[mmu_idx].n_used_entries++;
}

static inline void tlb_n_used_entries_dec(CPUArchState *env, uintptr_t mmu_idx)
{
    env->tlb_d[mmu_idx].n_used_entries--;
}

void tlb_init(CPUState *cpu)
{
    CPUArchState *env = cpu->env_ptr;
@@ -82,6 +242,8 @@ void tlb_init(CPUState *cpu)

    /* Ensure that cpu_reset performs a full flush.  */
    env->tlb_c.dirty = ALL_MMUIDX_BITS;

    tlb_dyn_init(env);
}

/* flush_all_helper: run fn across all cpus
@@ -122,7 +284,7 @@ void tlb_flush_counts(size_t *pfull, size_t *ppart, size_t *pelide)

static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
{
    memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
    tlb_table_flush_by_mmuidx(env, mmu_idx);
    memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
    env->tlb_d[mmu_idx].large_page_addr = -1;
    env->tlb_d[mmu_idx].large_page_mask = -1;
@@ -224,13 +386,24 @@ static inline bool tlb_hit_page_anyprot(CPUTLBEntry *tlb_entry,
           tlb_hit_page(tlb_entry->addr_code, page);
}

/**
 * tlb_entry_is_empty - return true if the entry is not in use
 * @te: pointer to CPUTLBEntry
 */
static inline bool tlb_entry_is_empty(const CPUTLBEntry *te)
{
    return te->addr_read == -1 && te->addr_write == -1 && te->addr_code == -1;
}

/* Called with tlb_c.lock held */
static inline void tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
static inline bool tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
                                          target_ulong page)
{
    if (tlb_hit_page_anyprot(tlb_entry, page)) {
        memset(tlb_entry, -1, sizeof(*tlb_entry));
        return true;
    }
    return false;
}

/* Called with tlb_c.lock held */
@@ -241,7 +414,9 @@ static inline void tlb_flush_vtlb_page_locked(CPUArchState *env, int mmu_idx,

    assert_cpu_is_self(ENV_GET_CPU(env));
    for (k = 0; k < CPU_VTLB_SIZE; k++) {
        tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page);
        if (tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page)) {
            tlb_n_used_entries_dec(env, mmu_idx);
        }
    }
}

@@ -258,7 +433,9 @@ static void tlb_flush_page_locked(CPUArchState *env, int midx,
                  midx, lp_addr, lp_mask);
        tlb_flush_one_mmuidx_locked(env, midx);
    } else {
        tlb_flush_entry_locked(tlb_entry(env, midx, page), page);
        if (tlb_flush_entry_locked(tlb_entry(env, midx, page), page)) {
            tlb_n_used_entries_dec(env, midx);
        }
        tlb_flush_vtlb_page_locked(env, midx, page);
    }
}
@@ -435,8 +612,9 @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
    qemu_spin_lock(&env->tlb_c.lock);
    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
        unsigned int i;
        unsigned int n = tlb_n_entries(env, mmu_idx);

        for (i = 0; i < CPU_TLB_SIZE; i++) {
        for (i = 0; i < n; i++) {
            tlb_reset_dirty_range_locked(&env->tlb_table[mmu_idx][i], start1,
                                         length);
        }
@@ -591,13 +769,14 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     * Only evict the old entry to the victim tlb if it's for a
     * different page; otherwise just overwrite the stale data.
     */
    if (!tlb_hit_page_anyprot(te, vaddr_page)) {
    if (!tlb_hit_page_anyprot(te, vaddr_page) && !tlb_entry_is_empty(te)) {
        unsigned vidx = env->tlb_d[mmu_idx].vindex++ % CPU_VTLB_SIZE;
        CPUTLBEntry *tv = &env->tlb_v_table[mmu_idx][vidx];

        /* Evict the old entry into the victim tlb.  */
        copy_tlb_helper_locked(tv, te);
        env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
        tlb_n_used_entries_dec(env, mmu_idx);
    }

    /* refill the tlb */
@@ -649,6 +828,7 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
    }

    copy_tlb_helper_locked(te, &tn);
    tlb_n_used_entries_inc(env, mmu_idx);
    qemu_spin_unlock(&env->tlb_c.lock);
}

+257 −0
Original line number Diff line number Diff line
@@ -512,6 +512,39 @@ void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_nand)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(vec64)) {
        *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) & *(vec64 *)(b + i));
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_nor)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(vec64)) {
        *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) | *(vec64 *)(b + i));
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(vec64)) {
        *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) ^ *(vec64 *)(b + i));
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
@@ -995,3 +1028,227 @@ void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc)
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_smin8)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
        int8_t aa = *(int8_t *)(a + i);
        int8_t bb = *(int8_t *)(b + i);
        int8_t dd = aa < bb ? aa : bb;
        *(int8_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_smin16)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
        int16_t aa = *(int16_t *)(a + i);
        int16_t bb = *(int16_t *)(b + i);
        int16_t dd = aa < bb ? aa : bb;
        *(int16_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_smin32)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
        int32_t aa = *(int32_t *)(a + i);
        int32_t bb = *(int32_t *)(b + i);
        int32_t dd = aa < bb ? aa : bb;
        *(int32_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_smin64)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
        int64_t aa = *(int64_t *)(a + i);
        int64_t bb = *(int64_t *)(b + i);
        int64_t dd = aa < bb ? aa : bb;
        *(int64_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_smax8)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
        int8_t aa = *(int8_t *)(a + i);
        int8_t bb = *(int8_t *)(b + i);
        int8_t dd = aa > bb ? aa : bb;
        *(int8_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_smax16)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
        int16_t aa = *(int16_t *)(a + i);
        int16_t bb = *(int16_t *)(b + i);
        int16_t dd = aa > bb ? aa : bb;
        *(int16_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_smax32)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
        int32_t aa = *(int32_t *)(a + i);
        int32_t bb = *(int32_t *)(b + i);
        int32_t dd = aa > bb ? aa : bb;
        *(int32_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_smax64)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
        int64_t aa = *(int64_t *)(a + i);
        int64_t bb = *(int64_t *)(b + i);
        int64_t dd = aa > bb ? aa : bb;
        *(int64_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_umin8)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
        uint8_t aa = *(uint8_t *)(a + i);
        uint8_t bb = *(uint8_t *)(b + i);
        uint8_t dd = aa < bb ? aa : bb;
        *(uint8_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_umin16)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
        uint16_t aa = *(uint16_t *)(a + i);
        uint16_t bb = *(uint16_t *)(b + i);
        uint16_t dd = aa < bb ? aa : bb;
        *(uint16_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_umin32)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
        uint32_t aa = *(uint32_t *)(a + i);
        uint32_t bb = *(uint32_t *)(b + i);
        uint32_t dd = aa < bb ? aa : bb;
        *(uint32_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_umin64)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
        uint64_t aa = *(uint64_t *)(a + i);
        uint64_t bb = *(uint64_t *)(b + i);
        uint64_t dd = aa < bb ? aa : bb;
        *(uint64_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_umax8)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
        uint8_t aa = *(uint8_t *)(a + i);
        uint8_t bb = *(uint8_t *)(b + i);
        uint8_t dd = aa > bb ? aa : bb;
        *(uint8_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_umax16)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
        uint16_t aa = *(uint16_t *)(a + i);
        uint16_t bb = *(uint16_t *)(b + i);
        uint16_t dd = aa > bb ? aa : bb;
        *(uint16_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_umax32)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
        uint32_t aa = *(uint32_t *)(a + i);
        uint32_t bb = *(uint32_t *)(b + i);
        uint32_t dd = aa > bb ? aa : bb;
        *(uint32_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}

void HELPER(gvec_umax64)(void *d, void *a, void *b, uint32_t desc)
{
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;

    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
        uint64_t aa = *(uint64_t *)(a + i);
        uint64_t bb = *(uint64_t *)(b + i);
        uint64_t dd = aa > bb ? aa : bb;
        *(uint64_t *)(d + i) = dd;
    }
    clear_high(d, oprsz, desc);
}
+23 −0
Original line number Diff line number Diff line
@@ -200,6 +200,26 @@ DEF_HELPER_FLAGS_4(gvec_ussub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_ussub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_ussub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

DEF_HELPER_FLAGS_4(gvec_smin8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_smin16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_smin32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_smin64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

DEF_HELPER_FLAGS_4(gvec_smax8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_smax16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_smax32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_smax64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

DEF_HELPER_FLAGS_4(gvec_umin8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_umin16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_umin32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_umin64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

DEF_HELPER_FLAGS_4(gvec_umax8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_umax16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_umax32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_umax64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
@@ -211,6 +231,9 @@ DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_nand, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_nor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_eqv, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+39 −32
Original line number Diff line number Diff line
@@ -67,37 +67,23 @@ typedef uint64_t target_ulong;
#define CPU_TLB_ENTRY_BITS 5
#endif

/* TCG_TARGET_TLB_DISPLACEMENT_BITS is used in CPU_TLB_BITS to ensure that
 * the TLB is not unnecessarily small, but still small enough for the
 * TLB lookup instruction sequence used by the TCG target.
 *
 * TCG will have to generate an operand as large as the distance between
 * env and the tlb_table[NB_MMU_MODES - 1][0].addend.  For simplicity,
 * the TCG targets just round everything up to the next power of two, and
 * count bits.  This works because: 1) the size of each TLB is a largish
 * power of two, 2) and because the limit of the displacement is really close
 * to a power of two, 3) the offset of tlb_table[0][0] inside env is smaller
 * than the size of a TLB.
 *
 * For example, the maximum displacement 0xFFF0 on PPC and MIPS, but TCG
 * just says "the displacement is 16 bits".  TCG_TARGET_TLB_DISPLACEMENT_BITS
 * then ensures that tlb_table at least 0x8000 bytes large ("not unnecessarily
 * small": 2^15).  The operand then will come up smaller than 0xFFF0 without
 * any particular care, because the TLB for a single MMU mode is larger than
 * 0x10000-0xFFF0=16 bytes.  In the end, the maximum value of the operand
 * could be something like 0xC000 (the offset of the last TLB table) plus
 * 0x18 (the offset of the addend field in each TLB entry) plus the offset
 * of tlb_table inside env (which is non-trivial but not huge).
 */
#define CPU_TLB_BITS                                             \
    MIN(8,                                                       \
        TCG_TARGET_TLB_DISPLACEMENT_BITS - CPU_TLB_ENTRY_BITS -  \
        (NB_MMU_MODES <= 1 ? 0 :                                 \
         NB_MMU_MODES <= 2 ? 1 :                                 \
         NB_MMU_MODES <= 4 ? 2 :                                 \
         NB_MMU_MODES <= 8 ? 3 : 4))
#define CPU_TLB_DYN_MIN_BITS 6
#define CPU_TLB_DYN_DEFAULT_BITS 8

#define CPU_TLB_SIZE (1 << CPU_TLB_BITS)
# if HOST_LONG_BITS == 32
/* Make sure we do not require a double-word shift for the TLB load */
#  define CPU_TLB_DYN_MAX_BITS (32 - TARGET_PAGE_BITS)
# else /* HOST_LONG_BITS == 64 */
/*
 * Assuming TARGET_PAGE_BITS==12, with 2**22 entries we can cover 2**(22+12) ==
 * 2**34 == 16G of address space. This is roughly what one would expect a
 * TLB to cover in a modern (as of 2018) x86_64 CPU. For instance, Intel
 * Skylake's Level-2 STLB has 16 1G entries.
 * Also, make sure we do not size the TLB past the guest's address space.
 */
#  define CPU_TLB_DYN_MAX_BITS                                  \
    MIN(22, TARGET_VIRT_ADDR_SPACE_BITS - TARGET_PAGE_BITS)
# endif

typedef struct CPUTLBEntry {
    /* bit TARGET_LONG_BITS to TARGET_PAGE_BITS : virtual address
@@ -141,6 +127,18 @@ typedef struct CPUIOTLBEntry {
    MemTxAttrs attrs;
} CPUIOTLBEntry;

/**
 * struct CPUTLBWindow
 * @begin_ns: host time (in ns) at the beginning of the time window
 * @max_entries: maximum number of entries observed in the window
 *
 * See also: tlb_mmu_resize_locked()
 */
typedef struct CPUTLBWindow {
    int64_t begin_ns;
    size_t max_entries;
} CPUTLBWindow;

typedef struct CPUTLBDesc {
    /*
     * Describe a region covering all of the large pages allocated
@@ -152,6 +150,8 @@ typedef struct CPUTLBDesc {
    target_ulong large_page_mask;
    /* The next index to use in the tlb victim table.  */
    size_t vindex;
    CPUTLBWindow window;
    size_t n_used_entries;
} CPUTLBDesc;

/*
@@ -176,6 +176,13 @@ typedef struct CPUTLBCommon {
    size_t elide_flush_count;
} CPUTLBCommon;

# define CPU_TLB                                                        \
    /* tlb_mask[i] contains (n_entries - 1) << CPU_TLB_ENTRY_BITS */    \
    uintptr_t tlb_mask[NB_MMU_MODES];                                   \
    CPUTLBEntry *tlb_table[NB_MMU_MODES];
# define CPU_IOTLB                              \
    CPUIOTLBEntry *iotlb[NB_MMU_MODES];

/*
 * The meaning of each of the MMU modes is defined in the target code.
 * Note that NB_MMU_MODES is not yet defined; we can only reference it
@@ -184,9 +191,9 @@ typedef struct CPUTLBCommon {
#define CPU_COMMON_TLB \
    CPUTLBCommon tlb_c;                                                 \
    CPUTLBDesc tlb_d[NB_MMU_MODES];                                     \
    CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE];                  \
    CPU_TLB                                                             \
    CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE];               \
    CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE];                    \
    CPU_IOTLB                                                           \
    CPUIOTLBEntry iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];

#else
+8 −1
Original line number Diff line number Diff line
@@ -139,7 +139,14 @@ static inline target_ulong tlb_addr_write(const CPUTLBEntry *entry)
static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx,
                                  target_ulong addr)
{
    return (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
    uintptr_t size_mask = env->tlb_mask[mmu_idx] >> CPU_TLB_ENTRY_BITS;

    return (addr >> TARGET_PAGE_BITS) & size_mask;
}

static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx)
{
    return (env->tlb_mask[mmu_idx] >> CPU_TLB_ENTRY_BITS) + 1;
}

/* Find the TLB entry corresponding to the mmu_idx + address pair.  */
Loading