Commit 9fd46c83 authored by Richard Henderson's avatar Richard Henderson Committed by Peter Maydell
Browse files

target/arm: Rewrite helper_sve_st[1234]*_r



This fixes the endianness problem for softmmu, and moves the
main loop out of a macro and into an inlined function.

Reviewed-by: default avatarPeter Maydell <peter.maydell@linaro.org>
Tested-by: default avatarLaurent Desnogues <laurent.desnogues@gmail.com>
Signed-off-by: default avatarRichard Henderson <richard.henderson@linaro.org>
Message-id: 20181005175350.30752-10-richard.henderson@linaro.org
Signed-off-by: default avatarPeter Maydell <peter.maydell@linaro.org>
parent f27d4dc2
Loading
Loading
Loading
Loading
+170 −177
Original line number Diff line number Diff line
@@ -3991,6 +3991,7 @@ typedef intptr_t sve_ld1_host_fn(void *vd, void *vg, void *host,
 */
typedef void sve_ld1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
                            target_ulong vaddr, int mmu_idx, uintptr_t ra);
typedef sve_ld1_tlb_fn sve_st1_tlb_fn;

/*
 * Generate the above primitives.
@@ -4668,214 +4669,206 @@ DO_LDFF1_LDNF1_2(dd, 3, 3)
/*
 * Store contiguous data, protected by a governing predicate.
 */
#define DO_ST1(NAME, FN, TYPEE, TYPEM, H)                  \
void HELPER(NAME)(CPUARMState *env, void *vg,              \
                  target_ulong addr, uint32_t desc)        \
{                                                          \
    intptr_t i, oprsz = simd_oprsz(desc);                  \
    intptr_t ra = GETPC();                                 \
    unsigned rd = simd_data(desc);                         \
    void *vd = &env->vfp.zregs[rd];                        \
    for (i = 0; i < oprsz; ) {                             \
        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
        do {                                               \
            if (pg & 1) {                                  \
                TYPEM m = *(TYPEE *)(vd + H(i));           \
                FN(env, addr, m, ra);                      \
            }                                              \
            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);      \
            addr += sizeof(TYPEM);                         \
        } while (i & 15);                                  \
    }                                                      \
}

#define DO_ST1_D(NAME, FN, TYPEM)                          \
void HELPER(NAME)(CPUARMState *env, void *vg,              \
                  target_ulong addr, uint32_t desc)        \
#ifdef CONFIG_SOFTMMU
#define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off,  \
                             target_ulong addr, int mmu_idx, uintptr_t ra)  \
{                                                                           \
    intptr_t i, oprsz = simd_oprsz(desc) / 8;              \
    intptr_t ra = GETPC();                                 \
    unsigned rd = simd_data(desc);                         \
    uint64_t *d = &env->vfp.zregs[rd].d[0];                \
    uint8_t *pg = vg;                                      \
    for (i = 0; i < oprsz; i += 1) {                       \
        if (pg[H1(i)] & 1) {                               \
            FN(env, addr, d[i], ra);                       \
        }                                                  \
        addr += sizeof(TYPEM);                             \
    }                                                      \
    TCGMemOpIdx oi = make_memop_idx(ctz32(sizeof(TYPEM)) | MOEND, mmu_idx); \
    TLB(env, addr, *(TYPEM *)(vd + H(reg_off)), oi, ra);                    \
}

#define DO_ST2(NAME, FN, TYPEE, TYPEM, H)                  \
void HELPER(NAME)(CPUARMState *env, void *vg,              \
                  target_ulong addr, uint32_t desc)        \
#else
#define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off,  \
                             target_ulong addr, int mmu_idx, uintptr_t ra)  \
{                                                                           \
    intptr_t i, oprsz = simd_oprsz(desc);                  \
    intptr_t ra = GETPC();                                 \
    unsigned rd = simd_data(desc);                         \
    void *d1 = &env->vfp.zregs[rd];                        \
    void *d2 = &env->vfp.zregs[(rd + 1) & 31];             \
    for (i = 0; i < oprsz; ) {                             \
        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
        do {                                               \
            if (pg & 1) {                                  \
                TYPEM m1 = *(TYPEE *)(d1 + H(i));          \
                TYPEM m2 = *(TYPEE *)(d2 + H(i));          \
                FN(env, addr, m1, ra);                     \
                FN(env, addr + sizeof(TYPEM), m2, ra);     \
            }                                              \
            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);      \
            addr += 2 * sizeof(TYPEM);                     \
        } while (i & 15);                                  \
    }                                                      \
    HOST(g2h(addr), *(TYPEM *)(vd + H(reg_off)));                           \
}
#endif

#define DO_ST3(NAME, FN, TYPEE, TYPEM, H)                  \
void HELPER(NAME)(CPUARMState *env, void *vg,              \
                  target_ulong addr, uint32_t desc)        \
{                                                          \
    intptr_t i, oprsz = simd_oprsz(desc);                  \
    intptr_t ra = GETPC();                                 \
    unsigned rd = simd_data(desc);                         \
    void *d1 = &env->vfp.zregs[rd];                        \
    void *d2 = &env->vfp.zregs[(rd + 1) & 31];             \
    void *d3 = &env->vfp.zregs[(rd + 2) & 31];             \
    for (i = 0; i < oprsz; ) {                             \
        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
        do {                                               \
            if (pg & 1) {                                  \
                TYPEM m1 = *(TYPEE *)(d1 + H(i));          \
                TYPEM m2 = *(TYPEE *)(d2 + H(i));          \
                TYPEM m3 = *(TYPEE *)(d3 + H(i));          \
                FN(env, addr, m1, ra);                     \
                FN(env, addr + sizeof(TYPEM), m2, ra);     \
                FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
            }                                              \
            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);      \
            addr += 3 * sizeof(TYPEM);                     \
        } while (i & 15);                                  \
    }                                                      \
}
DO_ST_TLB(st1bb,   H1,  uint8_t, stb_p, 0, helper_ret_stb_mmu)
DO_ST_TLB(st1bh, H1_2, uint16_t, stb_p, 0, helper_ret_stb_mmu)
DO_ST_TLB(st1bs, H1_4, uint32_t, stb_p, 0, helper_ret_stb_mmu)
DO_ST_TLB(st1bd,     , uint64_t, stb_p, 0, helper_ret_stb_mmu)

#define DO_ST4(NAME, FN, TYPEE, TYPEM, H)                  \
void HELPER(NAME)(CPUARMState *env, void *vg,              \
                  target_ulong addr, uint32_t desc)        \
{                                                          \
    intptr_t i, oprsz = simd_oprsz(desc);                  \
    intptr_t ra = GETPC();                                 \
    unsigned rd = simd_data(desc);                         \
    void *d1 = &env->vfp.zregs[rd];                        \
    void *d2 = &env->vfp.zregs[(rd + 1) & 31];             \
    void *d3 = &env->vfp.zregs[(rd + 2) & 31];             \
    void *d4 = &env->vfp.zregs[(rd + 3) & 31];             \
    for (i = 0; i < oprsz; ) {                             \
        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
        do {                                               \
            if (pg & 1) {                                  \
                TYPEM m1 = *(TYPEE *)(d1 + H(i));          \
                TYPEM m2 = *(TYPEE *)(d2 + H(i));          \
                TYPEM m3 = *(TYPEE *)(d3 + H(i));          \
                TYPEM m4 = *(TYPEE *)(d4 + H(i));          \
                FN(env, addr, m1, ra);                     \
                FN(env, addr + sizeof(TYPEM), m2, ra);     \
                FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
                FN(env, addr + 3 * sizeof(TYPEM), m4, ra); \
            }                                              \
            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);      \
            addr += 4 * sizeof(TYPEM);                     \
        } while (i & 15);                                  \
    }                                                      \
}
DO_ST_TLB(st1hh_le, H1_2, uint16_t, stw_le_p, MO_LE, helper_le_stw_mmu)
DO_ST_TLB(st1hs_le, H1_4, uint32_t, stw_le_p, MO_LE, helper_le_stw_mmu)
DO_ST_TLB(st1hd_le,     , uint64_t, stw_le_p, MO_LE, helper_le_stw_mmu)

DO_ST1(sve_st1bh_r, cpu_stb_data_ra, uint16_t, uint8_t, H1_2)
DO_ST1(sve_st1bs_r, cpu_stb_data_ra, uint32_t, uint8_t, H1_4)
DO_ST1_D(sve_st1bd_r, cpu_stb_data_ra, uint8_t)
DO_ST_TLB(st1ss_le, H1_4, uint32_t, stl_le_p, MO_LE, helper_le_stl_mmu)
DO_ST_TLB(st1sd_le,     , uint64_t, stl_le_p, MO_LE, helper_le_stl_mmu)

DO_ST1(sve_st1hs_r, cpu_stw_data_ra, uint32_t, uint16_t, H1_4)
DO_ST1_D(sve_st1hd_r, cpu_stw_data_ra, uint16_t)
DO_ST_TLB(st1dd_le,     , uint64_t, stq_le_p, MO_LE, helper_le_stq_mmu)

DO_ST1_D(sve_st1sd_r, cpu_stl_data_ra, uint32_t)
DO_ST_TLB(st1hh_be, H1_2, uint16_t, stw_be_p, MO_BE, helper_be_stw_mmu)
DO_ST_TLB(st1hs_be, H1_4, uint32_t, stw_be_p, MO_BE, helper_be_stw_mmu)
DO_ST_TLB(st1hd_be,     , uint64_t, stw_be_p, MO_BE, helper_be_stw_mmu)

DO_ST1(sve_st1bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
DO_ST2(sve_st2bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
DO_ST3(sve_st3bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
DO_ST4(sve_st4bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
DO_ST_TLB(st1ss_be, H1_4, uint32_t, stl_be_p, MO_BE, helper_be_stl_mmu)
DO_ST_TLB(st1sd_be,     , uint64_t, stl_be_p, MO_BE, helper_be_stl_mmu)

DO_ST1(sve_st1hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
DO_ST2(sve_st2hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
DO_ST3(sve_st3hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
DO_ST4(sve_st4hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
DO_ST_TLB(st1dd_be,     , uint64_t, stq_be_p, MO_BE, helper_be_stq_mmu)

DO_ST1(sve_st1ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
DO_ST2(sve_st2ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
DO_ST3(sve_st3ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
DO_ST4(sve_st4ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
#undef DO_ST_TLB

DO_ST1_D(sve_st1dd_r, cpu_stq_data_ra, uint64_t)
/*
 * Common helpers for all contiguous 1,2,3,4-register predicated stores.
 */
static void sve_st1_r(CPUARMState *env, void *vg, target_ulong addr,
                      uint32_t desc, const uintptr_t ra,
                      const int esize, const int msize,
                      sve_st1_tlb_fn *tlb_fn)
{
    const int mmu_idx = cpu_mmu_index(env, false);
    intptr_t i, oprsz = simd_oprsz(desc);
    unsigned rd = simd_data(desc);
    void *vd = &env->vfp.zregs[rd];

    set_helper_retaddr(ra);
    for (i = 0; i < oprsz; ) {
        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
        do {
            if (pg & 1) {
                tlb_fn(env, vd, i, addr, mmu_idx, ra);
            }
            i += esize, pg >>= esize;
            addr += msize;
        } while (i & 15);
    }
    set_helper_retaddr(0);
}

void HELPER(sve_st2dd_r)(CPUARMState *env, void *vg,
                         target_ulong addr, uint32_t desc)
static void sve_st2_r(CPUARMState *env, void *vg, target_ulong addr,
                      uint32_t desc, const uintptr_t ra,
                      const int esize, const int msize,
                      sve_st1_tlb_fn *tlb_fn)
{
    intptr_t i, oprsz = simd_oprsz(desc) / 8;
    intptr_t ra = GETPC();
    const int mmu_idx = cpu_mmu_index(env, false);
    intptr_t i, oprsz = simd_oprsz(desc);
    unsigned rd = simd_data(desc);
    uint64_t *d1 = &env->vfp.zregs[rd].d[0];
    uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
    uint8_t *pg = vg;
    void *d1 = &env->vfp.zregs[rd];
    void *d2 = &env->vfp.zregs[(rd + 1) & 31];

    for (i = 0; i < oprsz; i += 1) {
        if (pg[H1(i)] & 1) {
            cpu_stq_data_ra(env, addr, d1[i], ra);
            cpu_stq_data_ra(env, addr + 8, d2[i], ra);
    set_helper_retaddr(ra);
    for (i = 0; i < oprsz; ) {
        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
        do {
            if (pg & 1) {
                tlb_fn(env, d1, i, addr, mmu_idx, ra);
                tlb_fn(env, d2, i, addr + msize, mmu_idx, ra);
            }
        addr += 2 * 8;
            i += esize, pg >>= esize;
            addr += 2 * msize;
        } while (i & 15);
    }
    set_helper_retaddr(0);
}

void HELPER(sve_st3dd_r)(CPUARMState *env, void *vg,
                         target_ulong addr, uint32_t desc)
static void sve_st3_r(CPUARMState *env, void *vg, target_ulong addr,
                      uint32_t desc, const uintptr_t ra,
                      const int esize, const int msize,
                      sve_st1_tlb_fn *tlb_fn)
{
    intptr_t i, oprsz = simd_oprsz(desc) / 8;
    intptr_t ra = GETPC();
    const int mmu_idx = cpu_mmu_index(env, false);
    intptr_t i, oprsz = simd_oprsz(desc);
    unsigned rd = simd_data(desc);
    uint64_t *d1 = &env->vfp.zregs[rd].d[0];
    uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
    uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
    uint8_t *pg = vg;
    void *d1 = &env->vfp.zregs[rd];
    void *d2 = &env->vfp.zregs[(rd + 1) & 31];
    void *d3 = &env->vfp.zregs[(rd + 2) & 31];

    for (i = 0; i < oprsz; i += 1) {
        if (pg[H1(i)] & 1) {
            cpu_stq_data_ra(env, addr, d1[i], ra);
            cpu_stq_data_ra(env, addr + 8, d2[i], ra);
            cpu_stq_data_ra(env, addr + 16, d3[i], ra);
    set_helper_retaddr(ra);
    for (i = 0; i < oprsz; ) {
        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
        do {
            if (pg & 1) {
                tlb_fn(env, d1, i, addr, mmu_idx, ra);
                tlb_fn(env, d2, i, addr + msize, mmu_idx, ra);
                tlb_fn(env, d3, i, addr + 2 * msize, mmu_idx, ra);
            }
        addr += 3 * 8;
            i += esize, pg >>= esize;
            addr += 3 * msize;
        } while (i & 15);
    }
    set_helper_retaddr(0);
}

void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg,
                         target_ulong addr, uint32_t desc)
static void sve_st4_r(CPUARMState *env, void *vg, target_ulong addr,
                      uint32_t desc, const uintptr_t ra,
                      const int esize, const int msize,
                      sve_st1_tlb_fn *tlb_fn)
{
    intptr_t i, oprsz = simd_oprsz(desc) / 8;
    intptr_t ra = GETPC();
    const int mmu_idx = cpu_mmu_index(env, false);
    intptr_t i, oprsz = simd_oprsz(desc);
    unsigned rd = simd_data(desc);
    uint64_t *d1 = &env->vfp.zregs[rd].d[0];
    uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
    uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
    uint64_t *d4 = &env->vfp.zregs[(rd + 3) & 31].d[0];
    uint8_t *pg = vg;
    void *d1 = &env->vfp.zregs[rd];
    void *d2 = &env->vfp.zregs[(rd + 1) & 31];
    void *d3 = &env->vfp.zregs[(rd + 2) & 31];
    void *d4 = &env->vfp.zregs[(rd + 3) & 31];

    for (i = 0; i < oprsz; i += 1) {
        if (pg[H1(i)] & 1) {
            cpu_stq_data_ra(env, addr, d1[i], ra);
            cpu_stq_data_ra(env, addr + 8, d2[i], ra);
            cpu_stq_data_ra(env, addr + 16, d3[i], ra);
            cpu_stq_data_ra(env, addr + 24, d4[i], ra);
    set_helper_retaddr(ra);
    for (i = 0; i < oprsz; ) {
        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
        do {
            if (pg & 1) {
                tlb_fn(env, d1, i, addr, mmu_idx, ra);
                tlb_fn(env, d2, i, addr + msize, mmu_idx, ra);
                tlb_fn(env, d3, i, addr + 2 * msize, mmu_idx, ra);
                tlb_fn(env, d4, i, addr + 3 * msize, mmu_idx, ra);
            }
        addr += 4 * 8;
            i += esize, pg >>= esize;
            addr += 4 * msize;
        } while (i & 15);
    }
    set_helper_retaddr(0);
}

#define DO_STN_1(N, NAME, ESIZE) \
void __attribute__((flatten)) HELPER(sve_st##N##NAME##_r)           \
    (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc)  \
{                                                                   \
    sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, 1,           \
                  sve_st1##NAME##_tlb);                             \
}

#define DO_STN_2(N, NAME, ESIZE, MSIZE) \
void __attribute__((flatten)) HELPER(sve_st##N##NAME##_r)             \
    (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc)    \
{                                                                     \
    sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE,         \
                  arm_cpu_data_is_big_endian(env)                     \
                  ? sve_st1##NAME##_be_tlb : sve_st1##NAME##_le_tlb); \
}

DO_STN_1(1, bb, 1)
DO_STN_1(1, bh, 2)
DO_STN_1(1, bs, 4)
DO_STN_1(1, bd, 8)
DO_STN_1(2, bb, 1)
DO_STN_1(3, bb, 1)
DO_STN_1(4, bb, 1)

DO_STN_2(1, hh, 2, 2)
DO_STN_2(1, hs, 4, 2)
DO_STN_2(1, hd, 8, 2)
DO_STN_2(2, hh, 2, 2)
DO_STN_2(3, hh, 2, 2)
DO_STN_2(4, hh, 2, 2)

DO_STN_2(1, ss, 4, 4)
DO_STN_2(1, sd, 8, 4)
DO_STN_2(2, ss, 4, 4)
DO_STN_2(3, ss, 4, 4)
DO_STN_2(4, ss, 4, 4)

DO_STN_2(1, dd, 8, 8)
DO_STN_2(2, dd, 8, 8)
DO_STN_2(3, dd, 8, 8)
DO_STN_2(4, dd, 8, 8)

#undef DO_STN_1
#undef DO_STN_2

/* Loads with a vector index.  */

#define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN)                            \