Commit 44f1441d authored by Richard Henderson's avatar Richard Henderson
Browse files

tcg/i386: Split subroutines out of tcg_expand_vec_op



This routine was becoming too large.

Signed-off-by: default avatarRichard Henderson <richard.henderson@linaro.org>
parent dd0a0fcd
Loading
Loading
Loading
Loading
+224 −219
Original line number Diff line number Diff line
@@ -3079,22 +3079,16 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
    }
}

void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
                       TCGArg a0, ...)
static void expand_vec_shi(TCGType type, unsigned vece, bool shr,
                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
{
    va_list va;
    TCGArg a1, a2;
    TCGv_vec v0, t1, t2, t3, t4;
    TCGv_vec t1, t2;

    va_start(va, a0);
    v0 = temp_tcgv_vec(arg_temp(a0));

    switch (opc) {
    case INDEX_op_shli_vec:
    case INDEX_op_shri_vec:
    tcg_debug_assert(vece == MO_8);
        a1 = va_arg(va, TCGArg);
        a2 = va_arg(va, TCGArg);

    t1 = tcg_temp_new_vec(type);
    t2 = tcg_temp_new_vec(type);

    /* Unpack to W, shift, and repack.  Tricky bits:
       (1) Use punpck*bw x,x to produce DDCCBBAA,
           i.e. duplicate in other half of the 16-bit lane.
@@ -3104,155 +3098,135 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
       (3) Step 2 leaves high half zero such that PACKUSWB
           (pack with unsigned saturation) does not modify
           the quantity.  */
        t1 = tcg_temp_new_vec(type);
        t2 = tcg_temp_new_vec(type);
    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
                  tcgv_vec_arg(t1), a1, a1);
              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
                  tcgv_vec_arg(t2), a1, a1);
        if (opc == INDEX_op_shri_vec) {
            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
                     tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
                     tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
        } else {
            vec_gen_3(INDEX_op_shli_vec, type, MO_16,
                     tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
            vec_gen_3(INDEX_op_shli_vec, type, MO_16,
                     tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
                     tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8);
            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
                     tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8);
              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));

    if (shr) {
        tcg_gen_shri_vec(MO_16, t1, t1, imm + 8);
        tcg_gen_shri_vec(MO_16, t2, t2, imm + 8);
    } else {
        tcg_gen_shli_vec(MO_16, t1, t1, imm + 8);
        tcg_gen_shli_vec(MO_16, t2, t2, imm + 8);
        tcg_gen_shri_vec(MO_16, t1, t1, 8);
        tcg_gen_shri_vec(MO_16, t2, t2, 8);
    }

    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
                 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
    tcg_temp_free_vec(t1);
    tcg_temp_free_vec(t2);
        break;
}

    case INDEX_op_sari_vec:
        a1 = va_arg(va, TCGArg);
        a2 = va_arg(va, TCGArg);
        if (vece == MO_8) {
            /* Unpack to W, shift, and repack, as above.  */
static void expand_vec_sari(TCGType type, unsigned vece,
                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
{
    TCGv_vec t1, t2;

    switch (vece) {
    case MO_8:
        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
        t1 = tcg_temp_new_vec(type);
        t2 = tcg_temp_new_vec(type);
        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
                      tcgv_vec_arg(t1), a1, a1);
                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
                      tcgv_vec_arg(t2), a1, a1);
            vec_gen_3(INDEX_op_sari_vec, type, MO_16,
                      tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
            vec_gen_3(INDEX_op_sari_vec, type, MO_16,
                      tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
        tcg_temp_free_vec(t1);
        tcg_temp_free_vec(t2);
        break;
        }
        tcg_debug_assert(vece == MO_64);
        /* MO_64: If the shift is <= 32, we can emulate the sign extend by
           performing an arithmetic 32-bit shift and overwriting the high
           half of the result (note that the ISA says shift of 32 is valid). */
        if (a2 <= 32) {

    case MO_64:
        if (imm <= 32) {
            /* We can emulate a small sign extend by performing an arithmetic
             * 32-bit shift and overwriting the high half of a 64-bit logical
             * shift (note that the ISA says shift of 32 is valid).
             */
            t1 = tcg_temp_new_vec(type);
            vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2);
            vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
            tcg_gen_sari_vec(MO_32, t1, v1, imm);
            tcg_gen_shri_vec(MO_64, v0, v1, imm);
            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
                      a0, a0, tcgv_vec_arg(t1), 0xaa);
                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
                      tcgv_vec_arg(t1), 0xaa);
            tcg_temp_free_vec(t1);
            break;
        }
        /* Otherwise we will need to use a compare vs 0 to produce the
           sign-extend, shift and merge.  */
        t1 = tcg_temp_new_vec(type);
        t2 = tcg_const_zeros_vec(type);
        vec_gen_4(INDEX_op_cmp_vec, type, MO_64,
                  tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT);
        tcg_temp_free_vec(t2);
        vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
        vec_gen_3(INDEX_op_shli_vec, type, MO_64,
                  tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2);
        vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1));
        } else {
            /* Otherwise we will need to use a compare vs 0 to produce
             * the sign-extend, shift and merge.
             */
            t1 = tcg_const_zeros_vec(type);
            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
            tcg_gen_shri_vec(MO_64, v0, v1, imm);
            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
            tcg_gen_or_vec(MO_64, v0, v0, t1);
            tcg_temp_free_vec(t1);
        }
        break;

    case INDEX_op_mul_vec:
    default:
        g_assert_not_reached();
    }
}

static void expand_vec_mul(TCGType type, unsigned vece,
                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
{
    TCGv_vec t1, t2, t3, t4;

    tcg_debug_assert(vece == MO_8);
        a1 = va_arg(va, TCGArg);
        a2 = va_arg(va, TCGArg);

    /*
     * Unpack v1 bytes to words, 0 | x.
     * Unpack v2 bytes to words, y | 0.
     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
     * Shift logical right by 8 bits to clear the high 8 bytes before
     * using an unsigned saturated pack.
     *
     * The difference between the V64, V128 and V256 cases is merely how
     * we distribute the expansion between temporaries.
     */
    switch (type) {
    case TCG_TYPE_V64:
        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
        tcg_gen_dup16i_vec(t2, 0);
        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
                      tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2));
                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
                      tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2);
                  tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
        tcg_gen_mul_vec(MO_16, t1, t1, t2);
        tcg_gen_shri_vec(MO_16, t1, t1, 8);
        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1));
                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
        tcg_temp_free_vec(t1);
        tcg_temp_free_vec(t2);
        break;

    case TCG_TYPE_V128:
            t1 = tcg_temp_new_vec(TCG_TYPE_V128);
            t2 = tcg_temp_new_vec(TCG_TYPE_V128);
            t3 = tcg_temp_new_vec(TCG_TYPE_V128);
            t4 = tcg_temp_new_vec(TCG_TYPE_V128);
            tcg_gen_dup16i_vec(t4, 0);
            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
                      tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
                      tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
                      tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
                      tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
            tcg_gen_mul_vec(MO_16, t1, t1, t2);
            tcg_gen_mul_vec(MO_16, t3, t3, t4);
            tcg_gen_shri_vec(MO_16, t1, t1, 8);
            tcg_gen_shri_vec(MO_16, t3, t3, 8);
            vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
            tcg_temp_free_vec(t1);
            tcg_temp_free_vec(t2);
            tcg_temp_free_vec(t3);
            tcg_temp_free_vec(t4);
            break;

    case TCG_TYPE_V256:
            t1 = tcg_temp_new_vec(TCG_TYPE_V256);
            t2 = tcg_temp_new_vec(TCG_TYPE_V256);
            t3 = tcg_temp_new_vec(TCG_TYPE_V256);
            t4 = tcg_temp_new_vec(TCG_TYPE_V256);
        t1 = tcg_temp_new_vec(type);
        t2 = tcg_temp_new_vec(type);
        t3 = tcg_temp_new_vec(type);
        t4 = tcg_temp_new_vec(type);
        tcg_gen_dup16i_vec(t4, 0);
            /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7]
               t1: extends of B[0-7], D[0-7]
               t2: extends of X[0-7], Z[0-7]
               t3: extends of A[0-7], C[0-7]
               t4: extends of W[0-7], Y[0-7].  */
            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
                      tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
                      tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
                      tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
                      tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
            /* t1: BX DZ; t2: AW CY.  */
        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
                  tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
                  tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
        tcg_gen_mul_vec(MO_16, t1, t1, t2);
        tcg_gen_mul_vec(MO_16, t3, t3, t4);
        tcg_gen_shri_vec(MO_16, t1, t1, 8);
        tcg_gen_shri_vec(MO_16, t3, t3, 8);
            /* a0: AW BX CY DZ.  */
            vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8,
                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
        tcg_temp_free_vec(t1);
        tcg_temp_free_vec(t2);
        tcg_temp_free_vec(t3);
@@ -3262,9 +3236,10 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
    default:
        g_assert_not_reached();
    }
        break;
}

    case INDEX_op_cmp_vec:
static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
{
    enum {
        NEED_SWAP = 1,
@@ -3284,13 +3259,9 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
        [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
        [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
    };

            TCGCond cond;
    TCGv_vec t1, t2;
    uint8_t fixup;

            a1 = va_arg(va, TCGArg);
            a2 = va_arg(va, TCGArg);
            cond = va_arg(va, TCGArg);
    fixup = fixups[cond & 15];
    tcg_debug_assert(fixup != 0xff);

@@ -3298,8 +3269,7 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
        cond = tcg_invert_cond(cond);
    }
    if (fixup & NEED_SWAP) {
                TCGArg t;
                t = a1, a1 = a2, a2 = t;
        t1 = v1, v1 = v2, v2 = t1;
        cond = tcg_swap_cond(cond);
    }

@@ -3308,24 +3278,59 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
        t1 = tcg_temp_new_vec(type);
        t2 = tcg_temp_new_vec(type);
        tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
                tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2);
                tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2);
                a1 = tcgv_vec_arg(t1);
                a2 = tcgv_vec_arg(t2);
        tcg_gen_sub_vec(vece, t1, v1, t2);
        tcg_gen_sub_vec(vece, t2, v2, t2);
        v1 = t1;
        v2 = t2;
        cond = tcg_signed_cond(cond);
    }

    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
            vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
    /* Expand directly; do not recurse.  */
    vec_gen_4(INDEX_op_cmp_vec, type, vece,
              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);

            if (fixup & NEED_BIAS) {
    if (t1) {
        tcg_temp_free_vec(t1);
        if (t2) {
            tcg_temp_free_vec(t2);
        }
    }
    if (fixup & NEED_INV) {
        tcg_gen_not_vec(vece, v0, v0);
    }
}

void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
                       TCGArg a0, ...)
{
    va_list va;
    TCGArg a2;
    TCGv_vec v0, v1, v2;

    va_start(va, a0);
    v0 = temp_tcgv_vec(arg_temp(a0));
    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
    a2 = va_arg(va, TCGArg);

    switch (opc) {
    case INDEX_op_shli_vec:
    case INDEX_op_shri_vec:
        expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2);
        break;

    case INDEX_op_sari_vec:
        expand_vec_sari(type, vece, v0, v1, a2);
        break;

    case INDEX_op_mul_vec:
        v2 = temp_tcgv_vec(arg_temp(a2));
        expand_vec_mul(type, vece, v0, v1, v2);
        break;

    case INDEX_op_cmp_vec:
        v2 = temp_tcgv_vec(arg_temp(a2));
        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
        break;

    default: