Loading tcg/i386/tcg-target.inc.c +224 −219 Original line number Diff line number Diff line Loading @@ -3079,22 +3079,16 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) } } void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, TCGArg a0, ...) static void expand_vec_shi(TCGType type, unsigned vece, bool shr, TCGv_vec v0, TCGv_vec v1, TCGArg imm) { va_list va; TCGArg a1, a2; TCGv_vec v0, t1, t2, t3, t4; TCGv_vec t1, t2; va_start(va, a0); v0 = temp_tcgv_vec(arg_temp(a0)); switch (opc) { case INDEX_op_shli_vec: case INDEX_op_shri_vec: tcg_debug_assert(vece == MO_8); a1 = va_arg(va, TCGArg); a2 = va_arg(va, TCGArg); t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); /* Unpack to W, shift, and repack. Tricky bits: (1) Use punpck*bw x,x to produce DDCCBBAA, i.e. duplicate in other half of the 16-bit lane. Loading @@ -3104,155 +3098,135 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, (3) Step 2 leaves high half zero such that PACKUSWB (pack with unsigned saturation) does not modify the quantity. */ t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, tcgv_vec_arg(t1), a1, a1); tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, tcgv_vec_arg(t2), a1, a1); if (opc == INDEX_op_shri_vec) { vec_gen_3(INDEX_op_shri_vec, type, MO_16, tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); vec_gen_3(INDEX_op_shri_vec, type, MO_16, tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); } else { vec_gen_3(INDEX_op_shli_vec, type, MO_16, tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); vec_gen_3(INDEX_op_shli_vec, type, MO_16, tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); vec_gen_3(INDEX_op_shri_vec, type, MO_16, tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8); vec_gen_3(INDEX_op_shri_vec, type, MO_16, tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8); tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); if (shr) { tcg_gen_shri_vec(MO_16, t1, t1, imm + 8); tcg_gen_shri_vec(MO_16, t2, t2, imm + 8); } else { tcg_gen_shli_vec(MO_16, t1, t1, imm + 8); tcg_gen_shli_vec(MO_16, t2, t2, imm + 8); tcg_gen_shri_vec(MO_16, t1, t1, 8); tcg_gen_shri_vec(MO_16, t2, t2, 8); } vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2)); tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); tcg_temp_free_vec(t1); tcg_temp_free_vec(t2); break; } case INDEX_op_sari_vec: a1 = va_arg(va, TCGArg); a2 = va_arg(va, TCGArg); if (vece == MO_8) { /* Unpack to W, shift, and repack, as above. */ static void expand_vec_sari(TCGType type, unsigned vece, TCGv_vec v0, TCGv_vec v1, TCGArg imm) { TCGv_vec t1, t2; switch (vece) { case MO_8: /* Unpack to W, shift, and repack, as in expand_vec_shi. */ t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, tcgv_vec_arg(t1), a1, a1); tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, tcgv_vec_arg(t2), a1, a1); vec_gen_3(INDEX_op_sari_vec, type, MO_16, tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); vec_gen_3(INDEX_op_sari_vec, type, MO_16, tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2)); tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); tcg_temp_free_vec(t1); tcg_temp_free_vec(t2); break; } tcg_debug_assert(vece == MO_64); /* MO_64: If the shift is <= 32, we can emulate the sign extend by performing an arithmetic 32-bit shift and overwriting the high half of the result (note that the ISA says shift of 32 is valid). */ if (a2 <= 32) { case MO_64: if (imm <= 32) { /* We can emulate a small sign extend by performing an arithmetic * 32-bit shift and overwriting the high half of a 64-bit logical * shift (note that the ISA says shift of 32 is valid). */ t1 = tcg_temp_new_vec(type); vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2); vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2); tcg_gen_sari_vec(MO_32, t1, v1, imm); tcg_gen_shri_vec(MO_64, v0, v1, imm); vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, a0, a0, tcgv_vec_arg(t1), 0xaa); tcgv_vec_arg(v0), tcgv_vec_arg(v0), tcgv_vec_arg(t1), 0xaa); tcg_temp_free_vec(t1); break; } /* Otherwise we will need to use a compare vs 0 to produce the sign-extend, shift and merge. */ t1 = tcg_temp_new_vec(type); t2 = tcg_const_zeros_vec(type); vec_gen_4(INDEX_op_cmp_vec, type, MO_64, tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT); tcg_temp_free_vec(t2); vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2); vec_gen_3(INDEX_op_shli_vec, type, MO_64, tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2); vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1)); } else { /* Otherwise we will need to use a compare vs 0 to produce * the sign-extend, shift and merge. */ t1 = tcg_const_zeros_vec(type); tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1); tcg_gen_shri_vec(MO_64, v0, v1, imm); tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); tcg_gen_or_vec(MO_64, v0, v0, t1); tcg_temp_free_vec(t1); } break; case INDEX_op_mul_vec: default: g_assert_not_reached(); } } static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) { TCGv_vec t1, t2, t3, t4; tcg_debug_assert(vece == MO_8); a1 = va_arg(va, TCGArg); a2 = va_arg(va, TCGArg); /* * Unpack v1 bytes to words, 0 | x. * Unpack v2 bytes to words, y | 0. * This leaves the 8-bit result, x * y, with 8 bits of right padding. * Shift logical right by 8 bits to clear the high 8 bytes before * using an unsigned saturated pack. * * The difference between the V64, V128 and V256 cases is merely how * we distribute the expansion between temporaries. */ switch (type) { case TCG_TYPE_V64: t1 = tcg_temp_new_vec(TCG_TYPE_V128); t2 = tcg_temp_new_vec(TCG_TYPE_V128); tcg_gen_dup16i_vec(t2, 0); vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2)); tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2)); vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2); tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2)); tcg_gen_mul_vec(MO_16, t1, t1, t2); tcg_gen_shri_vec(MO_16, t1, t1, 8); vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1)); tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); tcg_temp_free_vec(t1); tcg_temp_free_vec(t2); break; case TCG_TYPE_V128: t1 = tcg_temp_new_vec(TCG_TYPE_V128); t2 = tcg_temp_new_vec(TCG_TYPE_V128); t3 = tcg_temp_new_vec(TCG_TYPE_V128); t4 = tcg_temp_new_vec(TCG_TYPE_V128); tcg_gen_dup16i_vec(t4, 0); vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4)); vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2); vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8, tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4)); vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8, tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2); tcg_gen_mul_vec(MO_16, t1, t1, t2); tcg_gen_mul_vec(MO_16, t3, t3, t4); tcg_gen_shri_vec(MO_16, t1, t1, 8); tcg_gen_shri_vec(MO_16, t3, t3, 8); vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3)); tcg_temp_free_vec(t1); tcg_temp_free_vec(t2); tcg_temp_free_vec(t3); tcg_temp_free_vec(t4); break; case TCG_TYPE_V256: t1 = tcg_temp_new_vec(TCG_TYPE_V256); t2 = tcg_temp_new_vec(TCG_TYPE_V256); t3 = tcg_temp_new_vec(TCG_TYPE_V256); t4 = tcg_temp_new_vec(TCG_TYPE_V256); t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); t3 = tcg_temp_new_vec(type); t4 = tcg_temp_new_vec(type); tcg_gen_dup16i_vec(t4, 0); /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7] t1: extends of B[0-7], D[0-7] t2: extends of X[0-7], Z[0-7] t3: extends of A[0-7], C[0-7] t4: extends of W[0-7], Y[0-7]. */ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8, tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4)); vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8, tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2); vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8, tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4)); vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8, tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2); /* t1: BX DZ; t2: AW CY. */ vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4)); vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2)); vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4)); vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2)); tcg_gen_mul_vec(MO_16, t1, t1, t2); tcg_gen_mul_vec(MO_16, t3, t3, t4); tcg_gen_shri_vec(MO_16, t1, t1, 8); tcg_gen_shri_vec(MO_16, t3, t3, 8); /* a0: AW BX CY DZ. */ vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8, a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3)); vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); tcg_temp_free_vec(t1); tcg_temp_free_vec(t2); tcg_temp_free_vec(t3); Loading @@ -3262,9 +3236,10 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, default: g_assert_not_reached(); } break; } case INDEX_op_cmp_vec: static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0, TCGv_vec v1, TCGv_vec v2, TCGCond cond) { enum { NEED_SWAP = 1, Loading @@ -3284,13 +3259,9 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, [TCG_COND_LEU] = NEED_BIAS | NEED_INV, [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV, }; TCGCond cond; TCGv_vec t1, t2; uint8_t fixup; a1 = va_arg(va, TCGArg); a2 = va_arg(va, TCGArg); cond = va_arg(va, TCGArg); fixup = fixups[cond & 15]; tcg_debug_assert(fixup != 0xff); Loading @@ -3298,8 +3269,7 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, cond = tcg_invert_cond(cond); } if (fixup & NEED_SWAP) { TCGArg t; t = a1, a1 = a2, a2 = t; t1 = v1, v1 = v2, v2 = t1; cond = tcg_swap_cond(cond); } Loading @@ -3308,24 +3278,59 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1)); tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2); tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2); a1 = tcgv_vec_arg(t1); a2 = tcgv_vec_arg(t2); tcg_gen_sub_vec(vece, t1, v1, t2); tcg_gen_sub_vec(vece, t2, v2, t2); v1 = t1; v2 = t2; cond = tcg_signed_cond(cond); } tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); /* Expand directly; do not recurse. */ vec_gen_4(INDEX_op_cmp_vec, type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond); if (fixup & NEED_BIAS) { if (t1) { tcg_temp_free_vec(t1); if (t2) { tcg_temp_free_vec(t2); } } if (fixup & NEED_INV) { tcg_gen_not_vec(vece, v0, v0); } } void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, TCGArg a0, ...) { va_list va; TCGArg a2; TCGv_vec v0, v1, v2; va_start(va, a0); v0 = temp_tcgv_vec(arg_temp(a0)); v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); a2 = va_arg(va, TCGArg); switch (opc) { case INDEX_op_shli_vec: case INDEX_op_shri_vec: expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2); break; case INDEX_op_sari_vec: expand_vec_sari(type, vece, v0, v1, a2); break; case INDEX_op_mul_vec: v2 = temp_tcgv_vec(arg_temp(a2)); expand_vec_mul(type, vece, v0, v1, v2); break; case INDEX_op_cmp_vec: v2 = temp_tcgv_vec(arg_temp(a2)); expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg)); break; default: Loading Loading
tcg/i386/tcg-target.inc.c +224 −219 Original line number Diff line number Diff line Loading @@ -3079,22 +3079,16 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) } } void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, TCGArg a0, ...) static void expand_vec_shi(TCGType type, unsigned vece, bool shr, TCGv_vec v0, TCGv_vec v1, TCGArg imm) { va_list va; TCGArg a1, a2; TCGv_vec v0, t1, t2, t3, t4; TCGv_vec t1, t2; va_start(va, a0); v0 = temp_tcgv_vec(arg_temp(a0)); switch (opc) { case INDEX_op_shli_vec: case INDEX_op_shri_vec: tcg_debug_assert(vece == MO_8); a1 = va_arg(va, TCGArg); a2 = va_arg(va, TCGArg); t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); /* Unpack to W, shift, and repack. Tricky bits: (1) Use punpck*bw x,x to produce DDCCBBAA, i.e. duplicate in other half of the 16-bit lane. Loading @@ -3104,155 +3098,135 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, (3) Step 2 leaves high half zero such that PACKUSWB (pack with unsigned saturation) does not modify the quantity. */ t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, tcgv_vec_arg(t1), a1, a1); tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, tcgv_vec_arg(t2), a1, a1); if (opc == INDEX_op_shri_vec) { vec_gen_3(INDEX_op_shri_vec, type, MO_16, tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); vec_gen_3(INDEX_op_shri_vec, type, MO_16, tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); } else { vec_gen_3(INDEX_op_shli_vec, type, MO_16, tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); vec_gen_3(INDEX_op_shli_vec, type, MO_16, tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); vec_gen_3(INDEX_op_shri_vec, type, MO_16, tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8); vec_gen_3(INDEX_op_shri_vec, type, MO_16, tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8); tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); if (shr) { tcg_gen_shri_vec(MO_16, t1, t1, imm + 8); tcg_gen_shri_vec(MO_16, t2, t2, imm + 8); } else { tcg_gen_shli_vec(MO_16, t1, t1, imm + 8); tcg_gen_shli_vec(MO_16, t2, t2, imm + 8); tcg_gen_shri_vec(MO_16, t1, t1, 8); tcg_gen_shri_vec(MO_16, t2, t2, 8); } vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2)); tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); tcg_temp_free_vec(t1); tcg_temp_free_vec(t2); break; } case INDEX_op_sari_vec: a1 = va_arg(va, TCGArg); a2 = va_arg(va, TCGArg); if (vece == MO_8) { /* Unpack to W, shift, and repack, as above. */ static void expand_vec_sari(TCGType type, unsigned vece, TCGv_vec v0, TCGv_vec v1, TCGArg imm) { TCGv_vec t1, t2; switch (vece) { case MO_8: /* Unpack to W, shift, and repack, as in expand_vec_shi. */ t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, tcgv_vec_arg(t1), a1, a1); tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, tcgv_vec_arg(t2), a1, a1); vec_gen_3(INDEX_op_sari_vec, type, MO_16, tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); vec_gen_3(INDEX_op_sari_vec, type, MO_16, tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2)); tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); tcg_temp_free_vec(t1); tcg_temp_free_vec(t2); break; } tcg_debug_assert(vece == MO_64); /* MO_64: If the shift is <= 32, we can emulate the sign extend by performing an arithmetic 32-bit shift and overwriting the high half of the result (note that the ISA says shift of 32 is valid). */ if (a2 <= 32) { case MO_64: if (imm <= 32) { /* We can emulate a small sign extend by performing an arithmetic * 32-bit shift and overwriting the high half of a 64-bit logical * shift (note that the ISA says shift of 32 is valid). */ t1 = tcg_temp_new_vec(type); vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2); vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2); tcg_gen_sari_vec(MO_32, t1, v1, imm); tcg_gen_shri_vec(MO_64, v0, v1, imm); vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, a0, a0, tcgv_vec_arg(t1), 0xaa); tcgv_vec_arg(v0), tcgv_vec_arg(v0), tcgv_vec_arg(t1), 0xaa); tcg_temp_free_vec(t1); break; } /* Otherwise we will need to use a compare vs 0 to produce the sign-extend, shift and merge. */ t1 = tcg_temp_new_vec(type); t2 = tcg_const_zeros_vec(type); vec_gen_4(INDEX_op_cmp_vec, type, MO_64, tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT); tcg_temp_free_vec(t2); vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2); vec_gen_3(INDEX_op_shli_vec, type, MO_64, tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2); vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1)); } else { /* Otherwise we will need to use a compare vs 0 to produce * the sign-extend, shift and merge. */ t1 = tcg_const_zeros_vec(type); tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1); tcg_gen_shri_vec(MO_64, v0, v1, imm); tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); tcg_gen_or_vec(MO_64, v0, v0, t1); tcg_temp_free_vec(t1); } break; case INDEX_op_mul_vec: default: g_assert_not_reached(); } } static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) { TCGv_vec t1, t2, t3, t4; tcg_debug_assert(vece == MO_8); a1 = va_arg(va, TCGArg); a2 = va_arg(va, TCGArg); /* * Unpack v1 bytes to words, 0 | x. * Unpack v2 bytes to words, y | 0. * This leaves the 8-bit result, x * y, with 8 bits of right padding. * Shift logical right by 8 bits to clear the high 8 bytes before * using an unsigned saturated pack. * * The difference between the V64, V128 and V256 cases is merely how * we distribute the expansion between temporaries. */ switch (type) { case TCG_TYPE_V64: t1 = tcg_temp_new_vec(TCG_TYPE_V128); t2 = tcg_temp_new_vec(TCG_TYPE_V128); tcg_gen_dup16i_vec(t2, 0); vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2)); tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2)); vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2); tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2)); tcg_gen_mul_vec(MO_16, t1, t1, t2); tcg_gen_shri_vec(MO_16, t1, t1, 8); vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1)); tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); tcg_temp_free_vec(t1); tcg_temp_free_vec(t2); break; case TCG_TYPE_V128: t1 = tcg_temp_new_vec(TCG_TYPE_V128); t2 = tcg_temp_new_vec(TCG_TYPE_V128); t3 = tcg_temp_new_vec(TCG_TYPE_V128); t4 = tcg_temp_new_vec(TCG_TYPE_V128); tcg_gen_dup16i_vec(t4, 0); vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4)); vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2); vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8, tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4)); vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8, tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2); tcg_gen_mul_vec(MO_16, t1, t1, t2); tcg_gen_mul_vec(MO_16, t3, t3, t4); tcg_gen_shri_vec(MO_16, t1, t1, 8); tcg_gen_shri_vec(MO_16, t3, t3, 8); vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3)); tcg_temp_free_vec(t1); tcg_temp_free_vec(t2); tcg_temp_free_vec(t3); tcg_temp_free_vec(t4); break; case TCG_TYPE_V256: t1 = tcg_temp_new_vec(TCG_TYPE_V256); t2 = tcg_temp_new_vec(TCG_TYPE_V256); t3 = tcg_temp_new_vec(TCG_TYPE_V256); t4 = tcg_temp_new_vec(TCG_TYPE_V256); t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); t3 = tcg_temp_new_vec(type); t4 = tcg_temp_new_vec(type); tcg_gen_dup16i_vec(t4, 0); /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7] t1: extends of B[0-7], D[0-7] t2: extends of X[0-7], Z[0-7] t3: extends of A[0-7], C[0-7] t4: extends of W[0-7], Y[0-7]. */ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8, tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4)); vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8, tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2); vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8, tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4)); vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8, tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2); /* t1: BX DZ; t2: AW CY. */ vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4)); vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2)); vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4)); vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2)); tcg_gen_mul_vec(MO_16, t1, t1, t2); tcg_gen_mul_vec(MO_16, t3, t3, t4); tcg_gen_shri_vec(MO_16, t1, t1, 8); tcg_gen_shri_vec(MO_16, t3, t3, 8); /* a0: AW BX CY DZ. */ vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8, a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3)); vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); tcg_temp_free_vec(t1); tcg_temp_free_vec(t2); tcg_temp_free_vec(t3); Loading @@ -3262,9 +3236,10 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, default: g_assert_not_reached(); } break; } case INDEX_op_cmp_vec: static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0, TCGv_vec v1, TCGv_vec v2, TCGCond cond) { enum { NEED_SWAP = 1, Loading @@ -3284,13 +3259,9 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, [TCG_COND_LEU] = NEED_BIAS | NEED_INV, [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV, }; TCGCond cond; TCGv_vec t1, t2; uint8_t fixup; a1 = va_arg(va, TCGArg); a2 = va_arg(va, TCGArg); cond = va_arg(va, TCGArg); fixup = fixups[cond & 15]; tcg_debug_assert(fixup != 0xff); Loading @@ -3298,8 +3269,7 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, cond = tcg_invert_cond(cond); } if (fixup & NEED_SWAP) { TCGArg t; t = a1, a1 = a2, a2 = t; t1 = v1, v1 = v2, v2 = t1; cond = tcg_swap_cond(cond); } Loading @@ -3308,24 +3278,59 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1)); tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2); tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2); a1 = tcgv_vec_arg(t1); a2 = tcgv_vec_arg(t2); tcg_gen_sub_vec(vece, t1, v1, t2); tcg_gen_sub_vec(vece, t2, v2, t2); v1 = t1; v2 = t2; cond = tcg_signed_cond(cond); } tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); /* Expand directly; do not recurse. */ vec_gen_4(INDEX_op_cmp_vec, type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond); if (fixup & NEED_BIAS) { if (t1) { tcg_temp_free_vec(t1); if (t2) { tcg_temp_free_vec(t2); } } if (fixup & NEED_INV) { tcg_gen_not_vec(vece, v0, v0); } } void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, TCGArg a0, ...) { va_list va; TCGArg a2; TCGv_vec v0, v1, v2; va_start(va, a0); v0 = temp_tcgv_vec(arg_temp(a0)); v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); a2 = va_arg(va, TCGArg); switch (opc) { case INDEX_op_shli_vec: case INDEX_op_shri_vec: expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2); break; case INDEX_op_sari_vec: expand_vec_sari(type, vece, v0, v1, a2); break; case INDEX_op_mul_vec: v2 = temp_tcgv_vec(arg_temp(a2)); expand_vec_mul(type, vece, v0, v1, v2); break; case INDEX_op_cmp_vec: v2 = temp_tcgv_vec(arg_temp(a2)); expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg)); break; default: Loading