tcg/i386: Implement INDEX_op_rotl{i,s,v}_vec (885b1706) · Commits · SUMMER2020 / students / proj-2021291

tcg/i386/tcg-target.inc.c

+100 −16

Original line number	Diff line number	Diff line
		@@ -3233,6 +3233,7 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
		case INDEX_op_shls_vec:
		case INDEX_op_shrs_vec:
		case INDEX_op_sars_vec:
		case INDEX_op_rotls_vec:
		case INDEX_op_cmp_vec:
		case INDEX_op_x86_shufps_vec:
		case INDEX_op_x86_blend_vec:
		@@ -3271,6 +3272,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
		case INDEX_op_xor_vec:
		case INDEX_op_andc_vec:
		return 1;
		case INDEX_op_rotli_vec:
		case INDEX_op_cmp_vec:
		case INDEX_op_cmpsel_vec:
		return -1;
		@@ -3297,12 +3299,17 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
		return vece >= MO_16;
		case INDEX_op_sars_vec:
		return vece >= MO_16 && vece <= MO_32;
		case INDEX_op_rotls_vec:
		return vece >= MO_16 ? -1 : 0;

		case INDEX_op_shlv_vec:
		case INDEX_op_shrv_vec:
		return have_avx2 && vece >= MO_32;
		case INDEX_op_sarv_vec:
		return have_avx2 && vece == MO_32;
		case INDEX_op_rotlv_vec:
		case INDEX_op_rotrv_vec:
		return have_avx2 && vece >= MO_32 ? -1 : 0;

		case INDEX_op_mul_vec:
		if (vece == MO_8) {
		@@ -3331,7 +3338,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
		}
		}

		static void expand_vec_shi(TCGType type, unsigned vece, bool shr,
		static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
		TCGv_vec v0, TCGv_vec v1, TCGArg imm)
		{
		TCGv_vec t1, t2;
		@@ -3341,26 +3348,31 @@ static void expand_vec_shi(TCGType type, unsigned vece, bool shr,
		t1 = tcg_temp_new_vec(type);
		t2 = tcg_temp_new_vec(type);

		/* Unpack to W, shift, and repack. Tricky bits:
		(1) Use punpck*bw x,x to produce DDCCBBAA,
		i.e. duplicate in other half of the 16-bit lane.
		(2) For right-shift, add 8 so that the high half of
		the lane becomes zero. For left-shift, we must
		shift up and down again.
		(3) Step 2 leaves high half zero such that PACKUSWB
		(pack with unsigned saturation) does not modify
		the quantity. */
		/*
		* Unpack to W, shift, and repack. Tricky bits:
		* (1) Use punpck*bw x,x to produce DDCCBBAA,
		* i.e. duplicate in other half of the 16-bit lane.
		* (2) For right-shift, add 8 so that the high half of the lane
		* becomes zero. For left-shift, and left-rotate, we must
		* shift up and down again.
		* (3) Step 2 leaves high half zero such that PACKUSWB
		* (pack with unsigned saturation) does not modify
		* the quantity.
		*/
		vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
		tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
		vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
		tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));

		if (shr) {
		tcg_gen_shri_vec(MO_16, t1, t1, imm + 8);
		tcg_gen_shri_vec(MO_16, t2, t2, imm + 8);
		if (opc != INDEX_op_rotli_vec) {
		imm += 8;
		}
		if (opc == INDEX_op_shri_vec) {
		tcg_gen_shri_vec(MO_16, t1, t1, imm);
		tcg_gen_shri_vec(MO_16, t2, t2, imm);
		} else {
		tcg_gen_shli_vec(MO_16, t1, t1, imm + 8);
		tcg_gen_shli_vec(MO_16, t2, t2, imm + 8);
		tcg_gen_shli_vec(MO_16, t1, t1, imm);
		tcg_gen_shli_vec(MO_16, t2, t2, imm);
		tcg_gen_shri_vec(MO_16, t1, t1, 8);
		tcg_gen_shri_vec(MO_16, t2, t2, 8);
		}
		@@ -3427,6 +3439,61 @@ static void expand_vec_sari(TCGType type, unsigned vece,
		}
		}

		static void expand_vec_rotli(TCGType type, unsigned vece,
		TCGv_vec v0, TCGv_vec v1, TCGArg imm)
		{
		TCGv_vec t;

		if (vece == MO_8) {
		expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
		return;
		}

		t = tcg_temp_new_vec(type);
		tcg_gen_shli_vec(vece, t, v1, imm);
		tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
		tcg_gen_or_vec(vece, v0, v0, t);
		tcg_temp_free_vec(t);
		}

		static void expand_vec_rotls(TCGType type, unsigned vece,
		TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
		{
		TCGv_i32 rsh;
		TCGv_vec t;

		tcg_debug_assert(vece != MO_8);

		t = tcg_temp_new_vec(type);
		rsh = tcg_temp_new_i32();

		tcg_gen_neg_i32(rsh, lsh);
		tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
		tcg_gen_shls_vec(vece, t, v1, lsh);
		tcg_gen_shrs_vec(vece, v0, v1, rsh);
		tcg_gen_or_vec(vece, v0, v0, t);
		tcg_temp_free_vec(t);
		tcg_temp_free_i32(rsh);
		}

		static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
		TCGv_vec v1, TCGv_vec sh, bool right)
		{
		TCGv_vec t = tcg_temp_new_vec(type);

		tcg_gen_dupi_vec(vece, t, 8 << vece);
		tcg_gen_sub_vec(vece, t, t, sh);
		if (right) {
		tcg_gen_shlv_vec(vece, t, v1, t);
		tcg_gen_shrv_vec(vece, v0, v1, sh);
		} else {
		tcg_gen_shrv_vec(vece, t, v1, t);
		tcg_gen_shlv_vec(vece, v0, v1, sh);
		}
		tcg_gen_or_vec(vece, v0, v0, t);
		tcg_temp_free_vec(t);
		}

		static void expand_vec_mul(TCGType type, unsigned vece,
		TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
		{
		@@ -3636,13 +3703,30 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
		switch (opc) {
		case INDEX_op_shli_vec:
		case INDEX_op_shri_vec:
		expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2);
		expand_vec_shi(type, vece, opc, v0, v1, a2);
		break;

		case INDEX_op_sari_vec:
		expand_vec_sari(type, vece, v0, v1, a2);
		break;

		case INDEX_op_rotli_vec:
		expand_vec_rotli(type, vece, v0, v1, a2);
		break;

		case INDEX_op_rotls_vec:
		expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
		break;

		case INDEX_op_rotlv_vec:
		v2 = temp_tcgv_vec(arg_temp(a2));
		expand_vec_rotv(type, vece, v0, v1, v2, false);
		break;
		case INDEX_op_rotrv_vec:
		v2 = temp_tcgv_vec(arg_temp(a2));
		expand_vec_rotv(type, vece, v0, v1, v2, true);
		break;

		case INDEX_op_mul_vec:
		v2 = temp_tcgv_vec(arg_temp(a2));
		expand_vec_mul(type, vece, v0, v1, v2);