Commit 633f6502 authored Jan 11, 2013 by Paolo Bonzini Committed by Blue Swirl Jan 19, 2013

optimize: optimize using nonzero bits



This adds two optimizations using the non-zero bit mask.  In some cases
involving shifts or ANDs the value can become zero, and can thus be
optimized to a move of zero.  Second, useless zero-extension or an
AND with constant can be detected that would only zero bits that are
already zero.

The main advantage of this optimization is that it turns zero-extensions
into moves, thus enabling much better copy propagation (around 1% code
reduction).  Here is for example a "test $0xff0000,%ecx + je" before
optimization:

 mov_i64 tmp0,rcx
 movi_i64 tmp1,$0xff0000
 discard cc_src
 and_i64 cc_dst,tmp0,tmp1
 movi_i32 cc_op,$0x1c
 ext32u_i64 tmp0,cc_dst
 movi_i64 tmp12,$0x0
 brcond_i64 tmp0,tmp12,eq,$0x0

and after (without patch on the left, with on the right):

 movi_i64 tmp1,$0xff0000                 movi_i64 tmp1,$0xff0000
 discard cc_src                          discard cc_src
 and_i64 cc_dst,rcx,tmp1                 and_i64 cc_dst,rcx,tmp1
 movi_i32 cc_op,$0x1c                    movi_i32 cc_op,$0x1c
 ext32u_i64 tmp0,cc_dst
 movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
 brcond_i64 tmp0,tmp12,eq,$0x0           brcond_i64 cc_dst,tmp12,eq,$0x0

Other similar cases: "test %eax, %eax + jne" where eax is already 32-bit
(after optimization, without patch on the left, with on the right):

 discard cc_src                          discard cc_src
 mov_i64 cc_dst,rax                      mov_i64 cc_dst,rax
 movi_i32 cc_op,$0x1c                    movi_i32 cc_op,$0x1c
 ext32u_i64 tmp0,cc_dst
 movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
 brcond_i64 tmp0,tmp12,ne,$0x0           brcond_i64 rax,tmp12,ne,$0x0

"test $0x1, %dl + je":

 movi_i64 tmp1,$0x1                      movi_i64 tmp1,$0x1
 discard cc_src                          discard cc_src
 and_i64 cc_dst,rdx,tmp1                 and_i64 cc_dst,rdx,tmp1
 movi_i32 cc_op,$0x1a                    movi_i32 cc_op,$0x1a
 ext8u_i64 tmp0,cc_dst
 movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
 brcond_i64 tmp0,tmp12,eq,$0x0           brcond_i64 cc_dst,tmp12,eq,$0x0

In some cases TCG even outsmarts GCC. :)  Here the input code has
"and $0x2,%eax + movslq %eax,%rbx + test %rbx, %rbx" and the optimizer,
thanks to copy propagation, does the following:

 movi_i64 tmp12,$0x2                     movi_i64 tmp12,$0x2
 and_i64 rax,rax,tmp12                   and_i64 rax,rax,tmp12
 mov_i64 cc_dst,rax                      mov_i64 cc_dst,rax
 ext32s_i64 tmp0,rax                  -> nop
 mov_i64 rbx,tmp0                     -> mov_i64 rbx,cc_dst
 and_i64 cc_dst,rbx,rbx               -> nop

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>

parent 3a9d8b17

tcg/optimize.c

+28 −2

Original line number	Diff line number	Diff line
		@@ -484,7 +484,7 @@ static TCGArg tcg_constant_folding(TCGContext s, uint16_t *tcg_opc_ptr,
		TCGArg args, TCGOpDef tcg_op_defs)
		{
		int i, nb_ops, op_index, nb_temps, nb_globals, nb_call_args;
		tcg_target_ulong mask;
		tcg_target_ulong mask, affected;
		TCGOpcode op;
		const TCGOpDef *def;
		TCGArg *gen_args;
		@@ -629,6 +629,7 @@ static TCGArg tcg_constant_folding(TCGContext s, uint16_t *tcg_opc_ptr,

		/* Simplify using known-zero bits */
		mask = -1;
		affected = -1;
		switch (op) {
		CASE_OP_32_64(ext8s):
		if ((temps[args[1]].mask & 0x80) != 0) {
		@@ -656,7 +657,7 @@ static TCGArg tcg_constant_folding(TCGContext s, uint16_t *tcg_opc_ptr,
		mask = temps[args[2]].mask;
		if (temps[args[2]].state == TCG_TEMP_CONST) {
		and_const:
		;
		affected = temps[args[1]].mask & ~mask;
		}
		mask = temps[args[1]].mask & mask;
		break;
		@@ -708,6 +709,31 @@ static TCGArg tcg_constant_folding(TCGContext s, uint16_t *tcg_opc_ptr,
		break;
		}

		if (mask == 0) {
		assert(def->nb_oargs == 1);
		s->gen_opc_buf[op_index] = op_to_movi(op);
		tcg_opt_gen_movi(gen_args, args[0], 0);
		args += def->nb_oargs + def->nb_iargs + def->nb_cargs;
		gen_args += 2;
		continue;
		}
		if (affected == 0) {
		assert(def->nb_oargs == 1);
		if (temps_are_copies(args[0], args[1])) {
		s->gen_opc_buf[op_index] = INDEX_op_nop;
		} else if (temps[args[1]].state != TCG_TEMP_CONST) {
		s->gen_opc_buf[op_index] = op_to_mov(op);
		tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
		gen_args += 2;
		} else {
		s->gen_opc_buf[op_index] = op_to_movi(op);
		tcg_opt_gen_movi(gen_args, args[0], temps[args[1]].val);
		gen_args += 2;
		}
		args += def->nb_iargs + 1;
		continue;
		}

		/* Simplify expression for "op r, a, 0 => movi r, 0" cases */
		switch (op) {
		CASE_OP_32_64(and):