Commit 00c8fa9f authored by Emilio G. Cota's avatar Emilio G. Cota Committed by Richard Henderson
Browse files

tcg: optimise memory layout of TCGTemp



This brings down the size of the struct from 56 to 32 bytes on 64-bit,
and to 20 bytes on 32-bit. This leads to memory savings:

Before:
$ find . -name 'tcg.o' | xargs size
   text    data     bss     dec     hex filename
  41131   29800      88   71019   1156b ./aarch64-softmmu/tcg/tcg.o
  37969   29416      96   67481   10799 ./x86_64-linux-user/tcg/tcg.o
  39354   28816      96   68266   10aaa ./arm-linux-user/tcg/tcg.o
  40802   29096      88   69986   11162 ./arm-softmmu/tcg/tcg.o
  39417   29672      88   69177   10e39 ./x86_64-softmmu/tcg/tcg.o

After:
$ find . -name 'tcg.o' | xargs size
   text    data     bss     dec     hex filename
  40883   29800      88   70771   11473 ./aarch64-softmmu/tcg/tcg.o
  37473   29416      96   66985   105a9 ./x86_64-linux-user/tcg/tcg.o
  38858   28816      96   67770   108ba ./arm-linux-user/tcg/tcg.o
  40554   29096      88   69738   1106a ./arm-softmmu/tcg/tcg.o
  39169   29672      88   68929   10d41 ./x86_64-softmmu/tcg/tcg.o

Note that using an entire byte for some enums that need less than
that wastes a few bits (noticeable in 32 bits, where we use
20 bytes instead of 16) but avoids extraction code, which overall
is a win--I've tested several variations of the patch, and the appended
is the best performer for OpenSSL's bntest by a very small margin:

Before:
$ taskset -c 0 perf stat -r 15 -- x86_64-linux-user/qemu-x86_64 img/bntest-x86_64 >/dev/null
[...]
 Performance counter stats for 'x86_64-linux-user/qemu-x86_64 img/bntest-x86_64' (15 runs):

      10538.479833 task-clock (msec)  # 0.999 CPUs utilized  ( +-  0.38% )
               772 context-switches   # 0.073 K/sec          ( +-  2.03% )
                 0 cpu-migrations     # 0.000 K/sec          ( +-100.00% )
             2,207 page-faults        # 0.209 K/sec          ( +-  0.08% )
      10.552871687 seconds time elapsed                      ( +-  0.39% )

After:
$ taskset -c 0 perf stat -r 15 -- x86_64-linux-user/qemu-x86_64 img/bntest-x86_64 >/dev/null
 Performance counter stats for 'x86_64-linux-user/qemu-x86_64 img/bntest-x86_64' (15 runs):

      10459.968847 task-clock (msec)  # 0.999 CPUs utilized  ( +-  0.30% )
               739 context-switches   # 0.071 K/sec          ( +-  1.71% )
                 0 cpu-migrations     # 0.000 K/sec          ( +- 68.14% )
             2,204 page-faults        # 0.211 K/sec          ( +-  0.10% )
      10.473900411 seconds time elapsed                      ( +-  0.30% )

Suggested-by: default avatarStefan Weil <sw@weilnetz.de>
Suggested-by: default avatarRichard Henderson <rth@twiddle.net>
Reviewed-by: default avatarAlex Bennée <alex.bennee@linaro.org>
Signed-off-by: default avatarEmilio G. Cota <cota@braap.org>
Signed-off-by: default avatarRichard Henderson <rth@twiddle.net>
parent 874e9aee
Loading
Loading
Loading
Loading
+14 −12
Original line number Diff line number Diff line
@@ -417,20 +417,19 @@ static inline TCGCond tcg_high_cond(TCGCond c)
    }
}

#define TEMP_VAL_DEAD  0
#define TEMP_VAL_REG   1
#define TEMP_VAL_MEM   2
#define TEMP_VAL_CONST 3
typedef enum TCGTempVal {
    TEMP_VAL_DEAD,
    TEMP_VAL_REG,
    TEMP_VAL_MEM,
    TEMP_VAL_CONST,
} TCGTempVal;

/* XXX: optimize memory layout */
typedef struct TCGTemp {
    TCGType base_type;
    TCGType type;
    int val_type;
    int reg;
    tcg_target_long val;
    int mem_reg;
    intptr_t mem_offset;
    unsigned int reg:8;
    unsigned int mem_reg:8;
    TCGTempVal val_type:8;
    TCGType base_type:8;
    TCGType type:8;
    unsigned int fixed_reg:1;
    unsigned int mem_coherent:1;
    unsigned int mem_allocated:1;
@@ -438,6 +437,9 @@ typedef struct TCGTemp {
                                  basic blocks. Otherwise, it is not
                                  preserved across basic blocks. */
    unsigned int temp_allocated:1; /* never used for code gen */

    tcg_target_long val;
    intptr_t mem_offset;
    const char *name;
} TCGTemp;