Commit c878da3b authored by malc's avatar malc
Browse files

tcg/ppc32: Use trampolines to trim the code size for mmu slow path accessors



mmu access looks something like:

<check tlb>
if miss goto slow_path
<fast path>
done:
...

; end of the TB
slow_path:
 <pre process>
 mr r3, r27         ; move areg0 to r3
                    ; (r3 holds the first argument for all the PPC32 ABIs)
 <call mmu_helper>
 b $+8
 .long done
 <post process>
 b done

On ppc32 <call mmu_helper> is:

(SysV and Darwin)

mmu_helper is most likely not within direct branching distance from
the call site, necessitating

a. moving 32 bit offset of mmu_helper into a GPR ; 8 bytes
b. moving GPR to CTR/LR                          ; 4 bytes
c. (finally) branching to CTR/LR                 ; 4 bytes

r3 setting              - 4 bytes
call                    - 16 bytes
dummy jump over retaddr - 4 bytes
embedded retaddr        - 4 bytes
         Total overhead - 28 bytes

(PowerOpen (AIX))
a. moving 32 bit offset of mmu_helper's TOC into a GPR1 ; 8 bytes
b. loading 32 bit function pointer into GPR2            ; 4 bytes
c. moving GPR2 to CTR/LR                                ; 4 bytes
d. loading 32 bit small area pointer into R2            ; 4 bytes
e. (finally) branching to CTR/LR                        ; 4 bytes

r3 setting              - 4 bytes
call                    - 24 bytes
dummy jump over retaddr - 4 bytes
embedded retaddr        - 4 bytes
         Total overhead - 36 bytes

Following is done to trim the code size of slow path sections:

In tcg_target_qemu_prologue trampolines are emitted that look like this:

trampoline:
mfspr r3, LR
addi  r3, 4
mtspr LR, r3      ; fixup LR to point over embedded retaddr
mr    r3, r27
<jump mmu_helper> ; tail call of sorts

And slow path becomes:

slow_path:
 <pre process>
 <call trampoline>
 .long done
 <post process>
 b done

call                    - 4 bytes (trampoline is within code gen buffer
                                   and most likely accessible via
                                   direct branch)
embedded retaddr        - 4 bytes
         Total overhead - 8 bytes

In the end the icache pressure is decreased by 20/28 bytes at the cost
of an extra jump to trampoline and adjusting LR (to skip over embedded
retaddr) once inside.

Signed-off-by: default avatarmalc <av1474@comtv.ru>
parent 1cfd981f
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -337,7 +337,7 @@ extern uintptr_t tci_tb_ptr;
                                    *(int32_t *)((void *)GETRA() + 3) - 1))
# elif defined (_ARCH_PPC) && !defined (_ARCH_PPC64)
#  define GETRA() ((uintptr_t)__builtin_return_address(0))
#  define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() + 4)) - 1))
#  define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() - 4)) - 1))
# else
#  error "CONFIG_QEMU_LDST_OPTIMIZATION needs GETPC_LDST() implementation!"
# endif
+24 −8
Original line number Diff line number Diff line
@@ -569,6 +569,9 @@ static const void * const qemu_st_helpers[4] = {
    helper_stq_mmu,
};

static void *ld_trampolines[4];
static void *st_trampolines[4];

static void tcg_out_tlb_check (TCGContext *s, int r0, int r1, int r2,
                               int addr_reg, int addr_reg2, int s_bits,
                               int offset1, int offset2, uint8_t **label_ptr)
@@ -848,8 +851,7 @@ static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
    reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr);

    /* slow path */
    ir = 3;
    tcg_out_mov (s, TCG_TYPE_I32, ir++, TCG_AREG0);
    ir = 4;
#if TARGET_LONG_BITS == 32
    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
#else
@@ -860,8 +862,7 @@ static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
#endif
    tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
    tcg_out_call (s, (tcg_target_long) qemu_ld_helpers[s_bits], 1);
    tcg_out32 (s, B | 8);
    tcg_out_call (s, (tcg_target_long) ld_trampolines[s_bits], 1);
    tcg_out32 (s, (tcg_target_long) raddr);
    switch (opc) {
    case 0|4:
@@ -916,8 +917,7 @@ static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
    reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr);

    /* slow path */
    ir = 3;
    tcg_out_mov (s, TCG_TYPE_I32, ir++, TCG_AREG0);
    ir = 4;
#if TARGET_LONG_BITS == 32
    tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
#else
@@ -959,8 +959,7 @@ static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
    ir++;

    tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
    tcg_out_call (s, (tcg_target_long) qemu_st_helpers[opc], 1);
    tcg_out32 (s, B | 8);
    tcg_out_call (s, (tcg_target_long) st_trampolines[opc], 1);
    tcg_out32 (s, (tcg_target_long) raddr);
    tcg_out_b (s, 0, (tcg_target_long) raddr);
}
@@ -983,6 +982,15 @@ void tcg_out_tb_finalize(TCGContext *s)
}
#endif

static void emit_ldst_trampoline (TCGContext *s, const void *ptr)
{
    tcg_out32 (s, MFSPR | RT (3) | LR);
    tcg_out32 (s, ADDI | RT (3) | RA (3) | 4);
    tcg_out32 (s, MTSPR | RS (3) | LR);
    tcg_out_mov (s, TCG_TYPE_I32, 3, TCG_AREG0);
    tcg_out_b (s, 0, (tcg_target_long) ptr);
}

static void tcg_target_qemu_prologue (TCGContext *s)
{
    int i, frame_size;
@@ -1043,6 +1051,14 @@ static void tcg_target_qemu_prologue (TCGContext *s)
    tcg_out32 (s, MTSPR | RS (0) | LR);
    tcg_out32 (s, ADDI | RT (1) | RA (1) | frame_size);
    tcg_out32 (s, BCLR | BO_ALWAYS);

    for (i = 0; i < 4; ++i) {
        ld_trampolines[i] = s->code_ptr;
        emit_ldst_trampoline (s, qemu_ld_helpers[i]);

        st_trampolines[i] = s->code_ptr;
        emit_ldst_trampoline (s, qemu_st_helpers[i]);
    }
}

static void tcg_out_ld (TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,