Commit 057b6e37 authored by Richard Henderson's avatar Richard Henderson
Browse files

tcg/arm: Use LDRD to load tlb mask+table



This changes the code generation for the tlb from e.g.

	ldr      ip, [r6, #-0x10]
	ldr      r2, [r6, #-0xc]
	and      ip, ip, r4, lsr #8
	ldrd     r0, r1, [r2, ip]!
	ldr      r2, [r2, #0x18]

to

	ldrd     r0, r1, [r6, #-0x10]
	and      r0, r0, r4, lsr #8
	ldrd     r2, r3, [r1, r0]!
	ldr      r1, [r1, #0x18]

for armv7 hosts.  Rearranging the register allocation in
order to avoid overlap between the two ldrd pairs causes
the patch to be larger than it ordinarily would be.

Reviewed-by: default avatarPeter Maydell <peter.maydell@linaro.org>
Signed-off-by: default avatarRichard Henderson <richard.henderson@linaro.org>
parent 65b23204
Loading
Loading
Loading
Loading
+40 −26
Original line number Diff line number Diff line
@@ -267,6 +267,7 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
#endif
        break;
@@ -1224,6 +1225,10 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,
QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -256);

/* These offsets are built into the LDRD below.  */
QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4);

/* Load and compare a TLB entry, leaving the flags set.  Returns the register
   containing the addend of the tlb entry.  Clobbers R0, R1, R2, TMP.  */

@@ -1238,47 +1243,54 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
    unsigned s_bits = opc & MO_SIZE;
    unsigned a_bits = get_alignment_bits(opc);

    /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
    tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP, TCG_AREG0, mask_off);
    tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R2, TCG_AREG0, table_off);
    /*
     * We don't support inline unaligned acceses, but we can easily
     * support overalignment checks.
     */
    if (a_bits < s_bits) {
        a_bits = s_bits;
    }

    /* Extract the tlb index from the address into TMP.  */
    tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_TMP, TCG_REG_TMP, addrlo,
    /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}.  */
    if (use_armv6_instructions) {
        tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
    } else {
        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R0, TCG_AREG0, mask_off);
        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R1, TCG_AREG0, table_off);
    }

    /* Extract the tlb index from the address into R0.  */
    tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo,
                    SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));

    /*
     * Add the tlb_table pointer, creating the CPUTLBEntry address in R2.
     * Load the tlb comparator into R0/R1 and the fast path addend into R2.
     * Add the tlb_table pointer, creating the CPUTLBEntry address in R1.
     * Load the tlb comparator into R2/R3 and the fast path addend into R1.
     */
    if (cmp_off == 0) {
        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
            tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, TCG_REG_TMP);
            tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
        } else {
            tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, TCG_REG_TMP);
            tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
        }
    } else {
        tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
		        TCG_REG_R2, TCG_REG_R2, TCG_REG_TMP, 0);
                        TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0);
        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
            tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
            tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
        } else {
            tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
            tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
        }
    }
    if (!use_armv6_instructions && TARGET_LONG_BITS == 64) {
        tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, cmp_off + 4);
        tcg_out_ld32_12(s, COND_AL, TCG_REG_R3, TCG_REG_R1, cmp_off + 4);
    }

    /* Load the tlb addend.  */
    tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2,
    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R1,
                    offsetof(CPUTLBEntry, addend));

    /* Check alignment.  We don't support inline unaligned acceses,
       but we can easily support overalignment checks.  */
    if (a_bits < s_bits) {
        a_bits = s_bits;
    }

    /* Check alignment, check comparators.  */
    if (use_armv7_instructions) {
        tcg_target_ulong mask = ~(TARGET_PAGE_MASK | ((1 << a_bits) - 1));
        int rot = encode_imm(mask);
@@ -1291,22 +1303,24 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
            tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP,
                            addrlo, TCG_REG_TMP, 0);
        }
        tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R0, TCG_REG_TMP, 0);
        tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, 0);
    } else {
        if (a_bits) {
            tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo,
                            (1 << a_bits) - 1);
        }
        tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, addrlo,
                        SHIFT_IMM_LSR(TARGET_PAGE_BITS));
        tcg_out_dat_reg(s, (a_bits ? COND_EQ : COND_AL), ARITH_CMP,
                        0, TCG_REG_R0, TCG_REG_TMP,
                        0, TCG_REG_R2, TCG_REG_TMP,
                        SHIFT_IMM_LSL(TARGET_PAGE_BITS));
    }

    if (TARGET_LONG_BITS == 64) {
        tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R1, addrhi, 0);
        tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R3, addrhi, 0);
    }

    return TCG_REG_R2;
    return TCG_REG_R1;
}

/* Record the context of a call to the out of line helper code for the slow