[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] Re: [PATCH 3/3] target-sparc: Inline some generation of car
From: |
Blue Swirl |
Subject: |
[Qemu-devel] Re: [PATCH 3/3] target-sparc: Inline some generation of carry for ADDX/SUBX. |
Date: |
Thu, 20 May 2010 19:59:33 +0000 |
Thanks, applied.
On Wed, May 12, 2010 at 6:04 PM, Richard Henderson <address@hidden> wrote:
> Computing carry is trivial for some inputs. By avoiding an
> external function call, we generate near-optimal code for
> the common cases of add+addx (double-word arithmetic) and
> cmp+addx (a setcc pattern).
>
> Signed-off-by: Richard Henderson <address@hidden>
> ---
> target-sparc/helper.h | 2 +-
> target-sparc/op_helper.c | 2 +-
> target-sparc/translate.c | 272
> +++++++++++++++++++++++++++++++++-------------
> 3 files changed, 200 insertions(+), 76 deletions(-)
>
> diff --git a/target-sparc/helper.h b/target-sparc/helper.h
> index 04c1306..6f103e7 100644
> --- a/target-sparc/helper.h
> +++ b/target-sparc/helper.h
> @@ -158,6 +158,6 @@ VIS_CMPHELPER(cmpne);
> #undef VIS_HELPER
> #undef VIS_CMPHELPER
> DEF_HELPER_0(compute_psr, void);
> -DEF_HELPER_0(compute_C_icc, tl);
> +DEF_HELPER_0(compute_C_icc, i32);
>
> #include "def-helper.h"
> diff --git a/target-sparc/op_helper.c b/target-sparc/op_helper.c
> index 3783b02..125cd67 100644
> --- a/target-sparc/op_helper.c
> +++ b/target-sparc/op_helper.c
> @@ -1342,7 +1342,7 @@ void helper_compute_psr(void)
> CC_OP = CC_OP_FLAGS;
> }
>
> -target_ulong helper_compute_C_icc(void)
> +uint32_t helper_compute_C_icc(void)
> {
> uint32_t ret;
>
> diff --git a/target-sparc/translate.c b/target-sparc/translate.c
> index ea7c71b..713d3e1 100644
> --- a/target-sparc/translate.c
> +++ b/target-sparc/translate.c
> @@ -332,24 +332,132 @@ static inline void gen_op_add_cc(TCGv dst, TCGv src1,
> TCGv src2)
> tcg_gen_mov_tl(dst, cpu_cc_dst);
> }
>
> -static inline void gen_op_addxi_cc(TCGv dst, TCGv src1, target_long src2)
> +static TCGv_i32 gen_add32_carry32(void)
> {
> - gen_helper_compute_C_icc(cpu_tmp0);
> - tcg_gen_mov_tl(cpu_cc_src, src1);
> - tcg_gen_movi_tl(cpu_cc_src2, src2);
> - tcg_gen_add_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0);
> - tcg_gen_addi_tl(cpu_cc_dst, cpu_cc_dst, src2);
> - tcg_gen_mov_tl(dst, cpu_cc_dst);
> + TCGv_i32 carry_32, cc_src1_32, cc_src2_32;
> +
> + /* Carry is computed from a previous add: (dst < src) */
> +#if TARGET_LONG_BITS == 64
> + cc_src1_32 = tcg_temp_new_i32();
> + cc_src2_32 = tcg_temp_new_i32();
> + tcg_gen_trunc_i64_i32(cc_src1_32, cpu_cc_dst);
> + tcg_gen_trunc_i64_i32(cc_src2_32, cpu_cc_src);
> +#else
> + cc_src1_32 = cpu_cc_dst;
> + cc_src2_32 = cpu_cc_src;
> +#endif
> +
> + carry_32 = tcg_temp_new_i32();
> + tcg_gen_setcond_i32(TCG_COND_LTU, carry_32, cc_src1_32, cc_src2_32);
> +
> +#if TARGET_LONG_BITS == 64
> + tcg_temp_free_i32(cc_src1_32);
> + tcg_temp_free_i32(cc_src2_32);
> +#endif
> +
> + return carry_32;
> }
>
> -static inline void gen_op_addx_cc(TCGv dst, TCGv src1, TCGv src2)
> +static TCGv_i32 gen_sub32_carry32(void)
> {
> - gen_helper_compute_C_icc(cpu_tmp0);
> - tcg_gen_mov_tl(cpu_cc_src, src1);
> - tcg_gen_mov_tl(cpu_cc_src2, src2);
> - tcg_gen_add_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0);
> - tcg_gen_add_tl(cpu_cc_dst, cpu_cc_dst, cpu_cc_src2);
> - tcg_gen_mov_tl(dst, cpu_cc_dst);
> + TCGv_i32 carry_32, cc_src1_32, cc_src2_32;
> +
> + /* Carry is computed from a previous borrow: (src1 < src2) */
> +#if TARGET_LONG_BITS == 64
> + cc_src1_32 = tcg_temp_new_i32();
> + cc_src2_32 = tcg_temp_new_i32();
> + tcg_gen_trunc_i64_i32(cc_src1_32, cpu_cc_src);
> + tcg_gen_trunc_i64_i32(cc_src2_32, cpu_cc_src2);
> +#else
> + cc_src1_32 = cpu_cc_src;
> + cc_src2_32 = cpu_cc_src2;
> +#endif
> +
> + carry_32 = tcg_temp_new_i32();
> + tcg_gen_setcond_i32(TCG_COND_LTU, carry_32, cc_src1_32, cc_src2_32);
> +
> +#if TARGET_LONG_BITS == 64
> + tcg_temp_free_i32(cc_src1_32);
> + tcg_temp_free_i32(cc_src2_32);
> +#endif
> +
> + return carry_32;
> +}
> +
> +static void gen_op_addx_int(DisasContext *dc, TCGv dst, TCGv src1,
> + TCGv src2, int update_cc)
> +{
> + TCGv_i32 carry_32;
> + TCGv carry;
> +
> + switch (dc->cc_op) {
> + case CC_OP_DIV:
> + case CC_OP_LOGIC:
> + /* Carry is known to be zero. Fall back to plain ADD. */
> + if (update_cc) {
> + gen_op_add_cc(dst, src1, src2);
> + } else {
> + tcg_gen_add_tl(dst, src1, src2);
> + }
> + return;
> +
> + case CC_OP_ADD:
> + case CC_OP_TADD:
> + case CC_OP_TADDTV:
> +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
> + {
> + /* For 32-bit hosts, we can re-use the host's hardware carry
> + generation by using an ADD2 opcode. We discard the low
> + part of the output. Ideally we'd combine this operation
> + with the add that generated the carry in the first place. */
> + TCGv dst_low = tcg_temp_new();
> + tcg_gen_op6_i32(INDEX_op_add2_i32, dst_low, dst,
> + cpu_cc_src, src1, cpu_cc_src2, src2);
> + tcg_temp_free(dst_low);
> + goto add_done;
> + }
> +#endif
> + carry_32 = gen_add32_carry32();
> + break;
> +
> + case CC_OP_SUB:
> + case CC_OP_TSUB:
> + case CC_OP_TSUBTV:
> + carry_32 = gen_sub32_carry32();
> + break;
> +
> + default:
> + /* We need external help to produce the carry. */
> + carry_32 = tcg_temp_new_i32();
> + gen_helper_compute_C_icc(carry_32);
> + break;
> + }
> +
> +#if TARGET_LONG_BITS == 64
> + carry = tcg_temp_new();
> + tcg_gen_extu_i32_i64(carry, carry_32);
> +#else
> + carry = carry_32;
> +#endif
> +
> + tcg_gen_add_tl(dst, src1, src2);
> + tcg_gen_add_tl(dst, dst, carry);
> +
> + tcg_temp_free_i32(carry_32);
> +#if TARGET_LONG_BITS == 64
> + tcg_temp_free(carry);
> +#endif
> +
> +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
> + add_done:
> +#endif
> + if (update_cc) {
> + tcg_gen_mov_tl(cpu_cc_src, src1);
> + tcg_gen_mov_tl(cpu_cc_src2, src2);
> + tcg_gen_mov_tl(cpu_cc_dst, dst);
> + tcg_gen_movi_i32(cpu_cc_op, CC_OP_ADDX);
> + dc->cc_op = CC_OP_ADDX;
> + }
> }
>
> static inline void gen_op_tadd_cc(TCGv dst, TCGv src1, TCGv src2)
> @@ -415,24 +523,80 @@ static inline void gen_op_sub_cc(TCGv dst, TCGv src1,
> TCGv src2)
> tcg_gen_mov_tl(dst, cpu_cc_dst);
> }
>
> -static inline void gen_op_subxi_cc(TCGv dst, TCGv src1, target_long src2)
> +static void gen_op_subx_int(DisasContext *dc, TCGv dst, TCGv src1,
> + TCGv src2, int update_cc)
> {
> - gen_helper_compute_C_icc(cpu_tmp0);
> - tcg_gen_mov_tl(cpu_cc_src, src1);
> - tcg_gen_movi_tl(cpu_cc_src2, src2);
> - tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0);
> - tcg_gen_subi_tl(cpu_cc_dst, cpu_cc_dst, src2);
> - tcg_gen_mov_tl(dst, cpu_cc_dst);
> -}
> + TCGv_i32 carry_32;
> + TCGv carry;
>
> -static inline void gen_op_subx_cc(TCGv dst, TCGv src1, TCGv src2)
> -{
> - gen_helper_compute_C_icc(cpu_tmp0);
> - tcg_gen_mov_tl(cpu_cc_src, src1);
> - tcg_gen_mov_tl(cpu_cc_src2, src2);
> - tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0);
> - tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_dst, cpu_cc_src2);
> - tcg_gen_mov_tl(dst, cpu_cc_dst);
> + switch (dc->cc_op) {
> + case CC_OP_DIV:
> + case CC_OP_LOGIC:
> + /* Carry is known to be zero. Fall back to plain SUB. */
> + if (update_cc) {
> + gen_op_sub_cc(dst, src1, src2);
> + } else {
> + tcg_gen_sub_tl(dst, src1, src2);
> + }
> + return;
> +
> + case CC_OP_ADD:
> + case CC_OP_TADD:
> + case CC_OP_TADDTV:
> + carry_32 = gen_add32_carry32();
> + break;
> +
> + case CC_OP_SUB:
> + case CC_OP_TSUB:
> + case CC_OP_TSUBTV:
> +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
> + {
> + /* For 32-bit hosts, we can re-use the host's hardware carry
> + generation by using a SUB2 opcode. We discard the low
> + part of the output. Ideally we'd combine this operation
> + with the add that generated the carry in the first place. */
> + TCGv dst_low = tcg_temp_new();
> + tcg_gen_op6_i32(INDEX_op_sub2_i32, dst_low, dst,
> + cpu_cc_src, src1, cpu_cc_src2, src2);
> + tcg_temp_free(dst_low);
> + goto sub_done;
> + }
> +#endif
> + carry_32 = gen_sub32_carry32();
> + break;
> +
> + default:
> + /* We need external help to produce the carry. */
> + carry_32 = tcg_temp_new_i32();
> + gen_helper_compute_C_icc(carry_32);
> + break;
> + }
> +
> +#if TARGET_LONG_BITS == 64
> + carry = tcg_temp_new();
> + tcg_gen_extu_i32_i64(carry, carry_32);
> +#else
> + carry = carry_32;
> +#endif
> +
> + tcg_gen_sub_tl(dst, src1, src2);
> + tcg_gen_sub_tl(dst, dst, carry);
> +
> + tcg_temp_free_i32(carry_32);
> +#if TARGET_LONG_BITS == 64
> + tcg_temp_free(carry);
> +#endif
> +
> +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
> + sub_done:
> +#endif
> + if (update_cc) {
> + tcg_gen_mov_tl(cpu_cc_src, src1);
> + tcg_gen_mov_tl(cpu_cc_src2, src2);
> + tcg_gen_mov_tl(cpu_cc_dst, dst);
> + tcg_gen_movi_i32(cpu_cc_op, CC_OP_SUBX);
> + dc->cc_op = CC_OP_SUBX;
> + }
> }
>
> static inline void gen_op_tsub_cc(TCGv dst, TCGv src1, TCGv src2)
> @@ -2950,28 +3114,8 @@ static void disas_sparc_insn(DisasContext * dc)
> }
> break;
> case 0x8: /* addx, V9 addc */
> - if (IS_IMM) {
> - simm = GET_FIELDs(insn, 19, 31);
> - if (xop & 0x10) {
> - gen_op_addxi_cc(cpu_dst, cpu_src1, simm);
> - tcg_gen_movi_i32(cpu_cc_op, CC_OP_ADDX);
> - dc->cc_op = CC_OP_ADDX;
> - } else {
> - gen_helper_compute_C_icc(cpu_tmp0);
> - tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, simm);
> - tcg_gen_add_tl(cpu_dst, cpu_src1, cpu_tmp0);
> - }
> - } else {
> - if (xop & 0x10) {
> - gen_op_addx_cc(cpu_dst, cpu_src1, cpu_src2);
> - tcg_gen_movi_i32(cpu_cc_op, CC_OP_ADDX);
> - dc->cc_op = CC_OP_ADDX;
> - } else {
> - gen_helper_compute_C_icc(cpu_tmp0);
> - tcg_gen_add_tl(cpu_tmp0, cpu_src2, cpu_tmp0);
> - tcg_gen_add_tl(cpu_dst, cpu_src1, cpu_tmp0);
> - }
> - }
> + gen_op_addx_int(dc, cpu_dst, cpu_src1, cpu_src2,
> + (xop & 0x10));
> break;
> #ifdef TARGET_SPARC64
> case 0x9: /* V9 mulx */
> @@ -3002,28 +3146,8 @@ static void disas_sparc_insn(DisasContext * dc)
> }
> break;
> case 0xc: /* subx, V9 subc */
> - if (IS_IMM) {
> - simm = GET_FIELDs(insn, 19, 31);
> - if (xop & 0x10) {
> - gen_op_subxi_cc(cpu_dst, cpu_src1, simm);
> - tcg_gen_movi_i32(cpu_cc_op, CC_OP_SUBX);
> - dc->cc_op = CC_OP_SUBX;
> - } else {
> - gen_helper_compute_C_icc(cpu_tmp0);
> - tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, simm);
> - tcg_gen_sub_tl(cpu_dst, cpu_src1, cpu_tmp0);
> - }
> - } else {
> - if (xop & 0x10) {
> - gen_op_subx_cc(cpu_dst, cpu_src1, cpu_src2);
> - tcg_gen_movi_i32(cpu_cc_op, CC_OP_SUBX);
> - dc->cc_op = CC_OP_SUBX;
> - } else {
> - gen_helper_compute_C_icc(cpu_tmp0);
> - tcg_gen_add_tl(cpu_tmp0, cpu_src2, cpu_tmp0);
> - tcg_gen_sub_tl(cpu_dst, cpu_src1, cpu_tmp0);
> - }
> - }
> + gen_op_subx_int(dc, cpu_dst, cpu_src1, cpu_src2,
> + (xop & 0x10));
> break;
> #ifdef TARGET_SPARC64
> case 0xd: /* V9 udivx */
> --
> 1.7.0.1
>
>
- Re: [Qemu-devel] [PATCH 2/3] target-sparc: Simplify ICC generation; fix ADDX carry generation., (continued)