[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH 40/57] target-i386: Use CC_SRC2 for ADC and SBB
From: |
Richard Henderson |
Subject: |
[Qemu-devel] [PATCH 40/57] target-i386: Use CC_SRC2 for ADC and SBB |
Date: |
Wed, 23 Jan 2013 20:03:24 -0800 |
Now that we've got two slots in ENV, store two of the three inputs.
This lets us do less work when carry-out is not needed, and avoids
the unpredictable CC_OP after translating these insns.
Signed-off-by: Richard Henderson <address@hidden>
---
target-i386/cc_helper_template.h | 44 +++++++++++++++++-------------
target-i386/cpu.h | 7 ++---
target-i386/translate.c | 58 ++++++++++------------------------------
3 files changed, 44 insertions(+), 65 deletions(-)
diff --git a/target-i386/cc_helper_template.h b/target-i386/cc_helper_template.h
index 951ceaf..fcb14db 100644
--- a/target-i386/cc_helper_template.h
+++ b/target-i386/cc_helper_template.h
@@ -61,16 +61,19 @@ static int glue(compute_all_add, SUFFIX)(CPUX86State *env)
static int glue(compute_all_adc, SUFFIX)(CPUX86State *env)
{
int cf, pf, af, zf, sf, of;
- target_long src1, src2;
+ DATA_TYPE dst, src1, src2, src3;
+ dst = CC_DST;
src1 = CC_SRC;
- src2 = CC_DST - CC_SRC - 1;
- cf = (DATA_TYPE)CC_DST <= (DATA_TYPE)src1;
- pf = parity_table[(uint8_t)CC_DST];
- af = (CC_DST ^ src1 ^ src2) & 0x10;
- zf = ((DATA_TYPE)CC_DST == 0) << 6;
- sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
- of = lshift((src1 ^ src2 ^ -1) & (src1 ^ CC_DST), 12 - DATA_BITS) & CC_O;
+ src3 = CC_SRC2; /* carry-in: always 0/1. */
+ src2 = dst - src1;
+
+ cf = dst < src1 || dst < src3;
+ pf = parity_table[(uint8_t)dst];
+ af = (dst ^ src1 ^ src2) & 0x10;
+ zf = (dst == 0) << 6;
+ sf = lshift(dst, 8 - DATA_BITS) & 0x80;
+ of = lshift((src1 ^ src2 ^ -1) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
return cf | pf | af | zf | sf | of;
}
@@ -93,16 +96,21 @@ static int glue(compute_all_sub, SUFFIX)(CPUX86State *env)
static int glue(compute_all_sbb, SUFFIX)(CPUX86State *env)
{
int cf, pf, af, zf, sf, of;
- target_long src1, src2;
-
- src1 = CC_DST + CC_SRC + 1;
- src2 = CC_SRC;
- cf = (DATA_TYPE)src1 <= (DATA_TYPE)src2;
- pf = parity_table[(uint8_t)CC_DST];
- af = (CC_DST ^ src1 ^ src2) & 0x10;
- zf = ((DATA_TYPE)CC_DST == 0) << 6;
- sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
- of = lshift((src1 ^ src2) & (src1 ^ CC_DST), 12 - DATA_BITS) & CC_O;
+ DATA_TYPE dst, src1, src2, src3;
+
+ dst = CC_DST;
+ src3 = CC_SRC2; /* borrow-in: always 0/1. */
+ src2 = CC_SRC + src3;
+ src1 = dst + src2;
+
+ /* If src2 + src3 overflows, then we're logically subtracting a larger
+ value than src1 could have held, and thus we must have borrow out. */
+ cf = src2 < src3 || src1 < src2;
+ pf = parity_table[(uint8_t)dst];
+ af = (dst ^ src1 ^ src2) & 0x10;
+ zf = (dst == 0) << 6;
+ sf = lshift(dst, 8 - DATA_BITS) & 0x80;
+ of = lshift((src1 ^ src2) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
return cf | pf | af | zf | sf | of;
}
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 868627e..fa34ff2 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -1119,9 +1119,10 @@ static inline int cpu_mmu_index (CPUX86State *env)
#define EIP (env->eip)
#define DF (env->df)
-#define CC_SRC (env->cc_src)
-#define CC_DST (env->cc_dst)
-#define CC_OP (env->cc_op)
+#define CC_DST (env->cc_dst)
+#define CC_SRC (env->cc_src)
+#define CC_SRC2 (env->cc_src2)
+#define CC_OP (env->cc_op)
/* n must be a constant to be efficient */
static inline target_long lshift(target_long x, int n)
diff --git a/target-i386/translate.c b/target-i386/translate.c
index aaee393..77d86b0 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -196,9 +196,9 @@ static const uint8_t cc_op_live[CC_OP_NB] = {
[CC_OP_EFLAGS] = USES_CC_SRC,
[CC_OP_MULB ... CC_OP_MULQ] = USES_CC_DST | USES_CC_SRC,
[CC_OP_ADDB ... CC_OP_ADDQ] = USES_CC_DST | USES_CC_SRC,
- [CC_OP_ADCB ... CC_OP_ADCQ] = USES_CC_DST | USES_CC_SRC,
+ [CC_OP_ADCB ... CC_OP_ADCQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
[CC_OP_SUBB ... CC_OP_SUBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
- [CC_OP_SBBB ... CC_OP_SBBQ] = USES_CC_DST | USES_CC_SRC,
+ [CC_OP_SBBB ... CC_OP_SBBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
[CC_OP_LOGICB ... CC_OP_LOGICQ] = USES_CC_DST,
[CC_OP_INCB ... CC_OP_INCQ] = USES_CC_DST | USES_CC_SRC,
[CC_OP_DECB ... CC_OP_DECQ] = USES_CC_DST | USES_CC_SRC,
@@ -876,6 +876,13 @@ static void gen_op_update2_cc(void)
tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
}
+static void gen_op_update3_cc(TCGv reg)
+{
+ tcg_gen_mov_tl(cpu_cc_src2, reg);
+ tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
+ tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+}
+
static inline void gen_op_testl_T0_T1_cc(void)
{
tcg_gen_and_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
@@ -936,30 +943,6 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s,
TCGv reg)
return (CCPrepare) { .cond = TCG_COND_LTU, .reg = t0,
.reg2 = t1, .mask = -1, .use_reg2 = true };
- case CC_OP_SBBB ... CC_OP_SBBQ:
- /* (DATA_TYPE)(CC_DST + CC_SRC + 1) <= (DATA_TYPE)CC_SRC */
- size = s->cc_op - CC_OP_SBBB;
- t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
- if (TCGV_EQUAL(t1, reg) && TCGV_EQUAL(reg, cpu_cc_src)) {
- tcg_gen_mov_tl(cpu_tmp0, cpu_cc_src);
- t1 = cpu_tmp0;
- }
-
- tcg_gen_add_tl(reg, cpu_cc_dst, cpu_cc_src);
- tcg_gen_addi_tl(reg, reg, 1);
- gen_extu(size, reg);
- t0 = reg;
- goto adc_sbb;
-
- case CC_OP_ADCB ... CC_OP_ADCQ:
- /* (DATA_TYPE)CC_DST <= (DATA_TYPE)CC_SRC */
- size = s->cc_op - CC_OP_ADCB;
- t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
- t0 = gen_ext_tl(reg, cpu_cc_dst, size, false);
- adc_sbb:
- return (CCPrepare) { .cond = TCG_COND_LEU, .reg = t0,
- .reg2 = t1, .mask = -1, .use_reg2 = true };
-
case CC_OP_LOGICB ... CC_OP_LOGICQ:
return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
@@ -1421,18 +1404,10 @@ static void gen_op(DisasContext *s1, int op, int ot,
int d)
gen_op_mov_reg_T0(ot, d);
else
gen_op_st_T0_A0(ot + s1->mem_index);
- tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
- tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
- tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4);
- tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2);
- tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_ADDB + ot);
- set_cc_op(s1, CC_OP_DYNAMIC);
+ gen_op_update3_cc(cpu_tmp4);
+ set_cc_op(s1, CC_OP_ADCB + ot);
break;
case OP_SBBL:
- /*
- * No need to store cpu_cc_src2, because it is used only
- * when the cc_op is known.
- */
gen_compute_eflags_c(s1, cpu_tmp4);
tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_tmp4);
@@ -1440,12 +1415,8 @@ static void gen_op(DisasContext *s1, int op, int ot, int
d)
gen_op_mov_reg_T0(ot, d);
else
gen_op_st_T0_A0(ot + s1->mem_index);
- tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
- tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
- tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4);
- tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2);
- tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_SUBB + ot);
- set_cc_op(s1, CC_OP_DYNAMIC);
+ gen_op_update3_cc(cpu_tmp4);
+ set_cc_op(s1, CC_OP_SBBB + ot);
break;
case OP_ADDL:
gen_op_addl_T0_T1();
@@ -1463,8 +1434,7 @@ static void gen_op(DisasContext *s1, int op, int ot, int
d)
gen_op_mov_reg_T0(ot, d);
else
gen_op_st_T0_A0(ot + s1->mem_index);
- gen_op_update2_cc();
- tcg_gen_mov_tl(cpu_cc_src2, cpu_tmp0);
+ gen_op_update3_cc(cpu_tmp0);
set_cc_op(s1, CC_OP_SUBB + ot);
break;
default:
--
1.7.11.7
- [Qemu-devel] [PATCH 44/57] target-i386: Decode the VEX prefixes, (continued)
- [Qemu-devel] [PATCH 44/57] target-i386: Decode the VEX prefixes, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 43/57] target-i386: Tidy prefix parsing, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 42/57] target-i386: Make helper_cc_compute_all const, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 53/57] target-i386: Implement RORX, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 55/57] target-i386: Use clz/ctz for bsf/bsr helpers, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 54/57] target-i386: Implement ADX extension, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 56/57] target-i386: Simplify bsf/bsr flags computation, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 57/57] target-i386: Implement tzcnt and fix lzcnt, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 33/57] target-i386: introduce gen_cmovcc1, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 31/57] target-i386: inline gen_prepare_cc_slow, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 40/57] target-i386: Use CC_SRC2 for ADC and SBB,
Richard Henderson <=
- [Qemu-devel] [PATCH 35/57] target-i386: kill cpu_T3, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 49/57] target-i386: Implement BZHI, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 39/57] target-i386: optimize flags checking after sub using CC_SRC2, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 36/57] target-i386: use gen_op for cmps/scas, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 48/57] target-i386: Implement BLSR, BLSMSK, BLSI, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 46/57] target-i386: Implement ANDN, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 51/57] target-i386: Implement PDEP, PEXT, Richard Henderson, 2013/01/23
- [Qemu-devel] [PATCH 38/57] target-i386: Update cc_op before TCG branches, Richard Henderson, 2013/01/23