[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH 56/57] target-i386: Implement tzcnt and fix lzcnt
From: |
Richard Henderson |
Subject: |
[Qemu-devel] [PATCH 56/57] target-i386: Implement tzcnt and fix lzcnt |
Date: |
Tue, 19 Feb 2013 09:40:30 -0800 |
We weren't computing flags for lzcnt at all. At the same time,
adjust the implementation of bsf/bsr to avoid the local branch,
using movcond instead.
Signed-off-by: Richard Henderson <address@hidden>
---
target-i386/helper.h | 5 ++-
target-i386/int_helper.c | 11 ++-----
target-i386/translate.c | 86 +++++++++++++++++++++++++++---------------------
3 files changed, 54 insertions(+), 48 deletions(-)
diff --git a/target-i386/helper.h b/target-i386/helper.h
index e1ecdb8..26a0cc8 100644
--- a/target-i386/helper.h
+++ b/target-i386/helper.h
@@ -195,9 +195,8 @@ DEF_HELPER_3(frstor, void, env, tl, int)
DEF_HELPER_3(fxsave, void, env, tl, int)
DEF_HELPER_3(fxrstor, void, env, tl, int)
-DEF_HELPER_FLAGS_1(bsf, TCG_CALL_NO_RWG_SE, tl, tl)
-DEF_HELPER_FLAGS_1(bsr, TCG_CALL_NO_RWG_SE, tl, tl)
-DEF_HELPER_FLAGS_2(lzcnt, TCG_CALL_NO_RWG_SE, tl, tl, int)
+DEF_HELPER_FLAGS_1(clz, TCG_CALL_NO_RWG_SE, tl, tl)
+DEF_HELPER_FLAGS_1(ctz, TCG_CALL_NO_RWG_SE, tl, tl)
DEF_HELPER_FLAGS_2(pdep, TCG_CALL_NO_RWG_SE, tl, tl, tl)
DEF_HELPER_FLAGS_2(pext, TCG_CALL_NO_RWG_SE, tl, tl, tl)
diff --git a/target-i386/int_helper.c b/target-i386/int_helper.c
index 7bec4eb..3b56075 100644
--- a/target-i386/int_helper.c
+++ b/target-i386/int_helper.c
@@ -456,19 +456,14 @@ void helper_idivq_EAX(CPUX86State *env, target_ulong t0)
#endif
/* bit operations */
-target_ulong helper_bsf(target_ulong t0)
+target_ulong helper_ctz(target_ulong t0)
{
return ctztl(t0);
}
-target_ulong helper_lzcnt(target_ulong t0, int wordsize)
+target_ulong helper_clz(target_ulong t0)
{
- return clztl(t0) - (TARGET_LONG_BITS - wordsize);
-}
-
-target_ulong helper_bsr(target_ulong t0)
-{
- return clztl(t0) ^ (TARGET_LONG_BITS - 1);
+ return clztl(t0);
}
target_ulong helper_pdep(target_ulong src, target_ulong mask)
diff --git a/target-i386/translate.c b/target-i386/translate.c
index 7edfb55..30e88da 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -7157,46 +7157,58 @@ static target_ulong disas_insn(CPUX86State *env,
DisasContext *s,
tcg_gen_movi_tl(cpu_cc_dst, 0);
}
break;
- case 0x1bc: /* bsf */
- case 0x1bd: /* bsr */
- {
- int label1;
- TCGv t0;
-
- ot = dflag + OT_WORD;
- modrm = cpu_ldub_code(env, s->pc++);
- reg = ((modrm >> 3) & 7) | rex_r;
- gen_ldst_modrm(env, s,modrm, ot, OR_TMP0, 0);
- gen_extu(ot, cpu_T[0]);
- t0 = tcg_temp_local_new();
- tcg_gen_mov_tl(t0, cpu_T[0]);
- if ((b & 1) && (prefixes & PREFIX_REPZ) &&
- (s->cpuid_ext3_features & CPUID_EXT3_ABM)) {
- switch(ot) {
- case OT_WORD: gen_helper_lzcnt(cpu_T[0], t0,
- tcg_const_i32(16)); break;
- case OT_LONG: gen_helper_lzcnt(cpu_T[0], t0,
- tcg_const_i32(32)); break;
- case OT_QUAD: gen_helper_lzcnt(cpu_T[0], t0,
- tcg_const_i32(64)); break;
- }
- gen_op_mov_reg_T0(ot, reg);
+ case 0x1bc: /* bsf / tzcnt */
+ case 0x1bd: /* bsr / lzcnt */
+ ot = dflag + OT_WORD;
+ modrm = cpu_ldub_code(env, s->pc++);
+ reg = ((modrm >> 3) & 7) | rex_r;
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+ gen_extu(ot, cpu_T[0]);
+
+ /* Note that lzcnt and tzcnt are in different extensions. */
+ if ((prefixes & PREFIX_REPZ)
+ && (b & 1
+ ? s->cpuid_ext3_features & CPUID_EXT3_ABM
+ : s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)) {
+ int size = 8 << ot;
+ tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+ if (b & 1) {
+ /* For lzcnt, reduce the target_ulong result by the
+ number of zeros that we expect to find at the top. */
+ gen_helper_clz(cpu_T[0], cpu_T[0]);
+ tcg_gen_subi_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - size);
} else {
- label1 = gen_new_label();
- tcg_gen_movi_tl(cpu_cc_dst, 0);
- tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0, label1);
- if (b & 1) {
- gen_helper_bsr(cpu_T[0], t0);
- } else {
- gen_helper_bsf(cpu_T[0], t0);
- }
- gen_op_mov_reg_T0(ot, reg);
- tcg_gen_movi_tl(cpu_cc_dst, 1);
- gen_set_label(label1);
- set_cc_op(s, CC_OP_LOGICB + ot);
+ /* For tzcnt, a zero input must return the operand size:
+ force all bits outside the operand size to 1. */
+ target_ulong mask = (target_ulong)-2 << (size - 1);
+ tcg_gen_ori_tl(cpu_T[0], cpu_T[0], mask);
+ gen_helper_ctz(cpu_T[0], cpu_T[0]);
+ }
+ /* For lzcnt/tzcnt, C and Z bits are defined and are
+ related to the result. */
+ gen_op_update1_cc();
+ set_cc_op(s, CC_OP_BMILGB + ot);
+ } else {
+ /* For bsr/bsf, only the Z bit is defined and it is related
+ to the input and not the result. */
+ tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+ set_cc_op(s, CC_OP_LOGICB + ot);
+ if (b & 1) {
+ /* For bsr, return the bit index of the first 1 bit,
+ not the count of leading zeros. */
+ gen_helper_clz(cpu_T[0], cpu_T[0]);
+ tcg_gen_xori_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - 1);
+ } else {
+ gen_helper_ctz(cpu_T[0], cpu_T[0]);
}
- tcg_temp_free(t0);
+ /* ??? The manual says that the output is undefined when the
+ input is zero, but real hardware leaves it unchanged, and
+ real programs appear to depend on that. */
+ tcg_gen_movi_tl(cpu_tmp0, 0);
+ tcg_gen_movcond_tl(TCG_COND_EQ, cpu_T[0], cpu_cc_dst, cpu_tmp0,
+ cpu_regs[reg], cpu_T[0]);
}
+ gen_op_mov_reg_T0(ot, reg);
break;
/************************/
/* bcd */
--
1.8.1.2
- [Qemu-devel] [PATCH 04/57] target-i386: introduce gen_ext_tl, (continued)
- [Qemu-devel] [PATCH 04/57] target-i386: introduce gen_ext_tl, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 16/57] target-i386: Use gen_update_cc_op everywhere, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 17/57] target-i386: add helper functions to get other flags, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 23/57] target-i386: convert gen_compute_eflags_c to TCG, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 18/57] target-i386: do not compute eflags multiple times consecutively, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 25/57] target-i386: optimize setbe, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 24/57] target-i386: change gen_setcc_slow_T0 to gen_setcc_slow, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 26/57] target-i386: optimize setle, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 22/57] target-i386: use inverted setcond when computing NS or NZ, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 38/57] target-i386: Update cc_op before TCG branches, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 56/57] target-i386: Implement tzcnt and fix lzcnt,
Richard Henderson <=
- [Qemu-devel] [PATCH 52/57] target-i386: Implement SHLX, SARX, SHRX, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 31/57] target-i386: inline gen_prepare_cc_slow, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 36/57] target-i386: use gen_op for cmps/scas, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 41/57] target-i386: Make helper_cc_compute_{all, c} const, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 49/57] target-i386: Implement BZHI, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 37/57] target-i386: introduce gen_jcc1_noeob, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 33/57] target-i386: introduce gen_cmovcc1, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 39/57] target-i386: optimize flags checking after sub using CC_SRCT, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 42/57] target-i386: Use CC_SRC2 for ADC and SBB, Richard Henderson, 2013/02/19
- [Qemu-devel] [PATCH 28/57] target-i386: introduce CCPrepare, Richard Henderson, 2013/02/19