[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH v2 5/9] target/arm: Convert Neon narrowing shifts with op==8 to d
From: |
Peter Maydell |
Subject: |
[PATCH v2 5/9] target/arm: Convert Neon narrowing shifts with op==8 to decodetree |
Date: |
Fri, 22 May 2020 15:55:16 +0100 |
Convert the Neon narrowing shifts where op==8 to decodetree:
* VSHRN
* VRSHRN
* VQSHRUN
* VQRSHRUN
Signed-off-by: Peter Maydell <address@hidden>
---
target/arm/neon-dp.decode | 27 +++++
target/arm/translate-neon.inc.c | 168 ++++++++++++++++++++++++++++++++
target/arm/translate.c | 1 +
3 files changed, 196 insertions(+)
diff --git a/target/arm/neon-dp.decode b/target/arm/neon-dp.decode
index f9183060a51..01887240b4a 100644
--- a/target/arm/neon-dp.decode
+++ b/target/arm/neon-dp.decode
@@ -230,6 +230,17 @@ VMINNM_fp_3s 1111 001 1 0 . 1 . .... .... 1111 ... 1
.... @3same_fp
@2reg_shl_b .... ... . . . 001 shift:3 .... .... 0 q:1 . . .... \
&2reg_shift vm=%vm_dp vd=%vd_dp size=0
+# Narrowing right shifts: here the Q bit is part of the opcode decode
+@2reg_shrn_d .... ... . . . 1 ..... .... .... 0 . . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=3 q=0 \
+ shift=%neon_rshift_i5
+@2reg_shrn_s .... ... . . . 01 .... .... .... 0 . . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=2 q=0 \
+ shift=%neon_rshift_i4
+@2reg_shrn_h .... ... . . . 001 ... .... .... 0 . . . .... \
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0 \
+ shift=%neon_rshift_i3
+
VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d
VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s
VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_h
@@ -299,3 +310,19 @@ VQSHL_U_64_2sh 1111 001 1 1 . ...... .... 0111 . . . 1
.... @2reg_shl_d
VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s
VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h
VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b
+
+VSHRN_64_2sh 1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_d
+VSHRN_32_2sh 1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_s
+VSHRN_16_2sh 1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_h
+
+VRSHRN_64_2sh 1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_d
+VRSHRN_32_2sh 1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_s
+VRSHRN_16_2sh 1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_h
+
+VQSHRUN_64_2sh 1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_d
+VQSHRUN_32_2sh 1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_s
+VQSHRUN_16_2sh 1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_h
+
+VQRSHRUN_64_2sh 1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_d
+VQRSHRUN_32_2sh 1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_s
+VQRSHRUN_16_2sh 1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_h
diff --git a/target/arm/translate-neon.inc.c b/target/arm/translate-neon.inc.c
index 396db55565f..18ea7255e38 100644
--- a/target/arm/translate-neon.inc.c
+++ b/target/arm/translate-neon.inc.c
@@ -1504,3 +1504,171 @@ static bool do_2shift_env_32(DisasContext *s,
arg_2reg_shift *a,
DO_2SHIFT_ENV(VQSHLU, qshlu_s)
DO_2SHIFT_ENV(VQSHL_U, qshl_u)
DO_2SHIFT_ENV(VQSHL_S, qshl_s)
+
+static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
+ NeonGenTwo64OpFn *shiftfn,
+ NeonGenNarrowEnvFn *narrowfn)
+{
+ /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
+ TCGv_i64 constimm, rm1, rm2;
+ TCGv_i32 rd;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (a->vm & 1) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * This is always a right shift, and the shiftfn is always a
+ * left-shift helper, which thus needs the negated shift count.
+ */
+ constimm = tcg_const_i64(-a->shift);
+ rm1 = tcg_temp_new_i64();
+ rm2 = tcg_temp_new_i64();
+
+ /* Load both inputs first to avoid potential overwrite if rm == rd */
+ neon_load_reg64(rm1, a->vm);
+ neon_load_reg64(rm2, a->vm + 1);
+
+ shiftfn(rm1, rm1, constimm);
+ rd = tcg_temp_new_i32();
+ narrowfn(rd, cpu_env, rm1);
+ neon_store_reg(a->vd, 0, rd);
+
+ shiftfn(rm2, rm2, constimm);
+ rd = tcg_temp_new_i32();
+ narrowfn(rd, cpu_env, rm2);
+ neon_store_reg(a->vd, 1, rd);
+
+ tcg_temp_free_i64(rm1);
+ tcg_temp_free_i64(rm2);
+ tcg_temp_free_i64(constimm);
+
+ return true;
+}
+
+static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
+ NeonGenTwoOpFn *shiftfn,
+ NeonGenNarrowEnvFn *narrowfn)
+{
+ /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
+ TCGv_i32 constimm, rm1, rm2, rm3, rm4;
+ TCGv_i64 rtmp;
+ uint32_t imm;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (a->vm & 1) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * This is always a right shift, and the shiftfn is always a
+ * left-shift helper, which thus needs the negated shift count
+ * duplicated into each lane of the immediate value.
+ */
+ if (a->size == 1) {
+ imm = (uint16_t)(-a->shift);
+ imm |= imm << 16;
+ } else {
+ /* size == 2 */
+ imm = -a->shift;
+ }
+ constimm = tcg_const_i32(imm);
+
+ /* Load all inputs first to avoid potential overwrite */
+ rm1 = neon_load_reg(a->vm, 0);
+ rm2 = neon_load_reg(a->vm, 1);
+ rm3 = neon_load_reg(a->vm + 1, 0);
+ rm4 = neon_load_reg(a->vm + 1, 1);
+ rtmp = tcg_temp_new_i64();
+
+ // todo expand out the shift-narrow and the narrow-op
+ shiftfn(rm1, rm1, constimm);
+ shiftfn(rm2, rm2, constimm);
+
+ tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
+ tcg_temp_free_i32(rm2);
+
+ narrowfn(rm1, cpu_env, rtmp);
+ neon_store_reg(a->vd, 0, rm1);
+
+ shiftfn(rm3, rm3, constimm);
+ shiftfn(rm4, rm4, constimm);
+ tcg_temp_free_i32(constimm);
+
+ tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
+ tcg_temp_free_i32(rm4);
+
+ narrowfn(rm3, cpu_env, rtmp);
+ tcg_temp_free_i64(rtmp);
+ neon_store_reg(a->vd, 1, rm3);
+ return true;
+}
+
+#define DO_2SN_64(INSN, FUNC, NARROWFUNC) \
+ static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \
+ { \
+ return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC); \
+ }
+#define DO_2SN_32(INSN, FUNC, NARROWFUNC) \
+ static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \
+ { \
+ return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC); \
+ }
+
+static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+{
+ tcg_gen_extrl_i64_i32(dest, src);
+}
+
+static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+{
+ gen_helper_neon_narrow_u16(dest, src);
+}
+
+static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+{
+ gen_helper_neon_narrow_u8(dest, src);
+}
+
+DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
+DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
+DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
+
+DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
+DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
+DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
+
+DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
+DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
+DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
+
+DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
+DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
+DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
diff --git a/target/arm/translate.c b/target/arm/translate.c
index d711d39eb9d..f884db535b4 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -5280,6 +5280,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t
insn)
case 5: /* VSHL, VSLI */
case 6: /* VQSHLU */
case 7: /* VQSHL */
+ case 8: /* VSHRN, VRSHRN, VQSHRUN, VQRSHRUN */
return 1; /* handled by decodetree */
default:
break;
--
2.20.1
- [PATCH v2 0/9] target/arm: Convert 2-reg-shift and 1-reg-imm Neon insns to decodetree, Peter Maydell, 2020/05/22
- [PATCH v2 1/9] target/arm: Convert Neon VSHL and VSLI 2-reg-shift insn to decodetree, Peter Maydell, 2020/05/22
- [PATCH v2 2/9] target/arm: Convert Neon VSHR 2-reg-shift insns to decodetree, Peter Maydell, 2020/05/22
- [PATCH v2 3/9] target/arm: Convert Neon VSRA, VSRI, VRSHR, VRSRA 2-reg-shift insns to decodetree, Peter Maydell, 2020/05/22
- [PATCH v2 4/9] target/arm: Convert VQSHLU, VQSHL 2-reg-shift insns to decodetree, Peter Maydell, 2020/05/22
- [PATCH v2 5/9] target/arm: Convert Neon narrowing shifts with op==8 to decodetree,
Peter Maydell <=
- [PATCH v2 6/9] target/arm: Convert Neon narrowing shifts with op==9 to decodetree, Peter Maydell, 2020/05/22
- [PATCH v2 8/9] target/arm: Convert VCVT fixed-point ops to decodetree, Peter Maydell, 2020/05/22
- [PATCH v2 9/9] target/arm: Convert Neon one-register-and-immediate insns to decodetree, Peter Maydell, 2020/05/22
- [PATCH v2 7/9] target/arm: Convert Neon VSHLL, VMOVL to decodetree, Peter Maydell, 2020/05/22
- Re: [PATCH v2 0/9] target/arm: Convert 2-reg-shift and 1-reg-imm Neon insns to decodetree, no-reply, 2020/05/22