[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH v3 25/29] target/arm: Convert VQSHL, VQSHLU to gvec
From: |
Richard Henderson |
Subject: |
[PATCH v3 25/29] target/arm: Convert VQSHL, VQSHLU to gvec |
Date: |
Wed, 11 Sep 2024 19:41:10 -0700 |
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper.h | 12 ++++
target/arm/tcg/translate.h | 7 ++
target/arm/tcg/gengvec.c | 36 +++++++++++
target/arm/tcg/neon_helper.c | 33 ++++++++++
target/arm/tcg/translate-neon.c | 110 +-------------------------------
target/arm/tcg/neon-dp.decode | 6 +-
6 files changed, 94 insertions(+), 110 deletions(-)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index b463be38c5..b40589d329 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -324,6 +324,18 @@ DEF_HELPER_FLAGS_5(neon_uqrshl_b, TCG_CALL_NO_RWG, void,
ptr, ptr, ptr, ptr, i32
DEF_HELPER_FLAGS_5(neon_uqrshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr,
i32)
DEF_HELPER_FLAGS_5(neon_uqrshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr,
i32)
DEF_HELPER_FLAGS_5(neon_uqrshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr,
i32)
+DEF_HELPER_FLAGS_4(neon_sqshli_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshli_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshli_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshli_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_uqshli_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_uqshli_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_uqshli_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_uqshli_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshlui_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshlui_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshlui_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshlui_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_srshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_srshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
index 45990ae292..7721c627e9 100644
--- a/target/arm/tcg/translate.h
+++ b/target/arm/tcg/translate.h
@@ -471,6 +471,13 @@ void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs,
uint32_t rn_ofs,
void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_neon_sqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ int64_t c, uint32_t opr_sz, uint32_t max_sz);
+void gen_neon_uqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ int64_t c, uint32_t opr_sz, uint32_t max_sz);
+void gen_neon_sqshlui(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ int64_t c, uint32_t opr_sz, uint32_t max_sz);
+
void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
diff --git a/target/arm/tcg/gengvec.c b/target/arm/tcg/gengvec.c
index 3abdc57202..f652520b65 100644
--- a/target/arm/tcg/gengvec.c
+++ b/target/arm/tcg/gengvec.c
@@ -1313,6 +1313,42 @@ void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs,
uint32_t rn_ofs,
opr_sz, max_sz, 0, fns[vece]);
}
+void gen_neon_sqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ int64_t c, uint32_t opr_sz, uint32_t max_sz)
+{
+ static gen_helper_gvec_2_ptr * const fns[] = {
+ gen_helper_neon_sqshli_b, gen_helper_neon_sqshli_h,
+ gen_helper_neon_sqshli_s, gen_helper_neon_sqshli_d,
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_debug_assert(c >= 0 && c <= (8 << vece));
+ tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
+}
+
+void gen_neon_uqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ int64_t c, uint32_t opr_sz, uint32_t max_sz)
+{
+ static gen_helper_gvec_2_ptr * const fns[] = {
+ gen_helper_neon_uqshli_b, gen_helper_neon_uqshli_h,
+ gen_helper_neon_uqshli_s, gen_helper_neon_uqshli_d,
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_debug_assert(c >= 0 && c <= (8 << vece));
+ tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
+}
+
+void gen_neon_sqshlui(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ int64_t c, uint32_t opr_sz, uint32_t max_sz)
+{
+ static gen_helper_gvec_2_ptr * const fns[] = {
+ gen_helper_neon_sqshlui_b, gen_helper_neon_sqshlui_h,
+ gen_helper_neon_sqshlui_s, gen_helper_neon_sqshlui_d,
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_debug_assert(c >= 0 && c <= (8 << vece));
+ tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
+}
+
void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp
esz)
{
uint64_t max = MAKE_64BIT_MASK(0, 8 << esz);
diff --git a/target/arm/tcg/neon_helper.c b/target/arm/tcg/neon_helper.c
index 082bfd88ad..739e16e441 100644
--- a/target/arm/tcg/neon_helper.c
+++ b/target/arm/tcg/neon_helper.c
@@ -141,6 +141,19 @@ void HELPER(name)(void *vd, void *vn, void *vm, void
*venv, uint32_t desc) \
clear_tail(d, opr_sz, simd_maxsz(desc)); \
}
+#define NEON_GVEC_VOP2i_ENV(name, vtype) \
+void HELPER(name)(void *vd, void *vn, void *venv, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int imm = simd_data(desc); \
+ vtype *d = vd, *n = vn; \
+ CPUARMState *env = venv; \
+ for (i = 0; i < opr_sz / sizeof(vtype); i++) { \
+ NEON_FN(d[i], n[i], imm); \
+ } \
+ clear_tail(d, opr_sz, simd_maxsz(desc)); \
+}
+
/* Pairwise operations. */
/* For 32-bit elements each segment only contains a single element, so
the elementwise and pairwise operations are the same. */
@@ -271,22 +284,26 @@ uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t
shift)
(dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
NEON_VOP_ENV(qshl_u8, neon_u8, 4)
NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
+NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
NEON_VOP_ENV(qshl_u16, neon_u16, 2)
NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
+NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
+NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
+NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t)
#undef NEON_FN
uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
@@ -303,22 +320,26 @@ uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t
val, uint64_t shift)
(dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
NEON_VOP_ENV(qshl_s8, neon_s8, 4)
NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
+NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
NEON_VOP_ENV(qshl_s16, neon_s16, 2)
NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
+NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
+NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
+NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t)
#undef NEON_FN
uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
@@ -334,11 +355,13 @@ uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t
val, uint64_t shift)
#define NEON_FN(dest, src1, src2) \
(dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
+NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
+NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t)
#undef NEON_FN
uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
@@ -351,6 +374,16 @@ uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t
val, uint64_t shift)
return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
}
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
+NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
+NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t)
+#undef NEON_FN
+
#define NEON_FN(dest, src1, src2) \
(dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c
index a31a78c347..6dd70d1c53 100644
--- a/target/arm/tcg/translate-neon.c
+++ b/target/arm/tcg/translate-neon.c
@@ -1101,113 +1101,9 @@ DO_2SH(VRSRA_S, gen_gvec_srsra)
DO_2SH(VRSRA_U, gen_gvec_ursra)
DO_2SH(VSHR_S, gen_gvec_sshr)
DO_2SH(VSHR_U, gen_gvec_ushr)
-
-static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
- NeonGenTwo64OpEnvFn *fn)
-{
- /*
- * 2-reg-and-shift operations, size == 3 case, where the
- * function needs to be passed tcg_env.
- */
- TCGv_i64 constimm;
- int pass;
-
- if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
- return false;
- }
-
- /* UNDEF accesses to D16-D31 if they don't exist. */
- if (!dc_isar_feature(aa32_simd_r32, s) &&
- ((a->vd | a->vm) & 0x10)) {
- return false;
- }
-
- if ((a->vm | a->vd) & a->q) {
- return false;
- }
-
- if (!vfp_access_check(s)) {
- return true;
- }
-
- /*
- * To avoid excessive duplication of ops we implement shift
- * by immediate using the variable shift operations.
- */
- constimm = tcg_constant_i64(dup_const(a->size, a->shift));
-
- for (pass = 0; pass < a->q + 1; pass++) {
- TCGv_i64 tmp = tcg_temp_new_i64();
-
- read_neon_element64(tmp, a->vm, pass, MO_64);
- fn(tmp, tcg_env, tmp, constimm);
- write_neon_element64(tmp, a->vd, pass, MO_64);
- }
- return true;
-}
-
-static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
- NeonGenTwoOpEnvFn *fn)
-{
- /*
- * 2-reg-and-shift operations, size < 3 case, where the
- * helper needs to be passed tcg_env.
- */
- TCGv_i32 constimm, tmp;
- int pass;
-
- if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
- return false;
- }
-
- /* UNDEF accesses to D16-D31 if they don't exist. */
- if (!dc_isar_feature(aa32_simd_r32, s) &&
- ((a->vd | a->vm) & 0x10)) {
- return false;
- }
-
- if ((a->vm | a->vd) & a->q) {
- return false;
- }
-
- if (!vfp_access_check(s)) {
- return true;
- }
-
- /*
- * To avoid excessive duplication of ops we implement shift
- * by immediate using the variable shift operations.
- */
- constimm = tcg_constant_i32(dup_const(a->size, a->shift));
- tmp = tcg_temp_new_i32();
-
- for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
- read_neon_element32(tmp, a->vm, pass, MO_32);
- fn(tmp, tcg_env, tmp, constimm);
- write_neon_element32(tmp, a->vd, pass, MO_32);
- }
- return true;
-}
-
-#define DO_2SHIFT_ENV(INSN, FUNC) \
- static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
- { \
- return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64); \
- } \
- static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \
- { \
- static NeonGenTwoOpEnvFn * const fns[] = { \
- gen_helper_neon_##FUNC##8, \
- gen_helper_neon_##FUNC##16, \
- gen_helper_neon_##FUNC##32, \
- }; \
- assert(a->size < ARRAY_SIZE(fns)); \
- return do_2shift_env_32(s, a, fns[a->size]); \
- }
-
-DO_2SHIFT_ENV(VQSHLU, qshlu_s)
-DO_2SHIFT_ENV(VQSHL_U, qshl_u)
-DO_2SHIFT_ENV(VQSHL_S, qshl_s)
+DO_2SH(VQSHLU, gen_neon_sqshlui)
+DO_2SH(VQSHL_U, gen_neon_uqshli)
+DO_2SH(VQSHL_S, gen_neon_sqshli)
static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
NeonGenTwo64OpFn *shiftfn,
diff --git a/target/arm/tcg/neon-dp.decode b/target/arm/tcg/neon-dp.decode
index 788578c8fa..e883c6ab58 100644
--- a/target/arm/tcg/neon-dp.decode
+++ b/target/arm/tcg/neon-dp.decode
@@ -291,17 +291,17 @@ VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1
.... @2reg_shl_s
VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_h
VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_b
-VQSHLU_64_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_d
+VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_d
VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_s
VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_h
VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_b
-VQSHL_S_64_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
+VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s
VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h
VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b
-VQSHL_U_64_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
+VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s
VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h
VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b
--
2.43.0
- [PATCH v3 13/29] target/arm: Convert MOVI, FMOV, ORR, BIC (vector immediate) to decodetree, (continued)
- [PATCH v3 13/29] target/arm: Convert MOVI, FMOV, ORR, BIC (vector immediate) to decodetree, Richard Henderson, 2024/09/11
- [PATCH v3 16/29] target/arm: Convert handle_vec_simd_shri to decodetree, Richard Henderson, 2024/09/11
- [PATCH v3 19/29] target/arm: Convert SSHLL, USHLL to decodetree, Richard Henderson, 2024/09/11
- [PATCH v3 15/29] target/arm: Fix whitespace near gen_srshr64_i64, Richard Henderson, 2024/09/11
- [PATCH v3 17/29] target/arm: Convert handle_vec_simd_shli to decodetree, Richard Henderson, 2024/09/11
- [PATCH v3 14/29] target/arm: Introduce gen_gvec_sshr, gen_gvec_ushr, Richard Henderson, 2024/09/11
- [PATCH v3 18/29] target/arm: Use {, s}extract in handle_vec_simd_wshli, Richard Henderson, 2024/09/11
- [PATCH v3 20/29] target/arm: Push tcg_rnd into handle_shri_with_rndacc, Richard Henderson, 2024/09/11
- [PATCH v3 23/29] target/arm: Convert handle_scalar_simd_shri to decodetree, Richard Henderson, 2024/09/11
- [PATCH v3 21/29] target/arm: Split out subroutines of handle_shri_with_rndacc, Richard Henderson, 2024/09/11
- [PATCH v3 25/29] target/arm: Convert VQSHL, VQSHLU to gvec,
Richard Henderson <=
- [PATCH v3 27/29] target/arm: Convert SQSHL, UQSHL, SQSHLU (immediate) to decodetree, Richard Henderson, 2024/09/11
- [PATCH v3 26/29] target/arm: Widen NeonGenNarrowEnvFn return to 64 bits, Richard Henderson, 2024/09/11
- [PATCH v3 28/29] target/arm: Convert vector [US]QSHRN, [US]QRSHRN, SQSHRUN to decodetree, Richard Henderson, 2024/09/11
- [PATCH v3 22/29] target/arm: Convert SHRN, RSHRN to decodetree, Richard Henderson, 2024/09/11
- [PATCH v3 24/29] target/arm: Convert handle_scalar_simd_shli to decodetree, Richard Henderson, 2024/09/11
- [PATCH v3 29/29] target/arm: Convert scalar [US]QSHRN, [US]QRSHRN, SQSHRUN to decodetree, Richard Henderson, 2024/09/11
- Re: [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4, Peter Maydell, 2024/09/16