[PATCH 29/65] target/riscv: Add integer merge and move instructions for

qemu-riscv

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 29/65] target/riscv: Add integer merge and move instructions for

From:	Huang Tao
Subject:	[PATCH 29/65] target/riscv: Add integer merge and move instructions for XTheadVector
Date:	Fri, 12 Apr 2024 15:36:59 +0800

The instructions have the same function as RVV1.0. Overall there are only
general differences between XTheadVector and RVV1.0. Except of
th.vmv.v.x, the difference is that XTheadVector has no limit of SEW
of 8 to 64, Therefore, it is not suitable to use acceleration when
xlen < SEW.

Signed-off-by: Huang Tao <eric.huang@linux.alibaba.com>
---
 target/riscv/helper.h                         |  17 +++
 .../riscv/insn_trans/trans_xtheadvector.c.inc | 124 +++++++++++++++++-
 target/riscv/xtheadvector_helper.c            | 104 +++++++++++++++
 3 files changed, 239 insertions(+), 6 deletions(-)

diff --git a/target/riscv/helper.h b/target/riscv/helper.h
index 8b8dd62761..ba548ebdc9 100644
--- a/target/riscv/helper.h
+++ b/target/riscv/helper.h
@@ -1868,3 +1868,20 @@ DEF_HELPER_6(th_vwmaccsu_vx_w, void, ptr, ptr, tl, ptr, 
env, i32)
 DEF_HELPER_6(th_vwmaccus_vx_b, void, ptr, ptr, tl, ptr, env, i32)
 DEF_HELPER_6(th_vwmaccus_vx_h, void, ptr, ptr, tl, ptr, env, i32)
 DEF_HELPER_6(th_vwmaccus_vx_w, void, ptr, ptr, tl, ptr, env, i32)
+
+DEF_HELPER_6(th_vmerge_vvm_b, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vvm_h, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vvm_w, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vvm_d, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vxm_b, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vxm_h, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vxm_w, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(th_vmerge_vxm_d, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_4(th_vmv_v_v_b, void, ptr, ptr, env, i32)
+DEF_HELPER_4(th_vmv_v_v_h, void, ptr, ptr, env, i32)
+DEF_HELPER_4(th_vmv_v_v_w, void, ptr, ptr, env, i32)
+DEF_HELPER_4(th_vmv_v_v_d, void, ptr, ptr, env, i32)
+DEF_HELPER_4(th_vmv_v_x_b, void, ptr, i64, env, i32)
+DEF_HELPER_4(th_vmv_v_x_h, void, ptr, i64, env, i32)
+DEF_HELPER_4(th_vmv_v_x_w, void, ptr, i64, env, i32)
+DEF_HELPER_4(th_vmv_v_x_d, void, ptr, i64, env, i32)
diff --git a/target/riscv/insn_trans/trans_xtheadvector.c.inc 
b/target/riscv/insn_trans/trans_xtheadvector.c.inc
index bfa3a26f78..6d0ce9f966 100644
--- a/target/riscv/insn_trans/trans_xtheadvector.c.inc
+++ b/target/riscv/insn_trans/trans_xtheadvector.c.inc
@@ -1576,18 +1576,130 @@ GEN_OPIVX_WIDEN_TRANS_TH(th_vwmacc_vx, 
opivx_widen_check_th)
 GEN_OPIVX_WIDEN_TRANS_TH(th_vwmaccsu_vx, opivx_widen_check_th)
 GEN_OPIVX_WIDEN_TRANS_TH(th_vwmaccus_vx, opivx_widen_check_th)
 
+/* Vector Integer Merge and Move Instructions */
+
+/*
+ * This function is almost the copy of trans_vmv_v_v, except:
+ * 1) XTheadVector simplifies the judgment logic of whether
+ *    to accelerate or not for its lack of fractional LMUL and
+ *    VTA.
+ */
+static bool trans_th_vmv_v_v(DisasContext *s, arg_th_vmv_v_v *a)
+{
+    if (require_xtheadvector(s) &&
+        vext_check_isa_ill(s) &&
+        th_check_reg(s, a->rd, false) &&
+        th_check_reg(s, a->rs1, false)) {
+
+        if (s->vl_eq_vlmax) {
+            tcg_gen_gvec_mov(s->sew, vreg_ofs(s, a->rd),
+                             vreg_ofs(s, a->rs1),
+                             MAXSZ(s), MAXSZ(s));
+        } else {
+            uint32_t data = FIELD_DP32(0, VDATA_TH, LMUL, s->lmul);
+            static gen_helper_gvec_2_ptr * const fns[4] = {
+                gen_helper_th_vmv_v_v_b, gen_helper_th_vmv_v_v_h,
+                gen_helper_th_vmv_v_v_w, gen_helper_th_vmv_v_v_d,
+            };
+
+            tcg_gen_gvec_2_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, a->rs1),
+                               tcg_env, s->cfg_ptr->vlenb,
+                               s->cfg_ptr->vlenb, data,
+                               fns[s->sew]);
+        }
+        finalize_rvv_inst(s);
+        return true;
+    }
+    return false;
+}
+
+
+#define gen_helper_vmv_vx_th gen_helper_vmv_vx
+/*
+ * This function is almost the copy of trans_vmv_v_x, except:
+ * 1) Simplier judgment logic of acceleration
+ * 2) XTheadVector has no limit of SEW of 8 to 64, Therefore, it is not
+ *    suitable to use acceleration when xlen < SEW.
+ */
+static bool trans_th_vmv_v_x(DisasContext *s, arg_th_vmv_v_x *a)
+{
+    if (require_xtheadvector(s) &&
+        vext_check_isa_ill(s) &&
+        th_check_reg(s, a->rd, false)) {
+
+        TCGv s1;
+        s1 = get_gpr(s, a->rs1, EXT_SIGN);
+
+        if (s->vl_eq_vlmax && (8 << s->sew) <= get_xlen(s)) {
+            tcg_gen_gvec_dup_tl(s->sew, vreg_ofs(s, a->rd),
+                                MAXSZ(s), MAXSZ(s), s1);
+        } else {
+            TCGv_i32 desc;
+            TCGv_i64 s1_i64 = tcg_temp_new_i64();
+            TCGv_ptr dest = tcg_temp_new_ptr();
+            uint32_t data = FIELD_DP32(0, VDATA_TH, LMUL, s->lmul);
+            static gen_helper_vmv_vx_th * const fns[4] = {
+                gen_helper_th_vmv_v_x_b, gen_helper_th_vmv_v_x_h,
+                gen_helper_th_vmv_v_x_w, gen_helper_th_vmv_v_x_d,
+            };
+
+            tcg_gen_ext_tl_i64(s1_i64, s1);
+            desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
+                                              s->cfg_ptr->vlenb, data));
+            tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, a->rd));
+            fns[s->sew](dest, s1_i64, tcg_env, desc);
+        }
+
+        finalize_rvv_inst(s);
+        return true;
+    }
+    return false;
+}
+
+/* The difference is same as trans_th_vmv_v_v */
+static bool trans_th_vmv_v_i(DisasContext *s, arg_th_vmv_v_i *a)
+{
+    if (require_xtheadvector(s) &&
+        vext_check_isa_ill(s) &&
+        th_check_reg(s, a->rd, false)) {
+
+        int64_t simm = sextract64(a->rs1, 0, 5);
+        if (s->vl_eq_vlmax) {
+            tcg_gen_gvec_dup_imm(s->sew, vreg_ofs(s, a->rd),
+                                 MAXSZ(s), MAXSZ(s), simm);
+        } else {
+            TCGv_i32 desc;
+            TCGv_i64 s1;
+            TCGv_ptr dest;
+            uint32_t data = FIELD_DP32(0, VDATA_TH, LMUL, s->lmul);
+            static gen_helper_vmv_vx_th * const fns[4] = {
+                gen_helper_th_vmv_v_x_b, gen_helper_th_vmv_v_x_h,
+                gen_helper_th_vmv_v_x_w, gen_helper_th_vmv_v_x_d,
+            };
+
+            s1 = tcg_constant_i64(simm);
+            dest = tcg_temp_new_ptr();
+            desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
+                                              s->cfg_ptr->vlenb, data));
+            tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, a->rd));
+            fns[s->sew](dest, s1, tcg_env, desc);
+        }
+        finalize_rvv_inst(s);
+        return true;
+    }
+    return false;
+}
+
+GEN_OPIVV_TRANS_TH(th_vmerge_vvm, opivv_vadc_check_th)
+GEN_OPIVX_TRANS_TH(th_vmerge_vxm, opivx_vadc_check_th)
+GEN_OPIVI_TRANS_TH(th_vmerge_vim, IMM_SX, th_vmerge_vxm, opivx_vadc_check_th)
+
 #define TH_TRANS_STUB(NAME)                                \
 static bool trans_##NAME(DisasContext *s, arg_##NAME *a)   \
 {                                                          \
     return require_xtheadvector(s);                        \
 }
 
-TH_TRANS_STUB(th_vmv_v_v)
-TH_TRANS_STUB(th_vmv_v_x)
-TH_TRANS_STUB(th_vmv_v_i)
-TH_TRANS_STUB(th_vmerge_vvm)
-TH_TRANS_STUB(th_vmerge_vxm)
-TH_TRANS_STUB(th_vmerge_vim)
 TH_TRANS_STUB(th_vsaddu_vv)
 TH_TRANS_STUB(th_vsaddu_vx)
 TH_TRANS_STUB(th_vsaddu_vi)
diff --git a/target/riscv/xtheadvector_helper.c 
b/target/riscv/xtheadvector_helper.c
index 19aad626c9..d8a0e3af90 100644
--- a/target/riscv/xtheadvector_helper.c
+++ b/target/riscv/xtheadvector_helper.c
@@ -1923,3 +1923,107 @@ GEN_TH_VX(th_vwmaccsu_vx_w, 4, 8, clearq_th)
 GEN_TH_VX(th_vwmaccus_vx_b, 1, 2, clearh_th)
 GEN_TH_VX(th_vwmaccus_vx_h, 2, 4, clearl_th)
 GEN_TH_VX(th_vwmaccus_vx_w, 4, 8, clearq_th)
+
+/* Vector Integer Merge and Move Instructions */
+
+/*
+ * The funtions below of VMV and vmerge are all the copy of RVV1.0 functions,
+ * except:
+ * 1) different desc encoding
+ * 2) different tail/masked element process policy
+ * 3) different mask layout
+ */
+#define GEN_TH_VMV_VV(NAME, ETYPE, H, CLEAR_FN)                      \
+void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
+                  uint32_t desc)                                     \
+{                                                                    \
+    uint32_t vl = env->vl;                                           \
+    uint32_t esz = sizeof(ETYPE);                                    \
+    uint32_t vlmax = th_maxsz(desc) / esz;                           \
+    uint32_t i;                                                      \
+                                                                     \
+    VSTART_CHECK_EARLY_EXIT(env);                                    \
+    for (i = env->vstart; i < vl; i++) {                             \
+        ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
+        *((ETYPE *)vd + H(i)) = s1;                                  \
+    }                                                                \
+    env->vstart = 0;                                                 \
+    CLEAR_FN(vd, vl, vl * esz, vlmax * esz);                         \
+}
+
+GEN_TH_VMV_VV(th_vmv_v_v_b, int8_t,  H1, clearb_th)
+GEN_TH_VMV_VV(th_vmv_v_v_h, int16_t, H2, clearh_th)
+GEN_TH_VMV_VV(th_vmv_v_v_w, int32_t, H4, clearl_th)
+GEN_TH_VMV_VV(th_vmv_v_v_d, int64_t, H8, clearq_th)
+
+#define GEN_TH_VMV_VX(NAME, ETYPE, H, CLEAR_FN)                      \
+void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
+                  uint32_t desc)                                     \
+{                                                                    \
+    uint32_t vl = env->vl;                                           \
+    uint32_t esz = sizeof(ETYPE);                                    \
+    uint32_t vlmax = th_maxsz(desc) / esz;                           \
+    uint32_t i;                                                      \
+                                                                     \
+    VSTART_CHECK_EARLY_EXIT(env);                                    \
+    for (i = env->vstart; i < vl; i++) {                             \
+        *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
+    }                                                                \
+    env->vstart = 0;                                                 \
+    CLEAR_FN(vd, vl, vl * esz, vlmax * esz);                         \
+}
+
+GEN_TH_VMV_VX(th_vmv_v_x_b, int8_t,  H1, clearb_th)
+GEN_TH_VMV_VX(th_vmv_v_x_h, int16_t, H2, clearh_th)
+GEN_TH_VMV_VX(th_vmv_v_x_w, int32_t, H4, clearl_th)
+GEN_TH_VMV_VX(th_vmv_v_x_d, int64_t, H8, clearq_th)
+
+#define GEN_TH_VMERGE_VV(NAME, ETYPE, H, CLEAR_FN)                   \
+void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
+                  CPURISCVState *env, uint32_t desc)                 \
+{                                                                    \
+    uint32_t mlen = th_mlen(desc);                                   \
+    uint32_t vl = env->vl;                                           \
+    uint32_t esz = sizeof(ETYPE);                                    \
+    uint32_t vlmax = th_maxsz(desc) / esz;                           \
+    uint32_t i;                                                      \
+                                                                     \
+    VSTART_CHECK_EARLY_EXIT(env);                                    \
+    for (i = env->vstart; i < vl; i++) {                             \
+        ETYPE *vt = (!th_elem_mask(v0, mlen, i) ? vs2 : vs1);        \
+        *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
+    }                                                                \
+    env->vstart = 0;                                                 \
+    CLEAR_FN(vd, vl, vl * esz, vlmax * esz);                         \
+}
+
+GEN_TH_VMERGE_VV(th_vmerge_vvm_b, int8_t,  H1, clearb_th)
+GEN_TH_VMERGE_VV(th_vmerge_vvm_h, int16_t, H2, clearh_th)
+GEN_TH_VMERGE_VV(th_vmerge_vvm_w, int32_t, H4, clearl_th)
+GEN_TH_VMERGE_VV(th_vmerge_vvm_d, int64_t, H8, clearq_th)
+
+#define GEN_TH_VMERGE_VX(NAME, ETYPE, H, CLEAR_FN)                   \
+void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
+                  void *vs2, CPURISCVState *env, uint32_t desc)      \
+{                                                                    \
+    uint32_t mlen = th_mlen(desc);                                   \
+    uint32_t vl = env->vl;                                           \
+    uint32_t esz = sizeof(ETYPE);                                    \
+    uint32_t vlmax = th_maxsz(desc) / esz;                           \
+    uint32_t i;                                                      \
+                                                                     \
+    VSTART_CHECK_EARLY_EXIT(env);                                    \
+    for (i = env->vstart; i < vl; i++) {                             \
+        ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
+        ETYPE d = (!th_elem_mask(v0, mlen, i) ? s2 :                 \
+                   (ETYPE)(target_long)s1);                          \
+        *((ETYPE *)vd + H(i)) = d;                                   \
+    }                                                                \
+    env->vstart = 0;                                                 \
+    CLEAR_FN(vd, vl, vl * esz, vlmax * esz);                         \
+}
+
+GEN_TH_VMERGE_VX(th_vmerge_vxm_b, int8_t,  H1, clearb_th)
+GEN_TH_VMERGE_VX(th_vmerge_vxm_h, int16_t, H2, clearh_th)
+GEN_TH_VMERGE_VX(th_vmerge_vxm_w, int32_t, H4, clearl_th)
+GEN_TH_VMERGE_VX(th_vmerge_vxm_d, int64_t, H8, clearq_th)
-- 
2.44.0

[Prev in Thread]

Current Thread

[Next in Thread]

[PATCH 19/65] target/riscv: Add bitwise logical instructions for XTheadVector, (continued)
- [PATCH 19/65] target/riscv: Add bitwise logical instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 20/65] target/riscv: Add single-width bit shift instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 21/65] target/riscv: Add narrowing integer right shift instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 22/65] target/riscv: Add integer compare instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 23/65] target/riscv: Add integer min/max instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 24/65] target/riscv: Add single-width integer multiply instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 25/65] target/riscv: Add integer divide instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 26/65] target/riscv: Add widening integer multiply instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 27/65] target/riscv: Add single-width integer multiply-add instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 28/65] target/riscv: Add widening integer multiply-add instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 29/65] target/riscv: Add integer merge and move instructions for XTheadVector, Huang Tao <=
- [PATCH 30/65] target/riscv: Add single-width saturating add and sub instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 31/65] target/riscv: Add single-width average add and sub instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 32/65] target/riscv: Add single-width fractional mul with rounding and saturation for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 33/65] target/riscv: Add widening saturating scaled multiply-add instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 34/65] target/riscv: Add single-width scaling shift instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 35/65] target/riscv: Add narrowing fixed-point clip instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 36/65] target/riscv: Add single-width floating-point add/sub instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 37/65] target/riscv: Add widening floating-point add/sub instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 38/65] target/riscv: Add single-width floating-point multiply/divide instructions for XTheadVector, Huang Tao, 2024/04/12
- [PATCH 39/65] target/riscv: Add widening floating-point multiply instructions for XTheadVector, Huang Tao, 2024/04/12

Prev by Date: [PATCH 28/65] target/riscv: Add widening integer multiply-add instructions for XTheadVector
Next by Date: [PATCH 30/65] target/riscv: Add single-width saturating add and sub instructions for XTheadVector
Previous by thread: [PATCH 28/65] target/riscv: Add widening integer multiply-add instructions for XTheadVector
Next by thread: [PATCH 30/65] target/riscv: Add single-width saturating add and sub instructions for XTheadVector
Index(es):
- Date
- Thread