[Qemu-devel] [PATCH v8 1/6] target/mips: Optimize ILVOD.<B|H|W|D> MSA in

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH v8 1/6] target/mips: Optimize ILVOD.<B|H|W|D> MSA in

From:	Mateja Marjanovic
Subject:	[Qemu-devel] [PATCH v8 1/6] target/mips: Optimize ILVOD.<B\|H\|W\|D> MSA instructions
Date:	Thu, 18 Apr 2019 13:42:41 +0200

From: Mateja Marjanovic <address@hidden>

Optimize set of MSA instructions ILVOD.<B|H|W|D>, using
directly tcg registers and performing logic on them instead
of using helpers.

In the following table, the first column is the performance
before this patch. The second represents the performance
after converting from helpers to tcg, but without using
tcg_gen_deposit function. The third one is with the deposit
function and with using a uint64_t constant bit mask, and
the fourth is with the deposit function and with a mask
which is a tcg constant. The fourth is implemented in this
patch.

Performance measurement is done by executing the
instructions 10 million times on a computer
with Intel Core i7-3770 CPU @ 3.40GHz×8.

==================================================================
|| instruction ||     1     ||     2    ||     3    ||     4    ||
==================================================================
||   ilvod.b   || 117.50 ms || 24.13 ms || 24.45 ms || 23.24 ms ||
||   ilvod.h   ||  93.16 ms || 24.21 ms || 24.28 ms || 23.20 ms ||
||   ilvod.w   || 119.90 ms || 24.15 ms || 23.19 ms || 22.95 ms ||
||   ilvod.d   ||  43.01 ms || 21.17 ms || 23.07 ms || 22.59 ms ||
==================================================================
1 - before
2 - no-deposit-no-mask-as-tcg-constant
3 - with-deposit-no-mask-as-tcg-constant
4 - with-deposit-with-mask-as-tcg-constant (final)

The deposit function is used only in ILVOD.W.

No-deposit version of the ILVOD.W implementation:

static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd,
                               uint32_t ws, uint32_t wt)
{
    TCGv_i64 t1 = tcg_temp_new_i64();
    TCGv_i64 t2 = tcg_temp_new_i64();
    TCGv_i64 mask = tcg_const_i64(0xffffffff00000000ULL);

    tcg_gen_and_i64(t1, msa_wr_d[wt * 2], mask);
    tcg_gen_shri_i64(t1, t1, 32);
    tcg_gen_and_i64(t2, msa_wr_d[ws * 2], mask);
    tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t2);

    tcg_gen_and_i64(t1, msa_wr_d[wt * 2 + 1], mask);
    tcg_gen_shri_i64(t1, t1, 32);
    tcg_gen_and_i64(t2, msa_wr_d[ws * 2 + 1], mask);
    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t2);

    tcg_temp_free_i64(mask);
    tcg_temp_free_i64(t1);
    tcg_temp_free_i64(t2);
}

Reviewed-by: Richard Henderson <address@hidden>
Suggested-by: Richard Henderson <address@hidden>
Signed-off-by: Mateja Marjanovic <address@hidden>
---
 target/mips/helper.h     |  1 -
 target/mips/msa_helper.c |  7 ----
 target/mips/translate.c  | 91 +++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/target/mips/helper.h b/target/mips/helper.h
index a6d687e..d162836 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index c74e3cd..9e52a31 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -1206,13 +1206,6 @@ MSA_FN_DF(ilvr_df)
 MSA_FN_DF(ilvev_df)
 #undef MSA_DO
 
-#define MSA_DO(DF)                          \
-    do {                                    \
-        pwx->DF[2*i]   = pwt->DF[2*i+1];    \
-        pwx->DF[2*i+1] = pws->DF[2*i+1];    \
-    } while (0)
-MSA_FN_DF(ilvod_df)
-#undef MSA_DO
 #undef MSA_LOOP_COND
 
 #define MSA_LOOP_COND(DF) \
diff --git a/target/mips/translate.c b/target/mips/translate.c
index 364bd6d..99bd441 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -28001,6 +28001,80 @@ static void gen_msa_bit(CPUMIPSState *env, 
DisasContext *ctx)
     tcg_temp_free_i32(tws);
 }
 
+/*
+ * [MSA] ILVOD.<B|H> wd, ws, wt
+ *
+ *   Vector Interleave Odd (<byte|halfword> data elements)
+ *
+ */
+static inline void gen_ilvod_bh(CPUMIPSState *env, uint32_t wd,
+                                uint32_t ws, uint32_t wt,
+                                uint64_t mask, uint32_t shift)
+{
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+    TCGv_i64 mask_tcg = tcg_const_i64(mask);
+
+    tcg_gen_and_i64(t1, msa_wr_d[wt * 2], mask_tcg);
+    tcg_gen_shri_i64(t1, t1, shift);
+    tcg_gen_and_i64(t2, msa_wr_d[ws * 2], mask_tcg);
+    tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t2);
+
+    tcg_gen_and_i64(t1, msa_wr_d[wt * 2 + 1], mask_tcg);
+    tcg_gen_shri_i64(t1, t1, shift);
+    tcg_gen_and_i64(t2, msa_wr_d[ws * 2 + 1], mask_tcg);
+    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t2);
+
+    tcg_temp_free_i64(mask_tcg);
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t2);
+}
+
+static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt)
+{
+    gen_ilvod_bh(env, wd, ws, wt, 0xff00ff00ff00ff00ULL, 8);
+}
+
+static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt)
+{
+    gen_ilvod_bh(env, wd, ws, wt, 0xffff0000ffff0000ULL, 16);
+}
+
+/*
+ * [MSA] ILVOD.W wd, ws, wt
+ *
+ *   Vector Interleave Odd (word data elements)
+ *
+ */
+static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt)
+{
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    tcg_gen_shri_i64(t1, msa_wr_d[wt * 2], 32);
+    tcg_gen_deposit_i64(msa_wr_d[wd * 2], msa_wr_d[ws * 2], t1, 0, 32);
+
+    tcg_gen_shri_i64(t1, msa_wr_d[wt * 2 + 1], 32);
+    tcg_gen_deposit_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1], t1, 0, 32);
+
+    tcg_temp_free_i64(t1);
+}
+
+/*
+ * [MSA] ILVOD.D wd, ws, wt
+ *
+ *   Vector Interleave Odd (doubleword data elements)
+ *
+ */
+static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt)
+{
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2 + 1]);
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
+}
+
 static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
 {
 #define MASK_MSA_3R(op)    (MASK_MSA_MINOR(op) | (op & (0x7 << 23)))
@@ -28172,7 +28246,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext 
*ctx)
         gen_helper_msa_mod_u_df(cpu_env, tdf, twd, tws, twt);
         break;
     case OPC_ILVOD_df:
-        gen_helper_msa_ilvod_df(cpu_env, tdf, twd, tws, twt);
+        switch (df) {
+        case DF_BYTE:
+            gen_ilvod_b(env, wd, ws, wt);
+            break;
+        case DF_HALF:
+            gen_ilvod_h(env, wd, ws, wt);
+            break;
+        case DF_WORD:
+            gen_ilvod_w(env, wd, ws, wt);
+            break;
+        case DF_DOUBLE:
+            gen_ilvod_d(env, wd, ws, wt);
+            break;
+        default:
+            assert(0);
+        }
         break;
 
     case OPC_DOTP_S_df:
-- 
2.7.4

[Prev in Thread]

Current Thread

[Next in Thread]

[Qemu-devel] [PATCH v8 0/6] target/mips: Optimize MSA interleave instructions, Mateja Marjanovic, 2019/04/18
- [Qemu-devel] [PATCH v8 1/6] target/mips: Optimize ILVOD.<B|H|W|D> MSA instructions, Mateja Marjanovic <=
- [Qemu-devel] [PATCH v8 2/6] target/mips: Optimize ILVEV.<B|H|W|D> MSA instructions, Mateja Marjanovic, 2019/04/18
- [Qemu-devel] [PATCH v8 3/6] target/mips: Optimize ILVL.<B|H|W|D> MSA instructions, Mateja Marjanovic, 2019/04/18
- [Qemu-devel] [PATCH v8 4/6] target/mips: Optimize ILVR.<B|H|W|D> MSA instructions, Mateja Marjanovic, 2019/04/18
- [Qemu-devel] [PATCH v8 6/6] target/mips: Merge implementation of ILVOD.D and ILVL.D, Mateja Marjanovic, 2019/04/18
- [Qemu-devel] [PATCH v8 5/6] target/mips: Merge implementation of ILVEV.D and ILVR.D, Mateja Marjanovic, 2019/04/18

Prev by Date: Re: [Qemu-devel] [Qemu-block] [PATCH] scsi-generic: prevent guest from exceeding SG_IO limits
Next by Date: [Qemu-devel] [PATCH v8 0/6] target/mips: Optimize MSA interleave instructions
Previous by thread: [Qemu-devel] [PATCH v8 0/6] target/mips: Optimize MSA interleave instructions
Next by thread: [Qemu-devel] [PATCH v8 2/6] target/mips: Optimize ILVEV.<B|H|W|D> MSA instructions
Index(es):
- Date
- Thread