qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v2 13/14] tcg/riscv: Implement vector roti/v/x shi ops


From: LIU Zhiwei
Subject: Re: [PATCH v2 13/14] tcg/riscv: Implement vector roti/v/x shi ops
Date: Wed, 4 Sep 2024 23:25:20 +0800
User-agent: Mozilla Thunderbird


On 2024/9/3 23:15, Richard Henderson wrote:
On 8/29/24 23:16, LIU Zhiwei wrote:
@@ -2589,6 +2605,69 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
              }
          }
          break;
+    case INDEX_op_shli_vec:
+        if (a2 > 31) {
+            tcg_gen_shls_vec(vece, v0, v1, tcg_constant_i32(a2));
+        } else {
+            vec_gen_3(INDEX_op_rvv_shli_vec, type, vece, tcgv_vec_arg(v0),
+                      tcgv_vec_arg(v1), a2);
+        }
+        break;
+    case INDEX_op_shri_vec:
+        if (a2 > 31) {
+            tcg_gen_shrs_vec(vece, v0, v1, tcg_constant_i32(a2));
+        } else {
+            vec_gen_3(INDEX_op_rvv_shri_vec, type, vece, tcgv_vec_arg(v0),
+                      tcgv_vec_arg(v1), a2);
+        }
+        break;
+    case INDEX_op_sari_vec:
+        if (a2 > 31) {
+            tcg_gen_sars_vec(vece, v0, v1, tcg_constant_i32(a2));
+        } else {
+            vec_gen_3(INDEX_op_rvv_sari_vec, type, vece, tcgv_vec_arg(v0),
+                      tcgv_vec_arg(v1), a2);
+        }
+        break;
+    case INDEX_op_rotli_vec:
+        t1 = tcg_temp_new_vec(type);
+        tcg_gen_shli_vec(vece, t1, v1, a2);
+        tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - a2);
+        tcg_gen_or_vec(vece, v0, v0, t1);
+        tcg_temp_free_vec(t1);
+        break;
+    case INDEX_op_rotls_vec:
+        t1 = tcg_temp_new_vec(type);
+        t2 = tcg_temp_new_i32();
+        tcg_gen_neg_i32(t2, temp_tcgv_i32(arg_temp(a2)));
+        tcg_gen_shrs_vec(vece, v0, v1, t2);
+        tcg_gen_shls_vec(vece, t1, v1, temp_tcgv_i32(arg_temp(a2)));
+        tcg_gen_or_vec(vece, v0, v0, t1);
+        tcg_temp_free_vec(t1);
+        tcg_temp_free_i32(t2);
+        break;

I'm trying to work out how much benefit there is here of expanding these early, as opposed to simply using TCG_REG_TMP0 when the immediate doesn't fit,

We find for rotli,  it just copied code from the implementation of INDEX_op_shli_vec and INDEX_op_shri_vec if we don't expand it.

  case INDEX_op_rotli_vec:
        if (a2 > 31) {
            tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, TCG_REG_ZERO, a2);
            tcg_out_opc_vx(s, OPC_VSLL_VX, TCG_REG_V0, a1, TCG_REG_TMP0, true);
        } else {
            tcg_out_opc_vi(s, OPC_VSLL_VI, TCG_REG_V0, a1, a2, true);
        }

        if ((8 << vece) - a2) > 31) {
            tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, TCG_REG_ZERO, 8 << vece) - a2);
            tcg_out_opc_vx(s, OPC_VSRL_VX, a0, a1, TCG_REG_TMP0, true);
        } else {
            tcg_out_opc_vi(s, OPC_VSRL_VI, a0, a1, 8 << vece) - a2, true);
        }
        tcg_out_opc_vv(s, OPC_VOR_VV, a0, a0, TCG_REG_V0, true);
        break;

Thus, I prefer to expand it early, at least for rotli_vec.

Thanks,
Zhiwei

or for rotls_vec negation.

+    case INDEX_op_rotlv_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        tcg_gen_neg_vec(vece, t1, v2);
+        vec_gen_3(INDEX_op_shrv_vec, type, vece, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
+        tcg_gen_or_vec(vece, v0, v0, t1);
+        tcg_temp_free_vec(t1);
+        break;
+    case INDEX_op_rotrv_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        tcg_gen_neg_vec(vece, t1, v2);
+        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        vec_gen_3(INDEX_op_shrv_vec, type, vece, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
+        tcg_gen_or_vec(vece, v0, v0, t1);
+        tcg_temp_free_vec(t1);
+        break;

And here we can use TCG_REG_V0 as the temporary, both for negation and shift intermediate.

    vrsub_vi  V0, a2, 0
    vshlv_vv  V0, a1, V0
    vshrv_vv  a0, a1, a2
    vor_vv    a0, a0, V0


r~

reply via email to

[Prev in Thread] Current Thread [Next in Thread]