|
From: | LIU Zhiwei |
Subject: | Re: [PATCH v2 13/14] tcg/riscv: Implement vector roti/v/x shi ops |
Date: | Wed, 4 Sep 2024 23:25:20 +0800 |
User-agent: | Mozilla Thunderbird |
On 8/29/24 23:16, LIU Zhiwei wrote:
@@ -2589,6 +2605,69 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
}
}
break;
+ case INDEX_op_shli_vec:
+ if (a2 > 31) {
+ tcg_gen_shls_vec(vece, v0, v1, tcg_constant_i32(a2));
+ } else {
+ vec_gen_3(INDEX_op_rvv_shli_vec, type, vece, tcgv_vec_arg(v0),
+ tcgv_vec_arg(v1), a2);
+ }
+ break;
+ case INDEX_op_shri_vec:
+ if (a2 > 31) {
+ tcg_gen_shrs_vec(vece, v0, v1, tcg_constant_i32(a2));
+ } else {
+ vec_gen_3(INDEX_op_rvv_shri_vec, type, vece, tcgv_vec_arg(v0),
+ tcgv_vec_arg(v1), a2);
+ }
+ break;
+ case INDEX_op_sari_vec:
+ if (a2 > 31) {
+ tcg_gen_sars_vec(vece, v0, v1, tcg_constant_i32(a2));
+ } else {
+ vec_gen_3(INDEX_op_rvv_sari_vec, type, vece, tcgv_vec_arg(v0),
+ tcgv_vec_arg(v1), a2);
+ }
+ break;
+ case INDEX_op_rotli_vec:
+ t1 = tcg_temp_new_vec(type);
+ tcg_gen_shli_vec(vece, t1, v1, a2);
+ tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - a2);
+ tcg_gen_or_vec(vece, v0, v0, t1);
+ tcg_temp_free_vec(t1);
+ break;
+ case INDEX_op_rotls_vec:
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_i32();
+ tcg_gen_neg_i32(t2, temp_tcgv_i32(arg_temp(a2)));
+ tcg_gen_shrs_vec(vece, v0, v1, t2);
+ tcg_gen_shls_vec(vece, t1, v1, temp_tcgv_i32(arg_temp(a2)));
+ tcg_gen_or_vec(vece, v0, v0, t1);
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_i32(t2);
+ break;
I'm trying to work out how much benefit there is here of expanding these early, as opposed to simply using TCG_REG_TMP0 when the immediate doesn't fit,
We find for rotli, it just copied code from the implementation
of INDEX_op_shli_vec and INDEX_op_shri_vec if we don't expand it.
case INDEX_op_rotli_vec: if (a2 > 31) { tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, TCG_REG_ZERO, a2); tcg_out_opc_vx(s, OPC_VSLL_VX, TCG_REG_V0, a1, TCG_REG_TMP0, true); } else { tcg_out_opc_vi(s, OPC_VSLL_VI, TCG_REG_V0, a1, a2, true); } if ((8 << vece) - a2) > 31) { tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, TCG_REG_ZERO, 8 << vece) - a2); tcg_out_opc_vx(s, OPC_VSRL_VX, a0, a1, TCG_REG_TMP0, true); } else { tcg_out_opc_vi(s, OPC_VSRL_VI, a0, a1, 8 << vece) - a2, true); } tcg_out_opc_vv(s, OPC_VOR_VV, a0, a0, TCG_REG_V0, true); break;
Thus, I prefer to expand it early, at least for rotli_vec.
Thanks,
Zhiwei
or for rotls_vec negation.
+ case INDEX_op_rotlv_vec:
+ v2 = temp_tcgv_vec(arg_temp(a2));
+ t1 = tcg_temp_new_vec(type);
+ tcg_gen_neg_vec(vece, t1, v2);
+ vec_gen_3(INDEX_op_shrv_vec, type, vece, tcgv_vec_arg(t1),
+ tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+ vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(v0),
+ tcgv_vec_arg(v1), tcgv_vec_arg(v2));
+ tcg_gen_or_vec(vece, v0, v0, t1);
+ tcg_temp_free_vec(t1);
+ break;
+ case INDEX_op_rotrv_vec:
+ v2 = temp_tcgv_vec(arg_temp(a2));
+ t1 = tcg_temp_new_vec(type);
+ tcg_gen_neg_vec(vece, t1, v2);
+ vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
+ tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+ vec_gen_3(INDEX_op_shrv_vec, type, vece, tcgv_vec_arg(v0),
+ tcgv_vec_arg(v1), tcgv_vec_arg(v2));
+ tcg_gen_or_vec(vece, v0, v0, t1);
+ tcg_temp_free_vec(t1);
+ break;
And here we can use TCG_REG_V0 as the temporary, both for negation and shift intermediate.
vrsub_vi V0, a2, 0
vshlv_vv V0, a1, V0
vshrv_vv a0, a1, a2
vor_vv a0, a0, V0
r~
[Prev in Thread] | Current Thread | [Next in Thread] |