[Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate

From:	C Fontana
Subject:	[Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate
Date:	Thu, 23 Jan 2014 21:08:21 +0100

Hi Peter, just two nits, answering from the tablet so sorry if arrives with strange formatting, hope not..

On Thursday, January 23, 2014, Peter Maydell <address@hidden> wrote:

From: Alex Bennée <address@hidden>

This implements a subset of the AdvSIMD shift operations (namely all the
none saturating or narrowing ones). The actual shift generation code
itself is common for both the scalar and vector cases but wrapped with
either vector element iteration or the fp reg access.

The rounding operations need to take special care to correctly reflect
the result of adding rounding bits on high bits as the intermediates do
not truncate.

Signed-off-by: Alex Bennée <address@hidden>
Reviewed-by: Richard Henderson <address@hidden>
---
target-arm/translate-a64.c | 381 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 379 insertions(+), 2 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 5eabf24..9eb91fc4 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5531,15 +5531,220 @@ static void disas_simd_scalar_pairwise(DisasContext *s, uint32_t insn)
unsupported_encoding(s, insn);
}

+/*
+ * Common SSHR[RA]/USHR[RA] - Shift right (optional rounding/accumulate)
+ *
+ * This code is handles the common shifting code and is used by both
+ * the vector and scalar code.
+ */
+static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
+ TCGv_i64 tcg_rnd, bool accumulate,
+ bool is_u, int size, int shift)
+{
+ bool extended_result = false;
+ bool round = !TCGV_IS_UNUSED_I64(tcg_rnd);
+ int ext_lshift = 0;
+ TCGv_i64 tcg_src_hi;
+
+ if (round && size == 3) {
+ extended_result = true;
+ ext_lshift = 64 - shift;
+ tcg_src_hi = tcg_temp_new_i64();
+ } else if (shift == 64) {
+ if (!accumulate && is_u) {
+ /* result is zero */
+ tcg_gen_movi_i64(tcg_res, 0);
+ return;
+ }
+ }
+
+ /* Deal with the rounding step */
+ if (round) {
+ if (extended_result) {
+ TCGv_i64 tcg_zero = tcg_const_i64(0);
+ if (!is_u) {
+ /* take care of sign extending tcg_res */
+ tcg_gen_sari_i64(tcg_src_hi, tcg_src, 63);
+ tcg_gen_add2_i64(tcg_src, tcg_src_hi,
+ tcg_src, tcg_src_hi,
+ tcg_rnd, tcg_zero);
+ } else {
+ tcg_gen_add2_i64(tcg_src, tcg_src_hi,
+ tcg_src, tcg_zero,
+ tcg_rnd, tcg_zero);
+ }
+ tcg_temp_free_i64(tcg_zero);
+ } else {
+ tcg_gen_add_i64(tcg_src, tcg_src, tcg_rnd);
+ }
+ }
+
+ /* Now do the shift right */
+ if (round && extended_result) {
+ /* extended case, >64 bit precision required */
+ if (ext_lshift == 0) {
+ /* special case, only high bits matter */
+ tcg_gen_mov_i64(tcg_src, tcg_src_hi);
+ } else {
+ tcg_gen_shri_i64(tcg_src, tcg_src, shift);
+ tcg_gen_shli_i64(tcg_src_hi, tcg_src_hi, ext_lshift);
+ tcg_gen_or_i64(tcg_src, tcg_src, tcg_src_hi);
+ }
+ } else {
+ if (is_u) {
+ if (shift == 64) {
+ /* essentially shifting in 64 zeros */
+ tcg_gen_movi_i64(tcg_src, 0);
+ } else {
+ tcg_gen_shri_i64(tcg_src, tcg_src, shift);
+ }
+ } else {
+ if (shift == 64) {
+ /* effectively extending the sign-bit */
+ tcg_gen_sari_i64(tcg_src, tcg_src, 63);
+ } else {
+ tcg_gen_sari_i64(tcg_src, tcg_src, shift);
+ }
+ }
+ }
+
+ if (accumulate) {
+ tcg_gen_add_i64(tcg_res, tcg_res, tcg_src);
+ } else {
+ tcg_gen_mov_i64(tcg_res, tcg_src);
+ }
+
+ if (extended_result) {
+ tcg_temp_free(tcg_src_hi);

should this be tcg_temp_free_i64 ?

+ }
+}
+
+/* Common SHL/SLI - Shift left with an optional insert */
+static void handle_shli_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
+ bool insert, int shift)
+{
+ if (insert) { /* SLI */
+ tcg_gen_deposit_i64(tcg_res, tcg_res, tcg_src, shift, 64 - shift);
+ } else { /* SHL */
+ tcg_gen_shli_i64(tcg_res, tcg_src, shift);
+ }
+}
+
+/* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */
+static void handle_scalar_simd_shri(DisasContext *s,
+ bool is_u, int immh, int immb,
+ int opcode, int rn, int rd)
+{
+ const int size = 3;
+ int immhb = immh << 3 | immb;
+ int shift = 2 * (8 << size) - immhb;
+ bool accumulate = false;
+ bool round = false;
+ TCGv_i64 tcg_rn;
+ TCGv_i64 tcg_rd;
+ TCGv_i64 tcg_round;
+
+ if (!extract32(immh, 3, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (opcode) {
+ case 0x02: /* SSRA / USRA (accumulate) */
+ accumulate = true;
+ break;
+ case 0x04: /* SRSHR / URSHR (rounding) */
+ round = true;
+ break;
+ case 0x06: /* SRSRA / URSRA (accum + rounding) */
+ accumulate = round = true;
+ break;
+ }
+
+ if (round) {
+ uint64_t round_const = 1ULL << (shift - 1);
+ tcg_round = tcg_const_i64(round_const);
+ } else {
+ TCGV_UNUSED_I64(tcg_round);
+ }
+
+ tcg_rn = read_fp_dreg(s, rn);
+ tcg_rd = accumulate ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
+
+ handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+ accumulate, is_u, size, shift);
+
+ write_fp_dreg(s, rd, tcg_rd);
+
+ tcg_temp_free_i64(tcg_rn);
+ tcg_temp_free_i64(tcg_rd);
+ if (round) {
+ tcg_temp_free_i64(tcg_round);
+ }
+}
+
+/* SHL/SLI - Scalar shift left */
+static void handle_scalar_simd_shli(DisasContext *s, bool insert,
+ int immh, int immb, int opcode,
+ int rn, int rd)
+{
+ int size = 32 - clz32(immh) - 1;
+ int immhb = immh << 3 | immb;
+ int shift = immhb - (8 << size);
+ TCGv_i64 tcg_rn = new_tmp_a64(s);
+ TCGv_i64 tcg_rd = new_tmp_a64(s);
+
+ if (!extract32(immh, 3, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ tcg_rn = read_fp_dreg(s, rn);
+ tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
+
+ handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift);
+
+ write_fp_dreg(s, rd, tcg_rd);
+
+ tcg_temp_free_i64(tcg_rn);
+ tcg_temp_free_i64(tcg_rd);
+
+ return;

no harm but maybe remove return?

+}
+
/* C3.6.9 AdvSIMD scalar shift by immediate
* 31 30 29 28 23 22 19 18 16 15 11 10 9 5 4 0
* +-----+---+-------------+------+------+--------+---+------+------+
* | 0 1 | U | 1 1 1 1 1 0 | immh | immb | opcode | 1 | Rn | Rd |
* +-----+---+-------------+------+------+--------+---+------+------+
+ *
+ * This is the scalar version so it works on a fixed sized registers
*/
static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
{
- unsupported_encoding(s, insn);
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 11, 5);
+ int immb = extract32(insn, 16, 3);
+ int immh = extract32(insn, 19, 4);
+ bool is_u = extract32(insn, 29, 1);
+
+ switch (opcode) {
+ case 0x00: /* SSHR / USHR */
+ case 0x02: /* SSRA / USRA */
+ case 0x04: /* SRSHR / URSHR */
+ case 0x06: /* SRSRA / URSRA */
+ handle_scalar_simd_shri(s, is_u, immh, immb, opcode, rn, rd);
+ break;
+ case 0x0a: /* SHL / SLI */
+ handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd);
+ break;
+ default:
+ unsupported_encoding(s, insn);
+ break;
+ }
+
+ return;

also here

}

/* C3.6.10 AdvSIMD scalar three different
@@ -5845,6 +6050,150 @@ static void disas_simd_scalar_indexed(DisasContext *s, uint32_t insn)
unsupported_encoding(s, insn);
}

+/* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */
+static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
+ int immh, int immb, int opcode, int rn, int rd)
+{
+ int size = 32 - clz32(immh) - 1;
+ int immhb = immh << 3 | immb;
+ int shift = 2 * (8 << size) - immhb;
+ bool accumulate = false;
+ bool round = false;
+ int dsize = is_q ? 128 : 64;
+ int esize = 8 << size;
+ int elements = dsize/esize;
+ TCGMemOp memop = size | (is_u ? 0 : MO_SIGN);
+ TCGv_i64 tcg_rn = new_tmp_a64(s);
+ TCGv_i64 tcg_rd = new_tmp_a64(s);
+ TCGv_i64 tcg_round;
+ int i;
+
+ if (extract32(immh, 3, 1) && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (size > 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (opcode) {
+ case 0x02: /* SSRA / USRA (accumulate) */
+ accumulate = true;
+ break;
+ case 0x04: /* SRSHR / URSHR (rounding) */
+ round = true;
+ break;
+ case 0x06: /* SRSRA / URSRA (accum + rounding) */
+ accumulate = round = true;
+ break;
+ }
+
+ if (round) {
+ uint64_t round_const = 1ULL << (shift - 1);
+ tcg_round = tcg_const_i64(round_const);
+ } else {
+ TCGV_UNUSED_I64(tcg_round);
+ }
+
+ for (i = 0; i < elements; i++) {
+ read_vec_element(s, tcg_rn, rn, i, memop);
+ if (accumulate) {
+ read_vec_element(s, tcg_rd, rd, i, memop);
+ }
+
+ handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+ accumulate, is_u, size, shift);
+
+ write_vec_element(s, tcg_rd, rd, i, size);
+ }
+
+ if (!is_q) {
+ clear_vec_high(s, rd);
+ }
+
+ if (round) {
+ tcg_temp_free_i64(tcg_round);
+ }
+}
+
+/* SHL/SLI - Vector shift left */
+static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert,
+ int immh, int immb, int opcode, int rn, int rd)
+{
+ int size = 32 - clz32(immh) - 1;
+ int immhb = immh << 3 | immb;
+ int shift = immhb - (8 << size);
+ int dsize = is_q ? 128 : 64;
+ int esize = 8 << size;
+ int elements = dsize/esize;
+ TCGv_i64 tcg_rn = new_tmp_a64(s);
+ TCGv_i64 tcg_rd = new_tmp_a64(s);
+ int i;
+
+ if (extract32(immh, 3, 1) && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (size > 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ for (i = 0; i < elements; i++) {
+ read_vec_element(s, tcg_rn, rn, i, size);
+ if (insert) {
+ read_vec_element(s, tcg_rd, rd, i, size);
+ }
+
+ handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift);
+
+ write_vec_element(s, tcg_rd, rd, i, size);
+ }
+
+ if (!is_q) {
+ clear_vec_high(s, rd);
+ }
+
+ return;

also here.

Ciao

Claudio

+}
+
+/* USHLL/SHLL - Vector shift left with widening */
+static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
+ int immh, int immb, int opcode, int rn, int rd)
+{
+ int size = 32 - clz32(immh) - 1;
+ int immhb = immh << 3 | immb;
+ int shift = immhb - (8 << size);
+ int dsize = 64;
+ int esize = 8 << size;
+ int elements = dsize/esize;
+ TCGv_i64 tcg_rn = new_tmp_a64(s);
+ TCGv_i64 tcg_rd = new_tmp_a64(s);
+ int i;
+
+ if (size >= 3) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ /* For the LL variants the store is larger than the load,
+ * so if rd == rn we would overwrite parts of our input.
+ * So load everything right now and use shifts in the main loop.
+ */
+ read_vec_element(s, tcg_rn, rn, is_q ? 1 : 0, MO_64);
+
+ for (i = 0; i < elements; i++) {
+ tcg_gen_shri_i64(tcg_rd, tcg_rn, i * esize);
+ ext_and_shift_reg(tcg_rd, tcg_rd, size | (!is_u << 2), 0);
+ tcg_gen_shli_i64(tcg_rd, tcg_rd, shift);
+ write_vec_element(s, tcg_rd, rd, i, size + 1);
+ }
+}
+
+
/* C3.6.14 --
1.8.5

[Prev in Thread]

Current Thread

[Next in Thread]

[Qemu-devel] [PATCH v2 1/8] target-arm: A64: Add SIMD three-different multiply accumulate insns, (continued)
- [Qemu-devel] [PATCH v2 1/8] target-arm: A64: Add SIMD three-different multiply accumulate insns, Peter Maydell, 2014/01/23
  - Re: [Qemu-devel] [PATCH v2 1/8] target-arm: A64: Add SIMD three-different multiply accumulate insns, Richard Henderson, 2014/01/23
- [Qemu-devel] [PATCH v2 4/8] target-arm: A64: Add top level decode for SIMD 3-same group, Peter Maydell, 2014/01/23
- [Qemu-devel] [PATCH v2 6/8] target-arm: A64: Add integer ops from SIMD 3-same group, Peter Maydell, 2014/01/23
  - Re: [Qemu-devel] [PATCH v2 6/8] target-arm: A64: Add integer ops from SIMD 3-same group, Richard Henderson, 2014/01/23
  - Re: [Qemu-devel] [PATCH v2 6/8] target-arm: A64: Add integer ops from SIMD 3-same group, Peter Maydell, 2014/01/25
- [Qemu-devel] [PATCH v2 2/8] target-arm: A64: Add SIMD three-different ABDL instructions, Peter Maydell, 2014/01/23
- [Qemu-devel] [PATCH v2 5/8] target-arm: A64: Add logic ops from SIMD 3 same group, Peter Maydell, 2014/01/23
- [Qemu-devel] [PATCH v2 3/8] target-arm: A64: Add SIMD scalar 3 same add, sub and compare ops, Peter Maydell, 2014/01/23
- [Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate, Peter Maydell, 2014/01/23
  - [Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate, C Fontana <=
    - Re: [Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate, Peter Maydell, 2014/01/23
- [Qemu-devel] [PATCH v2 7/8] target-arm: A64: Add simple SIMD 3-same floating point ops, Peter Maydell, 2014/01/23
  - Re: [Qemu-devel] [PATCH v2 7/8] target-arm: A64: Add simple SIMD 3-same floating point ops, Richard Henderson, 2014/01/23
    - Re: [Qemu-devel] [PATCH v2 7/8] target-arm: A64: Add simple SIMD 3-same floating point ops, Peter Maydell, 2014/01/23

Prev by Date: Re: [Qemu-devel] Emulating Ethernet controller based on Cortex-A15 platforms
Next by Date: Re: [Qemu-devel] Emulating Ethernet controller based on Cortex-A15 platforms
Previous by thread: [Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate
Next by thread: Re: [Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate
Index(es):
- Date
- Thread