[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH RFC v2] target/arm: Implement SVE2 MATCH, NMATCH
From: |
Richard Henderson |
Subject: |
Re: [PATCH RFC v2] target/arm: Implement SVE2 MATCH, NMATCH |
Date: |
Tue, 14 Apr 2020 19:05:10 -0700 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.4.1 |
On 4/14/20 4:16 PM, Stephen Long wrote:
> Signed-off-by: Stephen Long <address@hidden>
> ---
> target/arm/helper-sve.h | 10 ++++++++
> target/arm/sve.decode | 5 ++++
> target/arm/sve_helper.c | 51 ++++++++++++++++++++++++++++++++++++++
> target/arm/translate-sve.c | 22 ++++++++++++++++
> 4 files changed, 88 insertions(+)
>
> diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
> index 5dd880cf6d..bc4a463bc7 100644
> --- a/target/arm/helper-sve.h
> +++ b/target/arm/helper-sve.h
> @@ -2516,6 +2516,16 @@ DEF_HELPER_FLAGS_3(sve2_uqrshrnt_h, TCG_CALL_NO_RWG,
> void, ptr, ptr, i32)
> DEF_HELPER_FLAGS_3(sve2_uqrshrnt_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> DEF_HELPER_FLAGS_3(sve2_uqrshrnt_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
>
> +DEF_HELPER_FLAGS_5(sve2_match_ppzz_b, TCG_CALL_NO_RWG,
> + i32, ptr, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_5(sve2_match_ppzz_h, TCG_CALL_NO_RWG,
> + i32, ptr, ptr, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_5(sve2_nmatch_ppzz_b, TCG_CALL_NO_RWG,
> + i32, ptr, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_5(sve2_nmatch_ppzz_h, TCG_CALL_NO_RWG,
> + i32, ptr, ptr, ptr, ptr, i32)
> +
> DEF_HELPER_FLAGS_6(sve2_faddp_zpzz_h, TCG_CALL_NO_RWG,
> void, ptr, ptr, ptr, ptr, ptr, i32)
> DEF_HELPER_FLAGS_6(sve2_faddp_zpzz_s, TCG_CALL_NO_RWG,
> diff --git a/target/arm/sve.decode b/target/arm/sve.decode
> index 374e47fb05..652668df02 100644
> --- a/target/arm/sve.decode
> +++ b/target/arm/sve.decode
> @@ -1305,6 +1305,11 @@ UQSHRNT 01000101 .. 1 ..... 00 1101 .....
> ..... @rd_rn_tszimm_shr
> UQRSHRNB 01000101 .. 1 ..... 00 1110 ..... ..... @rd_rn_tszimm_shr
> UQRSHRNT 01000101 .. 1 ..... 00 1111 ..... ..... @rd_rn_tszimm_shr
>
> +### SVE2 Character Match
> +
> +MATCH 01000101 .. 1 ..... 100 ... ..... 0 .... @pd_pg_rn_rm
> +NMATCH 01000101 .. 1 ..... 100 ... ..... 1 .... @pd_pg_rn_rm
> +
> ## SVE2 floating-point pairwise operations
>
> FADDP 01100100 .. 010 00 0 100 ... ..... ..... @rdn_pg_rm
> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
> index b68f62cd7f..78abd8b62a 100644
> --- a/target/arm/sve_helper.c
> +++ b/target/arm/sve_helper.c
> @@ -6890,3 +6890,54 @@ DO_ST1_ZPZ_D(dd_be, zd, MO_64)
>
> #undef DO_ST1_ZPZ_S
> #undef DO_ST1_ZPZ_D
> +
> +#define DO_PPZZ_CHAR_MATCH(NAME, TYPE, OP, H, MASK, DEFAULT_VAL)
> \
> +static inline bool NAME##_inner_loop(TYPE nn, void *segmentbase)
> \
> +{
> \
> + intptr_t i = 128;
> \
> + do {
> \
> + do {
> \
> + i -= sizeof(TYPE) * 8;
> \
> + TYPE mm = *(TYPE *)(segmentbase + H1(i));
> \
> + if (nn OP mm) {
> \
> + return !DEFAULT_VAL;
> \
> + }
> \
> + } while (i & 63);
> \
> + } while (i > 0);
> \
> + return DEFAULT_VAL;
> \
> +}
> \
You seem to be mixing up bit and bytes here, with 128 bits and H1 as a byte
index.
I note that we don't need to keep re-loading the Zm segment elements from
memory. Perhaps something like
static inline bool do_match1(uint64_t n, uint64_t m, int esz)
{
int i, bits = 8 << esz;
n = extract64(n, 0, bits);
for (i = 0; i < 64; i += bits) {
if (n == extract64(m, i, bits)) {
return true;
}
}
return false;
}
static inline bool do_match2(uint64_t n, uint64_t m0,
uint64_t m1, int esz)
{
return do_match1(n, m0, esz) || do_match1(n, m1, esz);
}
As an improvement, we can use
https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
static inline bool do_match2(uint64_t n, uint64_t m0,
uint64_t m1, int esz)
{
int bits = 8 << esz;
uint64_t ones = dup_const(esz, 1);
uint64_t signs = ones << (bits - 1);
uint64_t cmp0, cmp1;
cmp1 = dup_const(esz, n);
cmp0 = cmp1 ^ m0;
cmp1 = cmp1 ^ m1;
cmp0 = (cmp0 - ones) & ~cmp0;
cmp1 = (cmp1 - ones) & ~cmp1;
return (cmp0 | cmp1) & signs;
}
> +uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
> \
> +{
> \
> + intptr_t opr_sz = simd_oprsz(desc);
> \
> + uint32_t flags = PREDTEST_INIT;
> \
> + intptr_t i = opr_sz;
> \
> + do {
> \
> + uint64_t out = 0;
> \
> + uint64_t pg;
> \
> + do {
> \
> + i -= sizeof(TYPE), out <<= sizeof(TYPE);
> \
> + TYPE nn = *(TYPE *)(vn + H(i));
> \
> + out = (out & ~1ull) | DEFAULT_VAL;
> \
> + out |= NAME##_inner_loop(nn, vm + (i & -16));
> \
> + } while (i & 63);
> \
> + pg = *(uint64_t *)(vg + (i >> 3)) & MASK;
> \
> + out &= pg;
> \
> + *(uint64_t *)(vd + (i >> 3)) = out;
> \
> + flags = iter_predtest_bwd(out, pg, flags);
> \
> + } while (i > 0);
> \
> + return 0;
> \
static inline uint32_t do_match(void *vd, void *vn,
void *vm, void *vg, uint32_t desc,
int esz, bool nmatch)
{
intptr_opr_sz = simd_oprsz(desc);
uint32_t flags = PREDTEST_INIT;
intptr_t i, j, k;
for (i = 0; i < opr_sz; i += 16) {
uint64_t m0 = *(uint64_t *)(vm + i);
uint64_t m1 = *(uint64_t *)(vm + i + 8);
uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
uint16_t out = 0;
for (j = 0; j < 16; j += 8) {
uint64_t n = *(uint64_t *)(vn + i + j);
for (k = 0; k < 8; k += 1 << esz) {
if (pg & (1 << (j + k))) {
bool o = do_match2(n >> (k * 8),
m0, m1, esz);
out |= (o ^ nmatch) << (j + k);
}
}
}
*(uint16_t *)(vd + H1_2(i >> 3)) = out;
flags = iter_predtest_fwd(out, pg, flags);
}
return flags;
}
#define DO_PPZZ_MATCH(NAME, ESZ, INV) \
uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, \
void *vg, uint32_t desc) \
{ \
return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
}
DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
r~