Re: [PATCH v7 1/1] target/riscv: rvv: Use wider accesses for unit stride

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v7 1/1] target/riscv: rvv: Use wider accesses for unit stride

From:	Richard Henderson
Subject:	Re: [PATCH v7 1/1] target/riscv: rvv: Use wider accesses for unit stride load/store
Date:	Fri, 20 Dec 2024 08:28:37 -0800
User-agent:	Mozilla Thunderbird

On 12/20/24 04:21, Craig Blackmore wrote:

Use atomic load/store functions to access multiple elements from host.

Co-authored-by: Paolo Savini <paolo.savini@embecosm.com>

Signed-off-by: Paolo Savini <paolo.savini@embecosm.com>
Signed-off-by: Craig Blackmore <craig.blackmore@embecosm.com>
---
  target/riscv/vector_helper.c | 107 +++++++++++++++++++++++++++++++++--
  1 file changed, 101 insertions(+), 6 deletions(-)

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index a85dd1d200..c0179165ce 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -206,10 +206,102 @@ vext_continus_ldst_tlb(CPURISCVState *env, 
vext_ldst_elem_fn_tlb *ldst_tlb,
      }
  }

+#if !HOST_BIG_ENDIAN

+/* Atomic operations for load/store */
+
+/*
+ * Return true if there are enough elements for this size access and the
+ * pointer is aligned to the required atomicity.
+ */
+
+static inline QEMU_ALWAYS_INLINE bool
+ok_for_atomic(uint32_t size, void *host, uint32_t reg_start, uint32_t evl,
+              uint32_t log2_esz)
+{
+    return (reg_start + (size >> log2_esz)) <= evl
+            && ((uintptr_t) host % size) == 0;
+}
+
+#define GEN_VEXT_LDST_ATOMIC_HOST(SIZE, TYPE)                             \
+static inline QEMU_ALWAYS_INLINE void                                     \
+vext_ldst_atom_##SIZE##_host(void *vd, uint32_t byte_offset, TYPE *host,  \
+                             bool is_load)                                \
+{                                                                         \
+    TYPE *vd_ptr = (TYPE *) (vd + byte_offset);                           \
+    if (is_load) {                                                        \
+        *vd_ptr = qatomic_read__nocheck(host);                            \
+    } else {                                                              \
+        qatomic_set__nocheck(host, *vd_ptr);                              \
+    }                                                                     \
+}                                                                         \
+
+GEN_VEXT_LDST_ATOMIC_HOST(2, uint16_t)
+GEN_VEXT_LDST_ATOMIC_HOST(4, uint32_t)
+#ifdef CONFIG_ATOMIC64
+GEN_VEXT_LDST_ATOMIC_HOST(8, uint64_t)
+#endif
+
+#ifdef CONFIG_INT128_TYPE
+static inline QEMU_ALWAYS_INLINE void
+vext_ldst_atom_16_host(void *vd, uint32_t byte_offset, Int128 *host,
+                       bool is_load)
+{
+    Int128 *vd_ptr = (Int128 *) (vd + byte_offset);
+    if (is_load) {
+        *vd_ptr = atomic16_read_ro(host);
+    } else {
+        atomic16_set(host, *vd_ptr);
+    }
+}
+#endif
+#endif


Using CONFIG_INT128_TYPE is peeking under the hood a bit too far.

As it happens, the ifdef should not be required, because the atomic16_{read,set} symbolsare always present.

+#ifdef CONFIG_INT128_TYPE
+    if (((is_load && HAVE_ATOMIC128_RO) || (!is_load && HAVE_ATOMIC128_RW))
+        && ok_for_atomic(16, host, reg_start, evl, log2_esz)) {
+        vext_ldst_atom_16_host(vd, byte_offset, host, is_load);
+        return 16;
+    }
+#endif

The tricky part is that if !HAVE_ATOMIC128_RO, then atomic16_set may be implemented with a16-byte compare and store loop. This is expensive, and you certainly don't want to usethat. You'd prefer two 8-byte stores.


So I think the condition should be

    if (HAVE_ATOMIC128_RO && (is_load || HAVE_ATOMIC128_RW))

with the above paragraph added as a comment.

That said, RVV does not have 128-bit elements, so at no point do you actually require128-bit atomicity.

+
+#ifdef CONFIG_ATOMIC64
+    if (ok_for_atomic(8, host, reg_start, evl, log2_esz)) {
+        vext_ldst_atom_8_host(vd, byte_offset, host, is_load);
+        return 8;
+    }
+#endif
+
+    if (ok_for_atomic(4, host, reg_start, evl, log2_esz)) {
+        vext_ldst_atom_4_host(vd, byte_offset, host, is_load);
+        return 4;
+    }
+
+    if (ok_for_atomic(2, host, reg_start, evl, log2_esz)) {
+        vext_ldst_atom_2_host(vd, byte_offset, host, is_load);
+        return 2;
+    }
+#endif
+
+    ldst_host(vd, reg_start, host);
+    return 1;
+}
+
  static inline QEMU_ALWAYS_INLINE void
  vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
                          void *vd, uint32_t evl, uint32_t reg_start, void 
*host,
-                        uint32_t esz, bool is_load)
+                        uint32_t esz, bool is_load, uint32_t log2_esz)
  {
  #if HOST_BIG_ENDIAN
      for (; reg_start < evl; reg_start++, host += esz) {
@@ -225,10 +317,13 @@ vext_continus_ldst_host(CPURISCVState *env, 
vext_ldst_elem_fn_host *ldst_host,
          } else {
              memcpy(host, vd + byte_offset, size);
          }
-    } else {
-        for (; reg_start < evl; reg_start++, host += esz) {
-            ldst_host(vd, reg_start, host);
-        }
+        return;
+    }
+    uint32_t chunk = 0;
+    for (; reg_start < evl; reg_start += (chunk >> log2_esz),
+                            host += chunk) {
+        chunk = vext_ldst_atomic_chunk_host(env, ldst_host, vd, evl, reg_start,
+                                            host, esz, is_load, log2_esz);
      }

My guess is the arrangement of the loop is not great, in that ok_for_atomic is buried atthe bottom of the call stack within this loop.

It would be better to test for alignment once, here at the outside. If the alignment isbelow esz, then the individual elements are misaligned, at which point the architecturedoes not require *any* atomicity. At which point, I believe you could use the memcpy pathjust above.


Perhaps something like

    uint32_t byte_offset = reg_start * esz;
    uint32_t size = (evl - reg_start) * esz;
    uint32_t test = (uintptr_t)host;

    if (esz == 1 || unlikely(test % esz != 0)) {
        if (is_load) {
            memcpy(vd + byte_offset, host, size);
        } else {
            memcpy(host, vd + byte_offset, size);
        }
        return;
    }

    /* Test that both alignment and size are multiples. */
    test |= size;
    if (HAVE_ATOMIC128_RO && (is_load || HAVE_ATOMIC128_RW) && (test % 16 == 0) 
{
        do {
            vext_ldst_atom_16_host(vd, reg_offset, host, is_load);
            reg_offset += 16 >> log2_esz;
            host += 16;
        } while (reg_offset < evl);
        return;
    }
    if (esz < 8 && test % 8 == 0) {
        ...
    }
    ...
    for (; reg_start < evl; reg_start++, host += esz) {
        ldst_host(vd, reg_start, host);
    }


r~

[Prev in Thread]

Current Thread

[Next in Thread]

[PATCH v7 0/1] target/riscv: rvv: Use wider accesses for unit stride load/store, Craig Blackmore, 2024/12/20
- [PATCH v7 1/1] target/riscv: rvv: Use wider accesses for unit stride load/store, Craig Blackmore, 2024/12/20
  - Re: [PATCH v7 1/1] target/riscv: rvv: Use wider accesses for unit stride load/store, Richard Henderson <=

Prev by Date: Re: [PULL 00/18] loongarch-to-apply queue
Next by Date: Re: [PULL 00/17] Migration patches for 2024-12-17
Previous by thread: [PATCH v7 1/1] target/riscv: rvv: Use wider accesses for unit stride load/store
Next by thread: [PATCH v2] hw/arm_sysctl: fix extracting 31th bit of val
Index(es):
- Date
- Thread