qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v7 1/1] target/riscv: rvv: Use wider accesses for unit stride


From: Richard Henderson
Subject: Re: [PATCH v7 1/1] target/riscv: rvv: Use wider accesses for unit stride load/store
Date: Fri, 20 Dec 2024 08:28:37 -0800
User-agent: Mozilla Thunderbird

On 12/20/24 04:21, Craig Blackmore wrote:
Use atomic load/store functions to access multiple elements from host.

Co-authored-by: Paolo Savini <paolo.savini@embecosm.com>

Signed-off-by: Paolo Savini <paolo.savini@embecosm.com>
Signed-off-by: Craig Blackmore <craig.blackmore@embecosm.com>
---
  target/riscv/vector_helper.c | 107 +++++++++++++++++++++++++++++++++--
  1 file changed, 101 insertions(+), 6 deletions(-)

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index a85dd1d200..c0179165ce 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -206,10 +206,102 @@ vext_continus_ldst_tlb(CPURISCVState *env, 
vext_ldst_elem_fn_tlb *ldst_tlb,
      }
  }
+#if !HOST_BIG_ENDIAN
+/* Atomic operations for load/store */
+
+/*
+ * Return true if there are enough elements for this size access and the
+ * pointer is aligned to the required atomicity.
+ */
+
+static inline QEMU_ALWAYS_INLINE bool
+ok_for_atomic(uint32_t size, void *host, uint32_t reg_start, uint32_t evl,
+              uint32_t log2_esz)
+{
+    return (reg_start + (size >> log2_esz)) <= evl
+            && ((uintptr_t) host % size) == 0;
+}
+
+#define GEN_VEXT_LDST_ATOMIC_HOST(SIZE, TYPE)                             \
+static inline QEMU_ALWAYS_INLINE void                                     \
+vext_ldst_atom_##SIZE##_host(void *vd, uint32_t byte_offset, TYPE *host,  \
+                             bool is_load)                                \
+{                                                                         \
+    TYPE *vd_ptr = (TYPE *) (vd + byte_offset);                           \
+    if (is_load) {                                                        \
+        *vd_ptr = qatomic_read__nocheck(host);                            \
+    } else {                                                              \
+        qatomic_set__nocheck(host, *vd_ptr);                              \
+    }                                                                     \
+}                                                                         \
+
+GEN_VEXT_LDST_ATOMIC_HOST(2, uint16_t)
+GEN_VEXT_LDST_ATOMIC_HOST(4, uint32_t)
+#ifdef CONFIG_ATOMIC64
+GEN_VEXT_LDST_ATOMIC_HOST(8, uint64_t)
+#endif
+
+#ifdef CONFIG_INT128_TYPE
+static inline QEMU_ALWAYS_INLINE void
+vext_ldst_atom_16_host(void *vd, uint32_t byte_offset, Int128 *host,
+                       bool is_load)
+{
+    Int128 *vd_ptr = (Int128 *) (vd + byte_offset);
+    if (is_load) {
+        *vd_ptr = atomic16_read_ro(host);
+    } else {
+        atomic16_set(host, *vd_ptr);
+    }
+}
+#endif
+#endif

Using CONFIG_INT128_TYPE is peeking under the hood a bit too far.
As it happens, the ifdef should not be required, because the atomic16_{read,set} symbols are always present.

+#ifdef CONFIG_INT128_TYPE
+    if (((is_load && HAVE_ATOMIC128_RO) || (!is_load && HAVE_ATOMIC128_RW))
+        && ok_for_atomic(16, host, reg_start, evl, log2_esz)) {
+        vext_ldst_atom_16_host(vd, byte_offset, host, is_load);
+        return 16;
+    }
+#endif

The tricky part is that if !HAVE_ATOMIC128_RO, then atomic16_set may be implemented with a 16-byte compare and store loop. This is expensive, and you certainly don't want to use that. You'd prefer two 8-byte stores.

So I think the condition should be

    if (HAVE_ATOMIC128_RO && (is_load || HAVE_ATOMIC128_RW))

with the above paragraph added as a comment.

That said, RVV does not have 128-bit elements, so at no point do you actually require 128-bit atomicity.

+
+#ifdef CONFIG_ATOMIC64
+    if (ok_for_atomic(8, host, reg_start, evl, log2_esz)) {
+        vext_ldst_atom_8_host(vd, byte_offset, host, is_load);
+        return 8;
+    }
+#endif
+
+    if (ok_for_atomic(4, host, reg_start, evl, log2_esz)) {
+        vext_ldst_atom_4_host(vd, byte_offset, host, is_load);
+        return 4;
+    }
+
+    if (ok_for_atomic(2, host, reg_start, evl, log2_esz)) {
+        vext_ldst_atom_2_host(vd, byte_offset, host, is_load);
+        return 2;
+    }
+#endif
+
+    ldst_host(vd, reg_start, host);
+    return 1;
+}
+
  static inline QEMU_ALWAYS_INLINE void
  vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
                          void *vd, uint32_t evl, uint32_t reg_start, void 
*host,
-                        uint32_t esz, bool is_load)
+                        uint32_t esz, bool is_load, uint32_t log2_esz)
  {
  #if HOST_BIG_ENDIAN
      for (; reg_start < evl; reg_start++, host += esz) {
@@ -225,10 +317,13 @@ vext_continus_ldst_host(CPURISCVState *env, 
vext_ldst_elem_fn_host *ldst_host,
          } else {
              memcpy(host, vd + byte_offset, size);
          }
-    } else {
-        for (; reg_start < evl; reg_start++, host += esz) {
-            ldst_host(vd, reg_start, host);
-        }
+        return;
+    }
+    uint32_t chunk = 0;
+    for (; reg_start < evl; reg_start += (chunk >> log2_esz),
+                            host += chunk) {
+        chunk = vext_ldst_atomic_chunk_host(env, ldst_host, vd, evl, reg_start,
+                                            host, esz, is_load, log2_esz);
      }

My guess is the arrangement of the loop is not great, in that ok_for_atomic is buried at the bottom of the call stack within this loop.

It would be better to test for alignment once, here at the outside. If the alignment is below esz, then the individual elements are misaligned, at which point the architecture does not require *any* atomicity. At which point, I believe you could use the memcpy path just above.

Perhaps something like

    uint32_t byte_offset = reg_start * esz;
    uint32_t size = (evl - reg_start) * esz;
    uint32_t test = (uintptr_t)host;

    if (esz == 1 || unlikely(test % esz != 0)) {
        if (is_load) {
            memcpy(vd + byte_offset, host, size);
        } else {
            memcpy(host, vd + byte_offset, size);
        }
        return;
    }

    /* Test that both alignment and size are multiples. */
    test |= size;
    if (HAVE_ATOMIC128_RO && (is_load || HAVE_ATOMIC128_RW) && (test % 16 == 0) 
{
        do {
            vext_ldst_atom_16_host(vd, reg_offset, host, is_load);
            reg_offset += 16 >> log2_esz;
            host += 16;
        } while (reg_offset < evl);
        return;
    }
    if (esz < 8 && test % 8 == 0) {
        ...
    }
    ...
    for (; reg_start < evl; reg_start++, host += esz) {
        ldst_host(vd, reg_start, host);
    }


r~



reply via email to

[Prev in Thread] Current Thread [Next in Thread]