[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v7 1/1] target/riscv: rvv: Use wider accesses for unit stride
From: |
Richard Henderson |
Subject: |
Re: [PATCH v7 1/1] target/riscv: rvv: Use wider accesses for unit stride load/store |
Date: |
Fri, 20 Dec 2024 08:28:37 -0800 |
User-agent: |
Mozilla Thunderbird |
On 12/20/24 04:21, Craig Blackmore wrote:
Use atomic load/store functions to access multiple elements from host.
Co-authored-by: Paolo Savini <paolo.savini@embecosm.com>
Signed-off-by: Paolo Savini <paolo.savini@embecosm.com>
Signed-off-by: Craig Blackmore <craig.blackmore@embecosm.com>
---
target/riscv/vector_helper.c | 107 +++++++++++++++++++++++++++++++++--
1 file changed, 101 insertions(+), 6 deletions(-)
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index a85dd1d200..c0179165ce 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -206,10 +206,102 @@ vext_continus_ldst_tlb(CPURISCVState *env,
vext_ldst_elem_fn_tlb *ldst_tlb,
}
}
+#if !HOST_BIG_ENDIAN
+/* Atomic operations for load/store */
+
+/*
+ * Return true if there are enough elements for this size access and the
+ * pointer is aligned to the required atomicity.
+ */
+
+static inline QEMU_ALWAYS_INLINE bool
+ok_for_atomic(uint32_t size, void *host, uint32_t reg_start, uint32_t evl,
+ uint32_t log2_esz)
+{
+ return (reg_start + (size >> log2_esz)) <= evl
+ && ((uintptr_t) host % size) == 0;
+}
+
+#define GEN_VEXT_LDST_ATOMIC_HOST(SIZE, TYPE) \
+static inline QEMU_ALWAYS_INLINE void \
+vext_ldst_atom_##SIZE##_host(void *vd, uint32_t byte_offset, TYPE *host, \
+ bool is_load) \
+{ \
+ TYPE *vd_ptr = (TYPE *) (vd + byte_offset); \
+ if (is_load) { \
+ *vd_ptr = qatomic_read__nocheck(host); \
+ } else { \
+ qatomic_set__nocheck(host, *vd_ptr); \
+ } \
+} \
+
+GEN_VEXT_LDST_ATOMIC_HOST(2, uint16_t)
+GEN_VEXT_LDST_ATOMIC_HOST(4, uint32_t)
+#ifdef CONFIG_ATOMIC64
+GEN_VEXT_LDST_ATOMIC_HOST(8, uint64_t)
+#endif
+
+#ifdef CONFIG_INT128_TYPE
+static inline QEMU_ALWAYS_INLINE void
+vext_ldst_atom_16_host(void *vd, uint32_t byte_offset, Int128 *host,
+ bool is_load)
+{
+ Int128 *vd_ptr = (Int128 *) (vd + byte_offset);
+ if (is_load) {
+ *vd_ptr = atomic16_read_ro(host);
+ } else {
+ atomic16_set(host, *vd_ptr);
+ }
+}
+#endif
+#endif
Using CONFIG_INT128_TYPE is peeking under the hood a bit too far.
As it happens, the ifdef should not be required, because the atomic16_{read,set} symbols
are always present.
+#ifdef CONFIG_INT128_TYPE
+ if (((is_load && HAVE_ATOMIC128_RO) || (!is_load && HAVE_ATOMIC128_RW))
+ && ok_for_atomic(16, host, reg_start, evl, log2_esz)) {
+ vext_ldst_atom_16_host(vd, byte_offset, host, is_load);
+ return 16;
+ }
+#endif
The tricky part is that if !HAVE_ATOMIC128_RO, then atomic16_set may be implemented with a
16-byte compare and store loop. This is expensive, and you certainly don't want to use
that. You'd prefer two 8-byte stores.
So I think the condition should be
if (HAVE_ATOMIC128_RO && (is_load || HAVE_ATOMIC128_RW))
with the above paragraph added as a comment.
That said, RVV does not have 128-bit elements, so at no point do you actually require
128-bit atomicity.
+
+#ifdef CONFIG_ATOMIC64
+ if (ok_for_atomic(8, host, reg_start, evl, log2_esz)) {
+ vext_ldst_atom_8_host(vd, byte_offset, host, is_load);
+ return 8;
+ }
+#endif
+
+ if (ok_for_atomic(4, host, reg_start, evl, log2_esz)) {
+ vext_ldst_atom_4_host(vd, byte_offset, host, is_load);
+ return 4;
+ }
+
+ if (ok_for_atomic(2, host, reg_start, evl, log2_esz)) {
+ vext_ldst_atom_2_host(vd, byte_offset, host, is_load);
+ return 2;
+ }
+#endif
+
+ ldst_host(vd, reg_start, host);
+ return 1;
+}
+
static inline QEMU_ALWAYS_INLINE void
vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
void *vd, uint32_t evl, uint32_t reg_start, void
*host,
- uint32_t esz, bool is_load)
+ uint32_t esz, bool is_load, uint32_t log2_esz)
{
#if HOST_BIG_ENDIAN
for (; reg_start < evl; reg_start++, host += esz) {
@@ -225,10 +317,13 @@ vext_continus_ldst_host(CPURISCVState *env,
vext_ldst_elem_fn_host *ldst_host,
} else {
memcpy(host, vd + byte_offset, size);
}
- } else {
- for (; reg_start < evl; reg_start++, host += esz) {
- ldst_host(vd, reg_start, host);
- }
+ return;
+ }
+ uint32_t chunk = 0;
+ for (; reg_start < evl; reg_start += (chunk >> log2_esz),
+ host += chunk) {
+ chunk = vext_ldst_atomic_chunk_host(env, ldst_host, vd, evl, reg_start,
+ host, esz, is_load, log2_esz);
}
My guess is the arrangement of the loop is not great, in that ok_for_atomic is buried at
the bottom of the call stack within this loop.
It would be better to test for alignment once, here at the outside. If the alignment is
below esz, then the individual elements are misaligned, at which point the architecture
does not require *any* atomicity. At which point, I believe you could use the memcpy path
just above.
Perhaps something like
uint32_t byte_offset = reg_start * esz;
uint32_t size = (evl - reg_start) * esz;
uint32_t test = (uintptr_t)host;
if (esz == 1 || unlikely(test % esz != 0)) {
if (is_load) {
memcpy(vd + byte_offset, host, size);
} else {
memcpy(host, vd + byte_offset, size);
}
return;
}
/* Test that both alignment and size are multiples. */
test |= size;
if (HAVE_ATOMIC128_RO && (is_load || HAVE_ATOMIC128_RW) && (test % 16 == 0)
{
do {
vext_ldst_atom_16_host(vd, reg_offset, host, is_load);
reg_offset += 16 >> log2_esz;
host += 16;
} while (reg_offset < evl);
return;
}
if (esz < 8 && test % 8 == 0) {
...
}
...
for (; reg_start < evl; reg_start++, host += esz) {
ldst_host(vd, reg_start, host);
}
r~