[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[RFC PATCH v3 5/5] target/riscv: Inline unit-stride ld/st and correspond
From: |
Max Chou |
Subject: |
[RFC PATCH v3 5/5] target/riscv: Inline unit-stride ld/st and corresponding functions for performance |
Date: |
Thu, 13 Jun 2024 22:19:06 +0800 |
In the vector unit-stride load/store helper functions. the vext_ldst_us
& vext_ldst_whole functions corresponding most of the execution time.
Inline the functions can avoid the function call overhead to improve the
helper function performance.
Signed-off-by: Max Chou <max.chou@sifive.com>
---
target/riscv/vector_helper.c | 64 +++++++++++++++++++-----------------
1 file changed, 34 insertions(+), 30 deletions(-)
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 09c9b231c3f..4a21064a366 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -408,20 +408,22 @@ typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env,
abi_ptr addr,
uint32_t idx, void *vd, uintptr_t retaddr);
typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
-#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
-static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \
- uint32_t byte_off, void *vd, uintptr_t retaddr) \
-{ \
- uint8_t *reg = ((uint8_t *)vd + byte_off); \
- ETYPE *cur = ((ETYPE *)reg); \
- *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
-} \
- \
-static void NAME##_host(void *vd, uint32_t byte_off, void *host) \
-{ \
- ETYPE val = LDSUF##_p(host); \
- uint8_t *reg = (uint8_t *)(vd + byte_off); \
- *(ETYPE *)(reg) = val; \
+#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
+static inline QEMU_ALWAYS_INLINE \
+void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \
+ uint32_t byte_off, void *vd, uintptr_t retaddr) \
+{ \
+ uint8_t *reg = ((uint8_t *)vd + byte_off); \
+ ETYPE *cur = ((ETYPE *)reg); \
+ *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
+} \
+ \
+static inline QEMU_ALWAYS_INLINE \
+void NAME##_host(void *vd, uint32_t byte_off, void *host) \
+{ \
+ ETYPE val = LDSUF##_p(host); \
+ uint8_t *reg = (uint8_t *)(vd + byte_off); \
+ *(ETYPE *)(reg) = val; \
}
GEN_VEXT_LD_ELEM(lde_b, uint8_t, H1, ldub)
@@ -429,20 +431,22 @@ GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
-#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
-static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \
- uint32_t byte_off, void *vd, uintptr_t retaddr) \
-{ \
- uint8_t *reg = ((uint8_t *)vd + byte_off); \
- ETYPE data = *((ETYPE *)reg); \
- cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
-} \
- \
-static void NAME##_host(void *vd, uint32_t byte_off, void *host) \
-{ \
- uint8_t *reg = ((uint8_t *)vd + byte_off); \
- ETYPE val = *(ETYPE *)(reg); \
- STSUF##_p(host, val); \
+#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
+static inline QEMU_ALWAYS_INLINE \
+void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \
+ uint32_t byte_off, void *vd, uintptr_t retaddr) \
+{ \
+ uint8_t *reg = ((uint8_t *)vd + byte_off); \
+ ETYPE data = *((ETYPE *)reg); \
+ cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
+} \
+ \
+static inline QEMU_ALWAYS_INLINE \
+void NAME##_host(void *vd, uint32_t byte_off, void *host) \
+{ \
+ uint8_t *reg = ((uint8_t *)vd + byte_off); \
+ ETYPE val = *(ETYPE *)(reg); \
+ STSUF##_p(host, val); \
}
GEN_VEXT_ST_ELEM(ste_b, uint8_t, H1, stb)
@@ -604,7 +608,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
*/
/* unmasked unit-stride load and store operation */
-static void
+static inline QEMU_ALWAYS_INLINE void
vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
vext_ldst_elem_fn_tlb *ldst_tlb,
vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
@@ -1006,7 +1010,7 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb)
/*
* load and store whole register instructions
*/
-static void
+static inline QEMU_ALWAYS_INLINE void
vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
vext_ldst_elem_fn_tlb *ldst_tlb,
vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
--
2.34.1
- [RFC PATCH v3 0/5] Improve the performance of RISC-V vector unit-stride/whole register ld/st instructions, Max Chou, 2024/06/13
- [RFC PATCH v3 1/5] accel/tcg: Avoid unnecessary call overhead from qemu_plugin_vcpu_mem_cb, Max Chou, 2024/06/13
- [RFC PATCH v3 3/5] target/riscv: rvv: Provide a fast path using direct access to host ram for unit-stride whole register load/store, Max Chou, 2024/06/13
- [RFC PATCH v3 4/5] target/riscv: rvv: Provide group continuous ld/st flow for unit-stride ld/st instructions, Max Chou, 2024/06/13
- [RFC PATCH v3 2/5] target/riscv: rvv: Provide a fast path using direct access to host ram for unmasked unit-stride load/store, Max Chou, 2024/06/13
- [RFC PATCH v3 5/5] target/riscv: Inline unit-stride ld/st and corresponding functions for performance,
Max Chou <=
- Re: [RFC PATCH v3 0/5] Improve the performance of RISC-V vector unit-stride/whole register ld/st instructions, Daniel Henrique Barboza, 2024/06/13