qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [RFC PATCH v5 2/5] target/riscv: rvv: Provide a fast path using dire


From: Max Chou
Subject: Re: [RFC PATCH v5 2/5] target/riscv: rvv: Provide a fast path using direct access to host ram for unmasked unit-stride load/store
Date: Tue, 30 Jul 2024 22:24:52 +0800
User-agent: Mozilla Thunderbird

On 2024/7/25 1:51 PM, Richard Henderson wrote:
On 7/17/24 23:39, Max Chou wrote:
@@ -199,7 +212,7 @@ static void
  vext_ldst_stride(void *vd, void *v0, target_ulong base,
                   target_ulong stride, CPURISCVState *env,
                   uint32_t desc, uint32_t vm,
-                 vext_ldst_elem_fn *ldst_elem,
+                 vext_ldst_elem_fn_tlb * ldst_elem,

Extra space: "type *var"
I will fix this part at v6.
Thanks.


                   uint32_t log2_esz, uintptr_t ra)
  {
      uint32_t i, k;
@@ -221,7 +234,8 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base,
                  continue;
              }
              target_ulong addr = base + stride * i + (k << log2_esz);
-            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
+            ldst_elem(env, adjust_addr(env, addr),
+                      (i + k * max_elems) << log2_esz, vd, ra);

Is this some sort of bug fix?  It doesn't seem related...
If it is a bug fix, it should be a separate patch.
This modification is caused by replacing the idx by byte offset for the vector register accessing flow in the basic GEN_VEXT_LD/ST_ELEMENT macros. But I think that it's not a good idea now, it will get unexpected result when the endian between host and guest are different.

I'll fix this part at v6.

  /*
   * unit-stride: access elements stored contiguously in memory
   */
    /* unmasked unit-stride load and store operation */
+static void
+vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
+                  uint32_t elems, uint32_t nf, uint32_t max_elems,
+                  uint32_t log2_esz, bool is_load,
+                  vext_ldst_elem_fn_tlb *ldst_tlb,
+                  vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
+{
+    void *host;
+    int i, k, flags;
+    uint32_t esz = 1 << log2_esz;
+    uint32_t size = (elems * nf) << log2_esz;
+    uint32_t evl = env->vstart + elems;
+    int mmu_index = riscv_env_mmu_index(env, false);
+    MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;

You may want to pass in mmu_index, so that it is computed once in the caller.
Thanks for the suggestion.
I'll try to pass in mmu_index at v6.


+
+    /* Check page permission/pmp/watchpoint/etc. */
+    flags = probe_access_flags(env, adjust_addr(env, addr), size, access_type,
+                               mmu_index, true, &host, ra);
+
+    if (host && flags == 0) {

If flags == 0, host will always be non-null.
You only need flags == 0.
Thanks for the suggestion.
I'll update this part at v6.


  static void
  vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, -             vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
-             uintptr_t ra)
+             vext_ldst_elem_fn_tlb *ldst_tlb,
+             vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
+             uint32_t evl, uintptr_t ra, bool is_load)
  {
-    uint32_t i, k;
+    uint32_t k;
+    target_ulong page_split, elems, addr;
      uint32_t nf = vext_nf(desc);
      uint32_t max_elems = vext_max_elems(desc, log2_esz);
      uint32_t esz = 1 << log2_esz;
+    uint32_t msize = nf * esz;
        VSTART_CHECK_EARLY_EXIT(env);
  -    /* load bytes from guest memory */
-    for (i = env->vstart; i < evl; env->vstart = ++i) {
-        k = 0;
-        while (k < nf) {
-            target_ulong addr = base + ((i * nf + k) << log2_esz);
-            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
-            k++;
+    while (env->vstart < evl) {

VSTART_CHECK_EARLY_EXIT has taken care of this condition for the first page. We know that one contiguous operation can only consume 1024 bytes, so cannot cross two pages.  Therefore this loop executes exactly once or twice.

I think it would be better to unroll this by hand:

    calc page range
    if (likely(elems)) {
        vext_page_ldst_us(... elems ...);
    }
    if (unlikely(env->vstart < evl)) {
        if (unlikely(page_split % msize)) {
           ...
        }
        vext_page_ldst_us(... evl - vstart ...);
    }
Yes, one contiguous operation can only consume 1024 bytes in vector unit-stride ld/st. It's a good idea to unroll it here.
I'll unroll this part at v6.
Thanks for the suggestion.

I'll try to extend the original loop implementation to other vector ld/st instructions that may cross multiple pages in the future.

+        /* Calculate page range */
+        addr = base + ((env->vstart * nf) << log2_esz);
+        page_split = -(addr | TARGET_PAGE_MASK);
+        /* Get number of elements */
+        elems = page_split / msize;
+        if (unlikely(env->vstart + elems >= evl)) {
+            elems = evl - env->vstart;
+        }
+
+        /* Load/store elements in page */
+        vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
+                          is_load, ldst_tlb, ldst_host, ra);
+
+        /* Cross page element */
+        if (unlikely((page_split % msize) != 0 && (env->vstart + 1) < evl)) {
+            for (k = 0; k < nf; k++) {
+                addr = base + ((env->vstart * nf + k) << log2_esz);
+                ldst_tlb(env, adjust_addr(env, addr),
+                         (k * max_elems + env->vstart) << log2_esz, vd, ra);
+            }
+            env->vstart++;
          }
      }
-    env->vstart = 0;
  +    env->vstart = 0;
      vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
  }


r~




reply via email to

[Prev in Thread] Current Thread [Next in Thread]