qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH 5/5] tcg-arm: Better pipeline for softmmu tlb access


From: Richard Henderson
Subject: [Qemu-devel] [PATCH 5/5] tcg-arm: Better pipeline for softmmu tlb access
Date: Wed, 28 Aug 2013 15:33:33 -0700

Moves the load for the tlb addend earlier, to better load latency.
Avoids the writeback from the comparator load, since we know how
to adjust the offset between the two loads.

 :  e2862c03      add   r2, r6, #768    ; 0x300
 :  e20c00ff      and   r0, ip, #255    ; 0xff
 :  e0822280      add   r2, r2, r0, lsl #5
-:  e1e209d8      ldrd  r0, [r2, #152]!
+:  e1c209d8      ldrd  r0, [r2, #152]
 :  e31b0007      tst   fp, #7  ; 0x7
+:  e59220a8      ldr   r2, [r2, #168]
 :  0150068c      cmpeq r0, ip, lsl #13
 :  01510007      cmpeq r1, r7
-:  e5921010      ldr   r1, [r2, #16]
-:  018b40f1      strdeq        r4, [fp, r1]
+:  018b40f2      strdeq        r4, [fp, r2]
 :  1b0000cb      blne  0x75e7e6e4

Signed-off-by: Richard Henderson <address@hidden>
---
 tcg/arm/tcg-target.c | 97 +++++++++++++++++++++++-----------------------------
 1 file changed, 42 insertions(+), 55 deletions(-)

diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index f1e547f..6d03d6b 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1179,13 +1179,18 @@ QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8);
 QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1])
                   > 0xffff);
 
-/* Load and compare a TLB entry, leaving the flags set.  Leaves R2 pointing
-   to the tlb entry.  Clobbers R1 and TMP.  */
+/* Load and compare a TLB entry, leaving the flags set.  Leaves R2 
+   containing the tlb addend.  Clobbers R0, R1 and TMP.  */
 
 static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
-                             int s_bits, int tlb_offset)
+                             int s_bits, int mem_index, bool is_load)
 {
     TCGReg base = TCG_AREG0;
+    int tlb_offset = 
+        (is_load
+         ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
+         : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
+    int add_offset = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
 
     /* Should generate something like the following:
      * pre-v7:
@@ -1193,18 +1198,10 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
      *   add    r2, env, #off & 0xff00
      *   and    r0, tmp, #(CPU_TLB_SIZE - 1)                      (2)
      *   add    r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS               (3)
-     *   ldr    r0, [r2, #off & 0xff]!                            (4)
+     *   ldr    r0, [r2, #off & 0xff]                             (4)
      *   tst    addr_reg, #s_mask
-     *   cmpeq  r0, tmp, lsl #TARGET_PAGE_BITS                    (5)
-     *
-     * v7 (not implemented yet):
-     *   ubfx   r2, addr_reg, #TARGET_PAGE_BITS, #CPU_TLB_BITS    (1)
-     *   movw   tmp, #~TARGET_PAGE_MASK & ~s_mask
-     *   movw   r0, #off
-     *   add    r2, env, r2, lsl #CPU_TLB_ENTRY_BITS              (2)
-     *   bic    tmp, addr_reg, tmp
-     *   ldr    r0, [r2, r0]!                                     (3)
-     *   cmp    r0, tmp                                           (4)
+     *   ldr    r2, [r2, #addoff]                                 (5)
+     *   cmpeq  r0, tmp, lsl #TARGET_PAGE_BITS
      */
     tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
                     0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
@@ -1213,7 +1210,6 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
     if (tlb_offset > 0xff) {
         tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
                         (24 << 7) | (tlb_offset >> 8));
-        tlb_offset &= 0xff;
         base = TCG_REG_R2;
     }
 
@@ -1226,14 +1222,12 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
        but due to how the pointer needs setting up, ldm isn't useful.
        Base arm5 doesn't have ldrd, but armv5te does.  */
     if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
-        tcg_out_memop_8(s, COND_AL, INSN_LDRD_IMM, TCG_REG_R0,
-                        TCG_REG_R2, tlb_offset, 1, 1);
+        tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, tlb_offset & 0xff);
     } else {
-        tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R0,
-                         TCG_REG_R2, tlb_offset, 1, 1);
+        tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, tlb_offset & 0xff);
         if (TARGET_LONG_BITS == 64) {
-            tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R1,
-                             TCG_REG_R2, 4, 1, 0);
+            tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
+                           (tlb_offset & 0xff) + 4);
         }
     }
 
@@ -1243,6 +1237,10 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
                         0, addrlo, (1 << s_bits) - 1);
     }
 
+    /* Load the tlb addend.  */
+    tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2,
+                    add_offset - (tlb_offset & 0xff00));
+
     tcg_out_dat_reg(s, (s_bits ? COND_EQ : COND_AL), ARITH_CMP, 0,
                     TCG_REG_R0, TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
 
@@ -1360,53 +1358,48 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg 
*args, int opc)
     mem_index = *args;
     s_bits = opc & 3;
 
-    tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
-                     offsetof(CPUArchState, 
tlb_table[mem_index][0].addr_read));
-
-    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
-                    offsetof(CPUTLBEntry, addend)
-                    - offsetof(CPUTLBEntry, addr_read));
+    tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 1);
 
     switch (opc) {
     case 0:
-        tcg_out_ld8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         break;
     case 0 | 4:
-        tcg_out_ld8s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld8s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         break;
     case 1:
-        tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         if (bswap) {
             tcg_out_bswap16(s, COND_EQ, data_reg, data_reg);
         }
         break;
     case 1 | 4:
         if (bswap) {
-            tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
             tcg_out_bswap16s(s, COND_EQ, data_reg, data_reg);
         } else {
-            tcg_out_ld16s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_ld16s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         }
         break;
     case 2:
     default:
-        tcg_out_ld32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         if (bswap) {
             tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
         }
         break;
     case 3:
         if (bswap) {
-            tcg_out_ld32_rwb(s, COND_EQ, data_reg2, TCG_REG_R1, addr_reg);
-            tcg_out_ld32_12(s, COND_EQ, data_reg, TCG_REG_R1, 4);
+            tcg_out_ld32_rwb(s, COND_EQ, data_reg2, TCG_REG_R2, addr_reg);
+            tcg_out_ld32_12(s, COND_EQ, data_reg, TCG_REG_R2, 4);
             tcg_out_bswap32(s, COND_EQ, data_reg2, data_reg2);
             tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
         } else if (use_armv6_instructions
                    && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
-            tcg_out_ldrd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_ldrd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         } else {
-            tcg_out_ld32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
-            tcg_out_ld32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
+            tcg_out_ld32_rwb(s, COND_EQ, data_reg, TCG_REG_R2, addr_reg);
+            tcg_out_ld32_12(s, COND_EQ, data_reg2, TCG_REG_R2, 4);
         }
         break;
     }
@@ -1506,47 +1499,41 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg 
*args, int opc)
     mem_index = *args;
     s_bits = opc & 3;
 
-    tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
-                     offsetof(CPUArchState,
-                              tlb_table[mem_index][0].addr_write));
-
-    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
-                    offsetof(CPUTLBEntry, addend)
-                    - offsetof(CPUTLBEntry, addr_write));
+    tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 0);
 
     switch (opc) {
     case 0:
-        tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         break;
     case 1:
         if (bswap) {
             tcg_out_bswap16st(s, COND_EQ, TCG_REG_R0, data_reg);
-            tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
+            tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R2);
         } else {
-            tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         }
         break;
     case 2:
     default:
         if (bswap) {
             tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
-            tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
+            tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R2);
         } else {
-            tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         }
         break;
     case 3:
         if (bswap) {
             tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg2);
-            tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, addr_reg);
+            tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R2, addr_reg);
             tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
-            tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, 4);
+            tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R2, 4);
         } else if (use_armv6_instructions
                    && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
-            tcg_out_strd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_strd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         } else {
-            tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
-            tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
+            tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R2, addr_reg);
+            tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R2, 4);
         }
         break;
     }
-- 
1.8.1.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]