qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH 1/1] tcg/aarch64: Implement tlb lookup fast path


From: Jani Kokkonen
Subject: [Qemu-devel] [PATCH 1/1] tcg/aarch64: Implement tlb lookup fast path
Date: Thu, 20 Jun 2013 12:53:23 +0200
User-agent: Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20130509 Thunderbird/17.0.6

Supports CONFIG_QEMU_LDST_OPTIMIZATION

Signed-off-by: Jani Kokkonen <address@hidden>
---
 configure                |    2 +-
 include/exec/exec-all.h  |   16 +++-
 tcg/aarch64/tcg-target.c |  197 ++++++++++++++++++++++++++++++++++------------
 3 files changed, 162 insertions(+), 53 deletions(-)

diff --git a/configure b/configure
index bed0104..6a36682 100755
--- a/configure
+++ b/configure
@@ -3599,7 +3599,7 @@ echo "libs_softmmu=$libs_softmmu" >> $config_host_mak
 echo "ARCH=$ARCH" >> $config_host_mak
 
 case "$cpu" in
-  arm|i386|x86_64|ppc)
+  arm|i386|x86_64|ppc|aarch64)
     # The TCG interpreter currently does not support ld/st optimization.
     if test "$tcg_interpreter" = "no" ; then
         echo "CONFIG_QEMU_LDST_OPTIMIZATION=y" >> $config_host_mak
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 5c31863..39c28d1 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -19,9 +19,7 @@
 
 #ifndef _EXEC_ALL_H_
 #define _EXEC_ALL_H_
-
 #include "qemu-common.h"
-
 /* allow to see translation results - the slowdown should be negligible, so we 
leave it */
 #define DEBUG_DISAS
 
@@ -358,6 +356,20 @@ static inline uintptr_t tcg_getpc_ldst(uintptr_t ra)
                                    not the start of the next opcode  */
     return ra;
 }
+#elif defined(__aarch64__)
+#  define GETRA()       ((uintptr_t)__builtin_return_address(0))
+#  define GETPC_LDST()  tcg_getpc_ldst(GETRA())
+static inline uintptr_t tcg_getpc_ldst(uintptr_t ra)
+{
+    int32_t b;
+    ra += 4;                    /* skip one instruction */
+    b = *(int32_t *)ra;         /* load the branch insn */
+    b = (b << 6) >> (6 - 2);    /* extract the displacement */
+    ra += b;                    /* apply the displacement  */
+    ra -= 4;                    /* return a pointer into the current opcode,
+                                   not the start of the next opcode  */
+    return ra;
+}
 # else
 #  error "CONFIG_QEMU_LDST_OPTIMIZATION needs GETPC_LDST() implementation!"
 # endif
diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index 8bb195e..03da082 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -706,6 +706,41 @@ static inline void tcg_out_uxt(TCGContext *s, int s_bits,
     tcg_out_ubfm(s, 0, rd, rn, 0, bits);
 }
 
+static inline void tcg_out_addi(TCGContext *s, int ext,
+                                TCGReg rd, TCGReg rn, unsigned int aimm)
+{
+    /* add immediate aimm unsigned 12bit value (with LSL 0 or 12) */
+    /* using ADD 0x11000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
+    unsigned int base = ext ? 0x91000000 : 0x11000000;
+
+    if (aimm <= 0xfff) {
+        aimm <<= 10;
+    } else {
+        /* we can only shift left by 12, on assert we cannot represent */
+        assert(!(aimm & 0xfff));
+        assert(aimm <= 0xfff000);
+        base |= 1 << 22; /* apply LSL 12 */
+        aimm >>= 2;
+    }
+
+    tcg_out32(s, base | aimm | (rn << 5) | rd);
+}
+
+static inline void tcg_out_subi(TCGContext *s, int ext,
+                                TCGReg rd, TCGReg rn, unsigned int aimm)
+{
+    /* sub immediate aimm unsigned 12bit value (we use LSL 0 - no shift) */
+    /* using SUB 0x51000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
+    unsigned int base = ext ? 0xd1000000 : 0x51000000;
+    assert(aimm <= 0xfff);
+    tcg_out32(s, base | (aimm << 10) | (rn << 5) | rd);
+}
+
+static inline void tcg_out_nop(TCGContext *s)
+{
+    tcg_out32(s, 0xd503201f);
+}
+
 #ifdef CONFIG_SOFTMMU
 #include "exec/softmmu_defs.h"
 
@@ -727,7 +762,106 @@ static const void * const qemu_st_helpers[4] = {
     helper_stq_mmu,
 };
 
-#else /* !CONFIG_SOFTMMU */
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+{
+    reloc_pc19(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
+    tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
+    tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, lb->addrlo_reg);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
+                 (tcg_target_long)qemu_ld_helpers[lb->opc & 3]);
+    tcg_out_callr(s, TCG_REG_TMP);
+    if (lb->opc & 0x04) {
+        tcg_out_sxt(s, 1, lb->opc & 3, lb->datalo_reg, TCG_REG_X0);
+    } else {
+        tcg_out_movr(s, 1, lb->datalo_reg, TCG_REG_X0);
+    }
+
+    tcg_out_goto(s, (tcg_target_long)lb->raddr);
+}
+
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+{
+    reloc_pc19(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
+
+    tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
+    tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, lb->addrlo_reg);
+    tcg_out_movr(s, 1, TCG_REG_X2, lb->datalo_reg);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
+                 (tcg_target_long)qemu_st_helpers[lb->opc & 3]);
+    tcg_out_callr(s, TCG_REG_TMP);
+
+    tcg_out_nop(s);
+    tcg_out_goto(s, (tcg_target_long)lb->raddr);
+}
+
+void tcg_out_tb_finalize(TCGContext *s)
+{
+    int i;
+    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
+        TCGLabelQemuLdst *label = &s->qemu_ldst_labels[i];
+        if (label->is_ld) {
+            tcg_out_qemu_ld_slow_path(s, label);
+        } else {
+            tcg_out_qemu_st_slow_path(s, label);
+        }
+    }
+}
+
+static void add_qemu_ldst_label(TCGContext *s, int is_ld, int opc,
+                                int data_reg, int addr_reg,
+                                int mem_index,
+                                uint8_t *raddr, uint8_t *label_ptr)
+{
+    int idx;
+    TCGLabelQemuLdst *label;
+
+    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
+        tcg_abort();
+    }
+
+    idx = s->nb_qemu_ldst_labels++;
+    label = &s->qemu_ldst_labels[idx];
+    label->is_ld = is_ld;
+    label->opc = opc;
+    label->datalo_reg = data_reg;
+    label->addrlo_reg = addr_reg;
+    label->mem_index = mem_index;
+    label->raddr = raddr;
+    label->label_ptr[0] = label_ptr;
+}
+
+/* Load and compare a TLB entry, emitting the conditional jump to the
+slow path for the failure case, which will be patched later when finalizing
+the slow pathClobbers X0,X1,X2,X3 and TMP.  */
+
+static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg,
+            int s_bits, uint8_t **label_ptr, int tlb_offset, int is_read)
+{
+    TCGReg base = TCG_AREG0;
+
+    tcg_out_ubfm(s, TARGET_LONG_BITS == 64 ? 1 : 0, TCG_REG_X0, addr_reg,
+           TARGET_PAGE_BITS, TARGET_PAGE_BITS + CPU_TLB_BITS);
+
+    tcg_out_andi(s, TARGET_LONG_BITS == 64 ? 1 : 0, TCG_REG_X3, addr_reg,
+           TARGET_LONG_BITS - TARGET_PAGE_BITS + s_bits,
+                   TARGET_LONG_BITS - TARGET_PAGE_BITS);
+
+    tcg_out_addi(s, 1, TCG_REG_X2, base, tlb_offset & 0xfff000);
+    tcg_out_arith(s, ARITH_ADD, 1, TCG_REG_X2, TCG_REG_X2,
+                   TCG_REG_X0, -CPU_TLB_ENTRY_BITS);
+    tcg_out_ldst(s, TARGET_LONG_BITS == 64 ? LDST_64 : LDST_32,
+                  LDST_LD, TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff);
+    tcg_out_ldst(s, LDST_64, LDST_LD, TCG_REG_X1, TCG_REG_X2,
+        (tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend) -
+             (is_read ? offsetof(CPUTLBEntry, addr_read) :
+                   offsetof(CPUTLBEntry, addr_write))));
+
+    tcg_out_cmp(s, 1, TCG_REG_X0, TCG_REG_X3, 0);
+    *label_ptr = s->code_ptr;
+    tcg_out_goto_cond_noaddr(s, TCG_COND_NE);
+}
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, int opc, TCGReg data_r,
                                    TCGReg addr_r, TCGReg off_r)
@@ -822,6 +956,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg 
*args, int opc)
     TCGReg addr_reg, data_reg;
 #ifdef CONFIG_SOFTMMU
     int mem_index, s_bits;
+    uint8_t *label_ptr;
 #endif
     data_reg = args[0];
     addr_reg = args[1];
@@ -829,23 +964,11 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg 
*args, int opc)
 #ifdef CONFIG_SOFTMMU
     mem_index = args[2];
     s_bits = opc & 3;
-
-    /* TODO: insert TLB lookup here */
-
-    /* all arguments passed via registers */
-    tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
-    tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, addr_reg);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, mem_index);
-    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
-                 (tcg_target_long)qemu_ld_helpers[s_bits]);
-    tcg_out_callr(s, TCG_REG_TMP);
-
-    if (opc & 0x04) { /* sign extend */
-        tcg_out_sxt(s, 1, s_bits, data_reg, TCG_REG_X0);
-    } else {
-        tcg_out_movr(s, 1, data_reg, TCG_REG_X0);
-    }
-
+    tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr,
+              offsetof(CPUArchState, tlb_table[mem_index][0].addr_read), 1);
+    tcg_out_qemu_ld_direct(s, opc, data_reg, addr_reg, TCG_REG_X1);
+    add_qemu_ldst_label(s, 1, opc, data_reg, addr_reg,
+                        mem_index, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     tcg_out_qemu_ld_direct(s, opc, data_reg, addr_reg,
                            GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR);
@@ -857,25 +980,19 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg 
*args, int opc)
     TCGReg addr_reg, data_reg;
 #ifdef CONFIG_SOFTMMU
     int mem_index, s_bits;
+    uint8_t *label_ptr;
 #endif
     data_reg = args[0];
     addr_reg = args[1];
-
 #ifdef CONFIG_SOFTMMU
     mem_index = args[2];
     s_bits = opc & 3;
 
-    /* TODO: insert TLB lookup here */
-
-    /* all arguments passed via registers */
-    tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
-    tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, addr_reg);
-    tcg_out_movr(s, 1, TCG_REG_X2, data_reg);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, mem_index);
-    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
-                 (tcg_target_long)qemu_st_helpers[s_bits]);
-    tcg_out_callr(s, TCG_REG_TMP);
-
+    tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr,
+             offsetof(CPUArchState, tlb_table[mem_index][0].addr_write), 0);
+    tcg_out_qemu_st_direct(s, opc, data_reg, addr_reg, TCG_REG_X1);
+    add_qemu_ldst_label(s, 0, opc, data_reg, addr_reg,
+                        mem_index, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     tcg_out_qemu_st_direct(s, opc, data_reg, addr_reg,
                            GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR);
@@ -1318,26 +1435,6 @@ static void tcg_target_init(TCGContext *s)
     tcg_add_target_add_op_defs(aarch64_op_defs);
 }
 
-static inline void tcg_out_addi(TCGContext *s, int ext,
-                                TCGReg rd, TCGReg rn, unsigned int aimm)
-{
-    /* add immediate aimm unsigned 12bit value (we use LSL 0 - no shift) */
-    /* using ADD 0x11000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
-    unsigned int base = ext ? 0x91000000 : 0x11000000;
-    assert(aimm <= 0xfff);
-    tcg_out32(s, base | (aimm << 10) | (rn << 5) | rd);
-}
-
-static inline void tcg_out_subi(TCGContext *s, int ext,
-                                TCGReg rd, TCGReg rn, unsigned int aimm)
-{
-    /* sub immediate aimm unsigned 12bit value (we use LSL 0 - no shift) */
-    /* using SUB 0x51000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
-    unsigned int base = ext ? 0xd1000000 : 0x51000000;
-    assert(aimm <= 0xfff);
-    tcg_out32(s, base | (aimm << 10) | (rn << 5) | rd);
-}
-
 static void tcg_target_qemu_prologue(TCGContext *s)
 {
     /* NB: frame sizes are in 16 byte stack units! */
-- 
1.7.9.5





reply via email to

[Prev in Thread] Current Thread [Next in Thread]