qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH] tcg/arm: improve direct jump


From: Aurelien Jarno
Subject: [Qemu-devel] [PATCH] tcg/arm: improve direct jump
Date: Thu, 10 Dec 2015 09:02:16 +0100

Use ldr pc, [pc, #-4] kind of branch for direct jump. This removes the
need to flush the icache on TB linking, and allow to remove the limit
on the code generation buffer.

Cc: Richard Henderson <address@hidden>
Cc: TeLeMan <address@hidden>
Cc: Andrzej Zaborowski <address@hidden>
Signed-off-by: Aurelien Jarno <address@hidden>
---
 include/exec/exec-all.h | 24 ++++--------------------
 tcg/arm/tcg-target.c    |  8 +++-----
 translate-all.c         |  2 --
 3 files changed, 7 insertions(+), 27 deletions(-)

Note: I don't really get the reason for the current 16MB limit. With the
standard branch instructions the offset is coded on 24 bits, but shifted
right by 2, which should give us a +/-32MB jumps, and therefore a 32MB
limit.

Therefore it might be a good idea to benchmark the original QEMU vs a
QEMU with a 32MB buffer vs QEMU with this patch.

If mixing data and instructions like in this patch causes too much
performances trouble, at least on ARMv7 we might want to try with movw +
movt + movpc. It's only 4 bytes longer than the current patch.

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index d900b0d..3cd63c9 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -274,26 +274,10 @@ void aarch64_tb_set_jmp_target(uintptr_t jmp_addr, 
uintptr_t addr);
 #elif defined(__arm__)
 static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
 {
-#if !QEMU_GNUC_PREREQ(4, 1)
-    register unsigned long _beg __asm ("a1");
-    register unsigned long _end __asm ("a2");
-    register unsigned long _flg __asm ("a3");
-#endif
-
-    /* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */
-    *(uint32_t *)jmp_addr =
-        (*(uint32_t *)jmp_addr & ~0xffffff)
-        | (((addr - (jmp_addr + 8)) >> 2) & 0xffffff);
-
-#if QEMU_GNUC_PREREQ(4, 1)
-    __builtin___clear_cache((char *) jmp_addr, (char *) jmp_addr + 4);
-#else
-    /* flush icache */
-    _beg = jmp_addr;
-    _end = jmp_addr + 4;
-    _flg = 0;
-    __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" (_flg));
-#endif
+    /* Patch the branch destination. It uses a ldr pc, [pc, #-4] kind
+       of branch so we write absolute address and we don't need to
+       flush icache. */
+    *(uint32_t *)jmp_addr = addr;
 }
 #elif defined(__sparc__) || defined(__mips__)
 void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr);
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index 3edf6a6..f28b9ba 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -986,10 +986,6 @@ static inline void tcg_out_st8(TCGContext *s, int cond,
         tcg_out_st8_12(s, cond, rd, rn, offset);
 }
 
-/* The _goto case is normally between TBs within the same code buffer, and
- * with the code buffer limited to 16MB we wouldn't need the long case.
- * But we also use it for the tail-call to the qemu_ld/st helpers, which does.
- */
 static inline void tcg_out_goto(TCGContext *s, int cond, tcg_insn_unit *addr)
 {
     intptr_t addri = (intptr_t)addr;
@@ -1649,8 +1645,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
             /* Direct jump method */
+            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
             s->tb_jmp_offset[args[0]] = tcg_current_code_size(s);
-            tcg_out_b_noaddr(s, COND_AL);
+            /* Skip over address */
+            s->code_ptr++;
         } else {
             /* Indirect jump method */
             intptr_t ptr = (intptr_t)(s->tb_next + args[0]);
diff --git a/translate-all.c b/translate-all.c
index 042a857..1ca113c 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -472,8 +472,6 @@ static inline PageDesc *page_find(tb_page_addr_t index)
 # define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
 #elif defined(__aarch64__)
 # define MAX_CODE_GEN_BUFFER_SIZE  (128ul * 1024 * 1024)
-#elif defined(__arm__)
-# define MAX_CODE_GEN_BUFFER_SIZE  (16u * 1024 * 1024)
 #elif defined(__s390x__)
   /* We have a +- 4GB range on the branches; leave some slop.  */
 # define MAX_CODE_GEN_BUFFER_SIZE  (3ul * 1024 * 1024 * 1024)
-- 
2.6.2




reply via email to

[Prev in Thread] Current Thread [Next in Thread]