qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available


From: Richard Henderson
Subject: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available
Date: Fri, 20 Dec 2013 15:00:12 -0800

As present on Atom and Haswell processors.

Signed-off-by: Richard Henderson <address@hidden>
---
 disas/i386.c          |   8 ++--
 tcg/i386/tcg-target.c | 127 ++++++++++++++++++++++++++++++++++----------------
 2 files changed, 91 insertions(+), 44 deletions(-)

Here's to "stress testing" a Haswell laptop before Santa delivers it.  ;-)


r~


diff --git a/disas/i386.c b/disas/i386.c
index 47f1f2e..beb33f0 100644
--- a/disas/i386.c
+++ b/disas/i386.c
@@ -2632,16 +2632,16 @@ static const struct dis386 prefix_user_table[][4] = {
 
   /* PREGRP87 */
   {
-    { "(bad)", { XX } },
-    { "(bad)", { XX } },
+    { "movbe", { Gv, M } },
+    { "movbe", { Gw, M } },
     { "(bad)", { XX } },
     { "crc32", { Gdq, { CRC32_Fixup, b_mode } } },
   },
 
   /* PREGRP88 */
   {
-    { "(bad)", { XX } },
-    { "(bad)", { XX } },
+    { "movbe", { M, Gv } },
+    { "movbe", { M, Gw } },
     { "(bad)", { XX } },
     { "crc32", { Gdq, { CRC32_Fixup, v_mode } } },
   },
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 7ac8e45..e76f3f3 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -99,18 +99,28 @@ static const int tcg_target_call_oarg_regs[] = {
 # define TCG_REG_L1 TCG_REG_EDX
 #endif
 
+#ifdef CONFIG_CPUID_H
+#include <cpuid.h>
+#endif
+
 /* For 32-bit, we are going to attempt to determine at runtime whether cmov
    is available.  However, the host compiler must supply <cpuid.h>, as we're
    not going to go so far as our own inline assembly.  */
 #if TCG_TARGET_REG_BITS == 64
 # define have_cmov 1
 #elif defined(CONFIG_CPUID_H)
-#include <cpuid.h>
 static bool have_cmov;
 #else
 # define have_cmov 0
 #endif
 
+/* There is no pre-processor definition for MOVBE to fall back on.  */
+#ifdef CONFIG_CPUID_H
+static bool have_movbe;
+#else
+#define have_movbe 0
+#endif
+
 static uint8_t *tb_ret_addr;
 
 static void patch_reloc(uint8_t *code_ptr, int type,
@@ -254,6 +264,7 @@ static inline int tcg_target_const_match(tcg_target_long 
val,
 # define P_REXB_RM     0
 # define P_GS           0
 #endif
+#define P_EXT38         0x8000          /* 0x0f 0x38 opcode prefix */
 
 #define OPC_ARITH_EvIz (0x81)
 #define OPC_ARITH_EvIb (0x83)
@@ -279,6 +290,8 @@ static inline int tcg_target_const_match(tcg_target_long 
val,
 #define OPC_MOVB_EvIz   (0xc6)
 #define OPC_MOVL_EvIz  (0xc7)
 #define OPC_MOVL_Iv     (0xb8)
+#define OPC_MOVBE_GyMy  (0xf0 | P_EXT | P_EXT38)
+#define OPC_MOVBE_MyGy  (0xf1 | P_EXT | P_EXT38)
 #define OPC_MOVSBL     (0xbe | P_EXT)
 #define OPC_MOVSWL     (0xbf | P_EXT)
 #define OPC_MOVSLQ     (0x63 | P_REXW)
@@ -400,6 +413,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int 
rm, int x)
 
     if (opc & P_EXT) {
         tcg_out8(s, 0x0f);
+        if (opc & P_EXT38) {
+            tcg_out8(s, 0x38);
+        }
     }
     tcg_out8(s, opc);
 }
@@ -411,6 +427,9 @@ static void tcg_out_opc(TCGContext *s, int opc)
     }
     if (opc & P_EXT) {
         tcg_out8(s, 0x0f);
+        if (opc & P_EXT38) {
+            tcg_out8(s, 0x38);
+        }
     }
     tcg_out8(s, opc);
 }
@@ -1336,7 +1355,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg 
datalo, TCGReg datahi,
                                    TCGReg base, intptr_t ofs, int seg,
                                    TCGMemOp memop)
 {
-    const TCGMemOp bswap = memop & MO_BSWAP;
+    const TCGMemOp real_bswap = memop & MO_BSWAP;
+    TCGMemOp bswap = real_bswap;
+    int movop = OPC_MOVL_GvEv;
+
+    if (real_bswap && have_movbe) {
+        bswap = 0;
+        movop = OPC_MOVBE_GyMy;
+    }
 
     switch (memop & MO_SSIZE) {
     case MO_UB:
@@ -1346,32 +1372,45 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, 
TCGReg datalo, TCGReg datahi,
         tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW + seg, datalo, base, ofs);
         break;
     case MO_UW:
-        tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
-        if (bswap) {
-            tcg_out_rolw_8(s, datalo);
+        if (real_bswap && have_movbe) {
+            tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
+                                 datalo, base, ofs);
+            tcg_out_ext16u(s, datalo, datalo);
+        } else {
+            tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
+            if (bswap) {
+                tcg_out_rolw_8(s, datalo);
+            }
         }
         break;
     case MO_SW:
-        if (bswap) {
-            tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
-            tcg_out_rolw_8(s, datalo);
-            tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
+        if (real_bswap) {
+            if (have_movbe) {
+                tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
+                                     datalo, base, ofs);
+            } else {
+                tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
+                tcg_out_rolw_8(s, datalo);
+            }
+            tcg_out_ext16s(s, datalo, datalo, P_REXW);
         } else {
             tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg,
                                  datalo, base, ofs);
         }
         break;
     case MO_UL:
-        tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
+        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
         if (bswap) {
             tcg_out_bswap32(s, datalo);
         }
         break;
 #if TCG_TARGET_REG_BITS == 64
     case MO_SL:
-        if (bswap) {
-            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
-            tcg_out_bswap32(s, datalo);
+        if (real_bswap) {
+            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+            if (bswap) {
+                tcg_out_bswap32(s, datalo);
+            }
             tcg_out_ext32s(s, datalo, datalo);
         } else {
             tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs);
@@ -1380,27 +1419,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, 
TCGReg datalo, TCGReg datahi,
 #endif
     case MO_Q:
         if (TCG_TARGET_REG_BITS == 64) {
-            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg,
-                                 datalo, base, ofs);
+            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
             if (bswap) {
                 tcg_out_bswap64(s, datalo);
             }
         } else {
-            if (bswap) {
+            if (real_bswap) {
                 int t = datalo;
                 datalo = datahi;
                 datahi = t;
             }
             if (base != datalo) {
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datalo, base, ofs);
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datahi, base, ofs + 4);
+                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
             } else {
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datahi, base, ofs + 4);
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datalo, base, ofs);
+                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
+                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
             }
             if (bswap) {
                 tcg_out_bswap32(s, datalo);
@@ -1476,13 +1510,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s, 
TCGReg datalo, TCGReg datahi,
                                    TCGReg base, intptr_t ofs, int seg,
                                    TCGMemOp memop)
 {
-    const TCGMemOp bswap = memop & MO_BSWAP;
-
     /* ??? Ideally we wouldn't need a scratch register.  For user-only,
        we could perform the bswap twice to restore the original value
        instead of moving to the scratch.  But as it is, the L constraint
        means that TCG_REG_L0 is definitely free here.  */
     const TCGReg scratch = TCG_REG_L0;
+    const TCGMemOp real_bswap = memop & MO_BSWAP;
+    TCGMemOp bswap = real_bswap;
+    int movop = OPC_MOVL_EvGv;
+
+    if (real_bswap && have_movbe) {
+        bswap = 0;
+        movop = OPC_MOVBE_MyGy;
+    }
 
     switch (memop & MO_SIZE) {
     case MO_8:
@@ -1501,8 +1541,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg 
datalo, TCGReg datahi,
             tcg_out_rolw_8(s, scratch);
             datalo = scratch;
         }
-        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg,
-                             datalo, base, ofs);
+        tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
         break;
     case MO_32:
         if (bswap) {
@@ -1510,7 +1549,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg 
datalo, TCGReg datahi,
             tcg_out_bswap32(s, scratch);
             datalo = scratch;
         }
-        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
+        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
         break;
     case MO_64:
         if (TCG_TARGET_REG_BITS == 64) {
@@ -1519,8 +1558,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg 
datalo, TCGReg datahi,
                 tcg_out_bswap64(s, scratch);
                 datalo = scratch;
             }
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg,
-                                 datalo, base, ofs);
+            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
         } else if (bswap) {
             tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
             tcg_out_bswap32(s, scratch);
@@ -1529,8 +1567,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg 
datalo, TCGReg datahi,
             tcg_out_bswap32(s, scratch);
             tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
         } else {
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base, ofs+4);
+            if (real_bswap) {
+                int t = datalo;
+                datalo = datahi;
+                datahi = t;
+            }
+            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+            tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
         }
         break;
     default:
@@ -2157,15 +2200,19 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 
 static void tcg_target_init(TCGContext *s)
 {
-    /* For 32-bit, 99% certainty that we're running on hardware that supports
-       cmov, but we still need to check.  In case cmov is not available, we'll
-       use a small forward branch.  */
+    unsigned a, b, c, d;
+
+    if (__get_cpuid(1, &a, &b, &c, &d)) {
 #ifndef have_cmov
-    {
-        unsigned a, b, c, d;
-        have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV));
-    }
+        /* For 32-bit, 99% certainty that we're running on hardware that
+           supports cmov, but we still need to check.  In case cmov is not
+           available, we'll use a small forward branch.  */
+        have_cmov = (d & bit_CMOV) != 0;
 #endif
+#ifndef have_movbe
+        have_movbe = (c & bit_MOVBE) != 0;
+#endif
+    }
 
     if (TCG_TARGET_REG_BITS == 64) {
         tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
-- 
1.8.4.2




reply via email to

[Prev in Thread] Current Thread [Next in Thread]