[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [RFC v3 PATCH 02/14] tcg/i386: Add support for fence
From: |
Paolo Bonzini |
Subject: |
Re: [Qemu-devel] [RFC v3 PATCH 02/14] tcg/i386: Add support for fence |
Date: |
Tue, 21 Jun 2016 09:24:46 +0200 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.1.1 |
On 18/06/2016 06:03, Pranith Kumar wrote:
> Generate mfence/sfence/lfence instruction on SSE2 enabled
> processors. For older processors, generate a 'lock orl $0,0(%esp)'
> instruction which has full ordering semantics.
>
> Signed-off-by: Pranith Kumar <address@hidden>
> [rth: Check for sse2, fallback to locked memory op otherwise.]
> Signed-off-by: Richard Henderson <address@hidden>
> ---
> tcg/i386/tcg-target.inc.c | 47
> +++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 47 insertions(+)
>
> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
> index 317484c..0748652 100644
> --- a/tcg/i386/tcg-target.inc.c
> +++ b/tcg/i386/tcg-target.inc.c
> @@ -121,6 +121,16 @@ static bool have_cmov;
> # define have_cmov 0
> #endif
>
> +/* For 32-bit, we are going to attempt to determine at runtime whether
> + sse2 support is available. */
> +#if TCG_TARGET_REG_BITS == 64 || defined(__SSE2__)
> +# define have_sse2 1
> +#elif defined(CONFIG_CPUID_H) && defined(bit_SSE2)
> +static bool have_sse2;
> +#else
> +# define have_sse2 0
> +#endif
> +
> /* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are
> going to attempt to determine at runtime whether movbe is available. */
> #if defined(CONFIG_CPUID_H) && defined(bit_MOVBE)
> @@ -686,6 +696,32 @@ static inline void tcg_out_pushi(TCGContext *s,
> tcg_target_long val)
> }
> }
>
> +static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
> +{
> + if (have_sse2) {
> + tcg_out16(s, 0xae0f);
> + switch (a0 & TCG_MO_ALL) {
> + case TCG_MO_LD_LD:
> + /* lfence */
> + tcg_out8(s, 0xe8);
> + break;
> + case TCG_MO_ST_ST:
> + /* sfence */
> + tcg_out8(s, 0xf8);
> + break;
These two barriers are unnecessary on x86, and so is TCG_MO_LD_ST.
> + default:
> + /* mfence */
> + tcg_out8(s, 0xf0);
> + break;
Please use lock orl here too, it turns out to be faster.
> + }
> + } else {
> + /* lock orl $0,0(%esp) */
> + tcg_out8(s, 0xf0);
> + tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
> + tcg_out8(s, 0);
This is only needed for TCG_MO_ST_LD.
Paolo
> + }
> +}
> +
> static inline void tcg_out_push(TCGContext *s, int reg)
> {
> tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
> @@ -2120,6 +2156,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode
> opc,
> }
> break;
>
> + case INDEX_op_mb:
> + assert(args[0] != 0);
> + tcg_out_mb(s, args[0]);
> + break;
> case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
> case INDEX_op_mov_i64:
> case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
> @@ -2185,6 +2225,8 @@ static const TCGTargetOpDef x86_op_defs[] = {
> { INDEX_op_add2_i32, { "r", "r", "0", "1", "ri", "ri" } },
> { INDEX_op_sub2_i32, { "r", "r", "0", "1", "ri", "ri" } },
>
> + { INDEX_op_mb, { } },
> +
> #if TCG_TARGET_REG_BITS == 32
> { INDEX_op_brcond2_i32, { "r", "r", "ri", "ri" } },
> { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } },
> @@ -2362,6 +2404,11 @@ static void tcg_target_init(TCGContext *s)
> available, we'll use a small forward branch. */
> have_cmov = (d & bit_CMOV) != 0;
> #endif
> +#ifndef have_sse2
> + /* Likewise, almost all hardware supports SSE2, but we do
> + have a locked memory operation to use as a substitute. */
> + have_sse2 = (d & bit_SSE2) != 0;
> +#endif
> #ifndef have_movbe
> /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
> need to probe for it. */
>