qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 1/9] TCG "sync" op


From: Ulrich Hecht
Subject: Re: [Qemu-devel] [PATCH 1/9] TCG "sync" op
Date: Fri, 16 Oct 2009 18:37:31 +0200
User-agent: KMail/1.9.10

On Friday 16 October 2009, Aurelien Jarno wrote:
> IMHO, the correct way to do it is to use the following code, assuming
> you want to use 64-bit TCG regs to hold 32-bit values (that's
> something that is not really clear in your next patch):
>
> - for register load:
> | static TCGv load_reg(int reg)
> | {
> |    TCGv r = tcg_temp_new_i64();
> |    tcg_gen_ext32u_i64(r, tcgregs[reg]);
> |    return r;
> | }
> |
> | static void store_reg32(int reg, TCGv v)
> | {
> |    tcg_gen_ext32u_i64(v, v); /* may be optional */
> |    tcg_gen_andi_i64(tcgregs[reg], tcgregs[reg],
> | 0xffffffff00000000ULL); tcg_gen_or_i64(tcgregs[reg], tcgregs[reg],
> | v);
> | }

This is _extremely_ detrimental to performance. The point of the sync op 
is that in most cases it's a nop because registers are usually used with 
the same bitness again and again. The sign extension/masking stuff is 
done every time a register is accessed as 32 bits, which is the most 
common case. Compare the translation of the following sequence of 
instructions:

IN: _dl_aux_init
0x0000000080044ff6:  lhi        %r4,0
0x0000000080044ffa:  lhi        %r5,0
0x0000000080044ffe:  lhi        %r0,0

with sync:

OP:
 mov_i32 loc0,global_cc
 movi_i32 tmp1,$0x0
 sync_i64 R4
 mov_i32 r4,tmp1
 movi_i32 tmp1,$0x0
 sync_i64 R5
 mov_i32 r5,tmp1
 movi_i32 tmp1,$0x0
 sync_i64 R0
 mov_i32 r0,tmp1
 mov_i32 global_cc,loc0
 movi_i64 tmp2,$0x80045002
 st_i64 tmp2,env,$0x158
 exit_tb $0x0

OUT: [size=61]
0x6019a030:  mov    0x160(%r14),%ebp
0x6019a037:  mov    %rbp,%rbx
0x6019a03a:  mov    $0x80045002,%r12d
0x6019a040:  mov    %r12,0x158(%r14)
0x6019a047:  mov    %ebp,0xd1a0(%r14)
0x6019a04e:  mov    %ebx,0x160(%r14)
0x6019a055:  xor    %ebp,%ebp
0x6019a057:  mov    %ebp,(%r14)
0x6019a05a:  xor    %ebp,%ebp
0x6019a05c:  mov    %ebp,0x20(%r14)
0x6019a060:  xor    %ebp,%ebp
0x6019a062:  mov    %ebp,0x28(%r14)
0x6019a066:  xor    %eax,%eax
0x6019a068:  jmpq   0x621dc8ce


with sign extension:

OP:
 mov_i32 loc0,global_cc
 movi_i32 tmp1,$0x0
 ext32u_i64 tmp1,tmp1
 movi_i64 tmp2,$0xffffffff00000000
 and_i64 R4,R4,tmp2
 or_i64 R4,R4,tmp1
 movi_i32 tmp1,$0x0
 ext32u_i64 tmp1,tmp1
 movi_i64 tmp2,$0xffffffff00000000
 and_i64 R5,R5,tmp2
 or_i64 R5,R5,tmp1
 movi_i32 tmp1,$0x0
 ext32u_i64 tmp1,tmp1
 movi_i64 tmp2,$0xffffffff00000000
 and_i64 R0,R0,tmp2
 or_i64 R0,R0,tmp1
 mov_i32 global_cc,loc0
 movi_i64 tmp2,$0x80045002
 st_i64 tmp2,env,$0x158
 exit_tb $0x0

OUT: [size=126]
0x6019af10:  mov    0x160(%r14),%ebp
0x6019af17:  xor    %ebx,%ebx
0x6019af19:  mov    %ebx,%ebx
0x6019af1b:  mov    0x20(%r14),%r12
0x6019af1f:  mov    $0xffffffff00000000,%r13
0x6019af29:  and    %r13,%r12
0x6019af2c:  or     %rbx,%r12
0x6019af2f:  xor    %ebx,%ebx
0x6019af31:  mov    %ebx,%ebx
0x6019af33:  mov    0x28(%r14),%r13
0x6019af37:  mov    $0xffffffff00000000,%r15
0x6019af41:  and    %r15,%r13
0x6019af44:  or     %rbx,%r13
0x6019af47:  xor    %ebx,%ebx
0x6019af49:  mov    %ebx,%ebx
0x6019af4b:  mov    (%r14),%r15
0x6019af4e:  mov    $0xffffffff00000000,%r10
0x6019af58:  and    %r10,%r15
0x6019af5b:  or     %rbx,%r15
0x6019af5e:  mov    %rbp,%rbx
0x6019af61:  mov    $0x80045002,%r10d
0x6019af67:  mov    %r10,0x158(%r14)
0x6019af6e:  mov    %ebp,0xd1a0(%r14)
0x6019af75:  mov    %ebx,0x160(%r14)
0x6019af7c:  mov    %r15,(%r14)
0x6019af7f:  mov    %r12,0x20(%r14)
0x6019af83:  mov    %r13,0x28(%r14)
0x6019af87:  xor    %eax,%eax
0x6019af89:  jmpq   0x621dd78e

Its more than twice the size and has ten memory accesses instead of 
seven.

CU
Uli

-- 
SUSE LINUX Products GmbH, GF: Markus Rex, HRB 16746 (AG Nürnberg)




reply via email to

[Prev in Thread] Current Thread [Next in Thread]