[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [RFC 22/30] target-arm: emulate LL/SC using cmpxchg helpers
From: |
Emilio G. Cota |
Subject: |
[Qemu-devel] [RFC 22/30] target-arm: emulate LL/SC using cmpxchg helpers |
Date: |
Mon, 27 Jun 2016 15:02:08 -0400 |
Emulating LL/SC with cmpxchg is not correct, since it can
suffer from the ABA problem. Portable parallel code, however,
is written assuming only cmpxchg--and not LL/SC--is available.
This means that in practice emulating LL/SC with cmpxchg is
a viable alternative.
The appended emulates LL/SC pairs in ARM with cmpxchg helpers.
This works in both user and system mode. In usermode, it avoids
pausing all other CPUs to perform the LL/SC pair. The subsequent
performance and scalability improvement is significant, as the
plots below show. They plot the throughput of atomic_add-bench
compiled for ARM and executed on a 64-core x86 machine.
Hi-res plots: http://imgur.com/a/aNQpB
atomic_add-bench: 1000000 ops/thread, [0,1] range
9 ++---------+----------+----------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
8 +Emaster +-H--+ ++
| | |
7 ++E ++
| | |
6 ++++ ++
| | |
5 ++ | ++
4 ++ | ++
| | |
3 ++ | ++
| | |
2 ++ | ++
|H++E+--- +++ ---+E+------+E+------+E|
1 +++ +E+-----+E+------+E+------+E+------+E+-- +++ +++ ++
++H+ + +++ + +++ ++++ + + + |
0 ++--H----H-+-----H----+----------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,2] range
16 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
14 ++master +-H--+ ++
| | |
12 ++| ++
| E |
10 ++| ++
| | |
8 ++++ ++
|E+| |
| | |
6 ++ | ++
| | |
4 ++ | ++
| +E+--- +++ +++ +++ ---+E+------+E|
2 +H+ +E+------E-------+E+-----+E+------+E+------+E+-- +++
+ | + +++ + ++++ + + + |
0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,128] range
70 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + ++++ + |
60 ++master +-H--+ ----E------+E+-------++
| -+E+--- +++ +++ +E|
| +++ ---- +++ ++|
50 ++ +++ ---+E+- ++
| -E--- |
40 ++ ---+++ ++
| +++--- |
| -+E+ |
30 ++ +++---- ++
| +E+ |
20 ++ +++-- ++
| +E+ |
|+E+ |
10 +E+ ++
+ + + + + + + |
0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,1024] range
120 ++---------+---------+----------+---------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
| master +-H--+ ++|
100 ++ ----E+
| +++ ---+E+--- ++|
| --E--- +++ |
80 ++ ---- +++ ++
| ---+E+- |
60 ++ -+E+-- ++
| +++ ---- +++ |
| -+E+- |
40 ++ +++---- ++
| +++ ---+E+ |
| -+E+--- |
20 ++ +E+ ++
|+E+++ |
+E+ + + + + + + |
0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
Signed-off-by: Emilio G. Cota <address@hidden>
---
target-arm/translate.c | 80 +++++++++-----------------------------------------
1 file changed, 14 insertions(+), 66 deletions(-)
diff --git a/target-arm/translate.c b/target-arm/translate.c
index bd5d5cb..0d4a1a9 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -7715,15 +7715,6 @@ static void gen_logicq_cc(TCGv_i32 lo, TCGv_i32 hi)
tcg_gen_or_i32(cpu_ZF, lo, hi);
}
-/* Load/Store exclusive instructions are implemented by remembering
- the value/address loaded, and seeing if these are the same
- when the store is performed. This should be sufficient to implement
- the architecturally mandated semantics, and avoids having to monitor
- regular stores.
-
- In system emulation mode only one CPU will be running at once, so
- this sequence is effectively atomic. In user emulation mode we
- throw an exception and handle the atomic operation elsewhere. */
static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
TCGv_i32 addr, int size)
{
@@ -7768,21 +7759,11 @@ static void gen_clrex(DisasContext *s)
tcg_gen_movi_i64(cpu_exclusive_addr, -1);
}
-#ifdef CONFIG_USER_ONLY
-static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
- TCGv_i32 addr, int size)
-{
- tcg_gen_extu_i32_i64(cpu_exclusive_test, addr);
- tcg_gen_movi_i32(cpu_exclusive_info,
- size | (rd << 4) | (rt << 8) | (rt2 << 12));
- gen_exception_internal_insn(s, 4, EXCP_STREX);
-}
-#else
static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
TCGv_i32 addr, int size)
{
- TCGv_i32 tmp;
- TCGv_i64 val64, extaddr;
+ TCGv_i32 t0, t1, t2;
+ TCGv_i64 extaddr;
TCGLabel *done_label;
TCGLabel *fail_label;
@@ -7799,69 +7780,36 @@ static void gen_store_exclusive(DisasContext *s, int
rd, int rt, int rt2,
tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label);
tcg_temp_free_i64(extaddr);
- tmp = tcg_temp_new_i32();
+ t0 = tcg_temp_new_i32();
+ t1 = load_reg(s, rt);
switch (size) {
case 0:
- gen_aa32_ld8u(s, tmp, addr, get_mem_index(s));
+ gen_helper_cmpxchgb(t0, cpu_env, addr, cpu_exclusive_val, t1);
break;
case 1:
- gen_aa32_ld16u(s, tmp, addr, get_mem_index(s));
+ gen_helper_cmpxchgw(t0, cpu_env, addr, cpu_exclusive_val, t1);
break;
case 2:
- case 3:
- gen_aa32_ld32u(s, tmp, addr, get_mem_index(s));
- break;
- default:
- abort();
- }
-
- val64 = tcg_temp_new_i64();
- if (size == 3) {
- TCGv_i32 tmp2 = tcg_temp_new_i32();
- TCGv_i32 tmp3 = tcg_temp_new_i32();
- tcg_gen_addi_i32(tmp2, addr, 4);
- gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s));
- tcg_temp_free_i32(tmp2);
- tcg_gen_concat_i32_i64(val64, tmp, tmp3);
- tcg_temp_free_i32(tmp3);
- } else {
- tcg_gen_extu_i32_i64(val64, tmp);
- }
- tcg_temp_free_i32(tmp);
-
- tcg_gen_brcond_i64(TCG_COND_NE, val64, cpu_exclusive_val, fail_label);
- tcg_temp_free_i64(val64);
-
- tmp = load_reg(s, rt);
- switch (size) {
- case 0:
- gen_aa32_st8(s, tmp, addr, get_mem_index(s));
+ gen_helper_cmpxchgl(t0, cpu_env, addr, cpu_exclusive_val, t1);
break;
- case 1:
- gen_aa32_st16(s, tmp, addr, get_mem_index(s));
- break;
- case 2:
case 3:
- gen_aa32_st32(s, tmp, addr, get_mem_index(s));
+ t2 = load_reg(s, rt2);
+ gen_helper_cmpxchgq(t0, cpu_env, addr, cpu_exclusive_val, t1, t2);
+ tcg_temp_free_i32(t2);
break;
default:
abort();
}
- tcg_temp_free_i32(tmp);
- if (size == 3) {
- tcg_gen_addi_i32(addr, addr, 4);
- tmp = load_reg(s, rt2);
- gen_aa32_st32(s, tmp, addr, get_mem_index(s));
- tcg_temp_free_i32(tmp);
- }
- tcg_gen_movi_i32(cpu_R[rd], 0);
+ tcg_temp_free_i32(t1);
+ tcg_gen_mov_i32(cpu_R[rd], t0);
+ tcg_temp_free_i32(t0);
tcg_gen_br(done_label);
+
gen_set_label(fail_label);
tcg_gen_movi_i32(cpu_R[rd], 1);
gen_set_label(done_label);
tcg_gen_movi_i64(cpu_exclusive_addr, -1);
}
-#endif
/* gen_srs:
* @env: CPUARMState
--
2.5.0
- [Qemu-devel] [RFC 10/30] cpu_ldst: add cpu_atomic helpers, (continued)
- [Qemu-devel] [RFC 10/30] cpu_ldst: add cpu_atomic helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 11/30] target-i386: add atomic helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 06/30] target-i386: emulate LOCK'ed cmpxchg8b/16b using cmpxchg helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 05/30] target-i386: emulate LOCK'ed cmpxchg using cmpxchg helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 13/30] target-i386: emulate LOCK'ed INC using atomic helper, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 15/30] target-i386: emulate LOCK'ed NEG using cmpxchg helper, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 27/30] target-arm: emulate aarch64's LL/SC using cmpxchg helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 25/30] helper: add DEF_HELPER_6, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 16/30] target-i386: emulate LOCK'ed XADD using atomic helper, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 23/30] target-arm: add atomic_xchg helper, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 22/30] target-arm: emulate LL/SC using cmpxchg helpers,
Emilio G. Cota <=
- [Qemu-devel] [RFC 29/30] linux-user: remove handling of aarch64's EXCP_STREX, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 20/30] target-i386: remove helper_lock(), Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 26/30] target-arm: add cmpxchg helpers for aarch64, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 17/30] target-i386: emulate LOCK'ed BTX ops using atomic helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 24/30] target-arm: emulate SWP with atomic_xchg helper, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 30/30] target-arm: remove EXCP_STREX + cpu_exclusive_{test, info}, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 18/30] target-i386: emulate XCHG using atomic helper, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 28/30] linux-user: remove handling of ARM's EXCP_STREX, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 21/30] target-arm: add cmpxchg helpers, Emilio G. Cota, 2016/06/27
- [Qemu-devel] [RFC 19/30] tests: add atomic_add-bench, Emilio G. Cota, 2016/06/27