When running with a single vcpu, we can return a constant instead of a
load when accessing cpu_index.
A side effect is that all tcg operations using it are optimized, most
notably scoreboard access.
When running a simple loop in user-mode, the speedup is around 20%.
Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
---
accel/tcg/plugin-gen.c | 7 +++++++
plugins/core.c | 13 +++++++++++++
2 files changed, 20 insertions(+)
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index 0f47bfbb489..2eabeecbdcf 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -102,6 +102,13 @@ static void gen_disable_mem_helper(void)
static TCGv_i32 gen_cpu_index(void)
{
+ /*
+ * Optimize when we run with a single vcpu. All values using cpu_index,
+ * including scoreboard index, will be optimized out.
+ */
+ if (qemu_plugin_num_vcpus() == 1) {
+ return tcg_constant_i32(0);
+ }
TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
tcg_gen_ld_i32(cpu_index, tcg_env,
-offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
diff --git a/plugins/core.c b/plugins/core.c
index bb105e8e688..8e32ca5ee08 100644
--- a/plugins/core.c
+++ b/plugins/core.c
@@ -266,6 +266,19 @@ static void qemu_plugin_vcpu_init__async(CPUState *cpu,
run_on_cpu_data unused)
assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
qemu_rec_mutex_lock(&plugin.lock);
+
+ /*
+ * We want to flush tb when a second cpu appear.
+ * When generating plugin code, we optimize cpu_index for num_vcpus == 1.
+ */
+ if (plugin.num_vcpus == 1) {
+ qemu_rec_mutex_unlock(&plugin.lock);
+ start_exclusive();
+ qemu_rec_mutex_lock(&plugin.lock);
+ tb_flush(cpu);
+ end_exclusive();
+ }
+
plugin.num_vcpus = MAX(plugin.num_vcpus, cpu->cpu_index + 1);
plugin_cpu_update__locked(&cpu->cpu_index, NULL, NULL);
success = g_hash_table_insert(plugin.cpu_ht, &cpu->cpu_index,