[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 03/16] tcg: track TBs with per-region BST's
From: |
Alex Bennée |
Subject: |
Re: [Qemu-devel] [PATCH 03/16] tcg: track TBs with per-region BST's |
Date: |
Thu, 29 Mar 2018 10:54:36 +0100 |
User-agent: |
mu4e 1.1.0; emacs 26.0.91 |
Emilio G. Cota <address@hidden> writes:
> This paves the way for enabling scalable parallel generation of TCG code.
>
> Instead of tracking TBs with a single binary search tree (BST), use a
> BST for each TCG region, protecting it with a lock. This is as scalable
> as it gets, since each TCG thread operates on a separate region.
>
> The core of this change is the introduction of struct tcg_region_tree,
> which contains a pointer to a GTree and an associated lock to serialize
> accesses to it. We then allocate an array of tcg_region_tree's, adding
> the appropriate padding to avoid false sharing based on
> qemu_dcache_linesize.
>
> Given a tc_ptr, we first find the corresponding region_tree. This
> is done by special-casing the first and last regions first, since they
> might be of size != region.size; otherwise we just divide the offset
> by region.stride. I was worried about this division (several dozen
> cycles of latency), but profiling shows that this is not a fast path.
> Note that region.stride is not required to be a power of two; it
> is only required to be a multiple of the host's page size.
>
> Note that with this design we can also provide consistent snapshots
> about all region trees at once; for instance, tcg_tb_foreach
> acquires/releases all region_tree locks before/after iterating over them.
> For this reason we now drop tb_lock in dump_exec_info().
>
> As an alternative I considered implementing a concurrent BST, but this
> can be tricky to get right, offers no consistent snapshots of the BST,
> and performance and scalability-wise I don't think it could ever beat
> having separate GTrees, given that our workload is insert-mostly (all
> concurrent BST designs I've seen focus, understandably, on making
> lookups fast, which comes at the expense of convoluted, non-wait-free
> insertions/removals).
>
> Signed-off-by: Emilio G. Cota <address@hidden>
Reviewed-by: Alex Bennée <address@hidden>
> ---
> accel/tcg/cpu-exec.c | 2 +-
> accel/tcg/translate-all.c | 101 ++++--------------------
> include/exec/exec-all.h | 1 -
> include/exec/tb-context.h | 1 -
> tcg/tcg.c | 191
> ++++++++++++++++++++++++++++++++++++++++++++++
> tcg/tcg.h | 6 ++
> 6 files changed, 213 insertions(+), 89 deletions(-)
>
> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> index ec57564..8c68727 100644
> --- a/accel/tcg/cpu-exec.c
> +++ b/accel/tcg/cpu-exec.c
> @@ -222,7 +222,7 @@ static void cpu_exec_nocache(CPUState *cpu, int
> max_cycles,
>
> tb_lock();
> tb_phys_invalidate(tb, -1);
> - tb_remove(tb);
> + tcg_tb_remove(tb);
> tb_unlock();
> }
> #endif
> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> index 1cf10f8..3a51d49 100644
> --- a/accel/tcg/translate-all.c
> +++ b/accel/tcg/translate-all.c
> @@ -205,8 +205,6 @@ void tb_lock_reset(void)
> }
> }
>
> -static TranslationBlock *tb_find_pc(uintptr_t tc_ptr);
> -
> void cpu_gen_init(void)
> {
> tcg_context_init(&tcg_init_ctx);
> @@ -375,13 +373,13 @@ bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc)
>
> if (check_offset < tcg_init_ctx.code_gen_buffer_size) {
> tb_lock();
> - tb = tb_find_pc(host_pc);
> + tb = tcg_tb_lookup(host_pc);
> if (tb) {
> cpu_restore_state_from_tb(cpu, tb, host_pc);
> if (tb->cflags & CF_NOCACHE) {
> /* one-shot translation, invalidate it immediately */
> tb_phys_invalidate(tb, -1);
> - tb_remove(tb);
> + tcg_tb_remove(tb);
> }
> r = true;
> }
> @@ -731,48 +729,6 @@ static inline void *alloc_code_gen_buffer(void)
> }
> #endif /* USE_STATIC_CODE_GEN_BUFFER, WIN32, POSIX */
>
> -/* compare a pointer @ptr and a tb_tc @s */
> -static int ptr_cmp_tb_tc(const void *ptr, const struct tb_tc *s)
> -{
> - if (ptr >= s->ptr + s->size) {
> - return 1;
> - } else if (ptr < s->ptr) {
> - return -1;
> - }
> - return 0;
> -}
> -
> -static gint tb_tc_cmp(gconstpointer ap, gconstpointer bp)
> -{
> - const struct tb_tc *a = ap;
> - const struct tb_tc *b = bp;
> -
> - /*
> - * When both sizes are set, we know this isn't a lookup.
> - * This is the most likely case: every TB must be inserted; lookups
> - * are a lot less frequent.
> - */
> - if (likely(a->size && b->size)) {
> - if (a->ptr > b->ptr) {
> - return 1;
> - } else if (a->ptr < b->ptr) {
> - return -1;
> - }
> - /* a->ptr == b->ptr should happen only on deletions */
> - g_assert(a->size == b->size);
> - return 0;
> - }
> - /*
> - * All lookups have either .size field set to 0.
> - * From the glib sources we see that @ap is always the lookup key.
> However
> - * the docs provide no guarantee, so we just mark this case as likely.
> - */
> - if (likely(a->size == 0)) {
> - return ptr_cmp_tb_tc(a->ptr, b);
> - }
> - return ptr_cmp_tb_tc(b->ptr, a);
> -}
> -
> static inline void code_gen_alloc(size_t tb_size)
> {
> tcg_ctx->code_gen_buffer_size = size_code_gen_buffer(tb_size);
> @@ -781,7 +737,6 @@ static inline void code_gen_alloc(size_t tb_size)
> fprintf(stderr, "Could not allocate dynamic translator buffer\n");
> exit(1);
> }
> - tb_ctx.tb_tree = g_tree_new(tb_tc_cmp);
> qemu_mutex_init(&tb_ctx.tb_lock);
> }
>
> @@ -842,14 +797,6 @@ static TranslationBlock *tb_alloc(target_ulong pc)
> return tb;
> }
>
> -/* Called with tb_lock held. */
> -void tb_remove(TranslationBlock *tb)
> -{
> - assert_tb_locked();
> -
> - g_tree_remove(tb_ctx.tb_tree, &tb->tc);
> -}
> -
> static inline void invalidate_page_bitmap(PageDesc *p)
> {
> #ifdef CONFIG_SOFTMMU
> @@ -914,10 +861,10 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data
> tb_flush_count)
> }
>
> if (DEBUG_TB_FLUSH_GATE) {
> - size_t nb_tbs = g_tree_nnodes(tb_ctx.tb_tree);
> + size_t nb_tbs = tcg_nb_tbs();
> size_t host_size = 0;
>
> - g_tree_foreach(tb_ctx.tb_tree, tb_host_size_iter, &host_size);
> + tcg_tb_foreach(tb_host_size_iter, &host_size);
> printf("qemu: flush code_size=%zu nb_tbs=%zu avg_tb_size=%zu\n",
> tcg_code_size(), nb_tbs, nb_tbs > 0 ? host_size / nb_tbs : 0);
> }
> @@ -926,10 +873,6 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data
> tb_flush_count)
> cpu_tb_jmp_cache_clear(cpu);
> }
>
> - /* Increment the refcount first so that destroy acts as a reset */
> - g_tree_ref(tb_ctx.tb_tree);
> - g_tree_destroy(tb_ctx.tb_tree);
> -
> qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
> page_flush_tb();
>
> @@ -1409,7 +1352,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
> * through the physical hash table and physical page list.
> */
> tb_link_page(tb, phys_pc, phys_page2);
> - g_tree_insert(tb_ctx.tb_tree, &tb->tc, tb);
> + tcg_tb_insert(tb);
> return tb;
> }
>
> @@ -1513,7 +1456,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t
> start, tb_page_addr_t end,
> current_tb = NULL;
> if (cpu->mem_io_pc) {
> /* now we have a real cpu fault */
> - current_tb = tb_find_pc(cpu->mem_io_pc);
> + current_tb = tcg_tb_lookup(cpu->mem_io_pc);
> }
> }
> if (current_tb == tb &&
> @@ -1629,7 +1572,7 @@ static bool tb_invalidate_phys_page(tb_page_addr_t
> addr, uintptr_t pc)
> tb = p->first_tb;
> #ifdef TARGET_HAS_PRECISE_SMC
> if (tb && pc != 0) {
> - current_tb = tb_find_pc(pc);
> + current_tb = tcg_tb_lookup(pc);
> }
> if (cpu != NULL) {
> env = cpu->env_ptr;
> @@ -1672,18 +1615,6 @@ static bool tb_invalidate_phys_page(tb_page_addr_t
> addr, uintptr_t pc)
> }
> #endif
>
> -/*
> - * Find the TB 'tb' such that
> - * tb->tc.ptr <= tc_ptr < tb->tc.ptr + tb->tc.size
> - * Return NULL if not found.
> - */
> -static TranslationBlock *tb_find_pc(uintptr_t tc_ptr)
> -{
> - struct tb_tc s = { .ptr = (void *)tc_ptr };
> -
> - return g_tree_lookup(tb_ctx.tb_tree, &s);
> -}
> -
> #if !defined(CONFIG_USER_ONLY)
> void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr)
> {
> @@ -1711,7 +1642,7 @@ void tb_check_watchpoint(CPUState *cpu)
> {
> TranslationBlock *tb;
>
> - tb = tb_find_pc(cpu->mem_io_pc);
> + tb = tcg_tb_lookup(cpu->mem_io_pc);
> if (tb) {
> /* We can use retranslation to find the PC. */
> cpu_restore_state_from_tb(cpu, tb, cpu->mem_io_pc);
> @@ -1745,7 +1676,7 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
> uint32_t n;
>
> tb_lock();
> - tb = tb_find_pc(retaddr);
> + tb = tcg_tb_lookup(retaddr);
> if (!tb) {
> cpu_abort(cpu, "cpu_io_recompile: could not find TB for pc=%p",
> (void *)retaddr);
> @@ -1789,7 +1720,7 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
> * cpu_exec_nocache() */
> tb_phys_invalidate(tb->orig_tb, -1);
> }
> - tb_remove(tb);
> + tcg_tb_remove(tb);
> }
>
> /* TODO: If env->pc != tb->pc (i.e. the faulting instruction was not
> @@ -1860,6 +1791,7 @@ static void print_qht_statistics(FILE *f,
> fprintf_function cpu_fprintf,
> }
>
> struct tb_tree_stats {
> + size_t nb_tbs;
> size_t host_size;
> size_t target_size;
> size_t max_target_size;
> @@ -1873,6 +1805,7 @@ static gboolean tb_tree_stats_iter(gpointer key,
> gpointer value, gpointer data)
> const TranslationBlock *tb = value;
> struct tb_tree_stats *tst = data;
>
> + tst->nb_tbs++;
> tst->host_size += tb->tc.size;
> tst->target_size += tb->size;
> if (tb->size > tst->max_target_size) {
> @@ -1896,10 +1829,8 @@ void dump_exec_info(FILE *f, fprintf_function
> cpu_fprintf)
> struct qht_stats hst;
> size_t nb_tbs;
>
> - tb_lock();
> -
> - nb_tbs = g_tree_nnodes(tb_ctx.tb_tree);
> - g_tree_foreach(tb_ctx.tb_tree, tb_tree_stats_iter, &tst);
> + tcg_tb_foreach(tb_tree_stats_iter, &tst);
> + nb_tbs = tst.nb_tbs;
> /* XXX: avoid using doubles ? */
> cpu_fprintf(f, "Translation buffer state:\n");
> /*
> @@ -1934,8 +1865,6 @@ void dump_exec_info(FILE *f, fprintf_function
> cpu_fprintf)
> cpu_fprintf(f, "TB invalidate count %d\n",
> tb_ctx.tb_phys_invalidate_count);
> cpu_fprintf(f, "TLB flush count %zu\n", tlb_flush_count());
> tcg_dump_info(f, cpu_fprintf);
> -
> - tb_unlock();
> }
>
> void dump_opcount_info(FILE *f, fprintf_function cpu_fprintf)
> @@ -2203,7 +2132,7 @@ int page_unprotect(target_ulong address, uintptr_t pc)
> * set the page to PAGE_WRITE and did the TB invalidate for us.
> */
> #ifdef TARGET_HAS_PRECISE_SMC
> - TranslationBlock *current_tb = tb_find_pc(pc);
> + TranslationBlock *current_tb = tcg_tb_lookup(pc);
> if (current_tb) {
> current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
> }
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index e5afd2e..17e08b3 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -401,7 +401,6 @@ static inline uint32_t curr_cflags(void)
> | (use_icount ? CF_USE_ICOUNT : 0);
> }
>
> -void tb_remove(TranslationBlock *tb);
> void tb_flush(CPUState *cpu);
> void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
> TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
> diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h
> index 1d41202..d8472c8 100644
> --- a/include/exec/tb-context.h
> +++ b/include/exec/tb-context.h
> @@ -31,7 +31,6 @@ typedef struct TBContext TBContext;
>
> struct TBContext {
>
> - GTree *tb_tree;
> struct qht htable;
> /* any access to the tbs or the page table must use this lock */
> QemuMutex tb_lock;
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index bb24526..b471708 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -135,6 +135,12 @@ static TCGContext **tcg_ctxs;
> static unsigned int n_tcg_ctxs;
> TCGv_env cpu_env = 0;
>
> +struct tcg_region_tree {
> + QemuMutex lock;
> + GTree *tree;
> + /* padding to avoid false sharing is computed at run-time */
> +};
> +
> /*
> * We divide code_gen_buffer into equally-sized "regions" that TCG threads
> * dynamically allocate from as demand dictates. Given appropriate region
> @@ -158,6 +164,13 @@ struct tcg_region_state {
> };
>
> static struct tcg_region_state region;
> +/*
> + * This is an array of struct tcg_region_tree's, with padding.
> + * We use void * to simplify the computation of region_trees[i]; each
> + * struct is found every tree_size bytes.
> + */
> +static void *region_trees;
> +static size_t tree_size;
> static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT];
> static TCGRegSet tcg_target_call_clobber_regs;
>
> @@ -295,6 +308,180 @@ TCGLabel *gen_new_label(void)
>
> #include "tcg-target.inc.c"
>
> +/* compare a pointer @ptr and a tb_tc @s */
> +static int ptr_cmp_tb_tc(const void *ptr, const struct tb_tc *s)
> +{
> + if (ptr >= s->ptr + s->size) {
> + return 1;
> + } else if (ptr < s->ptr) {
> + return -1;
> + }
> + return 0;
> +}
> +
> +static gint tb_tc_cmp(gconstpointer ap, gconstpointer bp)
> +{
> + const struct tb_tc *a = ap;
> + const struct tb_tc *b = bp;
> +
> + /*
> + * When both sizes are set, we know this isn't a lookup.
> + * This is the most likely case: every TB must be inserted; lookups
> + * are a lot less frequent.
> + */
> + if (likely(a->size && b->size)) {
> + if (a->ptr > b->ptr) {
> + return 1;
> + } else if (a->ptr < b->ptr) {
> + return -1;
> + }
> + /* a->ptr == b->ptr should happen only on deletions */
> + g_assert(a->size == b->size);
> + return 0;
> + }
> + /*
> + * All lookups have either .size field set to 0.
> + * From the glib sources we see that @ap is always the lookup key.
> However
> + * the docs provide no guarantee, so we just mark this case as likely.
> + */
> + if (likely(a->size == 0)) {
> + return ptr_cmp_tb_tc(a->ptr, b);
> + }
> + return ptr_cmp_tb_tc(b->ptr, a);
> +}
> +
> +static void tcg_region_trees_init(void)
> +{
> + size_t i;
> +
> + tree_size = ROUND_UP(sizeof(struct tcg_region_tree),
> qemu_dcache_linesize);
> + region_trees = qemu_memalign(qemu_dcache_linesize, region.n * tree_size);
> + for (i = 0; i < region.n; i++) {
> + struct tcg_region_tree *rt = region_trees + i * tree_size;
> +
> + qemu_mutex_init(&rt->lock);
> + rt->tree = g_tree_new(tb_tc_cmp);
> + }
> +}
> +
> +static struct tcg_region_tree *tc_ptr_to_region_tree(void *p)
> +{
> + size_t region_idx;
> +
> + if (p < region.start_aligned) {
> + region_idx = 0;
> + } else {
> + ptrdiff_t offset = p - region.start_aligned;
> +
> + if (offset > region.stride * (region.n - 1)) {
> + region_idx = region.n - 1;
> + } else {
> + region_idx = offset / region.stride;
> + }
> + }
> + return region_trees + region_idx * tree_size;
> +}
> +
> +void tcg_tb_insert(TranslationBlock *tb)
> +{
> + struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
> +
> + qemu_mutex_lock(&rt->lock);
> + g_tree_insert(rt->tree, &tb->tc, tb);
> + qemu_mutex_unlock(&rt->lock);
> +}
> +
> +void tcg_tb_remove(TranslationBlock *tb)
> +{
> + struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
> +
> + qemu_mutex_lock(&rt->lock);
> + g_tree_remove(rt->tree, &tb->tc);
> + qemu_mutex_unlock(&rt->lock);
> +}
> +
> +/*
> + * Find the TB 'tb' such that
> + * tb->tc.ptr <= tc_ptr < tb->tc.ptr + tb->tc.size
> + * Return NULL if not found.
> + */
> +TranslationBlock *tcg_tb_lookup(uintptr_t tc_ptr)
> +{
> + struct tcg_region_tree *rt = tc_ptr_to_region_tree((void *)tc_ptr);
> + TranslationBlock *tb;
> + struct tb_tc s = { .ptr = (void *)tc_ptr };
> +
> + qemu_mutex_lock(&rt->lock);
> + tb = g_tree_lookup(rt->tree, &s);
> + qemu_mutex_unlock(&rt->lock);
> + return tb;
> +}
> +
> +static void tcg_region_tree_lock_all(void)
> +{
> + size_t i;
> +
> + for (i = 0; i < region.n; i++) {
> + struct tcg_region_tree *rt = region_trees + i * tree_size;
> +
> + qemu_mutex_lock(&rt->lock);
> + }
> +}
> +
> +static void tcg_region_tree_unlock_all(void)
> +{
> + size_t i;
> +
> + for (i = 0; i < region.n; i++) {
> + struct tcg_region_tree *rt = region_trees + i * tree_size;
> +
> + qemu_mutex_unlock(&rt->lock);
> + }
> +}
> +
> +void tcg_tb_foreach(GTraverseFunc func, gpointer user_data)
> +{
> + size_t i;
> +
> + tcg_region_tree_lock_all();
> + for (i = 0; i < region.n; i++) {
> + struct tcg_region_tree *rt = region_trees + i * tree_size;
> +
> + g_tree_foreach(rt->tree, func, user_data);
> + }
> + tcg_region_tree_unlock_all();
> +}
> +
> +size_t tcg_nb_tbs(void)
> +{
> + size_t nb_tbs = 0;
> + size_t i;
> +
> + tcg_region_tree_lock_all();
> + for (i = 0; i < region.n; i++) {
> + struct tcg_region_tree *rt = region_trees + i * tree_size;
> +
> + nb_tbs += g_tree_nnodes(rt->tree);
> + }
> + tcg_region_tree_unlock_all();
> + return nb_tbs;
> +}
> +
> +static void tcg_region_tree_reset_all(void)
> +{
> + size_t i;
> +
> + tcg_region_tree_lock_all();
> + for (i = 0; i < region.n; i++) {
> + struct tcg_region_tree *rt = region_trees + i * tree_size;
> +
> + /* Increment the refcount first so that destroy acts as a reset */
> + g_tree_ref(rt->tree);
> + g_tree_destroy(rt->tree);
> + }
> + tcg_region_tree_unlock_all();
> +}
> +
> static void tcg_region_bounds(size_t curr_region, void **pstart, void **pend)
> {
> void *start, *end;
> @@ -380,6 +567,8 @@ void tcg_region_reset_all(void)
> g_assert(!err);
> }
> qemu_mutex_unlock(®ion.lock);
> +
> + tcg_region_tree_reset_all();
> }
>
> #ifdef CONFIG_USER_ONLY
> @@ -496,6 +685,8 @@ void tcg_region_init(void)
> g_assert(!rc);
> }
>
> + tcg_region_trees_init();
> +
> /* In user-mode we support only one ctx, so do the initial allocation
> now */
> #ifdef CONFIG_USER_ONLY
> {
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 9e2d909..8bf29cc 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -850,6 +850,12 @@ void tcg_region_reset_all(void);
> size_t tcg_code_size(void);
> size_t tcg_code_capacity(void);
>
> +void tcg_tb_insert(TranslationBlock *tb);
> +void tcg_tb_remove(TranslationBlock *tb);
> +TranslationBlock *tcg_tb_lookup(uintptr_t tc_ptr);
> +void tcg_tb_foreach(GTraverseFunc func, gpointer user_data);
> +size_t tcg_nb_tbs(void);
> +
> /* user-mode: Called with tb_lock held. */
> static inline void *tcg_malloc(int size)
> {
--
Alex Bennée
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- Re: [Qemu-devel] [PATCH 03/16] tcg: track TBs with per-region BST's,
Alex Bennée <=