qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v3 20/25] tcg: Save insn data and use it in cpu_


From: Aurelien Jarno
Subject: Re: [Qemu-devel] [PATCH v3 20/25] tcg: Save insn data and use it in cpu_restore_state_from_tb
Date: Fri, 25 Sep 2015 23:10:36 +0200
User-agent: Mutt/1.5.23 (2014-03-12)

On 2015-09-22 13:25, Richard Henderson wrote:
> We can now restore state without retranslation.
> 
> Signed-off-by: Richard Henderson <address@hidden>
> ---
>  include/exec/exec-all.h |   1 +
>  tcg/tcg.c               |  40 ++++++++-----
>  tcg/tcg.h               |   4 +-
>  translate-all.c         | 149 
> +++++++++++++++++++++++++++++++++++-------------
>  4 files changed, 139 insertions(+), 55 deletions(-)
> 
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 6a69802..402dd87 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -199,6 +199,7 @@ struct TranslationBlock {
>  #define CF_USE_ICOUNT  0x20000
>  
>      void *tc_ptr;    /* pointer to the translated code */
> +    uint8_t *tc_search;  /* pointer to search data */
>      /* next matching tb for physical address. */
>      struct TranslationBlock *phys_hash_next;
>      /* original tb when cflags has CF_NOCACHE */
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index bdb83d9..a0fce5b 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -2294,7 +2294,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
>                                        tcg_insn_unit *gen_code_buf,
>                                        long search_pc)
>  {
> -    int i, oi, oi_next;
> +    int i, oi, oi_next, num_insns;
>  
>  #ifdef DEBUG_DISAS
>      if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) {
> @@ -2338,6 +2338,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
>  
>      tcg_out_tb_init(s);
>  
> +    num_insns = -1;
>      for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
>          TCGOp * const op = &s->gen_op_buf[oi];
>          TCGArg * const args = &s->gen_opparam_buf[op->args];
> @@ -2361,6 +2362,10 @@ static inline int tcg_gen_code_common(TCGContext *s,
>              tcg_reg_alloc_movi(s, args, dead_args, sync_args);
>              break;
>          case INDEX_op_insn_start:
> +            if (num_insns >= 0) {
> +                s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
> +            }
> +            num_insns++;
>              for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
>                  target_ulong a;
>  #if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
> @@ -2368,7 +2373,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
>  #else
>                  a = args[i];
>  #endif
> -                s->gen_opc_data[i] = a;
> +                s->gen_insn_data[num_insns][i] = a;
>              }
>              break;
>          case INDEX_op_discard:
> @@ -2400,6 +2405,8 @@ static inline int tcg_gen_code_common(TCGContext *s,
>          check_regs(s);
>  #endif
>      }
> +    tcg_debug_assert(num_insns >= 0);
> +    s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
>  
>      /* Generate TB finalization at the end of block */
>      tcg_out_tb_finalize(s);
> @@ -2448,24 +2455,26 @@ int tcg_gen_code_search_pc(TCGContext *s, 
> tcg_insn_unit *gen_code_buf,
>  void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf)
>  {
>      TCGContext *s = &tcg_ctx;
> -    int64_t tot;
> +    int64_t tb_count = s->tb_count;
> +    int64_t tb_div_count = tb_count ? tb_count : 1;
> +    int64_t tot = s->interm_time + s->code_time;
>  
> -    tot = s->interm_time + s->code_time;
>      cpu_fprintf(f, "JIT cycles          %" PRId64 " (%0.3f s at 2.4 GHz)\n",
>                  tot, tot / 2.4e9);
>      cpu_fprintf(f, "translated TBs      %" PRId64 " (aborted=%" PRId64 " 
> %0.1f%%)\n", 
> -                s->tb_count, 
> -                s->tb_count1 - s->tb_count,
> -                s->tb_count1 ? (double)(s->tb_count1 - s->tb_count) / 
> s->tb_count1 * 100.0 : 0);
> +                tb_count, s->tb_count1 - tb_count,
> +                (double)(s->tb_count1 - s->tb_count)
> +                / (s->tb_count1 ? s->tb_count1 : 1) * 100.0);
>      cpu_fprintf(f, "avg ops/TB          %0.1f max=%d\n", 
> -                s->tb_count ? (double)s->op_count / s->tb_count : 0, 
> s->op_count_max);
> +                (double)s->op_count / tb_div_count, s->op_count_max);
>      cpu_fprintf(f, "deleted ops/TB      %0.2f\n",
> -                s->tb_count ? 
> -                (double)s->del_op_count / s->tb_count : 0);
> +                (double)s->del_op_count / tb_div_count);
>      cpu_fprintf(f, "avg temps/TB        %0.2f max=%d\n",
> -                s->tb_count ? 
> -                (double)s->temp_count / s->tb_count : 0,
> -                s->temp_count_max);
> +                (double)s->temp_count / tb_div_count, s->temp_count_max);
> +    cpu_fprintf(f, "avg host code/TB    %0.1f\n",
> +                (double)s->code_out_len / tb_div_count);
> +    cpu_fprintf(f, "avg search data/TB  %0.1f\n",
> +                (double)s->search_out_len / tb_div_count);
>      
>      cpu_fprintf(f, "cycles/op           %0.1f\n", 
>                  s->op_count ? (double)tot / s->op_count : 0);
> @@ -2473,8 +2482,11 @@ void tcg_dump_info(FILE *f, fprintf_function 
> cpu_fprintf)
>                  s->code_in_len ? (double)tot / s->code_in_len : 0);
>      cpu_fprintf(f, "cycles/out byte     %0.1f\n", 
>                  s->code_out_len ? (double)tot / s->code_out_len : 0);
> -    if (tot == 0)
> +    cpu_fprintf(f, "cycles/search byte     %0.1f\n", 
> +                s->search_out_len ? (double)tot / s->search_out_len : 0);
> +    if (tot == 0) {
>          tot = 1;
> +    }
>      cpu_fprintf(f, "  gen_interm time   %0.1f%%\n", 
>                  (double)s->interm_time / tot * 100.0);
>      cpu_fprintf(f, "  gen_code time     %0.1f%%\n", 
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 8fd1252..df499c6 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -532,6 +532,7 @@ struct TCGContext {
>      int64_t del_op_count;
>      int64_t code_in_len;
>      int64_t code_out_len;
> +    int64_t search_out_len;
>      int64_t interm_time;
>      int64_t code_time;
>      int64_t la_time;
> @@ -581,7 +582,8 @@ struct TCGContext {
>      uint16_t gen_opc_icount[OPC_BUF_SIZE];
>      uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
>  
> -    target_ulong gen_opc_data[TARGET_INSN_START_WORDS];
> +    uint16_t gen_insn_end_off[TCG_MAX_INSNS];
> +    target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
>  };
>  
>  extern TCGContext tcg_ctx;
> diff --git a/translate-all.c b/translate-all.c
> index 9f801ae..f6b8148 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -168,61 +168,127 @@ void cpu_gen_init(void)
>      tcg_context_init(&tcg_ctx); 
>  }
>  
> +/* Encode VAL as a signed leb128 sequence at P.
> +   Return P incremented past the encoded value.  */
> +static uint8_t *encode_sleb128(uint8_t *p, target_long val)
> +{
> +    int more, byte;
> +
> +    do {
> +        byte = val & 0x7f;
> +        val >>= 7;
> +        more = !((val == 0 && (byte & 0x40) == 0)
> +                 || (val == -1 && (byte & 0x40) != 0));
> +        if (more)
> +          byte |= 0x80;

You are missing braces here.

> +        *p++ = byte;
> +    } while (more);
> +
> +    return p;
> +}
> +
> +/* Decode a signed leb128 sequence at *PP; increment *PP past the
> +   decoded value.  Return the decoded value.  */
> +static target_long decode_sleb128(uint8_t **pp)
> +{
> +    uint8_t *p = *pp;
> +    target_long val = 0;
> +    int byte, shift = 0;
> +
> +    do {
> +        byte = *p++;
> +        val |= (target_ulong)(byte & 0x7f) << shift;
> +        shift += 7;
> +    } while (byte & 0x80);
> +    if (shift < TARGET_LONG_BITS && (byte & 0x40)) {
> +        val |= -(target_ulong)1 << shift;
> +    }
> +
> +    *pp = p;
> +    return val;
> +}
> +
> +/* Encode the data collected about the instructions while compiling TB.
> +   Place the data at BLOCK, and return the number of bytes consumed.
> +
> +   The logical table consisits of TARGET_INSN_START_WORDS target_ulong's,
> +   which come from the target's insn_start data, followed by a uintptr_t
> +   which comes from the host pc of the end of the code implementing the insn.
> +
> +   Each line of the table is encoded as sleb128 deltas from the previous
> +   line.  The seed for the first line is { tb->pc, 0..., tb->tc_ptr }.
> +   That is, the first column is seeded with the guest pc, the last column
> +   with the host pc, and the middle columns with zeros.  */
> +
> +static int encode_search(TranslationBlock *tb, uint8_t *block)
> +{
> +    uint8_t *p = block;
> +    int i, j, n;
> +
> +    tb->tc_search = block;
> +
> +    for (i = 0, n = tb->icount; i < n; ++i) {
> +        target_ulong prev;
> +
> +        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
> +            if (i == 0) {
> +                prev = (j == 0 ? tb->pc : 0);
> +            } else {
> +                prev = tcg_ctx.gen_insn_data[i - 1][j];
> +            }
> +            p = encode_sleb128(p, tcg_ctx.gen_insn_data[i][j] - prev);
> +        }
> +        prev = (i == 0 ? 0 : tcg_ctx.gen_insn_end_off[i - 1]);
> +        p = encode_sleb128(p, tcg_ctx.gen_insn_end_off[i] - prev);
> +    }
> +
> +    return p - block;
> +}
> +

Given we save both the host and the guest PC in this structure, one
obvious optimization would be to skip saving data for host instructions
which can not generate exception. It means that all the TCG ops in this
instruction do not generate exceptions either. We can easily test that
for all TCG instructions except all by looking at the
TCG_OPF_SIDE_EFFECTS flag. For the call op, we have to look at the
TCG_CALL_NO_SIDE_EFFECTS flag, even if it doesn't necessary means the
helper might generate exception.

That should significantly save space on load/store architectures. That
said we can probably do that in a latter time.

>  /* The cpu state corresponding to 'searched_pc' is restored.  */
>  static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
>                                       uintptr_t searched_pc)
>  {
> +    target_ulong data[TARGET_INSN_START_WORDS] = { tb->pc };
> +    uintptr_t host_pc = (uintptr_t)tb->tc_ptr;
>      CPUArchState *env = cpu->env_ptr;
> -    TCGContext *s = &tcg_ctx;
> -    int j;
> -    uintptr_t tc_ptr;
> +    uint8_t *p = tb->tc_search;
> +    int i, j, num_insns = tb->icount;
>  #ifdef CONFIG_PROFILER
> -    int64_t ti;
> +    int64_t ti = profile_getclock();
>  #endif
>  
> -#ifdef CONFIG_PROFILER
> -    ti = profile_getclock();
> -#endif
> -    tcg_func_start(s);
> +    if (searched_pc < host_pc) {
> +        return -1;
> +    }
>  
> -    gen_intermediate_code_pc(env, tb);
> +    /* Reconstruct the stored insn data while looking for the point at
> +       which the end of the insn exceeds the searched_pc.  */
> +    for (i = 0; i < num_insns; ++i) {
> +        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
> +            data[j] += decode_sleb128(&p);
> +        }
> +        host_pc += decode_sleb128(&p);
> +        if (host_pc > searched_pc) {
> +            goto found;
> +        }
> +    }
> +    return -1;
>  
> + found:
>      if (tb->cflags & CF_USE_ICOUNT) {
>          assert(use_icount);
>          /* Reset the cycle counter to the start of the block.  */
> -        cpu->icount_decr.u16.low += tb->icount;
> +        cpu->icount_decr.u16.low += num_insns;
>          /* Clear the IO flag.  */
>          cpu->can_do_io = 0;
>      }
> -
> -    /* find opc index corresponding to search_pc */
> -    tc_ptr = (uintptr_t)tb->tc_ptr;
> -    if (searched_pc < tc_ptr)
> -        return -1;
> -
> -    s->tb_next_offset = tb->tb_next_offset;
> -#ifdef USE_DIRECT_JUMP
> -    s->tb_jmp_offset = tb->tb_jmp_offset;
> -    s->tb_next = NULL;
> -#else
> -    s->tb_jmp_offset = NULL;
> -    s->tb_next = tb->tb_next;
> -#endif
> -    j = tcg_gen_code_search_pc(s, (tcg_insn_unit *)tc_ptr,
> -                               searched_pc - tc_ptr);
> -    if (j < 0)
> -        return -1;
> -    /* now find start of instruction before */
> -    while (s->gen_opc_instr_start[j] == 0) {
> -        j--;
> -    }
> -    cpu->icount_decr.u16.low -= s->gen_opc_icount[j];
> -
> -    restore_state_to_opc(env, tb, s->gen_opc_data);
> +    cpu->icount_decr.u16.low -= i;
> +    restore_state_to_opc(env, tb, data);
>  
>  #ifdef CONFIG_PROFILER
> -    s->restore_time += profile_getclock() - ti;
> -    s->restore_count++;
> +    tcg_ctx.restore_time += profile_getclock() - ti;
> +    tcg_ctx.restore_count++;
>  #endif
>      return 0;
>  }
> @@ -969,7 +1035,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>      tb_page_addr_t phys_pc, phys_page2;
>      target_ulong virt_page2;
>      tcg_insn_unit *gen_code_buf;
> -    int gen_code_size;
> +    int gen_code_size, search_size;
>  #ifdef CONFIG_PROFILER
>      int64_t ti;
>  #endif
> @@ -1025,11 +1091,13 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>  #endif
>  
>      gen_code_size = tcg_gen_code(&tcg_ctx, gen_code_buf);
> +    search_size = encode_search(tb, (void *)gen_code_buf + gen_code_size);
>  
>  #ifdef CONFIG_PROFILER
>      tcg_ctx.code_time += profile_getclock();
>      tcg_ctx.code_in_len += tb->size;
>      tcg_ctx.code_out_len += gen_code_size;
> +    tcg_ctx.search_out_len += search_size;
>  #endif
>  
>  #ifdef DEBUG_DISAS
> @@ -1041,8 +1109,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>      }
>  #endif
>  
> -    tcg_ctx.code_gen_ptr = (void *)(((uintptr_t)gen_code_buf +
> -            gen_code_size + CODE_GEN_ALIGN - 1) & ~(CODE_GEN_ALIGN - 1));
> +    tcg_ctx.code_gen_ptr = (void *)
> +        ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
> +                 CODE_GEN_ALIGN);
>  
>      /* check next page if needed */
>      virt_page2 = (pc + tb->size - 1) & TARGET_PAGE_MASK;

If you fix the coding style issue I mentioned above, you get:

Reviewed-by: Aurelien Jarno <address@hidden>

-- 
Aurelien Jarno                          GPG: 4096R/1DDD8C9B
address@hidden                 http://www.aurel32.net



reply via email to

[Prev in Thread] Current Thread [Next in Thread]