From 2b28d4213e3b16a93d53eb2e4d522c5824de1647 Mon Sep 17 00:00:00 2001 From: Vladimir Prus Date: Tue, 11 Nov 2008 12:29:02 +0300 Subject: [PATCH] Fix movcal.l/ocbi emulation. To: address@hidden X-KMail-Transport: CodeSourcery X-KMail-Identity: 901867920 * target-sh4/cpu.h (store_request_t): New. (CPUSH4State): New fields store_requests and store_request_tail. * target-sh4/helper.h (helper_movcal, herlper_do_stores, helper_ocbi): New. * target-sh4/op_helper.c (helper_movcal, herlper_do_stores) (helper_ocbi): New. * target-sh4/translate.c (DisasContext): New field has_movcal. (sh4_defs): Update CVS for SH7785. (cpu_sh4_init): Initialize env->store_request_tail; (_decode_opc): Flush pending movca.l-originated stores. Make use of helper_movcal and helper_ocbi. (gen_intermediate_code_internal): Initialize has_movcal to 1. --- cpu-exec.c | 2 +- target-sh4/cpu.h | 17 +++++++++++++-- target-sh4/helper.h | 4 +++ target-sh4/op_helper.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ target-sh4/translate.c | 44 +++++++++++++++++++++++++++++++++++++++--- 5 files changed, 108 insertions(+), 8 deletions(-) diff --git a/cpu-exec.c b/cpu-exec.c index 9a35a59..64b0845 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -174,7 +174,7 @@ static inline TranslationBlock *tb_find_fast(void) /* we record a subset of the CPU state. It will always be the same before a given translated block is executed. */ - cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); + cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); tb = env->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]; if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base || tb->flags != flags)) { diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h index ae434d1..eed3b1b 100644 --- a/target-sh4/cpu.h +++ b/target-sh4/cpu.h @@ -93,6 +93,12 @@ enum sh_features { SH_FEATURE_SH4A = 1, }; +typedef struct store_request_t { + uint32_t address; + uint32_t value; + struct store_request_t *next; +} store_request_t; + typedef struct CPUSH4State { int id; /* CPU model */ @@ -141,6 +147,8 @@ typedef struct CPUSH4State { tlb_t itlb[ITLB_SIZE]; /* instruction translation table */ void *intc_handle; int intr_at_halt; /* SR_BL ignored during sleep */ + store_request_t *store_requests; + store_request_t **store_request_tail; } CPUSH4State; CPUSH4State *cpu_sh4_init(const char *cpu_model); @@ -281,16 +289,19 @@ static inline void cpu_pc_from_tb(CPUState *env, TranslationBlock *tb) env->flags = tb->flags; } +#define TB_FLAG_PENDING_MOVCA (1 << 4) + static inline void cpu_get_tb_cpu_state(CPUState *env, target_ulong *pc, target_ulong *cs_base, int *flags) { *pc = env->pc; *cs_base = 0; *flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL - | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME)) /* Bits 0- 3 */ - | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR)) /* Bits 19-21 */ + | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME)) /* Bits 0- 3 */ + | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR)) /* Bits 19-21 */ | (env->sr & (SR_MD | SR_RB)) /* Bits 29-30 */ - | (env->sr & SR_FD); /* Bit 15 */ + | (env->sr & SR_FD) /* Bit 15 */ + | (env->store_requests ? TB_FLAG_PENDING_MOVCA : 0); /* Bit 4 */ } #endif /* _CPU_SH4_H */ diff --git a/target-sh4/helper.h b/target-sh4/helper.h index 631e7e1..d995688 100644 --- a/target-sh4/helper.h +++ b/target-sh4/helper.h @@ -9,6 +9,10 @@ DEF_HELPER_0(debug, void) DEF_HELPER_1(sleep, void, i32) DEF_HELPER_1(trapa, void, i32) +DEF_HELPER_2(movcal, void, i32, i32) +DEF_HELPER_0(do_stores, void) +DEF_HELPER_1(ocbi, void, i32) + DEF_HELPER_2(addv, i32, i32, i32) DEF_HELPER_2(addc, i32, i32, i32) DEF_HELPER_2(subv, i32, i32, i32) diff --git a/target-sh4/op_helper.c b/target-sh4/op_helper.c index 6352219..b4982b0 100644 --- a/target-sh4/op_helper.c +++ b/target-sh4/op_helper.c @@ -122,6 +122,55 @@ void helper_trapa(uint32_t tra) cpu_loop_exit(); } +void helper_movcal(uint32_t address, uint32_t value) +{ + store_request_t *r = (store_request_t *)malloc (sizeof(store_request_t)); + r->address = address; + r->value = value; + r->next = NULL; + + *(env->store_request_tail) = r; + env->store_request_tail = &(r->next); +} + +void helper_do_stores(void) +{ + store_request_t *current = env->store_requests; + + while(current) + { + uint32_t a = current->address, v = current->value; + store_request_t *next = current->next; + free (current); + env->store_requests = current = next; + if (current == 0) + env->store_request_tail = &(env->store_requests); + + stl_data(a, v); + } +} + +void helper_ocbi(uint32_t address) +{ + store_request_t **current = &(env->store_requests); + while (*current) + { + if ((*current)->address == address) + { + store_request_t *next = (*current)->next; + + if (next == 0) + { + env->store_request_tail = current; + } + + free (*current); + *current = next; + break; + } + } +} + uint32_t helper_addc(uint32_t arg0, uint32_t arg1) { uint32_t tmp0, tmp1; diff --git a/target-sh4/translate.c b/target-sh4/translate.c index ba9db14..949cb06 100644 --- a/target-sh4/translate.c +++ b/target-sh4/translate.c @@ -50,6 +50,7 @@ typedef struct DisasContext { uint32_t delayed_pc; int singlestep_enabled; uint32_t features; + int has_movcal; } DisasContext; #if defined(CONFIG_USER_ONLY) @@ -278,6 +279,7 @@ CPUSH4State *cpu_sh4_init(const char *cpu_model) return NULL; env->features = def->features; cpu_exec_init(env); + env->store_request_tail = &(env->store_requests); sh4_translate_init(); env->cpu_model_str = cpu_model; cpu_sh4_reset(env); @@ -490,6 +492,40 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg) static void _decode_opc(DisasContext * ctx) { + /* This code tries to make movcal emulation sufficiently + accurate for Linux purposes. This instruction writes + memory, and prior to that, always allocates a cache line. + It is used in two contexts: + - in memcpy, where data is copied in blocks, the first write + of to a block uses movca.l. I presume this is because writing + all data into cache, and then having the data sent into memory + later, via store buffer, is faster than, in case of write-through + cache configuration, to wait for memory write on each store. + - in arch/sh/mm/cache-sh4.c, movcal.l + ocbi combination is used + to flush the cache. Here, the data written by movcal.l is never + written to memory, and the data written is just bogus. + + To simulate this, we keep a list of store requests initiated + by movcal.l, see env->store_requests. movcal.l only adds new entry + to this list. When we see an instruction that is neither movca.l + nor ocbi, we perform the stores recorded in this list. When we see + ocbi, we check if the stores list has the address being invalidated. + If so, we remove the address from the list. + + To optimize, we only try to flush stores when we're at the start of + TB, or if we already saw movca.l in this TB and did not flush stores + yet. */ + if (ctx->has_movcal) + { + int opcode = ctx->opcode & 0xf0ff; + if (opcode != 0x0093 /* ocbi */ + && opcode != 0x00c3 /* movca.l */) + { + gen_helper_do_stores (); + ctx->has_movcal = 0; + } + } + #if 0 fprintf(stderr, "Translating opcode 0x%04x\n", ctx->opcode); #endif @@ -1529,7 +1565,8 @@ static void _decode_opc(DisasContext * ctx) } return; case 0x00c3: /* movca.l R0,@Rm */ - tcg_gen_qemu_st32(REG(0), REG(B11_8), ctx->memidx); + gen_helper_movcal (REG(B11_8), REG(0)); + ctx->has_movcal = 1; return; case 0x40a9: /* MOVUA.L @Rm,R0 (Rm) -> R0 @@ -1578,9 +1615,7 @@ static void _decode_opc(DisasContext * ctx) break; case 0x0093: /* ocbi @Rn */ { - TCGv dummy = tcg_temp_new(); - tcg_gen_qemu_ld32s(dummy, REG(B11_8), ctx->memidx); - tcg_temp_free(dummy); + gen_helper_ocbi (REG(B11_8)); } return; case 0x00a3: /* ocbp @Rn */ @@ -1858,6 +1893,7 @@ gen_intermediate_code_internal(CPUState * env, TranslationBlock * tb, ctx.tb = tb; ctx.singlestep_enabled = env->singlestep_enabled; ctx.features = env->features; + ctx.has_movcal = (tb->flags & TB_FLAG_PENDING_MOVCA); #ifdef DEBUG_DISAS if (loglevel & CPU_LOG_TB_CPU) { -- 1.5.3.5