[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] sh : performance problem
From: |
Lionel Landwerlin |
Subject: |
Re: [Qemu-devel] sh : performance problem |
Date: |
Tue, 03 Mar 2009 00:58:47 +0100 |
Shin-ichiro,
Sorry, but I cannot apply your patch cleanly on the last qemu-svn.
Instead, I would like to try another approach. The patch you proposed to
find (or not) a valid TLB entry has a complexity of O(log2(n)) (or
something like that if I remember) instead here is a patch with a
complexity of O(1).
This patch has the same problem concerning the multiple hit exception
that is not raised anymore, but I think we can handle it with some more
variables in tlb entries.
The patch apply on
qemu-svn r6215
+
git://git.kernel.org/pub/scm/virt/qemu/lethal/qemu-sh.git patches
Regards,
--
Lionel Landwerlin <address@hidden>
[PATCH] SH4: Use O(1) algorithm instead of O(n) for utlb lookup
The current algorithm to know whether a virtual address belongs to the
TLB cache is to parse all the TLB one by one. The complexity of this
algorithm is O(n).
The following patch implements a TLB entry cache, by keeping for each
virtual page number the associated TLB entry. As usual the complexity
gain is balanced by an overall memory comsumption that is about 5Mb.
This patches also introduce a behavior divergence with the real SH4
processor, because no more 'multiple hit exception' is raised, when 2
TLB map the same physical area. This will be fixed in another
iteration of this patch.
Signed-off-by: Lionel Landwerlin <address@hidden>
---
target-sh4/cpu.h | 8 +++
target-sh4/helper.c | 156 ++++++++++++++++++++++++++++++++++--------------
target-sh4/translate.c | 9 ++-
3 files changed, 127 insertions(+), 46 deletions(-)
diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h
index 4a9ddea..2b17a93 100644
--- a/target-sh4/cpu.h
+++ b/target-sh4/cpu.h
@@ -153,6 +153,14 @@ typedef struct CPUSH4State {
int intr_at_halt; /* SR_BL ignored during sleep */
store_request_t *store_requests;
store_request_t **store_request_tail;
+
+#if !defined(CONFIG_USER_ONLY)
+ /* vpn to utlb entry caches (too much space for user emulation) */
+ uint8_t utlbs_1k[4194304]; /* 2^22 => 4 Mb */
+ uint8_t utlbs_4k[1048576]; /* 2^20 => 1 Mb */
+ uint8_t utlbs_64k[65536]; /* 2^16 => 64 Kb */
+ uint8_t utlbs_1m[4096]; /* 2^12 => 4 Kb */
+#endif
} CPUSH4State;
CPUSH4State *cpu_sh4_init(const char *cpu_model);
diff --git a/target-sh4/helper.c b/target-sh4/helper.c
index 54a3f1f..c9b2ef0 100644
--- a/target-sh4/helper.c
+++ b/target-sh4/helper.c
@@ -240,6 +240,29 @@ static int itlb_replacement(CPUState * env)
assert(0);
}
+static inline int is_tlb_matching (tlb_t *entry, int use_asid,
+ uint8_t asid, target_ulong address)
+{
+ target_ulong start, end;
+
+ /* Invalid entry */
+ if (unlikely(!entry->v))
+ return 0;
+
+ /* Bad ASID */
+ if (unlikely (!entry->sh && use_asid && entry->asid != asid))
+ return 0;
+
+ start = (entry->vpn << 10) & ~(entry->size - 1);
+ end = start + entry->size - 1;
+
+ /* Match */
+ if (likely (address >= start && address <= end))
+ return 1;
+
+ return 0;
+}
+
/* Find the corresponding entry in the right TLB
Return entry, MMU_DTLB_MISS or MMU_DTLB_MULTIPLE
*/
@@ -253,37 +276,59 @@ static int find_tlb_entry(CPUState * env, target_ulong
address,
asid = env->pteh & 0xff;
- for (i = 0; i < nbtlb; i++) {
- if (!entries[i].v)
- continue; /* Invalid entry */
- if (!entries[i].sh && use_asid && entries[i].asid != asid)
- continue; /* Bad ASID */
-#if 0
- switch (entries[i].sz) {
- case 0:
- size = 1024; /* 1kB */
- break;
- case 1:
- size = 4 * 1024; /* 4kB */
- break;
- case 2:
- size = 64 * 1024; /* 64kB */
- break;
- case 3:
- size = 1024 * 1024; /* 1MB */
- break;
- default:
- assert(0);
- }
-#endif
- start = (entries[i].vpn << 10) & ~(entries[i].size - 1);
- end = start + entries[i].size - 1;
- if (address >= start && address <= end) { /* Match */
- if (match != MMU_DTLB_MISS)
- return MMU_DTLB_MULTIPLE; /* Multiple match */
- match = i;
- }
+ /* In case we are looking at the utlbs. */
+ if (env->utlb == entries)
+ {
+ /* We are most likely to use 4k virtual pages. */
+ i = env->utlbs_4k[address >> 12];
+ if (unlikely (!is_tlb_matching (&entries[i], use_asid,
+ asid, address)))
+ {
+ /* Then huge pages */
+ i = env->utlbs_64k[address >> 14];
+ if (unlikely (!is_tlb_matching (&entries[i], use_asid,
+ asid, address)))
+ {
+ /* Or *HUGE* pages */
+ i = env->utlbs_1m[address >> 16];
+ if (unlikely (!is_tlb_matching (&entries[i], use_asid,
+ asid, address)))
+ {
+ /* And finally tiny pages (who uses that ??). */
+ i = env->utlbs_1k[address >> 10];
+ if (likely (is_tlb_matching (&entries[i], use_asid,
+ asid, address)))
+ match = i;
+ }
+ else
+ match = i;
+ }
+ else
+ match = i;
+ }
+ else
+ match = i;
+ }
+ else
+ {
+ /* We keep the traditional o(n) algorithm for itlbs (only 4 of
+ * them). */
+ for (i = 0; i < nbtlb; i++) {
+ if (!entries[i].v)
+ continue; /* Invalid entry */
+ if (!entries[i].sh && use_asid && entries[i].asid != asid)
+ continue; /* Bad ASID */
+
+ start = (entries[i].vpn << 10) & ~(entries[i].size - 1);
+ end = start + entries[i].size - 1;
+ if (address >= start && address <= end) { /* Match */
+ if (match != MMU_DTLB_MISS)
+ return MMU_DTLB_MULTIPLE; /* Multiple match */
+ match = i;
+ }
+ }
}
+
return match;
}
@@ -403,6 +448,7 @@ static int get_mmu_address(CPUState * env, target_ulong *
physical,
*prot = (rw == 1)? PAGE_WRITE : PAGE_READ;
break;
}
+
} else if (n == MMU_DTLB_MISS) {
n = (rw == 1) ? MMU_DTLB_MISS_WRITE :
MMU_DTLB_MISS_READ;
@@ -412,8 +458,8 @@ static int get_mmu_address(CPUState * env, target_ulong *
physical,
*physical = ((matching->ppn << 10) & ~(matching->size - 1)) |
(address & (matching->size - 1));
if ((rw == 1) & !matching->d)
- n = MMU_DTLB_INITIAL_WRITE;
- else
+ n = MMU_DTLB_INITIAL_WRITE;
+ else
n = MMU_OK;
}
return n;
@@ -545,18 +591,23 @@ void cpu_load_tlb(CPUState * env)
entry->v = (uint8_t)cpu_ptel_v(env->ptel);
entry->ppn = cpu_ptel_ppn(env->ptel);
entry->sz = (uint8_t)cpu_ptel_sz(env->ptel);
+
switch (entry->sz) {
case 0: /* 00 */
entry->size = 1024; /* 1K */
+ env->utlbs_1k[entry->vpn] = n;
break;
case 1: /* 01 */
entry->size = 1024 * 4; /* 4K */
+ env->utlbs_4k[entry->vpn >> 2] = n;
break;
case 2: /* 10 */
entry->size = 1024 * 64; /* 64K */
+ env->utlbs_64k[entry->vpn >> 6] = n;
break;
case 3: /* 11 */
entry->size = 1024 * 1024; /* 1M */
+ env->utlbs_1m[entry->vpn >> 10] = n;
break;
default:
assert(0);
@@ -571,7 +622,7 @@ void cpu_load_tlb(CPUState * env)
entry->tc = (uint8_t)cpu_ptea_tc(env->ptea);
}
-void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *s, target_phys_addr_t addr,
+void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *env, target_phys_addr_t addr,
uint32_t mem_value)
{
int associate = addr & 0x0000080;
@@ -579,7 +630,7 @@ void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *s,
target_phys_addr_t addr,
uint8_t d = (uint8_t)((mem_value & 0x00000200) >> 9);
uint8_t v = (uint8_t)((mem_value & 0x00000100) >> 8);
uint8_t asid = (uint8_t)(mem_value & 0x000000ff);
- int use_asid = (s->mmucr & MMUCR_SV) == 0 || (s->sr & SR_MD) == 0;
+ int use_asid = (env->mmucr & MMUCR_SV) == 0 || (env->sr & SR_MD) == 0;
if (associate) {
int i;
@@ -588,7 +639,7 @@ void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *s,
target_phys_addr_t addr,
/* search UTLB */
for (i = 0; i < UTLB_SIZE; i++) {
- tlb_t * entry = &s->utlb[i];
+ tlb_t * entry = &env->utlb[i];
if (!entry->v)
continue;
@@ -596,8 +647,8 @@ void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *s,
target_phys_addr_t addr,
&& (!use_asid || entry->asid == asid || entry->sh)) {
if (utlb_match_entry) {
/* Multiple TLB Exception */
- s->exception_index = 0x140;
- s->tea = addr;
+ env->exception_index = 0x140;
+ env->tea = addr;
break;
}
if (entry->v && !v)
@@ -606,12 +657,12 @@ void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *s,
target_phys_addr_t addr,
entry->d = d;
utlb_match_entry = entry;
}
- increment_urc(s); /* per utlb access */
+ increment_urc(env); /* per utlb access */
}
/* search ITLB */
for (i = 0; i < ITLB_SIZE; i++) {
- tlb_t * entry = &s->itlb[i];
+ tlb_t * entry = &env->itlb[i];
if (entry->vpn == vpn
&& (!use_asid || entry->asid == asid || entry->sh)) {
if (entry->v && !v)
@@ -625,23 +676,40 @@ void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *s,
target_phys_addr_t addr,
}
if (needs_tlb_flush)
- tlb_flush_page(s, vpn << 10);
+ tlb_flush_page(env, vpn << 10);
} else {
int index = (addr & 0x00003f00) >> 8;
- tlb_t * entry = &s->utlb[index];
+ tlb_t * entry = &env->utlb[index];
if (entry->v) {
/* Overwriting valid entry in utlb. */
target_ulong address = entry->vpn << 10;
- if (!same_tlb_entry_exists(s->itlb, ITLB_SIZE, entry)) {
- tlb_flush_page(s, address);
+ if (!same_tlb_entry_exists(env->itlb, ITLB_SIZE, entry)) {
+ tlb_flush_page(env, address);
}
}
entry->asid = asid;
entry->vpn = vpn;
entry->d = d;
entry->v = v;
- increment_urc(s);
+ switch (entry->sz) {
+ case 0: /* 00 */
+ env->utlbs_1k[entry->vpn] = index;
+ break;
+ case 1: /* 01 */
+ env->utlbs_4k[entry->vpn >> 2] = index;
+ break;
+ case 2: /* 10 */
+ env->utlbs_64k[entry->vpn >> 6] = index;
+ break;
+ case 3: /* 11 */
+ env->utlbs_1m[entry->vpn >> 10] = index;
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ increment_urc(env);
}
}
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index df81052..eab21a6 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -202,6 +202,11 @@ static void cpu_sh4_reset(CPUSH4State * env)
set_float_rounding_mode(float_round_to_zero, &env->fp_status);
#endif
env->mmucr = 0;
+
+ memset (env->utlbs_1k, 0, sizeof (env->utlbs_1k));
+ memset (env->utlbs_4k, 0, sizeof (env->utlbs_4k));
+ memset (env->utlbs_64k, 0, sizeof (env->utlbs_64k));
+ memset (env->utlbs_1m, 0, sizeof (env->utlbs_1m));
}
typedef struct {
@@ -510,7 +515,7 @@ static void _decode_opc(DisasContext * ctx)
to this list. When we see an instruction that is neither movca.l
nor ocbi, we perform the stores recorded in this list. When we see
ocbi, we check if the stores list has the address being invalidated.
- If so, we remove the address from the list.
+ If so, we remove the address from the list.
To optimize, we only try to flush stores when we're at the start of
TB, or if we already saw movca.l in this TB and did not flush stores
@@ -1565,7 +1570,7 @@ static void _decode_opc(DisasContext * ctx)
}
return;
case 0x00c3: /* movca.l R0,@Rm */
- gen_helper_movcal (REG(B11_8), REG(0));
+ gen_helper_movcal (REG(B11_8), REG(0));
ctx->has_movcal = 1;
return;
case 0x40a9:
--
1.5.6.5
- Re: [Qemu-devel] sh : performance problem,
Lionel Landwerlin <=
- Re: [Qemu-devel] sh : performance problem, Lionel Landwerlin, 2009/03/02
- Re: [Qemu-devel] sh : performance problem, Shin-ichiro KAWASAKI, 2009/03/03
- Re: [Qemu-devel] sh : performance problem, Lionel Landwerlin, 2009/03/03
- Re: [Qemu-devel] sh : performance problem, Laurent Desnogues, 2009/03/03
- Re: [Qemu-devel] sh : performance problem, Lionel Landwerlin, 2009/03/03
- Re: [Qemu-devel] sh : performance problem, Paul Brook, 2009/03/03
- Re: [Qemu-devel] sh : performance problem, Shin-ichiro KAWASAKI, 2009/03/04
- Re: [Qemu-devel] sh : performance problem, Shin-ichiro KAWASAKI, 2009/03/04
- Re: [Qemu-devel] sh : performance problem, Lionel Landwerlin, 2009/03/03
- Re: [Qemu-devel] sh : performance problem, Shin-ichiro KAWASAKI, 2009/03/04