--- trunk/src/cpus/cpu_arm_instr.c 2007/10/08 16:19:05 17 +++ trunk/src/cpus/cpu_arm_instr.c 2007/10/08 16:19:11 18 @@ -25,7 +25,7 @@ * SUCH DAMAGE. * * - * $Id: cpu_arm_instr.c,v 1.29 2005/10/11 03:31:28 debug Exp $ + * $Id: cpu_arm_instr.c,v 1.39 2005/10/27 14:01:13 debug Exp $ * * ARM instructions. * @@ -36,6 +36,74 @@ */ +#include "arm_quick_pc_to_pointers.h" + +/* #define GATHER_BDT_STATISTICS */ + + +#ifdef GATHER_BDT_STATISTICS +/* + * update_bdt_statistics(): + * + * Gathers statistics about load/store multiple instructions. + * + * NOTE/TODO: Perhaps it would be more memory efficient to swap the high + * and low parts of the instruction word, so that the lllllll bits become + * the high bits; this would cause fewer host pages to be used. Anyway, the + * current implementation works on hosts with lots of RAM. + * + * The resulting file, bdt_statistics.txt, should then be processed like + * this to give a new cpu_arm_multi.txt: + * + * uniq -c bdt_statistics.txt|sort -nr|head -256|cut -f 2 > cpu_arm_multi.txt + */ +static void update_bdt_statistics(uint32_t iw) +{ + static FILE *f = NULL; + static long long *counts; + static char *counts_used; + static long long n = 0; + + if (f == NULL) { + size_t s = (1 << 24) * sizeof(long long); + f = fopen("bdt_statistics.txt", "w"); + if (f == NULL) { + fprintf(stderr, "update_bdt_statistics(): :-(\n"); + exit(1); + } + counts = zeroed_alloc(s); + counts_used = zeroed_alloc(65536); + } + + /* Drop the s-bit: xxxx100P USWLnnnn llllllll llllllll */ + iw = ((iw & 0x01800000) >> 1) | (iw & 0x003fffff); + + counts_used[iw & 0xffff] = 1; + counts[iw] ++; + + n ++; + if ((n % 500000) == 0) { + int i; + long long j; + fatal("[ update_bdt_statistics(): n = %lli ]\n", (long long) n); + fseek(f, 0, SEEK_SET); + for (i=0; i<0x1000000; i++) + if (counts_used[i & 0xffff] && counts[i] != 0) { + /* Recreate the opcode: */ + uint32_t opcode = ((i & 0x00c00000) << 1) + | (i & 0x003fffff) | 0x08000000; + for (j=0; jpc = cpu->cd.arm.r[ARM_PC]; /* Find the new physical page and update the translation pointers: */ - arm_pc_to_pointers(cpu); + quick_pc_to_pointers(cpu); } Y(b) @@ -208,7 +280,7 @@ cpu->pc &= ~3; /* Find the new physical page and update the translation pointers: */ - arm_pc_to_pointers(cpu); + quick_pc_to_pointers(cpu); } Y(bx) @@ -230,7 +302,7 @@ cpu_functioncall_trace_return(cpu); /* Find the new physical page and update the translation pointers: */ - arm_pc_to_pointers(cpu); + quick_pc_to_pointers(cpu); } Y(bx_trace) @@ -258,7 +330,7 @@ cpu->pc = cpu->cd.arm.r[ARM_PC] = lr - 4 + (int32_t)ic->arg[0]; /* Find the new physical page and update the translation pointers: */ - arm_pc_to_pointers(cpu); + quick_pc_to_pointers(cpu); } Y(bl) @@ -290,7 +362,7 @@ cpu->pc &= ~3; /* Find the new physical page and update the translation pointers: */ - arm_pc_to_pointers(cpu); + quick_pc_to_pointers(cpu); } Y(blx) @@ -320,7 +392,7 @@ cpu_functioncall_trace(cpu, cpu->pc); /* Find the new physical page and update the translation pointers: */ - arm_pc_to_pointers(cpu); + quick_pc_to_pointers(cpu); } Y(bl_trace) @@ -383,6 +455,9 @@ Y(bl_samepage_trace) +#include "cpu_arm_instr_misc.c" + + /* * mul: Multiplication * @@ -397,7 +472,8 @@ Y(mul) X(muls) { - uint32_t result = reg(ic->arg[1]) * reg(ic->arg[2]); + uint32_t result; + result = reg(ic->arg[1]) * reg(ic->arg[2]); cpu->cd.arm.cpsr &= ~(ARM_FLAG_Z | ARM_FLAG_N); if (result == 0) cpu->cd.arm.cpsr |= ARM_FLAG_Z; @@ -417,8 +493,9 @@ { /* xxxx0000 00ASdddd nnnnssss 1001mmmm (Rd,Rm,Rs[,Rn]) */ uint32_t iw = ic->arg[0]; - int rd = (iw >> 16) & 15, rn = (iw >> 12) & 15, - rs = (iw >> 8) & 15, rm = iw & 15; + int rd, rs, rn, rm; + rd = (iw >> 16) & 15; rn = (iw >> 12) & 15, + rs = (iw >> 8) & 15; rm = iw & 15; cpu->cd.arm.r[rd] = cpu->cd.arm.r[rm] * cpu->cd.arm.r[rs] + cpu->cd.arm.r[rn]; } @@ -427,8 +504,9 @@ { /* xxxx0000 00ASdddd nnnnssss 1001mmmm (Rd,Rm,Rs[,Rn]) */ uint32_t iw = ic->arg[0]; - int rd = (iw >> 16) & 15, rn = (iw >> 12) & 15, - rs = (iw >> 8) & 15, rm = iw & 15; + int rd, rs, rn, rm; + rd = (iw >> 16) & 15; rn = (iw >> 12) & 15, + rs = (iw >> 8) & 15; rm = iw & 15; cpu->cd.arm.r[rd] = cpu->cd.arm.r[rm] * cpu->cd.arm.r[rs] + cpu->cd.arm.r[rn]; cpu->cd.arm.cpsr &= ~(ARM_FLAG_Z | ARM_FLAG_N); @@ -448,9 +526,10 @@ X(mull) { /* xxxx0000 1UAShhhh llllssss 1001mmmm */ - uint32_t iw = ic->arg[0]; - int u_bit = (iw >> 22) & 1, a_bit = (iw >> 21) & 1; - uint64_t tmp = cpu->cd.arm.r[iw & 15]; + uint32_t iw; uint64_t tmp; int u_bit, a_bit; + iw = ic->arg[0]; + u_bit = (iw >> 22) & 1; a_bit = (iw >> 21) & 1; + tmp = cpu->cd.arm.r[iw & 15]; if (u_bit) tmp = (int64_t)(int32_t)tmp * (int64_t)(int32_t)cpu->cd.arm.r[(iw >> 8) & 15]; @@ -485,13 +564,15 @@ /* * ret_trace: "mov pc,lr" with trace enabled + * ret: "mov pc,lr" without trace enabled * * arg[0] = ignored */ X(ret_trace) { - uint32_t old_pc = cpu->cd.arm.r[ARM_PC]; - uint32_t mask_within_page = ((ARM_IC_ENTRIES_PER_PAGE-1) + uint32_t old_pc, mask_within_page; + old_pc = cpu->cd.arm.r[ARM_PC]; + mask_within_page = ((ARM_IC_ENTRIES_PER_PAGE-1) << ARM_INSTR_ALIGNMENT_SHIFT) | ((1 << ARM_INSTR_ALIGNMENT_SHIFT) - 1); @@ -509,10 +590,16 @@ ((cpu->pc & mask_within_page) >> ARM_INSTR_ALIGNMENT_SHIFT); } else { /* Find the new physical page and update pointers: */ - arm_pc_to_pointers(cpu); + quick_pc_to_pointers(cpu); } } Y(ret_trace) +X(ret) +{ + cpu->pc = cpu->cd.arm.r[ARM_PC] = cpu->cd.arm.r[ARM_LR]; + quick_pc_to_pointers(cpu); +} +Y(ret) /* @@ -672,7 +759,7 @@ cpu->pc = cpu->cd.arm.r[ARM_PC] = cpu->cd.arm.r[ARM_LR]; if (cpu->machine->show_trace_tree) cpu_functioncall_trace_return(cpu); - arm_pc_to_pointers(cpu); + quick_pc_to_pointers(cpu); } @@ -700,7 +787,7 @@ } else if (cpu->pc != old_pc) { /* PC was changed by the SWI call. Find the new physical page and update the translation pointers: */ - arm_pc_to_pointers(cpu); + quick_pc_to_pointers(cpu); } } Y(swi_useremul) @@ -792,6 +879,9 @@ struct arm_instr_call *); X(store_w0_byte_u1_p0_imm); X(store_w0_word_u1_p0_imm); +X(load_w0_word_u1_p0_imm); +X(load_w0_byte_u1_p1_imm); +X(load_w0_byte_u1_p1_reg); extern void (*arm_load_store_instr_pc[1024])(struct cpu *, struct arm_instr_call *); @@ -808,6 +898,7 @@ struct arm_instr_call *); X(cmps); X(sub); +X(add); X(subs); @@ -832,6 +923,11 @@ int i, return_flag = 0; uint32_t new_values[16]; +#ifdef GATHER_BDT_STATISTICS + if (!s_bit) + update_bdt_statistics(iw); +#endif + /* Synchronize the program counter: */ low_pc = ((size_t)ic - (size_t) cpu->cd.arm.cur_ic_page) / sizeof(struct arm_instr_call); @@ -990,7 +1086,7 @@ same page! */ /* Find the new physical page and update the translation pointers: */ - arm_pc_to_pointers(cpu); + quick_pc_to_pointers(cpu); } } Y(bdt_load) @@ -1015,6 +1111,11 @@ int w_bit = iw & 0x00200000; int i; +#ifdef GATHER_BDT_STATISTICS + if (!s_bit) + update_bdt_statistics(iw); +#endif + /* Synchronize the program counter: */ low_pc = ((size_t)ic - (size_t) cpu->cd.arm.cur_ic_page) / sizeof(struct arm_instr_call); @@ -1108,6 +1209,10 @@ Y(bdt_store) +/* Various load/store multiple instructions: */ +#include "tmp_arm_multi.c" + + /*****************************************************************************/ @@ -1256,6 +1361,199 @@ } +/* + * netbsd_memset: + * + * The core of a NetBSD/arm memset. + * + * f01bc420: e25XX080 subs rX,rX,#0x80 + * f01bc424: a8ac000c stmgeia ip!,{r2,r3} (16 of these) + * .. + * f01bc464: caffffed bgt 0xf01bc420 + */ +X(netbsd_memset) +{ + unsigned char *page; + uint32_t addr; + + do { + addr = cpu->cd.arm.r[ARM_IP]; + + instr(subs)(cpu, ic); + + if (((cpu->cd.arm.cpsr & ARM_FLAG_N)?1:0) != + ((cpu->cd.arm.cpsr & ARM_FLAG_V)?1:0)) { + cpu->n_translated_instrs += 16; + /* Skip the store multiples: */ + cpu->cd.arm.next_ic = &ic[17]; + return; + } + + /* Crossing a page boundary? Then continue non-combined. */ + if ((addr & 0xfff) + 128 > 0x1000) + return; + + /* R2/R3 non-zero? Not allowed here. */ + if (cpu->cd.arm.r[2] != 0 || cpu->cd.arm.r[3] != 0) + return; + + /* printf("addr = 0x%08x\n", addr); */ + + page = cpu->cd.arm.host_store[addr >> 12]; + /* No page translation? Continue non-combined. */ + if (page == NULL) + return; + + /* Clear: */ + memset(page + (addr & 0xfff), 0, 128); + cpu->cd.arm.r[ARM_IP] = addr + 128; + cpu->n_translated_instrs += 16; + + /* Branch back if greater: */ + cpu->n_translated_instrs += 1; + } while (((cpu->cd.arm.cpsr & ARM_FLAG_N)?1:0) == + ((cpu->cd.arm.cpsr & ARM_FLAG_V)?1:0) && + !(cpu->cd.arm.cpsr & ARM_FLAG_Z)); + + /* Continue at the instruction after the bgt: */ + cpu->cd.arm.next_ic = &ic[18]; +} + + +/* + * netbsd_memcpy: + * + * The core of a NetBSD/arm memcpy. + * + * f01bc530: e8b15018 ldmia r1!,{r3,r4,ip,lr} + * f01bc534: e8a05018 stmia r0!,{r3,r4,ip,lr} + * f01bc538: e8b15018 ldmia r1!,{r3,r4,ip,lr} + * f01bc53c: e8a05018 stmia r0!,{r3,r4,ip,lr} + * f01bc540: e2522020 subs r2,r2,#0x20 + * f01bc544: aafffff9 bge 0xf01bc530 + */ +X(netbsd_memcpy) +{ + unsigned char *page_0, *page_1; + uint32_t addr_r0, addr_r1; + + do { + addr_r0 = cpu->cd.arm.r[0]; + addr_r1 = cpu->cd.arm.r[1]; + + /* printf("addr_r0 = %08x r1 = %08x\n", addr_r0, addr_r1); */ + + /* Crossing a page boundary? Then continue non-combined. */ + if ((addr_r0 & 0xfff) + 32 > 0x1000 || + (addr_r1 & 0xfff) + 32 > 0x1000) { + instr(multi_0x08b15018)(cpu, ic); + return; + } + + page_0 = cpu->cd.arm.host_store[addr_r0 >> 12]; + page_1 = cpu->cd.arm.host_store[addr_r1 >> 12]; + + /* No page translations? Continue non-combined. */ + if (page_0 == NULL || page_1 == NULL) { + instr(multi_0x08b15018)(cpu, ic); + return; + } + + memcpy(page_0 + (addr_r0 & 0xfff), + page_1 + (addr_r1 & 0xfff), 32); + cpu->cd.arm.r[0] = addr_r0 + 32; + cpu->cd.arm.r[1] = addr_r1 + 32; + + cpu->n_translated_instrs += 4; + + instr(subs)(cpu, ic + 4); + cpu->n_translated_instrs ++; + + /* Loop while greater or equal: */ + cpu->n_translated_instrs ++; + } while (((cpu->cd.arm.cpsr & ARM_FLAG_N)?1:0) == + ((cpu->cd.arm.cpsr & ARM_FLAG_V)?1:0)); + + /* Continue at the instruction after the bge: */ + cpu->cd.arm.next_ic = &ic[6]; + cpu->n_translated_instrs --; +} + + +/* + * netbsd_cacheclean: + * + * The core of a NetBSD/arm cache clean routine, variant 1: + * + * f015f88c: e4902020 ldr r2,[r0],#32 + * f015f890: e2511020 subs r1,r1,#0x20 + * f015f894: 1afffffc bne 0xf015f88c + * f015f898: ee070f9a mcr 15,0,r0,cr7,cr10,4 + */ +X(netbsd_cacheclean) +{ + uint32_t r1 = cpu->cd.arm.r[1]; + cpu->n_translated_instrs += ((r1 >> 5) * 3); + cpu->cd.arm.next_ic = &ic[4]; +} + + +/* + * netbsd_cacheclean2: + * + * The core of a NetBSD/arm cache clean routine, variant 2: + * + * f015f93c: ee070f3a mcr 15,0,r0,cr7,cr10,1 + * f015f940: ee070f36 mcr 15,0,r0,cr7,cr6,1 + * f015f944: e2800020 add r0,r0,#0x20 + * f015f948: e2511020 subs r1,r1,#0x20 + * f015f94c: 8afffffa bhi 0xf015f93c + */ +X(netbsd_cacheclean2) +{ + cpu->n_translated_instrs += ((cpu->cd.arm.r[1] >> 5) * 5) - 1; + cpu->cd.arm.next_ic = &ic[5]; +} + + +/* + * netbsd_scanc: + * + * f01bccbc: e5d13000 ldrb r3,[r1] + * f01bccc0: e7d23003 ldrb r3,[r2,r3] + * f01bccc4: e113000c tsts r3,ip + */ +X(netbsd_scanc) +{ + unsigned char *page = cpu->cd.arm.host_load[cpu->cd.arm.r[1] >> 12]; + uint32_t t; + + if (page == NULL) { + instr(load_w0_byte_u1_p1_imm)(cpu, ic); + return; + } + + t = page[cpu->cd.arm.r[1] & 0xfff]; + t += cpu->cd.arm.r[2]; + page = cpu->cd.arm.host_load[t >> 12]; + + if (page == NULL) { + instr(load_w0_byte_u1_p1_imm)(cpu, ic); + return; + } + + cpu->cd.arm.r[3] = page[t & 0xfff]; + + t = cpu->cd.arm.r[3] & cpu->cd.arm.r[ARM_IP]; + cpu->cd.arm.cpsr &= ~(ARM_FLAG_Z | ARM_FLAG_N); + if (t == 0) + cpu->cd.arm.cpsr |= ARM_FLAG_Z; + + cpu->n_translated_instrs += 2; + cpu->cd.arm.next_ic = &ic[3]; +} + + /*****************************************************************************/ @@ -1269,7 +1567,7 @@ cpu->pc = cpu->cd.arm.r[ARM_PC]; /* Find the new physical page and update the translation pointers: */ - arm_pc_to_pointers(cpu); + quick_pc_to_pointers(cpu); /* end_of_page doesn't count as an executed instruction: */ cpu->n_translated_instrs --; @@ -1280,15 +1578,135 @@ /* - * arm_combine_instructions(): + * arm_combine_netbsd_memset(): + * + * Check for the core of a NetBSD/arm memset; large memsets use a sequence + * of 16 store-multiple instructions, each storing 2 registers at a time. + */ +void arm_combine_netbsd_memset(struct cpu *cpu, struct arm_instr_call *ic, + int low_addr) +{ + int n_back = (low_addr >> ARM_INSTR_ALIGNMENT_SHIFT) + & (ARM_IC_ENTRIES_PER_PAGE-1); + + if (n_back >= 17) { + int i; + for (i=-16; i<=-1; i++) + if (ic[i].f != instr(multi_0x08ac000c__ge)) + return; + if (ic[-17].f == instr(subs) && + ic[-17].arg[0]==ic[-17].arg[2] && ic[-17].arg[1] == 128 && + ic[ 0].f == instr(b_samepage__gt) && + ic[ 0].arg[0] == (size_t)&ic[-17]) { + ic[-17].f = instr(netbsd_memset); + combined; + } + } +} + + +/* + * arm_combine_netbsd_memcpy(): + * + * Check for the core of a NetBSD/arm memcpy; large memcpys use a + * sequence of ldmia instructions. + */ +void arm_combine_netbsd_memcpy(struct cpu *cpu, struct arm_instr_call *ic, + int low_addr) +{ + int n_back = (low_addr >> ARM_INSTR_ALIGNMENT_SHIFT) + & (ARM_IC_ENTRIES_PER_PAGE-1); + + if (n_back >= 5) { + if (ic[-5].f==instr(multi_0x08b15018) && + ic[-4].f==instr(multi_0x08a05018) && + ic[-3].f==instr(multi_0x08b15018) && + ic[-2].f==instr(multi_0x08a05018) && + ic[-1].f == instr(subs) && + ic[-1].arg[0]==ic[-1].arg[2] && ic[-1].arg[1] == 0x20 && + ic[ 0].f == instr(b_samepage__ge) && + ic[ 0].arg[0] == (size_t)&ic[-5]) { + ic[-5].f = instr(netbsd_memcpy); + combined; + } + } +} + + +/* + * arm_combine_netbsd_cacheclean(): + * + * Check for the core of a NetBSD/arm cache clean. (There are two variants.) + */ +void arm_combine_netbsd_cacheclean(struct cpu *cpu, struct arm_instr_call *ic, + int low_addr) +{ + int n_back = (low_addr >> ARM_INSTR_ALIGNMENT_SHIFT) + & (ARM_IC_ENTRIES_PER_PAGE-1); + + if (n_back >= 3) { + if (ic[-3].f==instr(load_w0_word_u1_p0_imm) && + ic[-2].f == instr(subs) && + ic[-2].arg[0]==ic[-2].arg[2] && ic[-2].arg[1] == 0x20 && + ic[-1].f == instr(b_samepage__ne) && + ic[-1].arg[0] == (size_t)&ic[-3]) { + ic[-3].f = instr(netbsd_cacheclean); + combined; + } + } +} + + +/* + * arm_combine_netbsd_cacheclean2(): * - * Combine two or more instructions, if possible, into a single function call. + * Check for the core of a NetBSD/arm cache clean. (Second variant.) */ -void arm_combine_instructions(struct cpu *cpu, struct arm_instr_call *ic, - uint32_t addr) +void arm_combine_netbsd_cacheclean2(struct cpu *cpu, struct arm_instr_call *ic, + int low_addr) { - int n_back; - n_back = (addr >> ARM_INSTR_ALIGNMENT_SHIFT) + int n_back = (low_addr >> ARM_INSTR_ALIGNMENT_SHIFT) + & (ARM_IC_ENTRIES_PER_PAGE-1); + + if (n_back >= 4) { + if (ic[-4].f == instr(mcr_mrc) && ic[-4].arg[0] == 0xee070f3a && + ic[-3].f == instr(mcr_mrc) && ic[-3].arg[0] == 0xee070f36 && + ic[-2].f == instr(add) && + ic[-2].arg[0]==ic[-2].arg[2] && ic[-2].arg[1] == 0x20 && + ic[-1].f == instr(subs) && + ic[-1].arg[0]==ic[-1].arg[2] && ic[-1].arg[1] == 0x20) { + ic[-4].f = instr(netbsd_cacheclean2); + combined; + } + } +} + + +/* + * arm_combine_netbsd_scanc(): + */ +void arm_combine_netbsd_scanc(struct cpu *cpu, struct arm_instr_call *ic, + int low_addr) +{ + int n_back = (low_addr >> ARM_INSTR_ALIGNMENT_SHIFT) + & (ARM_IC_ENTRIES_PER_PAGE-1); + + if (n_back >= 2) { + if (ic[-2].f == instr(load_w0_byte_u1_p1_imm) && + ic[-1].f == instr(load_w0_byte_u1_p1_reg)) { + ic[-2].f = instr(netbsd_scanc); + combined; + } + } +} + + +/* + * arm_combine_test2(): + */ +void arm_combine_test2(struct cpu *cpu, struct arm_instr_call *ic, int low_addr) +{ + int n_back = (low_addr >> ARM_INSTR_ALIGNMENT_SHIFT) & (ARM_IC_ENTRIES_PER_PAGE-1); if (n_back >= 2) { @@ -1299,9 +1717,15 @@ ic[ 0].f == instr(b_samepage__gt) && ic[ 0].arg[0] == (size_t)&ic[-2]) { ic[-2].f = instr(fill_loop_test2); +printf("YO test2\n"); combined; } } +} + + +#if 0 + /* TODO: This is another test hack. */ if (n_back >= 3) { if (ic[-3].f == instr(cmps) && @@ -1317,9 +1741,8 @@ combined; } } - /* TODO: Combine forward as well */ -} +#endif /*****************************************************************************/ @@ -1339,8 +1762,7 @@ unsigned char *page; unsigned char ib[4]; int condition_code, main_opcode, secondary_opcode, s_bit, rn, rd, r8; - int p_bit, u_bit, b_bit, w_bit, l_bit, regform, rm, c, t; - int any_pc_reg; + int p_bit, u_bit, b_bit, w_bit, l_bit, regform, rm, c, t, any_pc_reg; void (*samepage_function)(struct cpu *, struct arm_instr_call *); /* Figure out the address of the instruction: */ @@ -1363,7 +1785,7 @@ sizeof(ib), MEM_READ, CACHE_INSTRUCTION)) { fatal("to_be_translated(): " "read failed: TODO\n"); - goto bad; + return; } } @@ -1561,10 +1983,12 @@ goto bad; } - /* "mov pc,lr" with trace enabled: */ - if ((iword & 0x0fffffff) == 0x01a0f00e && - cpu->machine->show_trace_tree) { - ic->f = cond_instr(ret_trace); + /* "mov pc,lr": */ + if ((iword & 0x0fffffff) == 0x01a0f00e) { + if (cpu->machine->show_trace_tree) + ic->f = cond_instr(ret_trace); + else + ic->f = cond_instr(ret); break; } @@ -1577,6 +2001,50 @@ break; } + /* "mov reg,#0": */ + if ((iword & 0x0fff0fff) == 0x03a03000 && rd != ARM_PC) { + switch (rd) { + case 0: ic->f = cond_instr(clear_r0); break; + case 1: ic->f = cond_instr(clear_r1); break; + case 2: ic->f = cond_instr(clear_r2); break; + case 3: ic->f = cond_instr(clear_r3); break; + case 4: ic->f = cond_instr(clear_r4); break; + case 5: ic->f = cond_instr(clear_r5); break; + case 6: ic->f = cond_instr(clear_r6); break; + case 7: ic->f = cond_instr(clear_r7); break; + case 8: ic->f = cond_instr(clear_r8); break; + case 9: ic->f = cond_instr(clear_r9); break; + case 10: ic->f = cond_instr(clear_r10); break; + case 11: ic->f = cond_instr(clear_r11); break; + case 12: ic->f = cond_instr(clear_r12); break; + case 13: ic->f = cond_instr(clear_r13); break; + case 14: ic->f = cond_instr(clear_r14); break; + } + break; + } + + /* "mov reg,#1": */ + if ((iword & 0x0fff0fff) == 0x03a03001 && rd != ARM_PC) { + switch (rd) { + case 0: ic->f = cond_instr(mov1_r0); break; + case 1: ic->f = cond_instr(mov1_r1); break; + case 2: ic->f = cond_instr(mov1_r2); break; + case 3: ic->f = cond_instr(mov1_r3); break; + case 4: ic->f = cond_instr(mov1_r4); break; + case 5: ic->f = cond_instr(mov1_r5); break; + case 6: ic->f = cond_instr(mov1_r6); break; + case 7: ic->f = cond_instr(mov1_r7); break; + case 8: ic->f = cond_instr(mov1_r8); break; + case 9: ic->f = cond_instr(mov1_r9); break; + case 10: ic->f = cond_instr(mov1_r10); break; + case 11: ic->f = cond_instr(mov1_r11); break; + case 12: ic->f = cond_instr(mov1_r12); break; + case 13: ic->f = cond_instr(mov1_r13); break; + case 14: ic->f = cond_instr(mov1_r14); break; + } + break; + } + /* * Generic Data Processing Instructions: */ @@ -1613,6 +2081,9 @@ ic->f = arm_dpi_instr[condition_code + 16 * secondary_opcode + (s_bit? 256 : 0) + (any_pc_reg? 512 : 0) + (regform? 1024 : 0)]; + + if (iword == 0xe113000c) + cpu->combination_check = arm_combine_netbsd_scanc; break; case 0x4: /* Load and store... */ @@ -1645,12 +2116,41 @@ case 0x8: /* Multiple load/store... (Block data transfer) */ case 0x9: /* xxxx100P USWLnnnn llllllll llllllll */ + ic->arg[0] = (size_t)(&cpu->cd.arm.r[rn]); + ic->arg[1] = (size_t)iword; + /* Generic case: */ if (l_bit) ic->f = cond_instr(bdt_load); else ic->f = cond_instr(bdt_store); - ic->arg[0] = (size_t)(&cpu->cd.arm.r[rn]); - ic->arg[1] = (size_t)iword; +#if defined(HOST_LITTLE_ENDIAN) && !defined(GATHER_BDT_STATISTICS) + /* + * Check for availability of optimized implementation: + * xxxx100P USWLnnnn llllllll llllllll + * ^ ^ ^ ^ ^ ^ ^ ^ (0x00950154) + * These bits are used to select which list to scan, and then + * the list is scanned linearly. + * + * The optimized functions do not support show_trace_tree, + * but it's ok to use the unoptimized version in that case. + */ + if (!cpu->machine->show_trace_tree) { + int i = 0, j = iword; + j = ((j & 0x00800000) >> 16) | ((j & 0x00100000) >> 14) + | ((j & 0x00040000) >> 13) | ((j & 0x00010000) >> 12) + | ((j & 0x00000100) >> 5) | ((j & 0x00000040) >> 4) + | ((j & 0x00000010) >> 3) | ((j & 0x00000004) >> 2); + while (multi_opcode[j][i] != 0) { + if ((iword & 0x0fffffff) == + multi_opcode[j][i]) { + ic->f = multi_opcode_f[j] + [i*16 + condition_code]; + break; + } + i ++; + } + } +#endif if (rn == ARM_PC) { fatal("TODO: bdt with PC as base\n"); goto bad; @@ -1662,6 +2162,14 @@ if (main_opcode == 0x0a) { ic->f = cond_instr(b); samepage_function = cond_instr(b_samepage); + /* if (iword == 0xcafffffc) + cpu->combination_check = arm_combine_test2; */ + if (iword == 0xcaffffed) + cpu->combination_check = + arm_combine_netbsd_memset; + if (iword == 0xaafffff9) + cpu->combination_check = + arm_combine_netbsd_memcpy; } else { if (cpu->machine->show_trace_tree) { ic->f = cond_instr(bl_trace); @@ -1699,6 +2207,12 @@ ARM_INSTR_ALIGNMENT_SHIFT)); } } + +#if 0 + /* Hm. This doesn't really increase performance. */ + if (iword == 0x8afffffa) + cpu->combination_check = arm_combine_netbsd_cacheclean2; +#endif break; case 0xe: @@ -1711,6 +2225,8 @@ ic->arg[0] = iword; ic->f = cond_instr(cdp); } + if (iword == 0xee070f9a) + cpu->combination_check = arm_combine_netbsd_cacheclean; break; case 0xf: