--- trunk/src/cpus/cpu_mips_instr.c 2007/10/08 16:20:18 27 +++ trunk/src/cpus/cpu_mips_instr.c 2007/10/08 16:20:26 28 @@ -25,7 +25,7 @@ * SUCH DAMAGE. * * - * $Id: cpu_mips_instr.c,v 1.87 2006/06/25 02:46:08 debug Exp $ + * $Id: cpu_mips_instr.c,v 1.97 2006/07/20 03:20:03 debug Exp $ * * MIPS instructions. * @@ -135,6 +135,26 @@ } cpu->delay_slot = NOT_DELAYED; } +X(beq_samepage_addiu) +{ + MODE_uint_t rs = reg(ic->arg[0]), rt = reg(ic->arg[1]); + cpu->n_translated_instrs ++; + reg(ic[1].arg[1]) = (int32_t) + ((int32_t)reg(ic[1].arg[0]) + (int32_t)ic[1].arg[2]); + if (rs == rt) + cpu->cd.mips.next_ic = (struct mips_instr_call *) ic->arg[2]; + else + cpu->cd.mips.next_ic ++; +} +X(beq_samepage_nop) +{ + MODE_uint_t rs = reg(ic->arg[0]), rt = reg(ic->arg[1]); + cpu->n_translated_instrs ++; + if (rs == rt) + cpu->cd.mips.next_ic = (struct mips_instr_call *) ic->arg[2]; + else + cpu->cd.mips.next_ic ++; +} X(bne) { MODE_int_t old_pc = cpu->pc; @@ -172,6 +192,26 @@ } cpu->delay_slot = NOT_DELAYED; } +X(bne_samepage_addiu) +{ + MODE_uint_t rs = reg(ic->arg[0]), rt = reg(ic->arg[1]); + cpu->n_translated_instrs ++; + reg(ic[1].arg[1]) = (int32_t) + ((int32_t)reg(ic[1].arg[0]) + (int32_t)ic[1].arg[2]); + if (rs != rt) + cpu->cd.mips.next_ic = (struct mips_instr_call *) ic->arg[2]; + else + cpu->cd.mips.next_ic ++; +} +X(bne_samepage_nop) +{ + MODE_uint_t rs = reg(ic->arg[0]), rt = reg(ic->arg[1]); + cpu->n_translated_instrs ++; + if (rs != rt) + cpu->cd.mips.next_ic = (struct mips_instr_call *) ic->arg[2]; + else + cpu->cd.mips.next_ic ++; +} X(b) { MODE_int_t old_pc = cpu->pc; @@ -896,6 +936,16 @@ } else cpu->delay_slot = NOT_DELAYED; } +X(jr_ra_addiu) +{ + /* jr ra, followed by an addiu */ + MODE_int_t rs = cpu->cd.mips.gpr[MIPS_GPR_RA]; + reg(ic[1].arg[1]) = (int32_t) + ((int32_t)reg(ic[1].arg[0]) + (int32_t)ic[1].arg[2]); + cpu->pc = rs; + quick_pc_to_pointers(cpu); + cpu->n_translated_instrs ++; +} X(jr_ra_trace) { MODE_int_t rs = cpu->cd.mips.gpr[MIPS_GPR_RA]; @@ -1013,13 +1063,10 @@ */ X(cache) { - /* TODO. For now, just clear the rmw bit: */ - cpu->cd.mips.rmw = 0; + /* TODO: Implement cache operations. */ -/* TODO: fix */ -cpu->invalidate_code_translation(cpu, 0, INVALIDATE_ALL); -cpu->invalidate_translation_caches(cpu, 0, INVALIDATE_ALL); -/* cpu_create_or_reset_tc(cpu); */ + /* Make sure the rmw bit is cleared: */ + cpu->cd.mips.rmw = 0; } @@ -1849,19 +1896,36 @@ */ X(rfe) { - coproc_rfe(cpu); + /* Just rotate the interrupt/user bits: */ + cpu->cd.mips.coproc[0]->reg[COP0_STATUS] = + (cpu->cd.mips.coproc[0]->reg[COP0_STATUS] & ~0x3f) | + ((cpu->cd.mips.coproc[0]->reg[COP0_STATUS] & 0x3c) >> 2); - /* Note: no pc to pointers conversion is necessary here. */ + /* + * Note: no pc to pointers conversion is necessary here. Usually the + * rfe instruction resides in the delay slot of a jr k0/k1, and + * it is up to that instruction to do the pointer conversion. + */ } /* - * eret: Return from exception handler + * eret: Return from exception handler (non-R3000 style) */ X(eret) { - coproc_eret(cpu); + if (cpu->cd.mips.coproc[0]->reg[COP0_STATUS] & STATUS_ERL) { + cpu->pc = cpu->cd.mips.coproc[0]->reg[COP0_ERROREPC]; + cpu->cd.mips.coproc[0]->reg[COP0_STATUS] &= ~STATUS_ERL; + } else { + cpu->pc = cpu->cd.mips.coproc[0]->reg[COP0_EPC]; + cpu->delay_slot = 0; + cpu->cd.mips.coproc[0]->reg[COP0_STATUS] &= ~STATUS_EXL; + } + quick_pc_to_pointers(cpu); + + cpu->cd.mips.rmw = 0; /* the "LL bit" */ } @@ -2297,6 +2361,251 @@ /* + * sw_loop: + * + * s: addiu rX,rX,4 rX = arg[0] and arg[1] + * bne rY,rX,s (or rX,rY,s) rt=arg[1], rs=arg[0] + * sw rZ,-4(rX) rt=arg[0], rs=arg[1] + */ +X(sw_loop) +{ + MODE_uint_t rX = reg(ic->arg[0]), rZ = reg(ic[2].arg[0]); + uint64_t *rYp = (uint64_t *) ic[1].arg[0]; + MODE_uint_t rY, bytes_to_write; + unsigned char *page; + int partial = 0; + + page = cpu->cd.mips.host_store[rX >> 12]; + + /* Fallback: */ + if (cpu->delay_slot || page == NULL || (rX & 3) != 0 || rZ != 0) { + instr(addiu)(cpu, ic); + return; + } + + if (rYp == (uint64_t *) ic->arg[0]) + rYp = (uint64_t *) ic[1].arg[1]; + + rY = reg(rYp); + + bytes_to_write = rY - rX; + if ((rX & 0xfff) + bytes_to_write > 0x1000) { + bytes_to_write = 0x1000 - (rX & 0xfff); + partial = 1; + } + + /* printf("rX = %08x\n", (int)rX); + printf("rY = %08x\n", (int)rY); + printf("rZ = %08x\n", (int)rZ); + printf("%i bytes\n", (int)bytes_to_write); */ + + memset(page + (rX & 0xfff), 0, bytes_to_write); + + reg(ic->arg[0]) = rX + bytes_to_write; + + cpu->n_translated_instrs += bytes_to_write / 4 * 3 - 1; + cpu->cd.mips.next_ic = partial? + (struct mips_instr_call *) &ic[0] : + (struct mips_instr_call *) &ic[3]; +} + + +/* + * multi_sw_3: + * + * sw r?,ofs(rX) r?=arg[0], rX=arg[1], ofs=arg[2] + * sw r?,ofs(rX) r?=arg[0], rX=arg[1], ofs=arg[2] + * sw r?,ofs(rX) r?=arg[0], rX=arg[1], ofs=arg[2] + */ +X(multi_sw_3_le) +{ + uint32_t *page; + MODE_uint_t rX = reg(ic[0].arg[1]), r1, r2, r3; + MODE_uint_t addr0 = rX + (int32_t)ic[0].arg[2]; + MODE_uint_t addr1 = rX + (int32_t)ic[1].arg[2]; + MODE_uint_t addr2 = rX + (int32_t)ic[2].arg[2]; + uint32_t index0 = addr0 >> 12, index1 = addr1 >> 12, + index2 = addr2 >> 12; + + page = (uint32_t *) cpu->cd.mips.host_store[index0]; + + /* Fallback: */ + if (cpu->delay_slot || + page == NULL || (addr0 & 3) != 0 || (addr1 & 3) != 0 || + (addr2 & 3) != 0 || index0 != index1 || index0 != index2) { + /* Normal safe sw: */ + ic[1].f(cpu, ic); + return; + } + + addr0 = (addr0 >> 2) & 0x3ff; + addr1 = (addr1 >> 2) & 0x3ff; + addr2 = (addr2 >> 2) & 0x3ff; + + /* printf("addr0=%x 1=%x 2=%x\n", + (int)addr0, (int)addr1, (int)addr2); */ + + r1 = reg(ic[0].arg[0]); + r2 = reg(ic[1].arg[0]); + r3 = reg(ic[2].arg[0]); + + r1 = LE32_TO_HOST(r1); + r2 = LE32_TO_HOST(r2); + r3 = LE32_TO_HOST(r3); + + page[addr0] = r1; + page[addr1] = r2; + page[addr2] = r3; + + cpu->n_translated_instrs += 2; + cpu->cd.mips.next_ic = (struct mips_instr_call *) &ic[3]; +} +X(multi_sw_3_be) +{ + uint32_t *page; + MODE_uint_t rX = reg(ic[0].arg[1]), r1, r2, r3; + MODE_uint_t addr0 = rX + (int32_t)ic[0].arg[2]; + MODE_uint_t addr1 = rX + (int32_t)ic[1].arg[2]; + MODE_uint_t addr2 = rX + (int32_t)ic[2].arg[2]; + uint32_t index0 = addr0 >> 12, index1 = addr1 >> 12, + index2 = addr2 >> 12; + + page = (uint32_t *) cpu->cd.mips.host_store[index0]; + + /* Fallback: */ + if (cpu->delay_slot || + page == NULL || (addr0 & 3) != 0 || (addr1 & 3) != 0 || + (addr2 & 3) != 0 || index0 != index1 || index0 != index2) { + /* Normal safe sw: */ + ic[1].f(cpu, ic); + return; + } + + addr0 = (addr0 >> 2) & 0x3ff; + addr1 = (addr1 >> 2) & 0x3ff; + addr2 = (addr2 >> 2) & 0x3ff; + + /* printf("addr0=%x 1=%x 2=%x\n", + (int)addr0, (int)addr1, (int)addr2); */ + + r1 = reg(ic[0].arg[0]); + r2 = reg(ic[1].arg[0]); + r3 = reg(ic[2].arg[0]); + + r1 = BE32_TO_HOST(r1); + r2 = BE32_TO_HOST(r2); + r3 = BE32_TO_HOST(r3); + + page[addr0] = r1; + page[addr1] = r2; + page[addr2] = r3; + + cpu->n_translated_instrs += 2; + cpu->cd.mips.next_ic = (struct mips_instr_call *) &ic[3]; +} + + +/* + * netbsd_r3k_picache_do_inv: + * + * ic[0] mtc0 rV,status + * 1 nop + * 2 nop + * 3 s: addiu rX,rX,4 + * 4 bne rY,rX,s + * 5 sb zr,-4(rX) + * 6 nop + * 7 nop + * 8 mtc0 rT,status + */ +X(netbsd_r3k_picache_do_inv) +{ + MODE_uint_t rx = reg(ic[3].arg[0]), ry = reg(ic[4].arg[1]); + + /* Fallback if the environment isn't exactly right: */ + if (!(reg(ic[0].arg[0]) & MIPS1_ISOL_CACHES) || + (rx & 3) || (ry & 3) || cpu->delay_slot) { + instr(mtc0)(cpu, ic); + return; + } + + reg(ic[3].arg[0]) = ry; + cpu->n_translated_instrs += (ry - rx + 4) / 4 * 3 + 4; + + /* Run the last mtc0 instruction: */ + cpu->cd.mips.next_ic = (struct mips_instr_call *) &ic[8]; +} + + +#ifdef MODE32 +/* + * netbsd_strlen(): + * + * lb rV,0(rX) + * s: addiu rX,rX,1 + * bne zr,rV,s + * nop + */ +X(netbsd_strlen) +{ + MODE_uint_t rx = reg(ic[0].arg[1]); + MODE_int_t rv; + signed char *page; + uint32_t pageindex = rx >> 12; + int i; + + page = (signed char *) cpu->cd.mips.host_load[pageindex]; + + /* Fallback: */ + if (cpu->delay_slot || page == NULL) { + /* + * Normal lb: NOTE: It doesn't matter whether [1] or + * [16+1] is called here, because endianness for 8-bit + * loads is irrelevant. :-) + */ + mips32_loadstore[1](cpu, ic); + return; + } + + i = rx & 0xfff; + + /* + * TODO: This loop can be optimized further for optimal + * performance on the host, e.g. by reading full words... + */ + do { + rv = page[i ++]; + } while (i < 0x1000 && rv != 0); + + cpu->n_translated_instrs += (i - (rx & 0xfff)) * 4 - 1; + + reg(ic[0].arg[1]) = (rx & ~0xfff) + i; + reg(ic[2].arg[0]) = rv; + + /* Done with the loop? Or continue on the next rx page? */ + if (rv == 0) + cpu->cd.mips.next_ic = (struct mips_instr_call *) &ic[4]; + else + cpu->cd.mips.next_ic = (struct mips_instr_call *) &ic[0]; +} +#endif + + +/* + * lui_32bit: + * + * Combination of lui and addiu. + * Note: All 32 bits of arg[2] of the lui instr_call are used. + */ +X(lui_32bit) +{ + reg(ic[0].arg[0]) = (int32_t) ic[0].arg[2]; + cpu->n_translated_instrs ++; + cpu->cd.mips.next_ic ++; +} + + +/* * b_samepage_addiu: * * Combination of branch within the same page, followed by addiu. @@ -2402,20 +2711,207 @@ /* - * Combine: [Conditional] branch, followed by addiu. + * Combine: Memory fill loop (addiu, bne, sw) + * + * s: addiu rX,rX,4 + * bne rY,rX,s + * sw rZ,-4(rX) */ -void COMBINE(b_addiu)(struct cpu *cpu, struct mips_instr_call *ic, +void COMBINE(sw_loop)(struct cpu *cpu, struct mips_instr_call *ic, int low_addr) +{ + int n_back = (low_addr >> MIPS_INSTR_ALIGNMENT_SHIFT) + & (MIPS_IC_ENTRIES_PER_PAGE - 1); + + /* Only for 32-bit virtual address translation so far. */ + if (!cpu->is_32bit) + return; + + if (n_back < 2) + return; + + if (ic[-2].f == instr(addiu) && ic[-2].arg[0] == ic[-2].arg[1] && + (int32_t)ic[-2].arg[2] == 4 && + ic[-1].f == instr(bne_samepage) && + (ic[-1].arg[0] == ic[-2].arg[0] || + ic[-1].arg[1] == ic[-2].arg[0]) && + ic[-1].arg[0] != ic[-1].arg[1] && + ic[-1].arg[2] == (size_t) &ic[-2] && + ic[0].arg[0] != ic[0].arg[1] && + ic[0].arg[1] == ic[-2].arg[0] && (int32_t)ic[0].arg[2] == -4) { + ic[-2].f = instr(sw_loop); + combined; + } +} + + +/* + * Combine: Multiple SW in a row using the same base register + * + * sw r?,???(rX) + * sw r?,???(rX) + * sw r?,???(rX) + * ... + */ +void COMBINE(multi_sw)(struct cpu *cpu, struct mips_instr_call *ic, int low_addr) { int n_back = (low_addr >> MIPS_INSTR_ALIGNMENT_SHIFT) & (MIPS_IC_ENTRIES_PER_PAGE - 1); + /* Only for 32-bit virtual address translation so far. */ + if (!cpu->is_32bit) + return; + + if (n_back < 4) + return; + + /* Avoid "overlapping" instruction combinations: */ + if (ic[-4].f == instr(multi_sw_3_be)||ic[-3].f == instr(multi_sw_3_be)|| + ic[-4].f == instr(multi_sw_3_le)||ic[-3].f == instr(multi_sw_3_le)) + return; + + if (ic[-2].f == ic[0].f && ic[-1].f == ic[0].f && + ic[-2].arg[1] == ic[0].arg[1] && + ic[-1].arg[1] == ic[0].arg[1]) { + if (cpu->byte_order == EMUL_LITTLE_ENDIAN) + ic[-2].f = instr(multi_sw_3_le); + else + ic[-2].f = instr(multi_sw_3_be); + combined; + } +} + + +/* + * Combine: NetBSD/pmax 3.0 R2000/R3000 physical cache invalidation loop + * + * Instruction cache loop: + * + * ic[-8] mtc0 rV,status + * -7 nop + * -6 nop + * -5 s: addiu rX,rX,4 + * -4 bne rY,rX,s + * -3 sb zr,-4(rX) + * -2 nop + * -1 nop + * 0 mtc0 rT,status + */ +void COMBINE(netbsd_r3k_cache_inv)(struct cpu *cpu, + struct mips_instr_call *ic, int low_addr) +{ + int n_back = (low_addr >> MIPS_INSTR_ALIGNMENT_SHIFT) + & (MIPS_IC_ENTRIES_PER_PAGE - 1); + + if (n_back < 8) + return; + + if (ic[-8].f == instr(mtc0) && ic[-8].arg[1] == COP0_STATUS && + ic[-7].f == instr(nop) && ic[-6].f == instr(nop) && + ic[-5].f == instr(addiu) && ic[-5].arg[0] == ic[-5].arg[1] && + (int32_t)ic[-5].arg[2] == 4 && ic[-4].f == instr(bne_samepage) && + ic[-4].arg[0] == ic[-5].arg[0] && ic[-4].arg[0] != ic[-4].arg[1] && + ic[-4].arg[2] == (size_t) &ic[-5] && + ic[-3].arg[1] == ic[-5].arg[0] && + ic[-2].f == instr(nop) && ic[-1].f == instr(nop)) { + ic[-8].f = instr(netbsd_r3k_picache_do_inv); + combined; + } +} + + +/* + * Combine: something ending with a nop. + * + * NetBSD's strlen core. + * [Conditional] branch, followed by nop. + */ +void COMBINE(nop)(struct cpu *cpu, struct mips_instr_call *ic, int low_addr) +{ + int n_back = (low_addr >> MIPS_INSTR_ALIGNMENT_SHIFT) + & (MIPS_IC_ENTRIES_PER_PAGE - 1); + +#ifdef MODE32 + if (n_back < 3) + return; + + if ((ic[-3].f == mips32_loadstore[1] || + ic[-3].f == mips32_loadstore[16 + 1]) && + ic[-3].arg[2] == 0 && + ic[-3].arg[0] == ic[-1].arg[0] && ic[-3].arg[1] == ic[-2].arg[0] && + ic[-2].arg[0] == ic[-2].arg[1] && ic[-2].arg[2] == 1 && + ic[-2].f == instr(addiu) && ic[-1].arg[2] == (size_t) &ic[-3] && + ic[-1].arg[1] == (size_t) &cpu->cd.mips.gpr[MIPS_GPR_ZERO] && + ic[-1].f == instr(bne_samepage)) { + ic[-3].f = instr(netbsd_strlen); + combined; + return; + } +#endif + if (n_back < 1) return; + if (ic[-1].f == instr(bne_samepage)) { + ic[-1].f = instr(bne_samepage_nop); + combined; + return; + } + + if (ic[-1].f == instr(beq_samepage)) { + ic[-1].f = instr(beq_samepage_nop); + combined; + return; + } + + /* TODO: other branches that are followed by nop should be here */ +} + + +/* + * Combine: + * + * [Conditional] branch, followed by addiu. + * lui + addiu. + */ +void COMBINE(addiu)(struct cpu *cpu, struct mips_instr_call *ic, int low_addr) +{ + int n_back = (low_addr >> MIPS_INSTR_ALIGNMENT_SHIFT) + & (MIPS_IC_ENTRIES_PER_PAGE - 1); + + if (n_back < 1) + return; + + if (ic[-1].f == instr(set) && ic[-1].arg[0] == ic[0].arg[0] && + ic[0].arg[0] == ic[0].arg[1]) { + ic[-1].f = instr(lui_32bit); + ic[-1].arg[2] = (int32_t) (ic[-1].arg[1] + ic[0].arg[2]); + combined; + return; + } + if (ic[-1].f == instr(b_samepage)) { ic[-1].f = instr(b_samepage_addiu); combined; + return; + } + + if (ic[-1].f == instr(beq_samepage)) { + ic[-1].f = instr(beq_samepage_addiu); + combined; + return; + } + + if (ic[-1].f == instr(bne_samepage)) { + ic[-1].f = instr(bne_samepage_addiu); + combined; + return; + } + + if (ic[-1].f == instr(jr_ra)) { + ic[-1].f = instr(jr_ra_addiu); + combined; + return; } /* TODO: other branches that are followed by addiu should be here */ @@ -2883,11 +3379,19 @@ case HI6_XORI: ic->f = instr(xori); break; } + if (ic->arg[2] == 0) { + if ((cpu->is_32bit && ic->f == instr(addiu)) || + (!cpu->is_32bit && ic->f == instr(daddiu))) { + ic->f = instr(mov); + ic->arg[2] = ic->arg[1]; + } + } + if (rt == MIPS_GPR_ZERO) ic->f = instr(nop); if (ic->f == instr(addiu)) - cpu->cd.mips.combination_check = COMBINE(b_addiu); + cpu->cd.mips.combination_check = COMBINE(addiu); if (ic->f == instr(daddiu)) cpu->cd.mips.combination_check = COMBINE(b_daddiu); break; @@ -2896,6 +3400,8 @@ ic->f = instr(set); ic->arg[0] = (size_t)&cpu->cd.mips.gpr[rt]; ic->arg[1] = (int32_t) (imm << 16); + /* NOTE: Don't use arg[2] here. It can be used with + instruction combinations, to do lui + addiu, etc. */ if (rt == MIPS_GPR_ZERO) ic->f = instr(nop); break; @@ -3000,6 +3506,12 @@ ic->arg[1] = rd + ((iword & 7) << 5); ic->arg[2] = addr & 0xffc; ic->f = rs == COPz_MTCz? instr(mtc0) : instr(dmtc0); + + if (cpu->cd.mips.cpu_type.exc_model == EXC3K && + rs == COPz_MTCz && rd == COP0_STATUS) + cpu->cd.mips.combination_check = + COMBINE(netbsd_r3k_cache_inv); + break; case 8: if (iword == 0x4100ffff) { /* R2020 DECstation write-loop thingy. */ @@ -3317,6 +3829,12 @@ /* Load into the dummy scratch register, if rt = zero */ if (!store && rt == MIPS_GPR_ZERO) ic->arg[0] = (size_t)&cpu->cd.mips.scratch; + + /* Check for multiple stores in a row using the same + base register: */ + if (main_opcode == HI6_SW && rs == MIPS_GPR_SP) + cpu->cd.mips.combination_check = COMBINE(multi_sw); + break; case HI6_LL: @@ -3436,8 +3954,11 @@ cpu->cd.mips.cpu_type.isa_revision < 2) { static int warning = 0; if (!warning) { - fatal("[ WARNING! SPECIAL3 opcode used on a" - " cpu which doesn't implement it ]\n"); + fatal("[ WARNING! SPECIAL3 opcode used, but" + " the %s processor does not implement " + "such instructions. Only printing this " + "warning once. ]\n", + cpu->cd.mips.cpu_type.name); warning = 1; } ic->f = instr(reserved); @@ -3486,6 +4007,9 @@ } #endif + if (ic->f == instr(nop) && cpu->cd.mips.combination_check == NULL) + cpu->cd.mips.combination_check = COMBINE(nop); + #define DYNTRANS_TO_BE_TRANSLATED_TAIL #include "cpu_dyntrans.c"